95 lines
3.2 KiB
Python
95 lines
3.2 KiB
Python
import time
|
|
import random
|
|
from datetime import datetime
|
|
from .base import BaseScraper, ScrapeResult
|
|
from flask import current_app
|
|
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
|
from ..db import db
|
|
|
|
class Scraper(BaseScraper):
|
|
"""Dummy scraper for testing purposes that simulates paper downloading."""
|
|
|
|
def scrape(self, doi: str) -> ScrapeResult:
|
|
"""Simulate scraping a paper with realistic timing and random success/failure."""
|
|
start_time = time.time()
|
|
|
|
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
|
if not paper:
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=f"No paper found for DOI {doi}",
|
|
data=None,
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
# Simulate processing time (1-3 seconds)
|
|
processing_time = random.uniform(1, 3)
|
|
time.sleep(processing_time)
|
|
|
|
# Simulate 80% success rate
|
|
success = random.random() < 0.8
|
|
|
|
if success:
|
|
# Get download path and simulate file creation
|
|
download_path = DownloadPathConfig.get_path()
|
|
file_name = f"{doi.replace('/', '_')}.pdf"
|
|
file_path = f"{download_path}/{file_name}"
|
|
|
|
# Update paper status
|
|
paper.status = "Done"
|
|
paper.file_path = file_path
|
|
paper.error_msg = None
|
|
|
|
# Log success
|
|
ActivityLog.log_scraper_activity(
|
|
action="dummy_scrape",
|
|
status="success",
|
|
description=f"Successfully scraped {doi}",
|
|
paper_id=paper.id
|
|
)
|
|
|
|
result = ScrapeResult(
|
|
status="success",
|
|
message=f"Successfully scraped {doi}",
|
|
data={
|
|
"file_path": file_path,
|
|
"title": paper.title,
|
|
"journal": paper.journal
|
|
},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
else:
|
|
# Simulate failure
|
|
error_messages = [
|
|
"Paper not found in database",
|
|
"Access denied by publisher",
|
|
"Rate limit exceeded",
|
|
"Network timeout",
|
|
"Invalid DOI format"
|
|
]
|
|
error_msg = random.choice(error_messages)
|
|
|
|
paper.status = "Failed"
|
|
paper.error_msg = error_msg
|
|
|
|
# Log failure
|
|
ActivityLog.log_scraper_activity(
|
|
action="dummy_scrape",
|
|
status="error",
|
|
description=f"Failed to scrape {doi}: {error_msg}",
|
|
paper_id=paper.id
|
|
)
|
|
|
|
result = ScrapeResult(
|
|
status="error",
|
|
message=f"Failed to scrape {doi}: {error_msg}",
|
|
data={"error_code": "dummy_error"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
db.session.commit()
|
|
return result
|