104 lines
3.7 KiB
Python

import time
import random
import os
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Retry scraper that attempts to re-process failed papers with different strategies."""
# This scraper specifically targets "Failed" papers and retries them
INPUT_STATUSES = ["Failed"]
OUTPUT_STATUS_SUCCESS = "Done"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "Retrying"
def scrape(self, doi: str) -> ScrapeResult:
"""Retry scraping a failed paper with enhanced error handling."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of retry
self.log_scrape_start(doi, paper.id)
# Simulate longer processing time for retry (2-5 seconds)
processing_time = random.uniform(2, 5)
time.sleep(processing_time)
# Simulate 60% success rate on retry (lower than initial attempt)
success = random.random() < 0.6
result_data = {}
if success:
# Get download path and create dummy file
download_path = DownloadPathConfig.get_path()
file_name = f"{doi.replace('/', '_')}_retry.pdf"
file_path = f"{download_path}/{file_name}"
try:
# Ensure directory exists
os.makedirs(download_path, exist_ok=True)
# Create a dummy PDF file
with open(file_path, 'w') as f:
f.write(f"Dummy PDF content for retry of {doi}")
result_data = {"file_path": file_path}
# Log success
self.log_scrape_success(doi, f"Successfully retried {doi} on second attempt", paper.id)
result = ScrapeResult(
status="success",
message=f"Successfully retried paper {doi}",
data=result_data,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Failed to save retry file: {str(e)}"
self.log_scrape_failure(doi, error_msg, paper.id)
result = ScrapeResult(
status="error",
message=error_msg,
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
else:
# Retry failed - generate different error message
error_messages = [
"Retry failed: Still no access to publisher",
"Retry failed: Alternative download methods exhausted",
"Retry failed: DOI appears permanently inaccessible",
"Retry failed: Network timeout persists"
]
error_msg = random.choice(error_messages)
self.log_scrape_failure(doi, error_msg, paper.id)
result = ScrapeResult(
status="error",
message=error_msg,
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
return result