124 lines
4.4 KiB
Python

import time
import random
import os
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Retry scraper that attempts to re-process failed papers with different strategies."""
# This scraper specifically targets "Failed" papers and retries them
INPUT_STATUSES = ["Failed"]
OUTPUT_STATUS_SUCCESS = "Done"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "Retrying"
def scrape(self, doi: str) -> ScrapeResult:
"""Retry scraping a failed paper with enhanced error handling."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log retry attempt
ActivityLog.log_scraper_activity(
action="retry_failed_paper",
status="info",
description=f"Retrying failed paper: {paper.title}",
paper_id=paper.id
)
# Simulate longer processing time for retry (2-5 seconds)
processing_time = random.uniform(2, 5)
time.sleep(processing_time)
# Simulate 60% success rate on retry (lower than initial attempt)
success = random.random() < 0.6
result_data = {}
if success:
# Get download path and create dummy file
download_path = DownloadPathConfig.get_path()
file_name = f"{doi.replace('/', '_')}_retry.pdf"
file_path = f"{download_path}/{file_name}"
try:
# Ensure directory exists
os.makedirs(download_path, exist_ok=True)
# Create a dummy PDF file
with open(file_path, 'w') as f:
f.write(f"Dummy PDF content for retry of {doi}")
result_data = {"file_path": file_path}
# Log success
ActivityLog.log_scraper_activity(
action="retry_scrape_success",
status="success",
description=f"Successfully retried {doi} on second attempt",
paper_id=paper.id
)
result = ScrapeResult(
status="success",
message=f"Successfully retried paper {doi}",
data=result_data,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Failed to save retry file: {str(e)}"
ActivityLog.log_scraper_activity(
action="retry_scrape_file_error",
status="error",
description=error_msg,
paper_id=paper.id
)
result = ScrapeResult(
status="error",
message=error_msg,
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
else:
# Retry failed - generate different error message
error_messages = [
"Retry failed: Still no access to publisher",
"Retry failed: Alternative download methods exhausted",
"Retry failed: DOI appears permanently inaccessible",
"Retry failed: Network timeout persists"
]
error_msg = random.choice(error_messages)
ActivityLog.log_scraper_activity(
action="retry_scrape_failure",
status="error",
description=f"Retry failed for {doi}: {error_msg}",
paper_id=paper.id
)
result = ScrapeResult(
status="error",
message=error_msg,
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
return result