import time import random import os from datetime import datetime from .base import BaseScraper, ScrapeResult from flask import current_app from ..models import PaperMetadata, ActivityLog, DownloadPathConfig from ..db import db class Scraper(BaseScraper): """Retry scraper that attempts to re-process failed papers with different strategies.""" # This scraper specifically targets "Failed" papers and retries them INPUT_STATUSES = ["Failed"] OUTPUT_STATUS_SUCCESS = "Done" OUTPUT_STATUS_FAILURE = "Failed" OUTPUT_STATUS_PROCESSING = "Retrying" def scrape(self, doi: str) -> ScrapeResult: """Retry scraping a failed paper with enhanced error handling.""" start_time = time.time() paper = PaperMetadata.query.filter_by(doi=doi).first() if not paper: return ScrapeResult( status="error", message=f"No paper found for DOI {doi}", data=None, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Log retry attempt ActivityLog.log_scraper_activity( action="retry_failed_paper", status="info", description=f"Retrying failed paper: {paper.title}", paper_id=paper.id ) # Simulate longer processing time for retry (2-5 seconds) processing_time = random.uniform(2, 5) time.sleep(processing_time) # Simulate 60% success rate on retry (lower than initial attempt) success = random.random() < 0.6 result_data = {} if success: # Get download path and create dummy file download_path = DownloadPathConfig.get_path() file_name = f"{doi.replace('/', '_')}_retry.pdf" file_path = f"{download_path}/{file_name}" try: # Ensure directory exists os.makedirs(download_path, exist_ok=True) # Create a dummy PDF file with open(file_path, 'w') as f: f.write(f"Dummy PDF content for retry of {doi}") result_data = {"file_path": file_path} # Log success ActivityLog.log_scraper_activity( action="retry_scrape_success", status="success", description=f"Successfully retried {doi} on second attempt", paper_id=paper.id ) result = ScrapeResult( status="success", message=f"Successfully retried paper {doi}", data=result_data, duration=time.time() - start_time, timestamp=datetime.utcnow() ) except Exception as e: error_msg = f"Failed to save retry file: {str(e)}" ActivityLog.log_scraper_activity( action="retry_scrape_file_error", status="error", description=error_msg, paper_id=paper.id ) result = ScrapeResult( status="error", message=error_msg, data=None, duration=time.time() - start_time, timestamp=datetime.utcnow() ) else: # Retry failed - generate different error message error_messages = [ "Retry failed: Still no access to publisher", "Retry failed: Alternative download methods exhausted", "Retry failed: DOI appears permanently inaccessible", "Retry failed: Network timeout persists" ] error_msg = random.choice(error_messages) ActivityLog.log_scraper_activity( action="retry_scrape_failure", status="error", description=f"Retry failed for {doi}: {error_msg}", paper_id=paper.id ) result = ScrapeResult( status="error", message=error_msg, data=None, duration=time.time() - start_time, timestamp=datetime.utcnow() ) return result