SciPaperLoader/scipaperloader/scrapers/failed_retry.py

import time
import random
import os
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db

class Scraper(BaseScraper):
    """Retry scraper that attempts to re-process failed papers with different strategies."""

    # This scraper specifically targets "Failed" papers and retries them
    INPUT_STATUSES = ["Failed"]
    OUTPUT_STATUS_SUCCESS = "Done"
    OUTPUT_STATUS_FAILURE = "Failed"
    OUTPUT_STATUS_PROCESSING = "Retrying"

    def scrape(self, doi: str) -> ScrapeResult:
        """Retry scraping a failed paper with enhanced error handling."""
        start_time = time.time()

        paper = PaperMetadata.query.filter_by(doi=doi).first()
        if not paper:
            return ScrapeResult(
                status="error",
                message=f"No paper found for DOI {doi}",
                data=None,
                duration=time.time() - start_time,
                timestamp=datetime.utcnow()
            )

        # Log retry attempt
        ActivityLog.log_scraper_activity(
            action="retry_failed_paper",
            status="info",
            description=f"Retrying failed paper: {paper.title}",
            paper_id=paper.id
        )

        # Simulate longer processing time for retry (2-5 seconds)
        processing_time = random.uniform(2, 5)
        time.sleep(processing_time)

        # Simulate 60% success rate on retry (lower than initial attempt)
        success = random.random() < 0.6

        result_data = {}

        if success:
            # Get download path and create dummy file
            download_path = DownloadPathConfig.get_path()
            file_name = f"{doi.replace('/', '_')}_retry.pdf"
            file_path = f"{download_path}/{file_name}"

            try:
                # Ensure directory exists
                os.makedirs(download_path, exist_ok=True)

                # Create a dummy PDF file
                with open(file_path, 'w') as f:
                    f.write(f"Dummy PDF content for retry of {doi}")

                result_data = {"file_path": file_path}

                # Log success
                ActivityLog.log_scraper_activity(
                    action="retry_scrape_success",
                    status="success",
                    description=f"Successfully retried {doi} on second attempt",
                    paper_id=paper.id
                )

                result = ScrapeResult(
                    status="success",
                    message=f"Successfully retried paper {doi}",
                    data=result_data,
                    duration=time.time() - start_time,
                    timestamp=datetime.utcnow()
                )

            except Exception as e:
                error_msg = f"Failed to save retry file: {str(e)}"
                ActivityLog.log_scraper_activity(
                    action="retry_scrape_file_error",
                    status="error",
                    description=error_msg,
                    paper_id=paper.id
                )

                result = ScrapeResult(
                    status="error",
                    message=error_msg,
                    data=None,
                    duration=time.time() - start_time,
                    timestamp=datetime.utcnow()
                )
        else:
            # Retry failed - generate different error message
            error_messages = [
                "Retry failed: Still no access to publisher",
                "Retry failed: Alternative download methods exhausted",
                "Retry failed: DOI appears permanently inaccessible",
                "Retry failed: Network timeout persists"
            ]
            error_msg = random.choice(error_messages)

            ActivityLog.log_scraper_activity(
                action="retry_scrape_failure",
                status="error",
                description=f"Retry failed for {doi}: {error_msg}",
                paper_id=paper.id
            )

            result = ScrapeResult(
                status="error",
                message=error_msg,
                data=None,
                duration=time.time() - start_time,
                timestamp=datetime.utcnow()
            )

        return result