SciPaperLoader/scipaperloader/scrapers/dummy.py

import time
import random
import os
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db

class Scraper(BaseScraper):
    """Dummy scraper for testing purposes that simulates paper downloading."""

    # This scraper processes "New" papers and outputs "Done"/"Failed"
    INPUT_STATUSES = ["New"]
    OUTPUT_STATUS_SUCCESS = "Done"
    OUTPUT_STATUS_FAILURE = "Failed"
    OUTPUT_STATUS_PROCESSING = "Pending"

    def scrape(self, doi: str) -> ScrapeResult:
        """Simulate scraping a paper with realistic timing and random success/failure."""
        start_time = time.time()

        paper = PaperMetadata.query.filter_by(doi=doi).first()
        if not paper:
            return ScrapeResult(
                status="error",
                message=f"No paper found for DOI {doi}",
                data=None,
                duration=time.time() - start_time,
                timestamp=datetime.utcnow()
            )

        # Log start of scraping
        self.log_scrape_start(doi, paper.id)

        # Simulate processing time (1-3 seconds)
        processing_time = random.uniform(1, 3)
        time.sleep(processing_time)

        # Simulate 80% success rate
        success = random.random() < 0.8

        if success:
            # Get download path and create an actual dummy file
            download_path = DownloadPathConfig.get_path()
            file_name = f"{doi.replace('/', '_')}.pdf"
            file_path = f"{download_path}/{file_name}"

            # Check if the path is readable and writable
            if not os.path.exists(download_path):
                try:
                    # Create directory if it doesn't exist
                    os.makedirs(download_path, exist_ok=True)
                except OSError as e:
                    error_msg = f"Failed to create download directory: {str(e)}"
                    paper.status = "Failed"
                    paper.error_msg = error_msg

                    ActivityLog.log_scraper_activity(
                        action="dummy_scrape_path_error",
                        status="error",
                        description=error_msg,
                        paper_id=paper.id
                    )

                    return ScrapeResult(
                        status="error",
                        message=error_msg,
                        data={"error_code": "path_creation_error"},
                        duration=time.time() - start_time,
                        timestamp=datetime.utcnow()
                    )

            # Check if the path is readable
            if not os.access(download_path, os.R_OK):
                error_msg = f"Download path '{download_path}' is not readable"
                paper.status = "Failed"
                paper.error_msg = error_msg

                ActivityLog.log_scraper_activity(
                    action="dummy_scrape_path_error",
                    status="error",
                    description=error_msg,
                    paper_id=paper.id
                )

                return ScrapeResult(
                    status="error",
                    message=error_msg,
                    data={"error_code": "path_read_error"},
                    duration=time.time() - start_time,
                    timestamp=datetime.utcnow()
                )

            # Check if the path is writable
            if not os.access(download_path, os.W_OK):
                error_msg = f"Download path '{download_path}' is not writable"
                paper.status = "Failed"
                paper.error_msg = error_msg

                ActivityLog.log_scraper_activity(
                    action="dummy_scrape_path_error",
                    status="error",
                    description=error_msg,
                    paper_id=paper.id
                )

                return ScrapeResult(
                    status="error",
                    message=error_msg,
                    data={"error_code": "path_write_error"},
                    duration=time.time() - start_time,
                    timestamp=datetime.utcnow()
                )

            # Create a simple dummy PDF file
            try:
                with open(file_path, 'w') as f:
                    f.write(f"Dummy PDF file for paper with DOI: {doi}\n")
                    f.write(f"Title: {paper.title}\n")
                    f.write(f"Journal: {paper.journal}\n")
                    f.write(f"Generated: {datetime.utcnow().isoformat()}\n")
                    f.write("\nThis is a dummy file created by the SciPaperLoader dummy scraper.\n")

                # Update paper status
                paper.status = "Done"
                paper.file_path = file_path
                paper.error_msg = None
            except Exception as e:
                # Handle file creation errors
                error_msg = f"Failed to create dummy file: {str(e)}"
                paper.status = "Failed"
                paper.error_msg = error_msg

                ActivityLog.log_scraper_activity(
                    action="dummy_scrape_file_error",
                    status="error",
                    description=error_msg,
                    paper_id=paper.id
                )

                return ScrapeResult(
                    status="error",
                    message=error_msg,
                    data={"error_code": "file_creation_error"},
                    duration=time.time() - start_time,
                    timestamp=datetime.utcnow()
                )

            # Log success
            self.log_scrape_success(doi, f"Successfully scraped {doi}", paper.id)

            result = ScrapeResult(
                status="success",
                message=f"Successfully scraped {doi}",
                data={
                    "file_path": file_path,
                    "title": paper.title,
                    "journal": paper.journal
                },
                duration=time.time() - start_time,
                timestamp=datetime.utcnow()
            )
        else:
            # Simulate failure
            error_messages = [
                "Paper not found in database",
                "Access denied by publisher",
                "Rate limit exceeded",
                "Network timeout",
                "Invalid DOI format"
            ]
            error_msg = random.choice(error_messages)

            paper.status = "Failed"
            paper.error_msg = error_msg

            # Log failure
            self.log_scrape_failure(doi, error_msg, paper.id)

            result = ScrapeResult(
                status="error",
                message=f"Failed to scrape {doi}: {error_msg}",
                data={"error_code": "dummy_error"},
                duration=time.time() - start_time,
                timestamp=datetime.utcnow()
            )

        db.session.commit()
        return result