SciPaperLoader/scipaperloader/scrapers/base.py

from abc import ABC, abstractmethod
from typing import NamedTuple, Optional, Dict, List
from datetime import datetime

class ScrapeResult(NamedTuple):
    status: str            # "success", "error", "skipped"
    message: str           # human-readable status
    data: Optional[Dict]   # any extra payload (file_path, metadata, etc.)
    duration: Optional[float] = None  # processing time in seconds
    timestamp: Optional[datetime] = None  # when the operation completed

class BaseScraper(ABC):
    """Base class for all scraper implementations."""

    # Default input/output statuses - can be overridden by subclasses
    INPUT_STATUSES = ["New"]      # Which paper statuses this scraper will process
    OUTPUT_STATUS_SUCCESS = "Done"     # Status to set on successful scraping
    OUTPUT_STATUS_FAILURE = "Failed"   # Status to set on failed scraping
    OUTPUT_STATUS_PROCESSING = "Pending"  # Status to set while processing

    def __init__(self):
        """Initialize the scraper."""
        self.scraper_name = self.get_name().lower()

    def log_scrape_start(self, doi: str, paper_id: Optional[int] = None):
        """Log the start of a scraping operation."""
        from ..models import ActivityLog

        ActivityLog.log_scraper_activity(
            action=f"{self.scraper_name}_scrape_start",
            status="info",
            description=f"Starting {self.get_name()} for DOI: {doi}",
            paper_id=paper_id
        )

    def log_scrape_success(self, doi: str, message: str, paper_id: Optional[int] = None):
        """Log successful completion of scraping."""
        from ..models import ActivityLog

        ActivityLog.log_scraper_activity(
            action=f"{self.scraper_name}_scrape_success",
            status="success",
            description=f"{self.get_name()} completed successfully for DOI: {doi} - {message}",
            paper_id=paper_id
        )

    def log_scrape_failure(self, doi: str, message: str, paper_id: Optional[int] = None):
        """Log failed scraping operation."""
        from ..models import ActivityLog

        ActivityLog.log_scraper_activity(
            action=f"{self.scraper_name}_scrape_failure",
            status="error",
            description=f"{self.get_name()} failed for DOI: {doi} - {message}",
            paper_id=paper_id
        )

    @abstractmethod
    def scrape(self, doi: str) -> ScrapeResult:
        """
        Fetch metadata and/or download paper for the given DOI.

        Args:
            doi: The DOI of the paper to scrape

        Returns:
            ScrapeResult with status, message, and optional data
        """
        pass

    def get_name(self) -> str:
        """Return the name of this scraper."""
        return self.__class__.__name__

    def get_description(self) -> str:
        """Return a description of this scraper."""
        return getattr(self.__class__, "__doc__", "No description available")

    def get_input_statuses(self) -> List[str]:
        """Return list of paper statuses this scraper can process."""
        return self.INPUT_STATUSES

    def get_output_statuses(self) -> Dict[str, str]:
        """Return mapping of result types to output statuses."""
        return {
            "success": self.OUTPUT_STATUS_SUCCESS,
            "failure": self.OUTPUT_STATUS_FAILURE,
            "processing": self.OUTPUT_STATUS_PROCESSING
        }