from abc import ABC, abstractmethod from typing import NamedTuple, Optional, Dict, List from datetime import datetime class ScrapeResult(NamedTuple): status: str # "success", "error", "skipped" message: str # human-readable status data: Optional[Dict] # any extra payload (file_path, metadata, etc.) duration: Optional[float] = None # processing time in seconds timestamp: Optional[datetime] = None # when the operation completed class BaseScraper(ABC): """Base class for all scraper implementations.""" # Default input/output statuses - can be overridden by subclasses INPUT_STATUSES = ["New"] # Which paper statuses this scraper will process OUTPUT_STATUS_SUCCESS = "Done" # Status to set on successful scraping OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing @abstractmethod def scrape(self, doi: str) -> ScrapeResult: """ Fetch metadata and/or download paper for the given DOI. Args: doi: The DOI of the paper to scrape Returns: ScrapeResult with status, message, and optional data """ pass def get_name(self) -> str: """Return the name of this scraper.""" return self.__class__.__name__ def get_description(self) -> str: """Return a description of this scraper.""" return getattr(self.__class__, "__doc__", "No description available") def get_input_statuses(self) -> List[str]: """Return list of paper statuses this scraper can process.""" return self.INPUT_STATUSES def get_output_statuses(self) -> Dict[str, str]: """Return mapping of result types to output statuses.""" return { "success": self.OUTPUT_STATUS_SUCCESS, "failure": self.OUTPUT_STATUS_FAILURE, "processing": self.OUTPUT_STATUS_PROCESSING }