53 lines
2.0 KiB
Python

from abc import ABC, abstractmethod
from typing import NamedTuple, Optional, Dict, List
from datetime import datetime
class ScrapeResult(NamedTuple):
status: str # "success", "error", "skipped"
message: str # human-readable status
data: Optional[Dict] # any extra payload (file_path, metadata, etc.)
duration: Optional[float] = None # processing time in seconds
timestamp: Optional[datetime] = None # when the operation completed
class BaseScraper(ABC):
"""Base class for all scraper implementations."""
# Default input/output statuses - can be overridden by subclasses
INPUT_STATUSES = ["New"] # Which paper statuses this scraper will process
OUTPUT_STATUS_SUCCESS = "Done" # Status to set on successful scraping
OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping
OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing
@abstractmethod
def scrape(self, doi: str) -> ScrapeResult:
"""
Fetch metadata and/or download paper for the given DOI.
Args:
doi: The DOI of the paper to scrape
Returns:
ScrapeResult with status, message, and optional data
"""
pass
def get_name(self) -> str:
"""Return the name of this scraper."""
return self.__class__.__name__
def get_description(self) -> str:
"""Return a description of this scraper."""
return getattr(self.__class__, "__doc__", "No description available")
def get_input_statuses(self) -> List[str]:
"""Return list of paper statuses this scraper can process."""
return self.INPUT_STATUSES
def get_output_statuses(self) -> Dict[str, str]:
"""Return mapping of result types to output statuses."""
return {
"success": self.OUTPUT_STATUS_SUCCESS,
"failure": self.OUTPUT_STATUS_FAILURE,
"processing": self.OUTPUT_STATUS_PROCESSING
}