90 lines
3.4 KiB
Python
90 lines
3.4 KiB
Python
from abc import ABC, abstractmethod
|
|
from typing import NamedTuple, Optional, Dict, List
|
|
from datetime import datetime
|
|
|
|
class ScrapeResult(NamedTuple):
|
|
status: str # "success", "error", "skipped"
|
|
message: str # human-readable status
|
|
data: Optional[Dict] # any extra payload (file_path, metadata, etc.)
|
|
duration: Optional[float] = None # processing time in seconds
|
|
timestamp: Optional[datetime] = None # when the operation completed
|
|
|
|
class BaseScraper(ABC):
|
|
"""Base class for all scraper implementations."""
|
|
|
|
# Default input/output statuses - can be overridden by subclasses
|
|
INPUT_STATUSES = ["New"] # Which paper statuses this scraper will process
|
|
OUTPUT_STATUS_SUCCESS = "Done" # Status to set on successful scraping
|
|
OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping
|
|
OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing
|
|
|
|
def __init__(self):
|
|
"""Initialize the scraper."""
|
|
self.scraper_name = self.get_name().lower()
|
|
|
|
def log_scrape_start(self, doi: str, paper_id: Optional[int] = None):
|
|
"""Log the start of a scraping operation."""
|
|
from ..models import ActivityLog
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action=f"{self.scraper_name}_scrape_start",
|
|
status="info",
|
|
description=f"Starting {self.get_name()} for DOI: {doi}",
|
|
paper_id=paper_id
|
|
)
|
|
|
|
def log_scrape_success(self, doi: str, message: str, paper_id: Optional[int] = None):
|
|
"""Log successful completion of scraping."""
|
|
from ..models import ActivityLog
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action=f"{self.scraper_name}_scrape_success",
|
|
status="success",
|
|
description=f"{self.get_name()} completed successfully for DOI: {doi} - {message}",
|
|
paper_id=paper_id
|
|
)
|
|
|
|
def log_scrape_failure(self, doi: str, message: str, paper_id: Optional[int] = None):
|
|
"""Log failed scraping operation."""
|
|
from ..models import ActivityLog
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action=f"{self.scraper_name}_scrape_failure",
|
|
status="error",
|
|
description=f"{self.get_name()} failed for DOI: {doi} - {message}",
|
|
paper_id=paper_id
|
|
)
|
|
|
|
@abstractmethod
|
|
def scrape(self, doi: str) -> ScrapeResult:
|
|
"""
|
|
Fetch metadata and/or download paper for the given DOI.
|
|
|
|
Args:
|
|
doi: The DOI of the paper to scrape
|
|
|
|
Returns:
|
|
ScrapeResult with status, message, and optional data
|
|
"""
|
|
pass
|
|
|
|
def get_name(self) -> str:
|
|
"""Return the name of this scraper."""
|
|
return self.__class__.__name__
|
|
|
|
def get_description(self) -> str:
|
|
"""Return a description of this scraper."""
|
|
return getattr(self.__class__, "__doc__", "No description available")
|
|
|
|
def get_input_statuses(self) -> List[str]:
|
|
"""Return list of paper statuses this scraper can process."""
|
|
return self.INPUT_STATUSES
|
|
|
|
def get_output_statuses(self) -> Dict[str, str]:
|
|
"""Return mapping of result types to output statuses."""
|
|
return {
|
|
"success": self.OUTPUT_STATUS_SUCCESS,
|
|
"failure": self.OUTPUT_STATUS_FAILURE,
|
|
"processing": self.OUTPUT_STATUS_PROCESSING
|
|
}
|