from abc import ABC, abstractmethod from typing import Dict, Optional, List from dataclasses import dataclass @dataclass class ParsedContent: """Container for parsed content from a publisher's HTML.""" full_text: str title: Optional[str] = None abstract: Optional[str] = None authors: Optional[List[str]] = None keywords: Optional[List[str]] = None sections: Optional[Dict[str, str]] = None # section_title -> section_content references: Optional[List[str]] = None doi: Optional[str] = None journal: Optional[str] = None publication_date: Optional[str] = None metadata: Optional[Dict] = None # Additional metadata specific to publisher class BaseParser(ABC): """Base class for all publisher-specific parsers.""" def __init__(self): self.parser_name = self.__class__.__name__.lower().replace('parser', '') @abstractmethod def can_parse(self, html_content: str, url: Optional[str] = None) -> bool: """ Check if this parser can handle the given HTML content. Args: html_content: The HTML content to check url: Optional URL of the content (for additional context) Returns: True if this parser can handle the content, False otherwise """ pass @abstractmethod def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent: """ Parse HTML content and extract structured information. Args: html_content: The HTML content to parse doi: Optional DOI of the paper Returns: ParsedContent object with extracted information Raises: ParseError: If parsing fails """ pass def get_name(self) -> str: """Return the name of this parser.""" return self.parser_name def get_description(self) -> str: """Return a description of this parser.""" return getattr(self.__class__, "__doc__", "No description available") def validate_content(self, content: ParsedContent) -> bool: """ Validate the parsed content to ensure it meets minimum requirements. Args: content: The parsed content to validate Returns: True if content is valid, False otherwise """ # Basic validation - must have some full text if not content.full_text or len(content.full_text.strip()) < 100: return False return True class ParseError(Exception): """Exception raised when parsing fails.""" pass