SciPaperLoader/scipaperloader/parsers/base_parser.py

from abc import ABC, abstractmethod
from typing import Dict, Optional, List
from dataclasses import dataclass

@dataclass
class ParsedContent:
    """Container for parsed content from a publisher's HTML."""
    full_text: str
    title: Optional[str] = None
    abstract: Optional[str] = None
    authors: Optional[List[str]] = None
    keywords: Optional[List[str]] = None
    sections: Optional[Dict[str, str]] = None  # section_title -> section_content
    references: Optional[List[str]] = None
    doi: Optional[str] = None
    journal: Optional[str] = None
    publication_date: Optional[str] = None
    metadata: Optional[Dict] = None  # Additional metadata specific to publisher

class BaseParser(ABC):
    """Base class for all publisher-specific parsers."""

    def __init__(self):
        self.parser_name = self.__class__.__name__.lower().replace('parser', '')

    @abstractmethod
    def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
        """
        Check if this parser can handle the given HTML content.

        Args:
            html_content: The HTML content to check
            url: Optional URL of the content (for additional context)

        Returns:
            True if this parser can handle the content, False otherwise
        """
        pass

    @abstractmethod
    def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
        """
        Parse HTML content and extract structured information.

        Args:
            html_content: The HTML content to parse
            doi: Optional DOI of the paper

        Returns:
            ParsedContent object with extracted information

        Raises:
            ParseError: If parsing fails
        """
        pass

    def get_name(self) -> str:
        """Return the name of this parser."""
        return self.parser_name

    def get_description(self) -> str:
        """Return a description of this parser."""
        return getattr(self.__class__, "__doc__", "No description available")

    def validate_content(self, content: ParsedContent) -> bool:
        """
        Validate the parsed content to ensure it meets minimum requirements.

        Args:
            content: The parsed content to validate

        Returns:
            True if content is valid, False otherwise
        """
        # Basic validation - must have some full text
        if not content.full_text or len(content.full_text.strip()) < 100:
            return False

        return True

class ParseError(Exception):
    """Exception raised when parsing fails."""
    pass