84 lines
2.6 KiB
Python

from abc import ABC, abstractmethod
from typing import Dict, Optional, List
from dataclasses import dataclass
@dataclass
class ParsedContent:
"""Container for parsed content from a publisher's HTML."""
full_text: str
title: Optional[str] = None
abstract: Optional[str] = None
authors: Optional[List[str]] = None
keywords: Optional[List[str]] = None
sections: Optional[Dict[str, str]] = None # section_title -> section_content
references: Optional[List[str]] = None
doi: Optional[str] = None
journal: Optional[str] = None
publication_date: Optional[str] = None
metadata: Optional[Dict] = None # Additional metadata specific to publisher
class BaseParser(ABC):
"""Base class for all publisher-specific parsers."""
def __init__(self):
self.parser_name = self.__class__.__name__.lower().replace('parser', '')
@abstractmethod
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
"""
Check if this parser can handle the given HTML content.
Args:
html_content: The HTML content to check
url: Optional URL of the content (for additional context)
Returns:
True if this parser can handle the content, False otherwise
"""
pass
@abstractmethod
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
"""
Parse HTML content and extract structured information.
Args:
html_content: The HTML content to parse
doi: Optional DOI of the paper
Returns:
ParsedContent object with extracted information
Raises:
ParseError: If parsing fails
"""
pass
def get_name(self) -> str:
"""Return the name of this parser."""
return self.parser_name
def get_description(self) -> str:
"""Return a description of this parser."""
return getattr(self.__class__, "__doc__", "No description available")
def validate_content(self, content: ParsedContent) -> bool:
"""
Validate the parsed content to ensure it meets minimum requirements.
Args:
content: The parsed content to validate
Returns:
True if content is valid, False otherwise
"""
# Basic validation - must have some full text
if not content.full_text or len(content.full_text.strip()) < 100:
return False
return True
class ParseError(Exception):
"""Exception raised when parsing fails."""
pass