84 lines
2.6 KiB
Python
84 lines
2.6 KiB
Python
from abc import ABC, abstractmethod
|
|
from typing import Dict, Optional, List
|
|
from dataclasses import dataclass
|
|
|
|
@dataclass
|
|
class ParsedContent:
|
|
"""Container for parsed content from a publisher's HTML."""
|
|
full_text: str
|
|
title: Optional[str] = None
|
|
abstract: Optional[str] = None
|
|
authors: Optional[List[str]] = None
|
|
keywords: Optional[List[str]] = None
|
|
sections: Optional[Dict[str, str]] = None # section_title -> section_content
|
|
references: Optional[List[str]] = None
|
|
doi: Optional[str] = None
|
|
journal: Optional[str] = None
|
|
publication_date: Optional[str] = None
|
|
metadata: Optional[Dict] = None # Additional metadata specific to publisher
|
|
|
|
class BaseParser(ABC):
|
|
"""Base class for all publisher-specific parsers."""
|
|
|
|
def __init__(self):
|
|
self.parser_name = self.__class__.__name__.lower().replace('parser', '')
|
|
|
|
@abstractmethod
|
|
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
|
|
"""
|
|
Check if this parser can handle the given HTML content.
|
|
|
|
Args:
|
|
html_content: The HTML content to check
|
|
url: Optional URL of the content (for additional context)
|
|
|
|
Returns:
|
|
True if this parser can handle the content, False otherwise
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
|
|
"""
|
|
Parse HTML content and extract structured information.
|
|
|
|
Args:
|
|
html_content: The HTML content to parse
|
|
doi: Optional DOI of the paper
|
|
|
|
Returns:
|
|
ParsedContent object with extracted information
|
|
|
|
Raises:
|
|
ParseError: If parsing fails
|
|
"""
|
|
pass
|
|
|
|
def get_name(self) -> str:
|
|
"""Return the name of this parser."""
|
|
return self.parser_name
|
|
|
|
def get_description(self) -> str:
|
|
"""Return a description of this parser."""
|
|
return getattr(self.__class__, "__doc__", "No description available")
|
|
|
|
def validate_content(self, content: ParsedContent) -> bool:
|
|
"""
|
|
Validate the parsed content to ensure it meets minimum requirements.
|
|
|
|
Args:
|
|
content: The parsed content to validate
|
|
|
|
Returns:
|
|
True if content is valid, False otherwise
|
|
"""
|
|
# Basic validation - must have some full text
|
|
if not content.full_text or len(content.full_text.strip()) < 100:
|
|
return False
|
|
|
|
return True
|
|
|
|
class ParseError(Exception):
|
|
"""Exception raised when parsing fails."""
|
|
pass
|