SciPaperLoader/scipaperloader/parsers/arxiv_parser.py

import re
from bs4 import BeautifulSoup
from typing import Dict, Optional, List
from .base_parser import BaseParser, ParsedContent, ParseError

class ArxivParser(BaseParser):
    """Parser for arXiv papers."""

    def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
        """Check if this is an arXiv page."""
        html_lower = html_content.lower()

        # Check for arXiv indicators
        indicators = [
            'arxiv.org',
            'export.arxiv.org',
            'arxiv:',
            'meta name="citation_publisher" content="arxiv"',
        ]

        return any(indicator in html_lower for indicator in indicators)

    def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
        """Parse arXiv HTML content."""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')

            # Extract title
            title = self._extract_title(soup)

            # Extract abstract
            abstract = self._extract_abstract(soup)

            # Extract authors
            authors = self._extract_authors(soup)

            # Extract full text (arXiv usually just has abstract on the HTML page)
            full_text = self._extract_full_text(soup, abstract)

            # Extract keywords/subjects
            keywords = self._extract_subjects(soup)

            # Extract arxiv ID
            arxiv_id = self._extract_arxiv_id(soup)

            if not full_text or len(full_text.strip()) < 50:
                raise ParseError("Could not extract meaningful content from arXiv page")

            return ParsedContent(
                full_text=full_text,
                title=title,
                abstract=abstract,
                authors=authors,
                keywords=keywords,
                sections=None,  # arXiv HTML pages don't usually have full sections
                references=None,  # References are typically in the PDF
                doi=doi,
                journal="arXiv",
                publication_date=self._extract_submission_date(soup),
                metadata={
                    'parser': 'arxiv',
                    'arxiv_id': arxiv_id,
                    'source': 'arxiv.org'
                }
            )

        except Exception as e:
            raise ParseError(f"Failed to parse arXiv content: {str(e)}")

    def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract paper title."""
        # Try multiple title selectors for arXiv
        selectors = [
            'h1.title',
            'meta[name="citation_title"]',
            'title'
        ]

        for selector in selectors:
            if 'meta' in selector:
                element = soup.find('meta', attrs={'name': 'citation_title'})
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    text = element.get_text(strip=True)
                    # Remove "Title:" prefix if present
                    text = re.sub(r'^Title:\s*', '', text)
                    return text

        return None

    def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract paper abstract."""
        # arXiv abstract selectors
        selectors = [
            'blockquote.abstract',
            'div.abstract',
            'meta[name="citation_abstract"]'
        ]

        for selector in selectors:
            if 'meta' in selector:
                element = soup.find('meta', attrs={'name': 'citation_abstract'})
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    text = element.get_text(strip=True)
                    # Remove "Abstract:" prefix if present
                    text = re.sub(r'^Abstract:\s*', '', text)
                    return text

        return None

    def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
        """Extract author names."""
        authors = []

        # Try author meta tags
        author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
        if author_metas:
            authors = [meta.get('content', '').strip() for meta in author_metas]

        # Try arXiv author div
        if not authors:
            authors_div = soup.select_one('div.authors')
            if authors_div:
                # Extract author links or text
                author_links = authors_div.find_all('a')
                if author_links:
                    authors = [link.get_text(strip=True) for link in author_links]
                else:
                    # Fallback to text parsing
                    text = authors_div.get_text()
                    # Remove "Authors:" prefix and split by commas
                    text = re.sub(r'^Authors?:\s*', '', text)
                    authors = [author.strip() for author in text.split(',')]

        return authors if authors else None

    def _extract_full_text(self, soup: BeautifulSoup, abstract: Optional[str] = None) -> str:
        """Extract main content (usually just abstract for arXiv HTML pages)."""
        content_parts = []

        # For arXiv, the HTML page typically only contains abstract and metadata
        # The full text is in the PDF

        if abstract:
            content_parts.append(f"Abstract\n{abstract}")

        # Look for any additional content sections
        comments_section = soup.select_one('td.comments')
        if comments_section:
            comments = comments_section.get_text(strip=True)
            if comments:
                content_parts.append(f"Comments\n{comments}")

        # Add note about PDF availability
        content_parts.append(
            "\nNote: This is the abstract and metadata from the arXiv HTML page. "
            "The full text is available in the PDF version."
        )

        return '\n\n'.join(content_parts)

    def _extract_subjects(self, soup: BeautifulSoup) -> Optional[List[str]]:
        """Extract subject classifications."""
        subjects = []

        # Look for subject classification
        subjects_td = soup.select_one('td.subjects')
        if subjects_td:
            subjects_text = subjects_td.get_text(strip=True)
            # Parse subjects (format: "Primary: subject1; Secondary: subject2")
            subjects = [subj.strip() for subj in re.split(r'[;,]', subjects_text)]
            # Clean up prefixes
            subjects = [re.sub(r'^(Primary|Secondary):\s*', '', subj) for subj in subjects]
            subjects = [subj for subj in subjects if subj]  # Remove empty strings

        return subjects if subjects else None

    def _extract_arxiv_id(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract arXiv ID."""
        # Look for arXiv ID in various places
        arxiv_id_patterns = [
            r'arXiv:(\d+\.\d+(?:v\d+)?)',
            r'(\d{4}\.\d{4,5}(?:v\d+)?)',
        ]

        # Search in page text
        page_text = soup.get_text()
        for pattern in arxiv_id_patterns:
            match = re.search(pattern, page_text)
            if match:
                return match.group(1)

        # Search in URL or meta tags
        canonical_link = soup.find('link', attrs={'rel': 'canonical'})
        if canonical_link:
            href = canonical_link.get('href', '')
            for pattern in arxiv_id_patterns:
                match = re.search(pattern, href)
                if match:
                    return match.group(1)

        return None

    def _extract_submission_date(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract submission date."""
        # Look for submission date
        submission_td = soup.select_one('td.submission-history')
        if submission_td:
            date_text = submission_td.get_text()
            # Extract date (format varies)
            date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_text)
            if date_match:
                return date_match.group(1)

        # Try meta tag
        date_meta = soup.find('meta', attrs={'name': 'citation_date'})
        if date_meta:
            return date_meta.get('content', '').strip()

        return None