import re from bs4 import BeautifulSoup from typing import Dict, Optional, List from .base_parser import BaseParser, ParsedContent, ParseError class ArxivParser(BaseParser): """Parser for arXiv papers.""" def can_parse(self, html_content: str, url: Optional[str] = None) -> bool: """Check if this is an arXiv page.""" html_lower = html_content.lower() # Check for arXiv indicators indicators = [ 'arxiv.org', 'export.arxiv.org', 'arxiv:', 'meta name="citation_publisher" content="arxiv"', ] return any(indicator in html_lower for indicator in indicators) def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent: """Parse arXiv HTML content.""" try: soup = BeautifulSoup(html_content, 'html.parser') # Extract title title = self._extract_title(soup) # Extract abstract abstract = self._extract_abstract(soup) # Extract authors authors = self._extract_authors(soup) # Extract full text (arXiv usually just has abstract on the HTML page) full_text = self._extract_full_text(soup, abstract) # Extract keywords/subjects keywords = self._extract_subjects(soup) # Extract arxiv ID arxiv_id = self._extract_arxiv_id(soup) if not full_text or len(full_text.strip()) < 50: raise ParseError("Could not extract meaningful content from arXiv page") return ParsedContent( full_text=full_text, title=title, abstract=abstract, authors=authors, keywords=keywords, sections=None, # arXiv HTML pages don't usually have full sections references=None, # References are typically in the PDF doi=doi, journal="arXiv", publication_date=self._extract_submission_date(soup), metadata={ 'parser': 'arxiv', 'arxiv_id': arxiv_id, 'source': 'arxiv.org' } ) except Exception as e: raise ParseError(f"Failed to parse arXiv content: {str(e)}") def _extract_title(self, soup: BeautifulSoup) -> Optional[str]: """Extract paper title.""" # Try multiple title selectors for arXiv selectors = [ 'h1.title', 'meta[name="citation_title"]', 'title' ] for selector in selectors: if 'meta' in selector: element = soup.find('meta', attrs={'name': 'citation_title'}) if element: return element.get('content', '').strip() else: element = soup.select_one(selector) if element: text = element.get_text(strip=True) # Remove "Title:" prefix if present text = re.sub(r'^Title:\s*', '', text) return text return None def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]: """Extract paper abstract.""" # arXiv abstract selectors selectors = [ 'blockquote.abstract', 'div.abstract', 'meta[name="citation_abstract"]' ] for selector in selectors: if 'meta' in selector: element = soup.find('meta', attrs={'name': 'citation_abstract'}) if element: return element.get('content', '').strip() else: element = soup.select_one(selector) if element: text = element.get_text(strip=True) # Remove "Abstract:" prefix if present text = re.sub(r'^Abstract:\s*', '', text) return text return None def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]: """Extract author names.""" authors = [] # Try author meta tags author_metas = soup.find_all('meta', attrs={'name': 'citation_author'}) if author_metas: authors = [meta.get('content', '').strip() for meta in author_metas] # Try arXiv author div if not authors: authors_div = soup.select_one('div.authors') if authors_div: # Extract author links or text author_links = authors_div.find_all('a') if author_links: authors = [link.get_text(strip=True) for link in author_links] else: # Fallback to text parsing text = authors_div.get_text() # Remove "Authors:" prefix and split by commas text = re.sub(r'^Authors?:\s*', '', text) authors = [author.strip() for author in text.split(',')] return authors if authors else None def _extract_full_text(self, soup: BeautifulSoup, abstract: Optional[str] = None) -> str: """Extract main content (usually just abstract for arXiv HTML pages).""" content_parts = [] # For arXiv, the HTML page typically only contains abstract and metadata # The full text is in the PDF if abstract: content_parts.append(f"Abstract\n{abstract}") # Look for any additional content sections comments_section = soup.select_one('td.comments') if comments_section: comments = comments_section.get_text(strip=True) if comments: content_parts.append(f"Comments\n{comments}") # Add note about PDF availability content_parts.append( "\nNote: This is the abstract and metadata from the arXiv HTML page. " "The full text is available in the PDF version." ) return '\n\n'.join(content_parts) def _extract_subjects(self, soup: BeautifulSoup) -> Optional[List[str]]: """Extract subject classifications.""" subjects = [] # Look for subject classification subjects_td = soup.select_one('td.subjects') if subjects_td: subjects_text = subjects_td.get_text(strip=True) # Parse subjects (format: "Primary: subject1; Secondary: subject2") subjects = [subj.strip() for subj in re.split(r'[;,]', subjects_text)] # Clean up prefixes subjects = [re.sub(r'^(Primary|Secondary):\s*', '', subj) for subj in subjects] subjects = [subj for subj in subjects if subj] # Remove empty strings return subjects if subjects else None def _extract_arxiv_id(self, soup: BeautifulSoup) -> Optional[str]: """Extract arXiv ID.""" # Look for arXiv ID in various places arxiv_id_patterns = [ r'arXiv:(\d+\.\d+(?:v\d+)?)', r'(\d{4}\.\d{4,5}(?:v\d+)?)', ] # Search in page text page_text = soup.get_text() for pattern in arxiv_id_patterns: match = re.search(pattern, page_text) if match: return match.group(1) # Search in URL or meta tags canonical_link = soup.find('link', attrs={'rel': 'canonical'}) if canonical_link: href = canonical_link.get('href', '') for pattern in arxiv_id_patterns: match = re.search(pattern, href) if match: return match.group(1) return None def _extract_submission_date(self, soup: BeautifulSoup) -> Optional[str]: """Extract submission date.""" # Look for submission date submission_td = soup.select_one('td.submission-history') if submission_td: date_text = submission_td.get_text() # Extract date (format varies) date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_text) if date_match: return date_match.group(1) # Try meta tag date_meta = soup.find('meta', attrs={'name': 'citation_date'}) if date_meta: return date_meta.get('content', '').strip() return None