228 lines
8.5 KiB
Python

import re
from bs4 import BeautifulSoup
from typing import Dict, Optional, List
from .base_parser import BaseParser, ParsedContent, ParseError
class ArxivParser(BaseParser):
"""Parser for arXiv papers."""
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
"""Check if this is an arXiv page."""
html_lower = html_content.lower()
# Check for arXiv indicators
indicators = [
'arxiv.org',
'export.arxiv.org',
'arxiv:',
'meta name="citation_publisher" content="arxiv"',
]
return any(indicator in html_lower for indicator in indicators)
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
"""Parse arXiv HTML content."""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Extract title
title = self._extract_title(soup)
# Extract abstract
abstract = self._extract_abstract(soup)
# Extract authors
authors = self._extract_authors(soup)
# Extract full text (arXiv usually just has abstract on the HTML page)
full_text = self._extract_full_text(soup, abstract)
# Extract keywords/subjects
keywords = self._extract_subjects(soup)
# Extract arxiv ID
arxiv_id = self._extract_arxiv_id(soup)
if not full_text or len(full_text.strip()) < 50:
raise ParseError("Could not extract meaningful content from arXiv page")
return ParsedContent(
full_text=full_text,
title=title,
abstract=abstract,
authors=authors,
keywords=keywords,
sections=None, # arXiv HTML pages don't usually have full sections
references=None, # References are typically in the PDF
doi=doi,
journal="arXiv",
publication_date=self._extract_submission_date(soup),
metadata={
'parser': 'arxiv',
'arxiv_id': arxiv_id,
'source': 'arxiv.org'
}
)
except Exception as e:
raise ParseError(f"Failed to parse arXiv content: {str(e)}")
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract paper title."""
# Try multiple title selectors for arXiv
selectors = [
'h1.title',
'meta[name="citation_title"]',
'title'
]
for selector in selectors:
if 'meta' in selector:
element = soup.find('meta', attrs={'name': 'citation_title'})
if element:
return element.get('content', '').strip()
else:
element = soup.select_one(selector)
if element:
text = element.get_text(strip=True)
# Remove "Title:" prefix if present
text = re.sub(r'^Title:\s*', '', text)
return text
return None
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract paper abstract."""
# arXiv abstract selectors
selectors = [
'blockquote.abstract',
'div.abstract',
'meta[name="citation_abstract"]'
]
for selector in selectors:
if 'meta' in selector:
element = soup.find('meta', attrs={'name': 'citation_abstract'})
if element:
return element.get('content', '').strip()
else:
element = soup.select_one(selector)
if element:
text = element.get_text(strip=True)
# Remove "Abstract:" prefix if present
text = re.sub(r'^Abstract:\s*', '', text)
return text
return None
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract author names."""
authors = []
# Try author meta tags
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
if author_metas:
authors = [meta.get('content', '').strip() for meta in author_metas]
# Try arXiv author div
if not authors:
authors_div = soup.select_one('div.authors')
if authors_div:
# Extract author links or text
author_links = authors_div.find_all('a')
if author_links:
authors = [link.get_text(strip=True) for link in author_links]
else:
# Fallback to text parsing
text = authors_div.get_text()
# Remove "Authors:" prefix and split by commas
text = re.sub(r'^Authors?:\s*', '', text)
authors = [author.strip() for author in text.split(',')]
return authors if authors else None
def _extract_full_text(self, soup: BeautifulSoup, abstract: Optional[str] = None) -> str:
"""Extract main content (usually just abstract for arXiv HTML pages)."""
content_parts = []
# For arXiv, the HTML page typically only contains abstract and metadata
# The full text is in the PDF
if abstract:
content_parts.append(f"Abstract\n{abstract}")
# Look for any additional content sections
comments_section = soup.select_one('td.comments')
if comments_section:
comments = comments_section.get_text(strip=True)
if comments:
content_parts.append(f"Comments\n{comments}")
# Add note about PDF availability
content_parts.append(
"\nNote: This is the abstract and metadata from the arXiv HTML page. "
"The full text is available in the PDF version."
)
return '\n\n'.join(content_parts)
def _extract_subjects(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract subject classifications."""
subjects = []
# Look for subject classification
subjects_td = soup.select_one('td.subjects')
if subjects_td:
subjects_text = subjects_td.get_text(strip=True)
# Parse subjects (format: "Primary: subject1; Secondary: subject2")
subjects = [subj.strip() for subj in re.split(r'[;,]', subjects_text)]
# Clean up prefixes
subjects = [re.sub(r'^(Primary|Secondary):\s*', '', subj) for subj in subjects]
subjects = [subj for subj in subjects if subj] # Remove empty strings
return subjects if subjects else None
def _extract_arxiv_id(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract arXiv ID."""
# Look for arXiv ID in various places
arxiv_id_patterns = [
r'arXiv:(\d+\.\d+(?:v\d+)?)',
r'(\d{4}\.\d{4,5}(?:v\d+)?)',
]
# Search in page text
page_text = soup.get_text()
for pattern in arxiv_id_patterns:
match = re.search(pattern, page_text)
if match:
return match.group(1)
# Search in URL or meta tags
canonical_link = soup.find('link', attrs={'rel': 'canonical'})
if canonical_link:
href = canonical_link.get('href', '')
for pattern in arxiv_id_patterns:
match = re.search(pattern, href)
if match:
return match.group(1)
return None
def _extract_submission_date(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract submission date."""
# Look for submission date
submission_td = soup.select_one('td.submission-history')
if submission_td:
date_text = submission_td.get_text()
# Extract date (format varies)
date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_text)
if date_match:
return date_match.group(1)
# Try meta tag
date_meta = soup.find('meta', attrs={'name': 'citation_date'})
if date_meta:
return date_meta.get('content', '').strip()
return None