228 lines
8.5 KiB
Python
228 lines
8.5 KiB
Python
import re
|
|
from bs4 import BeautifulSoup
|
|
from typing import Dict, Optional, List
|
|
from .base_parser import BaseParser, ParsedContent, ParseError
|
|
|
|
class ArxivParser(BaseParser):
|
|
"""Parser for arXiv papers."""
|
|
|
|
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
|
|
"""Check if this is an arXiv page."""
|
|
html_lower = html_content.lower()
|
|
|
|
# Check for arXiv indicators
|
|
indicators = [
|
|
'arxiv.org',
|
|
'export.arxiv.org',
|
|
'arxiv:',
|
|
'meta name="citation_publisher" content="arxiv"',
|
|
]
|
|
|
|
return any(indicator in html_lower for indicator in indicators)
|
|
|
|
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
|
|
"""Parse arXiv HTML content."""
|
|
try:
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Extract title
|
|
title = self._extract_title(soup)
|
|
|
|
# Extract abstract
|
|
abstract = self._extract_abstract(soup)
|
|
|
|
# Extract authors
|
|
authors = self._extract_authors(soup)
|
|
|
|
# Extract full text (arXiv usually just has abstract on the HTML page)
|
|
full_text = self._extract_full_text(soup, abstract)
|
|
|
|
# Extract keywords/subjects
|
|
keywords = self._extract_subjects(soup)
|
|
|
|
# Extract arxiv ID
|
|
arxiv_id = self._extract_arxiv_id(soup)
|
|
|
|
if not full_text or len(full_text.strip()) < 50:
|
|
raise ParseError("Could not extract meaningful content from arXiv page")
|
|
|
|
return ParsedContent(
|
|
full_text=full_text,
|
|
title=title,
|
|
abstract=abstract,
|
|
authors=authors,
|
|
keywords=keywords,
|
|
sections=None, # arXiv HTML pages don't usually have full sections
|
|
references=None, # References are typically in the PDF
|
|
doi=doi,
|
|
journal="arXiv",
|
|
publication_date=self._extract_submission_date(soup),
|
|
metadata={
|
|
'parser': 'arxiv',
|
|
'arxiv_id': arxiv_id,
|
|
'source': 'arxiv.org'
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
raise ParseError(f"Failed to parse arXiv content: {str(e)}")
|
|
|
|
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract paper title."""
|
|
# Try multiple title selectors for arXiv
|
|
selectors = [
|
|
'h1.title',
|
|
'meta[name="citation_title"]',
|
|
'title'
|
|
]
|
|
|
|
for selector in selectors:
|
|
if 'meta' in selector:
|
|
element = soup.find('meta', attrs={'name': 'citation_title'})
|
|
if element:
|
|
return element.get('content', '').strip()
|
|
else:
|
|
element = soup.select_one(selector)
|
|
if element:
|
|
text = element.get_text(strip=True)
|
|
# Remove "Title:" prefix if present
|
|
text = re.sub(r'^Title:\s*', '', text)
|
|
return text
|
|
|
|
return None
|
|
|
|
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract paper abstract."""
|
|
# arXiv abstract selectors
|
|
selectors = [
|
|
'blockquote.abstract',
|
|
'div.abstract',
|
|
'meta[name="citation_abstract"]'
|
|
]
|
|
|
|
for selector in selectors:
|
|
if 'meta' in selector:
|
|
element = soup.find('meta', attrs={'name': 'citation_abstract'})
|
|
if element:
|
|
return element.get('content', '').strip()
|
|
else:
|
|
element = soup.select_one(selector)
|
|
if element:
|
|
text = element.get_text(strip=True)
|
|
# Remove "Abstract:" prefix if present
|
|
text = re.sub(r'^Abstract:\s*', '', text)
|
|
return text
|
|
|
|
return None
|
|
|
|
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
|
"""Extract author names."""
|
|
authors = []
|
|
|
|
# Try author meta tags
|
|
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
|
|
if author_metas:
|
|
authors = [meta.get('content', '').strip() for meta in author_metas]
|
|
|
|
# Try arXiv author div
|
|
if not authors:
|
|
authors_div = soup.select_one('div.authors')
|
|
if authors_div:
|
|
# Extract author links or text
|
|
author_links = authors_div.find_all('a')
|
|
if author_links:
|
|
authors = [link.get_text(strip=True) for link in author_links]
|
|
else:
|
|
# Fallback to text parsing
|
|
text = authors_div.get_text()
|
|
# Remove "Authors:" prefix and split by commas
|
|
text = re.sub(r'^Authors?:\s*', '', text)
|
|
authors = [author.strip() for author in text.split(',')]
|
|
|
|
return authors if authors else None
|
|
|
|
def _extract_full_text(self, soup: BeautifulSoup, abstract: Optional[str] = None) -> str:
|
|
"""Extract main content (usually just abstract for arXiv HTML pages)."""
|
|
content_parts = []
|
|
|
|
# For arXiv, the HTML page typically only contains abstract and metadata
|
|
# The full text is in the PDF
|
|
|
|
if abstract:
|
|
content_parts.append(f"Abstract\n{abstract}")
|
|
|
|
# Look for any additional content sections
|
|
comments_section = soup.select_one('td.comments')
|
|
if comments_section:
|
|
comments = comments_section.get_text(strip=True)
|
|
if comments:
|
|
content_parts.append(f"Comments\n{comments}")
|
|
|
|
# Add note about PDF availability
|
|
content_parts.append(
|
|
"\nNote: This is the abstract and metadata from the arXiv HTML page. "
|
|
"The full text is available in the PDF version."
|
|
)
|
|
|
|
return '\n\n'.join(content_parts)
|
|
|
|
def _extract_subjects(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
|
"""Extract subject classifications."""
|
|
subjects = []
|
|
|
|
# Look for subject classification
|
|
subjects_td = soup.select_one('td.subjects')
|
|
if subjects_td:
|
|
subjects_text = subjects_td.get_text(strip=True)
|
|
# Parse subjects (format: "Primary: subject1; Secondary: subject2")
|
|
subjects = [subj.strip() for subj in re.split(r'[;,]', subjects_text)]
|
|
# Clean up prefixes
|
|
subjects = [re.sub(r'^(Primary|Secondary):\s*', '', subj) for subj in subjects]
|
|
subjects = [subj for subj in subjects if subj] # Remove empty strings
|
|
|
|
return subjects if subjects else None
|
|
|
|
def _extract_arxiv_id(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract arXiv ID."""
|
|
# Look for arXiv ID in various places
|
|
arxiv_id_patterns = [
|
|
r'arXiv:(\d+\.\d+(?:v\d+)?)',
|
|
r'(\d{4}\.\d{4,5}(?:v\d+)?)',
|
|
]
|
|
|
|
# Search in page text
|
|
page_text = soup.get_text()
|
|
for pattern in arxiv_id_patterns:
|
|
match = re.search(pattern, page_text)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
# Search in URL or meta tags
|
|
canonical_link = soup.find('link', attrs={'rel': 'canonical'})
|
|
if canonical_link:
|
|
href = canonical_link.get('href', '')
|
|
for pattern in arxiv_id_patterns:
|
|
match = re.search(pattern, href)
|
|
if match:
|
|
return match.group(1)
|
|
|
|
return None
|
|
|
|
def _extract_submission_date(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract submission date."""
|
|
# Look for submission date
|
|
submission_td = soup.select_one('td.submission-history')
|
|
if submission_td:
|
|
date_text = submission_td.get_text()
|
|
# Extract date (format varies)
|
|
date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_text)
|
|
if date_match:
|
|
return date_match.group(1)
|
|
|
|
# Try meta tag
|
|
date_meta = soup.find('meta', attrs={'name': 'citation_date'})
|
|
if date_meta:
|
|
return date_meta.get('content', '').strip()
|
|
|
|
return None
|