253 lines
9.3 KiB
Python

import re
from bs4 import BeautifulSoup
from typing import Dict, Optional, List
from .base_parser import BaseParser, ParsedContent, ParseError
class ElsevierParser(BaseParser):
"""Parser for Elsevier/ScienceDirect articles."""
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
"""Check if this is an Elsevier/ScienceDirect page."""
html_lower = html_content.lower()
# Check for Elsevier/ScienceDirect indicators
indicators = [
'sciencedirect.com',
'elsevier.com',
'meta name="citation_publisher" content="elsevier"',
'copyright.*elsevier',
'sciencedirect',
]
return any(indicator in html_lower for indicator in indicators)
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
"""Parse Elsevier/ScienceDirect HTML content."""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Extract title
title = self._extract_title(soup)
# Extract abstract
abstract = self._extract_abstract(soup)
# Extract authors
authors = self._extract_authors(soup)
# Extract full text
full_text = self._extract_full_text(soup)
# Extract sections
sections = self._extract_sections(soup)
# Extract keywords
keywords = self._extract_keywords(soup)
# Extract references
references = self._extract_references(soup)
# Extract journal info
journal = self._extract_journal(soup)
# Extract publication date
publication_date = self._extract_publication_date(soup)
# Combine everything into full text if sections exist
if sections:
full_text = self._combine_sections(sections, abstract)
if not full_text or len(full_text.strip()) < 100:
raise ParseError("Could not extract meaningful full text content")
return ParsedContent(
full_text=full_text,
title=title,
abstract=abstract,
authors=authors,
keywords=keywords,
sections=sections,
references=references,
doi=doi,
journal=journal,
publication_date=publication_date,
metadata={
'parser': 'elsevier',
'source': 'sciencedirect'
}
)
except Exception as e:
raise ParseError(f"Failed to parse Elsevier content: {str(e)}")
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract article title."""
# Try multiple title selectors
selectors = [
'h1.title-text',
'h1[data-testid="title"]',
'h1.article-title',
'meta[name="citation_title"]',
'title'
]
for selector in selectors:
if 'meta' in selector:
element = soup.find('meta', attrs={'name': 'citation_title'})
if element:
return element.get('content', '').strip()
else:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return None
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract article abstract."""
selectors = [
'div.abstract-content',
'div[data-testid="abstract"]',
'div.abstract',
'section.abstract',
'div#abstract'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return None
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract author names."""
authors = []
# Try author meta tags
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
if author_metas:
authors = [meta.get('content', '').strip() for meta in author_metas]
# Try author div/span elements
if not authors:
author_elements = soup.select('div.author a, span.author, .author-name')
authors = [elem.get_text(strip=True) for elem in author_elements]
return authors if authors else None
def _extract_full_text(self, soup: BeautifulSoup) -> str:
"""Extract main article content."""
content_parts = []
# Try main content selectors
main_selectors = [
'div.article-content',
'div.body-content',
'main.article-body',
'div[data-testid="article-body"]',
'section.article-section'
]
for selector in main_selectors:
elements = soup.select(selector)
for element in elements:
# Remove script, style, and navigation elements
for unwanted in element.find_all(['script', 'style', 'nav', 'footer', 'header']):
unwanted.decompose()
text = element.get_text(separator='\n', strip=True)
if text and len(text) > 50: # Only add substantial content
content_parts.append(text)
return '\n\n'.join(content_parts)
def _extract_sections(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
"""Extract article sections with headings."""
sections = {}
# Look for section headings and content
section_elements = soup.find_all(['h2', 'h3', 'h4'], class_=re.compile(r'section|heading'))
for heading in section_elements:
section_title = heading.get_text(strip=True)
# Find content after this heading until next heading
content_parts = []
current = heading.next_sibling
while current and current.name not in ['h1', 'h2', 'h3', 'h4']:
if hasattr(current, 'get_text'):
text = current.get_text(strip=True)
if text:
content_parts.append(text)
current = current.next_sibling
if content_parts:
sections[section_title] = '\n'.join(content_parts)
return sections if sections else None
def _extract_keywords(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract article keywords."""
keywords = []
# Try keyword meta tags
keyword_metas = soup.find_all('meta', attrs={'name': 'citation_keywords'})
if keyword_metas:
for meta in keyword_metas:
content = meta.get('content', '')
if content:
keywords.extend([kw.strip() for kw in content.split(',')])
# Try keyword sections
if not keywords:
keyword_sections = soup.select('div.keywords, section.keywords')
for section in keyword_sections:
text = section.get_text()
keywords.extend([kw.strip() for kw in text.split(',') if kw.strip()])
return keywords if keywords else None
def _extract_references(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract references."""
references = []
ref_sections = soup.select('section.references, div.references, ol.references li')
for section in ref_sections:
if section.name == 'li':
references.append(section.get_text(strip=True))
else:
ref_items = section.find_all(['li', 'div'], class_=re.compile(r'reference'))
references.extend([item.get_text(strip=True) for item in ref_items])
return references if references else None
def _extract_journal(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract journal name."""
journal_meta = soup.find('meta', attrs={'name': 'citation_journal_title'})
if journal_meta:
return journal_meta.get('content', '').strip()
return None
def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract publication date."""
date_meta = soup.find('meta', attrs={'name': 'citation_publication_date'})
if date_meta:
return date_meta.get('content', '').strip()
return None
def _combine_sections(self, sections: Dict[str, str], abstract: Optional[str] = None) -> str:
"""Combine all sections into full text."""
full_text_parts = []
if abstract:
full_text_parts.append(f"Abstract\n{abstract}")
for section_title, section_content in sections.items():
full_text_parts.append(f"{section_title}\n{section_content}")
return '\n\n'.join(full_text_parts)