253 lines
9.3 KiB
Python
253 lines
9.3 KiB
Python
import re
|
|
from bs4 import BeautifulSoup
|
|
from typing import Dict, Optional, List
|
|
from .base_parser import BaseParser, ParsedContent, ParseError
|
|
|
|
class ElsevierParser(BaseParser):
|
|
"""Parser for Elsevier/ScienceDirect articles."""
|
|
|
|
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
|
|
"""Check if this is an Elsevier/ScienceDirect page."""
|
|
html_lower = html_content.lower()
|
|
|
|
# Check for Elsevier/ScienceDirect indicators
|
|
indicators = [
|
|
'sciencedirect.com',
|
|
'elsevier.com',
|
|
'meta name="citation_publisher" content="elsevier"',
|
|
'copyright.*elsevier',
|
|
'sciencedirect',
|
|
]
|
|
|
|
return any(indicator in html_lower for indicator in indicators)
|
|
|
|
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
|
|
"""Parse Elsevier/ScienceDirect HTML content."""
|
|
try:
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Extract title
|
|
title = self._extract_title(soup)
|
|
|
|
# Extract abstract
|
|
abstract = self._extract_abstract(soup)
|
|
|
|
# Extract authors
|
|
authors = self._extract_authors(soup)
|
|
|
|
# Extract full text
|
|
full_text = self._extract_full_text(soup)
|
|
|
|
# Extract sections
|
|
sections = self._extract_sections(soup)
|
|
|
|
# Extract keywords
|
|
keywords = self._extract_keywords(soup)
|
|
|
|
# Extract references
|
|
references = self._extract_references(soup)
|
|
|
|
# Extract journal info
|
|
journal = self._extract_journal(soup)
|
|
|
|
# Extract publication date
|
|
publication_date = self._extract_publication_date(soup)
|
|
|
|
# Combine everything into full text if sections exist
|
|
if sections:
|
|
full_text = self._combine_sections(sections, abstract)
|
|
|
|
if not full_text or len(full_text.strip()) < 100:
|
|
raise ParseError("Could not extract meaningful full text content")
|
|
|
|
return ParsedContent(
|
|
full_text=full_text,
|
|
title=title,
|
|
abstract=abstract,
|
|
authors=authors,
|
|
keywords=keywords,
|
|
sections=sections,
|
|
references=references,
|
|
doi=doi,
|
|
journal=journal,
|
|
publication_date=publication_date,
|
|
metadata={
|
|
'parser': 'elsevier',
|
|
'source': 'sciencedirect'
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
raise ParseError(f"Failed to parse Elsevier content: {str(e)}")
|
|
|
|
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract article title."""
|
|
# Try multiple title selectors
|
|
selectors = [
|
|
'h1.title-text',
|
|
'h1[data-testid="title"]',
|
|
'h1.article-title',
|
|
'meta[name="citation_title"]',
|
|
'title'
|
|
]
|
|
|
|
for selector in selectors:
|
|
if 'meta' in selector:
|
|
element = soup.find('meta', attrs={'name': 'citation_title'})
|
|
if element:
|
|
return element.get('content', '').strip()
|
|
else:
|
|
element = soup.select_one(selector)
|
|
if element:
|
|
return element.get_text(strip=True)
|
|
|
|
return None
|
|
|
|
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract article abstract."""
|
|
selectors = [
|
|
'div.abstract-content',
|
|
'div[data-testid="abstract"]',
|
|
'div.abstract',
|
|
'section.abstract',
|
|
'div#abstract'
|
|
]
|
|
|
|
for selector in selectors:
|
|
element = soup.select_one(selector)
|
|
if element:
|
|
return element.get_text(strip=True)
|
|
|
|
return None
|
|
|
|
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
|
"""Extract author names."""
|
|
authors = []
|
|
|
|
# Try author meta tags
|
|
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
|
|
if author_metas:
|
|
authors = [meta.get('content', '').strip() for meta in author_metas]
|
|
|
|
# Try author div/span elements
|
|
if not authors:
|
|
author_elements = soup.select('div.author a, span.author, .author-name')
|
|
authors = [elem.get_text(strip=True) for elem in author_elements]
|
|
|
|
return authors if authors else None
|
|
|
|
def _extract_full_text(self, soup: BeautifulSoup) -> str:
|
|
"""Extract main article content."""
|
|
content_parts = []
|
|
|
|
# Try main content selectors
|
|
main_selectors = [
|
|
'div.article-content',
|
|
'div.body-content',
|
|
'main.article-body',
|
|
'div[data-testid="article-body"]',
|
|
'section.article-section'
|
|
]
|
|
|
|
for selector in main_selectors:
|
|
elements = soup.select(selector)
|
|
for element in elements:
|
|
# Remove script, style, and navigation elements
|
|
for unwanted in element.find_all(['script', 'style', 'nav', 'footer', 'header']):
|
|
unwanted.decompose()
|
|
|
|
text = element.get_text(separator='\n', strip=True)
|
|
if text and len(text) > 50: # Only add substantial content
|
|
content_parts.append(text)
|
|
|
|
return '\n\n'.join(content_parts)
|
|
|
|
def _extract_sections(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
|
|
"""Extract article sections with headings."""
|
|
sections = {}
|
|
|
|
# Look for section headings and content
|
|
section_elements = soup.find_all(['h2', 'h3', 'h4'], class_=re.compile(r'section|heading'))
|
|
|
|
for heading in section_elements:
|
|
section_title = heading.get_text(strip=True)
|
|
|
|
# Find content after this heading until next heading
|
|
content_parts = []
|
|
current = heading.next_sibling
|
|
|
|
while current and current.name not in ['h1', 'h2', 'h3', 'h4']:
|
|
if hasattr(current, 'get_text'):
|
|
text = current.get_text(strip=True)
|
|
if text:
|
|
content_parts.append(text)
|
|
current = current.next_sibling
|
|
|
|
if content_parts:
|
|
sections[section_title] = '\n'.join(content_parts)
|
|
|
|
return sections if sections else None
|
|
|
|
def _extract_keywords(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
|
"""Extract article keywords."""
|
|
keywords = []
|
|
|
|
# Try keyword meta tags
|
|
keyword_metas = soup.find_all('meta', attrs={'name': 'citation_keywords'})
|
|
if keyword_metas:
|
|
for meta in keyword_metas:
|
|
content = meta.get('content', '')
|
|
if content:
|
|
keywords.extend([kw.strip() for kw in content.split(',')])
|
|
|
|
# Try keyword sections
|
|
if not keywords:
|
|
keyword_sections = soup.select('div.keywords, section.keywords')
|
|
for section in keyword_sections:
|
|
text = section.get_text()
|
|
keywords.extend([kw.strip() for kw in text.split(',') if kw.strip()])
|
|
|
|
return keywords if keywords else None
|
|
|
|
def _extract_references(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
|
"""Extract references."""
|
|
references = []
|
|
|
|
ref_sections = soup.select('section.references, div.references, ol.references li')
|
|
for section in ref_sections:
|
|
if section.name == 'li':
|
|
references.append(section.get_text(strip=True))
|
|
else:
|
|
ref_items = section.find_all(['li', 'div'], class_=re.compile(r'reference'))
|
|
references.extend([item.get_text(strip=True) for item in ref_items])
|
|
|
|
return references if references else None
|
|
|
|
def _extract_journal(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract journal name."""
|
|
journal_meta = soup.find('meta', attrs={'name': 'citation_journal_title'})
|
|
if journal_meta:
|
|
return journal_meta.get('content', '').strip()
|
|
|
|
return None
|
|
|
|
def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]:
|
|
"""Extract publication date."""
|
|
date_meta = soup.find('meta', attrs={'name': 'citation_publication_date'})
|
|
if date_meta:
|
|
return date_meta.get('content', '').strip()
|
|
|
|
return None
|
|
|
|
def _combine_sections(self, sections: Dict[str, str], abstract: Optional[str] = None) -> str:
|
|
"""Combine all sections into full text."""
|
|
full_text_parts = []
|
|
|
|
if abstract:
|
|
full_text_parts.append(f"Abstract\n{abstract}")
|
|
|
|
for section_title, section_content in sections.items():
|
|
full_text_parts.append(f"{section_title}\n{section_content}")
|
|
|
|
return '\n\n'.join(full_text_parts)
|