import re from bs4 import BeautifulSoup from typing import Dict, Optional, List from .base_parser import BaseParser, ParsedContent, ParseError class ElsevierParser(BaseParser): """Parser for Elsevier/ScienceDirect articles.""" def can_parse(self, html_content: str, url: Optional[str] = None) -> bool: """Check if this is an Elsevier/ScienceDirect page.""" html_lower = html_content.lower() # Check for Elsevier/ScienceDirect indicators indicators = [ 'sciencedirect.com', 'elsevier.com', 'meta name="citation_publisher" content="elsevier"', 'copyright.*elsevier', 'sciencedirect', ] return any(indicator in html_lower for indicator in indicators) def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent: """Parse Elsevier/ScienceDirect HTML content.""" try: soup = BeautifulSoup(html_content, 'html.parser') # Extract title title = self._extract_title(soup) # Extract abstract abstract = self._extract_abstract(soup) # Extract authors authors = self._extract_authors(soup) # Extract full text full_text = self._extract_full_text(soup) # Extract sections sections = self._extract_sections(soup) # Extract keywords keywords = self._extract_keywords(soup) # Extract references references = self._extract_references(soup) # Extract journal info journal = self._extract_journal(soup) # Extract publication date publication_date = self._extract_publication_date(soup) # Combine everything into full text if sections exist if sections: full_text = self._combine_sections(sections, abstract) if not full_text or len(full_text.strip()) < 100: raise ParseError("Could not extract meaningful full text content") return ParsedContent( full_text=full_text, title=title, abstract=abstract, authors=authors, keywords=keywords, sections=sections, references=references, doi=doi, journal=journal, publication_date=publication_date, metadata={ 'parser': 'elsevier', 'source': 'sciencedirect' } ) except Exception as e: raise ParseError(f"Failed to parse Elsevier content: {str(e)}") def _extract_title(self, soup: BeautifulSoup) -> Optional[str]: """Extract article title.""" # Try multiple title selectors selectors = [ 'h1.title-text', 'h1[data-testid="title"]', 'h1.article-title', 'meta[name="citation_title"]', 'title' ] for selector in selectors: if 'meta' in selector: element = soup.find('meta', attrs={'name': 'citation_title'}) if element: return element.get('content', '').strip() else: element = soup.select_one(selector) if element: return element.get_text(strip=True) return None def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]: """Extract article abstract.""" selectors = [ 'div.abstract-content', 'div[data-testid="abstract"]', 'div.abstract', 'section.abstract', 'div#abstract' ] for selector in selectors: element = soup.select_one(selector) if element: return element.get_text(strip=True) return None def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]: """Extract author names.""" authors = [] # Try author meta tags author_metas = soup.find_all('meta', attrs={'name': 'citation_author'}) if author_metas: authors = [meta.get('content', '').strip() for meta in author_metas] # Try author div/span elements if not authors: author_elements = soup.select('div.author a, span.author, .author-name') authors = [elem.get_text(strip=True) for elem in author_elements] return authors if authors else None def _extract_full_text(self, soup: BeautifulSoup) -> str: """Extract main article content.""" content_parts = [] # Try main content selectors main_selectors = [ 'div.article-content', 'div.body-content', 'main.article-body', 'div[data-testid="article-body"]', 'section.article-section' ] for selector in main_selectors: elements = soup.select(selector) for element in elements: # Remove script, style, and navigation elements for unwanted in element.find_all(['script', 'style', 'nav', 'footer', 'header']): unwanted.decompose() text = element.get_text(separator='\n', strip=True) if text and len(text) > 50: # Only add substantial content content_parts.append(text) return '\n\n'.join(content_parts) def _extract_sections(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]: """Extract article sections with headings.""" sections = {} # Look for section headings and content section_elements = soup.find_all(['h2', 'h3', 'h4'], class_=re.compile(r'section|heading')) for heading in section_elements: section_title = heading.get_text(strip=True) # Find content after this heading until next heading content_parts = [] current = heading.next_sibling while current and current.name not in ['h1', 'h2', 'h3', 'h4']: if hasattr(current, 'get_text'): text = current.get_text(strip=True) if text: content_parts.append(text) current = current.next_sibling if content_parts: sections[section_title] = '\n'.join(content_parts) return sections if sections else None def _extract_keywords(self, soup: BeautifulSoup) -> Optional[List[str]]: """Extract article keywords.""" keywords = [] # Try keyword meta tags keyword_metas = soup.find_all('meta', attrs={'name': 'citation_keywords'}) if keyword_metas: for meta in keyword_metas: content = meta.get('content', '') if content: keywords.extend([kw.strip() for kw in content.split(',')]) # Try keyword sections if not keywords: keyword_sections = soup.select('div.keywords, section.keywords') for section in keyword_sections: text = section.get_text() keywords.extend([kw.strip() for kw in text.split(',') if kw.strip()]) return keywords if keywords else None def _extract_references(self, soup: BeautifulSoup) -> Optional[List[str]]: """Extract references.""" references = [] ref_sections = soup.select('section.references, div.references, ol.references li') for section in ref_sections: if section.name == 'li': references.append(section.get_text(strip=True)) else: ref_items = section.find_all(['li', 'div'], class_=re.compile(r'reference')) references.extend([item.get_text(strip=True) for item in ref_items]) return references if references else None def _extract_journal(self, soup: BeautifulSoup) -> Optional[str]: """Extract journal name.""" journal_meta = soup.find('meta', attrs={'name': 'citation_journal_title'}) if journal_meta: return journal_meta.get('content', '').strip() return None def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]: """Extract publication date.""" date_meta = soup.find('meta', attrs={'name': 'citation_publication_date'}) if date_meta: return date_meta.get('content', '').strip() return None def _combine_sections(self, sections: Dict[str, str], abstract: Optional[str] = None) -> str: """Combine all sections into full text.""" full_text_parts = [] if abstract: full_text_parts.append(f"Abstract\n{abstract}") for section_title, section_content in sections.items(): full_text_parts.append(f"{section_title}\n{section_content}") return '\n\n'.join(full_text_parts)