SciPaperLoader/scipaperloader/parsers/elsevier_parser.py

import re
from bs4 import BeautifulSoup
from typing import Dict, Optional, List
from .base_parser import BaseParser, ParsedContent, ParseError

class ElsevierParser(BaseParser):
    """Parser for Elsevier/ScienceDirect articles."""

    def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
        """Check if this is an Elsevier/ScienceDirect page."""
        html_lower = html_content.lower()

        # Check for Elsevier/ScienceDirect indicators
        indicators = [
            'sciencedirect.com',
            'elsevier.com',
            'meta name="citation_publisher" content="elsevier"',
            'copyright.*elsevier',
            'sciencedirect',
        ]

        return any(indicator in html_lower for indicator in indicators)

    def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
        """Parse Elsevier/ScienceDirect HTML content."""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')

            # Extract title
            title = self._extract_title(soup)

            # Extract abstract
            abstract = self._extract_abstract(soup)

            # Extract authors
            authors = self._extract_authors(soup)

            # Extract full text
            full_text = self._extract_full_text(soup)

            # Extract sections
            sections = self._extract_sections(soup)

            # Extract keywords
            keywords = self._extract_keywords(soup)

            # Extract references
            references = self._extract_references(soup)

            # Extract journal info
            journal = self._extract_journal(soup)

            # Extract publication date
            publication_date = self._extract_publication_date(soup)

            # Combine everything into full text if sections exist
            if sections:
                full_text = self._combine_sections(sections, abstract)

            if not full_text or len(full_text.strip()) < 100:
                raise ParseError("Could not extract meaningful full text content")

            return ParsedContent(
                full_text=full_text,
                title=title,
                abstract=abstract,
                authors=authors,
                keywords=keywords,
                sections=sections,
                references=references,
                doi=doi,
                journal=journal,
                publication_date=publication_date,
                metadata={
                    'parser': 'elsevier',
                    'source': 'sciencedirect'
                }
            )

        except Exception as e:
            raise ParseError(f"Failed to parse Elsevier content: {str(e)}")

    def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract article title."""
        # Try multiple title selectors
        selectors = [
            'h1.title-text',
            'h1[data-testid="title"]',
            'h1.article-title',
            'meta[name="citation_title"]',
            'title'
        ]

        for selector in selectors:
            if 'meta' in selector:
                element = soup.find('meta', attrs={'name': 'citation_title'})
                if element:
                    return element.get('content', '').strip()
            else:
                element = soup.select_one(selector)
                if element:
                    return element.get_text(strip=True)

        return None

    def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract article abstract."""
        selectors = [
            'div.abstract-content',
            'div[data-testid="abstract"]',
            'div.abstract',
            'section.abstract',
            'div#abstract'
        ]

        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                return element.get_text(strip=True)

        return None

    def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
        """Extract author names."""
        authors = []

        # Try author meta tags
        author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
        if author_metas:
            authors = [meta.get('content', '').strip() for meta in author_metas]

        # Try author div/span elements
        if not authors:
            author_elements = soup.select('div.author a, span.author, .author-name')
            authors = [elem.get_text(strip=True) for elem in author_elements]

        return authors if authors else None

    def _extract_full_text(self, soup: BeautifulSoup) -> str:
        """Extract main article content."""
        content_parts = []

        # Try main content selectors
        main_selectors = [
            'div.article-content',
            'div.body-content',
            'main.article-body',
            'div[data-testid="article-body"]',
            'section.article-section'
        ]

        for selector in main_selectors:
            elements = soup.select(selector)
            for element in elements:
                # Remove script, style, and navigation elements
                for unwanted in element.find_all(['script', 'style', 'nav', 'footer', 'header']):
                    unwanted.decompose()

                text = element.get_text(separator='\n', strip=True)
                if text and len(text) > 50:  # Only add substantial content
                    content_parts.append(text)

        return '\n\n'.join(content_parts)

    def _extract_sections(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
        """Extract article sections with headings."""
        sections = {}

        # Look for section headings and content
        section_elements = soup.find_all(['h2', 'h3', 'h4'], class_=re.compile(r'section|heading'))

        for heading in section_elements:
            section_title = heading.get_text(strip=True)

            # Find content after this heading until next heading
            content_parts = []
            current = heading.next_sibling

            while current and current.name not in ['h1', 'h2', 'h3', 'h4']:
                if hasattr(current, 'get_text'):
                    text = current.get_text(strip=True)
                    if text:
                        content_parts.append(text)
                current = current.next_sibling

            if content_parts:
                sections[section_title] = '\n'.join(content_parts)

        return sections if sections else None

    def _extract_keywords(self, soup: BeautifulSoup) -> Optional[List[str]]:
        """Extract article keywords."""
        keywords = []

        # Try keyword meta tags
        keyword_metas = soup.find_all('meta', attrs={'name': 'citation_keywords'})
        if keyword_metas:
            for meta in keyword_metas:
                content = meta.get('content', '')
                if content:
                    keywords.extend([kw.strip() for kw in content.split(',')])

        # Try keyword sections
        if not keywords:
            keyword_sections = soup.select('div.keywords, section.keywords')
            for section in keyword_sections:
                text = section.get_text()
                keywords.extend([kw.strip() for kw in text.split(',') if kw.strip()])

        return keywords if keywords else None

    def _extract_references(self, soup: BeautifulSoup) -> Optional[List[str]]:
        """Extract references."""
        references = []

        ref_sections = soup.select('section.references, div.references, ol.references li')
        for section in ref_sections:
            if section.name == 'li':
                references.append(section.get_text(strip=True))
            else:
                ref_items = section.find_all(['li', 'div'], class_=re.compile(r'reference'))
                references.extend([item.get_text(strip=True) for item in ref_items])

        return references if references else None

    def _extract_journal(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract journal name."""
        journal_meta = soup.find('meta', attrs={'name': 'citation_journal_title'})
        if journal_meta:
            return journal_meta.get('content', '').strip()

        return None

    def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract publication date."""
        date_meta = soup.find('meta', attrs={'name': 'citation_publication_date'})
        if date_meta:
            return date_meta.get('content', '').strip()

        return None

    def _combine_sections(self, sections: Dict[str, str], abstract: Optional[str] = None) -> str:
        """Combine all sections into full text."""
        full_text_parts = []

        if abstract:
            full_text_parts.append(f"Abstract\n{abstract}")

        for section_title, section_content in sections.items():
            full_text_parts.append(f"{section_title}\n{section_content}")

        return '\n\n'.join(full_text_parts)