import time import os from datetime import datetime from typing import Optional from .base import BaseScraper, ScrapeResult from flask import current_app from ..models import PaperMetadata, ActivityLog, DownloadPathConfig from ..db import db from ..parsers.base_parser import BaseParser, ParseError from ..parsers.elsevier_parser import ElsevierParser from ..parsers.arxiv_parser import ArxivParser class Scraper(BaseScraper): """Full text extraction scraper that uses publisher-specific parsers.""" # This scraper processes papers with HTML content and outputs "TextExtracted"/"Failed" INPUT_STATUSES = ["WebContentDownloaded", "PublisherDetected"] OUTPUT_STATUS_SUCCESS = "TextExtracted" OUTPUT_STATUS_FAILURE = "Failed" OUTPUT_STATUS_PROCESSING = "ExtractingText" def __init__(self): super().__init__() # Registry of available parsers self.parsers = [ ElsevierParser(), ArxivParser(), # Add more parsers here as you create them # SpringerParser(), # WileyParser(), # IEEEParser(), ] def scrape(self, doi: str) -> ScrapeResult: """Extract full text using appropriate publisher parser.""" start_time = time.time() paper = PaperMetadata.query.filter_by(doi=doi).first() if not paper: return ScrapeResult( status="error", message=f"No paper found for DOI {doi}", data=None, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Log start of scraping self.log_scrape_start(doi, paper.id) # Update status to processing paper.status = self.OUTPUT_STATUS_PROCESSING db.session.commit() # Check if HTML file exists if not paper.file_path or not os.path.exists(paper.file_path): error_msg = f"HTML file not found for DOI {doi}. Expected at: {paper.file_path}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "html_file_not_found"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) try: # Read HTML content with open(paper.file_path, 'r', encoding='utf-8') as f: html_content = f.read() # Find appropriate parser parser = self._select_parser(html_content) if not parser: error_msg = f"No suitable parser found for DOI {doi}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "no_parser_available"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Parse content parsed_content = parser.parse(html_content, doi) # Validate parsed content if not parser.validate_content(parsed_content): error_msg = f"Parsed content validation failed for DOI {doi}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "content_validation_failed"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Save extracted text to file text_file_path = self._save_extracted_text(parsed_content, doi) # Update paper status to success paper.status = self.OUTPUT_STATUS_SUCCESS paper.error_msg = None # You might want to add a text_file_path field to store the text file location # paper.text_file_path = text_file_path db.session.commit() success_msg = f"Successfully extracted text using {parser.get_name()} parser" self.log_scrape_success(doi, success_msg, paper.id) return ScrapeResult( status="success", message=f"Successfully extracted full text for {doi}", data={ "text_file_path": text_file_path, "parser_used": parser.get_name(), "title": parsed_content.title, "word_count": len(parsed_content.full_text.split()), "has_abstract": bool(parsed_content.abstract), "has_sections": bool(parsed_content.sections), "author_count": len(parsed_content.authors) if parsed_content.authors else 0, "keyword_count": len(parsed_content.keywords) if parsed_content.keywords else 0, "reference_count": len(parsed_content.references) if parsed_content.references else 0 }, duration=time.time() - start_time, timestamp=datetime.utcnow() ) except ParseError as e: error_msg = f"Parser error for DOI {doi}: {str(e)}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "parser_error"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) except Exception as e: error_msg = f"Unexpected error extracting text for DOI {doi}: {str(e)}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "extraction_error"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) def _select_parser(self, html_content: str) -> Optional[BaseParser]: """ Select the most appropriate parser for the HTML content. Args: html_content: The HTML content to analyze Returns: The best parser for this content, or None if no parser can handle it """ for parser in self.parsers: if parser.can_parse(html_content): return parser return None def _save_extracted_text(self, parsed_content, doi: str) -> str: """ Save extracted text to a file. Args: parsed_content: The parsed content object doi: The DOI of the paper Returns: Path to the saved text file """ download_path = DownloadPathConfig.get_path() text_file_name = f"{doi.replace('/', '_')}_fulltext.txt" text_file_path = os.path.join(download_path, text_file_name) with open(text_file_path, 'w', encoding='utf-8') as f: # Write structured content f.write(f"DOI: {parsed_content.doi or doi}\n") f.write(f"Title: {parsed_content.title or 'Unknown'}\n") f.write(f"Journal: {parsed_content.journal or 'Unknown'}\n") f.write(f"Publication Date: {parsed_content.publication_date or 'Unknown'}\n") if parsed_content.authors: f.write(f"Authors: {', '.join(parsed_content.authors)}\n") if parsed_content.keywords: f.write(f"Keywords: {', '.join(parsed_content.keywords)}\n") f.write(f"Extracted: {datetime.utcnow().isoformat()}\n") f.write("=" * 80 + "\n\n") # Write full text f.write(parsed_content.full_text) # Optionally write references at the end if parsed_content.references: f.write("\n\n" + "=" * 80 + "\n") f.write("REFERENCES\n") f.write("=" * 80 + "\n") for i, ref in enumerate(parsed_content.references, 1): f.write(f"{i}. {ref}\n") return text_file_path