238 lines
9.1 KiB
Python

import time
import os
from datetime import datetime
from typing import Optional
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
from ..parsers.base_parser import BaseParser, ParseError
from ..parsers.elsevier_parser import ElsevierParser
from ..parsers.arxiv_parser import ArxivParser
class Scraper(BaseScraper):
"""Full text extraction scraper that uses publisher-specific parsers."""
# This scraper processes papers with HTML content and outputs "TextExtracted"/"Failed"
INPUT_STATUSES = ["WebContentDownloaded", "PublisherDetected"]
OUTPUT_STATUS_SUCCESS = "TextExtracted"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "ExtractingText"
def __init__(self):
super().__init__()
# Registry of available parsers
self.parsers = [
ElsevierParser(),
ArxivParser(),
# Add more parsers here as you create them
# SpringerParser(),
# WileyParser(),
# IEEEParser(),
]
def scrape(self, doi: str) -> ScrapeResult:
"""Extract full text using appropriate publisher parser."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
# Check if HTML file exists
if not paper.file_path or not os.path.exists(paper.file_path):
error_msg = f"HTML file not found for DOI {doi}. Expected at: {paper.file_path}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "html_file_not_found"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
try:
# Read HTML content
with open(paper.file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
# Find appropriate parser
parser = self._select_parser(html_content)
if not parser:
error_msg = f"No suitable parser found for DOI {doi}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "no_parser_available"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Parse content
parsed_content = parser.parse(html_content, doi)
# Validate parsed content
if not parser.validate_content(parsed_content):
error_msg = f"Parsed content validation failed for DOI {doi}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "content_validation_failed"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Save extracted text to file
text_file_path = self._save_extracted_text(parsed_content, doi)
# Update paper status to success
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.error_msg = None
# You might want to add a text_file_path field to store the text file location
# paper.text_file_path = text_file_path
db.session.commit()
success_msg = f"Successfully extracted text using {parser.get_name()} parser"
self.log_scrape_success(doi, success_msg, paper.id)
return ScrapeResult(
status="success",
message=f"Successfully extracted full text for {doi}",
data={
"text_file_path": text_file_path,
"parser_used": parser.get_name(),
"title": parsed_content.title,
"word_count": len(parsed_content.full_text.split()),
"has_abstract": bool(parsed_content.abstract),
"has_sections": bool(parsed_content.sections),
"author_count": len(parsed_content.authors) if parsed_content.authors else 0,
"keyword_count": len(parsed_content.keywords) if parsed_content.keywords else 0,
"reference_count": len(parsed_content.references) if parsed_content.references else 0
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except ParseError as e:
error_msg = f"Parser error for DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "parser_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Unexpected error extracting text for DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "extraction_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
def _select_parser(self, html_content: str) -> Optional[BaseParser]:
"""
Select the most appropriate parser for the HTML content.
Args:
html_content: The HTML content to analyze
Returns:
The best parser for this content, or None if no parser can handle it
"""
for parser in self.parsers:
if parser.can_parse(html_content):
return parser
return None
def _save_extracted_text(self, parsed_content, doi: str) -> str:
"""
Save extracted text to a file.
Args:
parsed_content: The parsed content object
doi: The DOI of the paper
Returns:
Path to the saved text file
"""
download_path = DownloadPathConfig.get_path()
text_file_name = f"{doi.replace('/', '_')}_fulltext.txt"
text_file_path = os.path.join(download_path, text_file_name)
with open(text_file_path, 'w', encoding='utf-8') as f:
# Write structured content
f.write(f"DOI: {parsed_content.doi or doi}\n")
f.write(f"Title: {parsed_content.title or 'Unknown'}\n")
f.write(f"Journal: {parsed_content.journal or 'Unknown'}\n")
f.write(f"Publication Date: {parsed_content.publication_date or 'Unknown'}\n")
if parsed_content.authors:
f.write(f"Authors: {', '.join(parsed_content.authors)}\n")
if parsed_content.keywords:
f.write(f"Keywords: {', '.join(parsed_content.keywords)}\n")
f.write(f"Extracted: {datetime.utcnow().isoformat()}\n")
f.write("=" * 80 + "\n\n")
# Write full text
f.write(parsed_content.full_text)
# Optionally write references at the end
if parsed_content.references:
f.write("\n\n" + "=" * 80 + "\n")
f.write("REFERENCES\n")
f.write("=" * 80 + "\n")
for i, ref in enumerate(parsed_content.references, 1):
f.write(f"{i}. {ref}\n")
return text_file_path