238 lines
9.1 KiB
Python
238 lines
9.1 KiB
Python
import time
|
|
import os
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
from .base import BaseScraper, ScrapeResult
|
|
from flask import current_app
|
|
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
|
from ..db import db
|
|
from ..parsers.base_parser import BaseParser, ParseError
|
|
from ..parsers.elsevier_parser import ElsevierParser
|
|
from ..parsers.arxiv_parser import ArxivParser
|
|
|
|
class Scraper(BaseScraper):
|
|
"""Full text extraction scraper that uses publisher-specific parsers."""
|
|
|
|
# This scraper processes papers with HTML content and outputs "TextExtracted"/"Failed"
|
|
INPUT_STATUSES = ["WebContentDownloaded", "PublisherDetected"]
|
|
OUTPUT_STATUS_SUCCESS = "TextExtracted"
|
|
OUTPUT_STATUS_FAILURE = "Failed"
|
|
OUTPUT_STATUS_PROCESSING = "ExtractingText"
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
# Registry of available parsers
|
|
self.parsers = [
|
|
ElsevierParser(),
|
|
ArxivParser(),
|
|
# Add more parsers here as you create them
|
|
# SpringerParser(),
|
|
# WileyParser(),
|
|
# IEEEParser(),
|
|
]
|
|
|
|
def scrape(self, doi: str) -> ScrapeResult:
|
|
"""Extract full text using appropriate publisher parser."""
|
|
start_time = time.time()
|
|
|
|
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
|
if not paper:
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=f"No paper found for DOI {doi}",
|
|
data=None,
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
# Log start of scraping
|
|
self.log_scrape_start(doi, paper.id)
|
|
|
|
# Update status to processing
|
|
paper.status = self.OUTPUT_STATUS_PROCESSING
|
|
db.session.commit()
|
|
|
|
# Check if HTML file exists
|
|
if not paper.file_path or not os.path.exists(paper.file_path):
|
|
error_msg = f"HTML file not found for DOI {doi}. Expected at: {paper.file_path}"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "html_file_not_found"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
try:
|
|
# Read HTML content
|
|
with open(paper.file_path, 'r', encoding='utf-8') as f:
|
|
html_content = f.read()
|
|
|
|
# Find appropriate parser
|
|
parser = self._select_parser(html_content)
|
|
|
|
if not parser:
|
|
error_msg = f"No suitable parser found for DOI {doi}"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "no_parser_available"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
# Parse content
|
|
parsed_content = parser.parse(html_content, doi)
|
|
|
|
# Validate parsed content
|
|
if not parser.validate_content(parsed_content):
|
|
error_msg = f"Parsed content validation failed for DOI {doi}"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "content_validation_failed"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
# Save extracted text to file
|
|
text_file_path = self._save_extracted_text(parsed_content, doi)
|
|
|
|
# Update paper status to success
|
|
paper.status = self.OUTPUT_STATUS_SUCCESS
|
|
paper.error_msg = None
|
|
# You might want to add a text_file_path field to store the text file location
|
|
# paper.text_file_path = text_file_path
|
|
db.session.commit()
|
|
|
|
success_msg = f"Successfully extracted text using {parser.get_name()} parser"
|
|
self.log_scrape_success(doi, success_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="success",
|
|
message=f"Successfully extracted full text for {doi}",
|
|
data={
|
|
"text_file_path": text_file_path,
|
|
"parser_used": parser.get_name(),
|
|
"title": parsed_content.title,
|
|
"word_count": len(parsed_content.full_text.split()),
|
|
"has_abstract": bool(parsed_content.abstract),
|
|
"has_sections": bool(parsed_content.sections),
|
|
"author_count": len(parsed_content.authors) if parsed_content.authors else 0,
|
|
"keyword_count": len(parsed_content.keywords) if parsed_content.keywords else 0,
|
|
"reference_count": len(parsed_content.references) if parsed_content.references else 0
|
|
},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except ParseError as e:
|
|
error_msg = f"Parser error for DOI {doi}: {str(e)}"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "parser_error"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
error_msg = f"Unexpected error extracting text for DOI {doi}: {str(e)}"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "extraction_error"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
def _select_parser(self, html_content: str) -> Optional[BaseParser]:
|
|
"""
|
|
Select the most appropriate parser for the HTML content.
|
|
|
|
Args:
|
|
html_content: The HTML content to analyze
|
|
|
|
Returns:
|
|
The best parser for this content, or None if no parser can handle it
|
|
"""
|
|
for parser in self.parsers:
|
|
if parser.can_parse(html_content):
|
|
return parser
|
|
|
|
return None
|
|
|
|
def _save_extracted_text(self, parsed_content, doi: str) -> str:
|
|
"""
|
|
Save extracted text to a file.
|
|
|
|
Args:
|
|
parsed_content: The parsed content object
|
|
doi: The DOI of the paper
|
|
|
|
Returns:
|
|
Path to the saved text file
|
|
"""
|
|
download_path = DownloadPathConfig.get_path()
|
|
text_file_name = f"{doi.replace('/', '_')}_fulltext.txt"
|
|
text_file_path = os.path.join(download_path, text_file_name)
|
|
|
|
with open(text_file_path, 'w', encoding='utf-8') as f:
|
|
# Write structured content
|
|
f.write(f"DOI: {parsed_content.doi or doi}\n")
|
|
f.write(f"Title: {parsed_content.title or 'Unknown'}\n")
|
|
f.write(f"Journal: {parsed_content.journal or 'Unknown'}\n")
|
|
f.write(f"Publication Date: {parsed_content.publication_date or 'Unknown'}\n")
|
|
|
|
if parsed_content.authors:
|
|
f.write(f"Authors: {', '.join(parsed_content.authors)}\n")
|
|
|
|
if parsed_content.keywords:
|
|
f.write(f"Keywords: {', '.join(parsed_content.keywords)}\n")
|
|
|
|
f.write(f"Extracted: {datetime.utcnow().isoformat()}\n")
|
|
f.write("=" * 80 + "\n\n")
|
|
|
|
# Write full text
|
|
f.write(parsed_content.full_text)
|
|
|
|
# Optionally write references at the end
|
|
if parsed_content.references:
|
|
f.write("\n\n" + "=" * 80 + "\n")
|
|
f.write("REFERENCES\n")
|
|
f.write("=" * 80 + "\n")
|
|
for i, ref in enumerate(parsed_content.references, 1):
|
|
f.write(f"{i}. {ref}\n")
|
|
|
|
return text_file_path
|