import time import os import requests from urllib.parse import urlparse from datetime import datetime from .base import BaseScraper, ScrapeResult from flask import current_app from ..models import PaperMetadata, ActivityLog, DownloadPathConfig from ..db import db class Scraper(BaseScraper): """Web fetcher scraper that downloads HTML content from DOI URLs.""" # This scraper processes "New" papers and outputs "WebContentDownloaded"/"Failed" INPUT_STATUSES = ["New"] OUTPUT_STATUS_SUCCESS = "WebContentDownloaded" OUTPUT_STATUS_FAILURE = "Failed" OUTPUT_STATUS_PROCESSING = "FetchingWebContent" def scrape(self, doi: str) -> ScrapeResult: """Fetch HTML content from DOI and save to download path.""" start_time = time.time() paper = PaperMetadata.query.filter_by(doi=doi).first() if not paper: return ScrapeResult( status="error", message=f"No paper found for DOI {doi}", data=None, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Log start of scraping self.log_scrape_start(doi, paper.id) # Update status to processing paper.status = self.OUTPUT_STATUS_PROCESSING db.session.commit() # Prepare file paths download_path = DownloadPathConfig.get_path() file_name = f"{doi.replace('/', '_')}.html" file_path = os.path.join(download_path, file_name) # Check/create download directory if not os.path.exists(download_path): try: os.makedirs(download_path, exist_ok=True) except OSError as e: error_msg = f"Failed to create download directory: {str(e)}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "path_creation_error"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Check path permissions if not os.access(download_path, os.W_OK): error_msg = f"Download path '{download_path}' is not writable" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "path_write_error"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) try: # Fetch HTML from DOI doi_url = f"https://doi.org/{doi}" headers = { 'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } response = requests.get( doi_url, headers=headers, timeout=30, allow_redirects=True, verify=True ) # Check for invalid DOI (404) or other HTTP errors if response.status_code == 404: error_msg = f"Invalid DOI: {doi} not found (404)" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "invalid_doi"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Check for other HTTP errors response.raise_for_status() # Save HTML content with open(file_path, 'w', encoding='utf-8') as f: f.write(response.text) # Extract final URL after redirects (for publisher detection) final_url = response.url # Update paper status to success paper.status = self.OUTPUT_STATUS_SUCCESS paper.file_path = file_path paper.error_msg = None db.session.commit() # Log success success_msg = f"Successfully fetched HTML content for {doi} from {final_url}" self.log_scrape_success(doi, success_msg, paper.id) return ScrapeResult( status="success", message=f"Successfully fetched HTML for {doi}", data={ "file_path": file_path, "final_url": final_url, "content_length": len(response.text), "content_type": response.headers.get('content-type', 'unknown'), "title": paper.title, "domain": urlparse(final_url).netloc if final_url else None }, duration=time.time() - start_time, timestamp=datetime.utcnow() ) except requests.exceptions.HTTPError as e: error_msg = f"HTTP error fetching {doi_url}: {e.response.status_code} - {e}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "http_error", "status_code": e.response.status_code}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) except requests.exceptions.RequestException as e: error_msg = f"Network error fetching {doi_url}: {str(e)}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "network_error"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) except Exception as e: error_msg = f"Failed to save HTML file: {str(e)}" paper.status = self.OUTPUT_STATUS_FAILURE paper.error_msg = error_msg db.session.commit() self.log_scrape_failure(doi, error_msg, paper.id) return ScrapeResult( status="error", message=error_msg, data={"error_code": "file_creation_error"}, duration=time.time() - start_time, timestamp=datetime.utcnow() )