202 lines
7.5 KiB
Python

import time
import os
import requests
from urllib.parse import urlparse
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Web fetcher scraper that downloads HTML content from DOI URLs."""
# This scraper processes "New" papers and outputs "WebContentDownloaded"/"Failed"
INPUT_STATUSES = ["New"]
OUTPUT_STATUS_SUCCESS = "WebContentDownloaded"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "FetchingWebContent"
def scrape(self, doi: str) -> ScrapeResult:
"""Fetch HTML content from DOI and save to download path."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
# Prepare file paths
download_path = DownloadPathConfig.get_path()
file_name = f"{doi.replace('/', '_')}.html"
file_path = os.path.join(download_path, file_name)
# Check/create download directory
if not os.path.exists(download_path):
try:
os.makedirs(download_path, exist_ok=True)
except OSError as e:
error_msg = f"Failed to create download directory: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Check path permissions
if not os.access(download_path, os.W_OK):
error_msg = f"Download path '{download_path}' is not writable"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_write_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
try:
# Fetch HTML from DOI
doi_url = f"https://doi.org/{doi}"
headers = {
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
response = requests.get(
doi_url,
headers=headers,
timeout=30,
allow_redirects=True,
verify=True
)
# Check for invalid DOI (404) or other HTTP errors
if response.status_code == 404:
error_msg = f"Invalid DOI: {doi} not found (404)"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "invalid_doi"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Check for other HTTP errors
response.raise_for_status()
# Save HTML content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(response.text)
# Extract final URL after redirects (for publisher detection)
final_url = response.url
# Update paper status to success
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.file_path = file_path
paper.error_msg = None
db.session.commit()
# Log success
success_msg = f"Successfully fetched HTML content for {doi} from {final_url}"
self.log_scrape_success(doi, success_msg, paper.id)
return ScrapeResult(
status="success",
message=f"Successfully fetched HTML for {doi}",
data={
"file_path": file_path,
"final_url": final_url,
"content_length": len(response.text),
"content_type": response.headers.get('content-type', 'unknown'),
"title": paper.title,
"domain": urlparse(final_url).netloc if final_url else None
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except requests.exceptions.HTTPError as e:
error_msg = f"HTTP error fetching {doi_url}: {e.response.status_code} - {e}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "http_error", "status_code": e.response.status_code},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except requests.exceptions.RequestException as e:
error_msg = f"Network error fetching {doi_url}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "network_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Failed to save HTML file: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "file_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)