172 lines
6.3 KiB
Python
172 lines
6.3 KiB
Python
import time
|
|
import os
|
|
import requests
|
|
from datetime import datetime
|
|
from .base import BaseScraper, ScrapeResult
|
|
from flask import current_app
|
|
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
|
from ..db import db
|
|
|
|
class Scraper(BaseScraper):
|
|
"""Scraper that fetches HTML content from DOI and saves it for further processing."""
|
|
|
|
# This scraper processes "New" papers and outputs "HtmlDownloaded"/"Failed"
|
|
INPUT_STATUSES = ["New"]
|
|
OUTPUT_STATUS_SUCCESS = "HtmlDownloaded"
|
|
OUTPUT_STATUS_FAILURE = "Failed"
|
|
OUTPUT_STATUS_PROCESSING = "FetchingHtml"
|
|
|
|
def scrape(self, doi: str) -> ScrapeResult:
|
|
"""Fetch HTML content from DOI and save to download path."""
|
|
start_time = time.time()
|
|
|
|
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
|
if not paper:
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=f"No paper found for DOI {doi}",
|
|
data=None,
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
# Log start of scraping
|
|
self.log_scrape_start(doi, paper.id)
|
|
|
|
# Update status to processing
|
|
paper.status = self.OUTPUT_STATUS_PROCESSING
|
|
db.session.commit()
|
|
|
|
# Prepare file paths
|
|
download_path = DownloadPathConfig.get_path()
|
|
file_name = f"{doi.replace('/', '_')}.html"
|
|
file_path = os.path.join(download_path, file_name)
|
|
|
|
# Check/create download directory (same pattern as dummy)
|
|
if not os.path.exists(download_path):
|
|
try:
|
|
os.makedirs(download_path, exist_ok=True)
|
|
except OSError as e:
|
|
error_msg = f"Failed to create download directory: {str(e)}"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "path_creation_error"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
# Check path permissions (same pattern as dummy)
|
|
if not os.access(download_path, os.W_OK):
|
|
error_msg = f"Download path '{download_path}' is not writable"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action="html_fetch_path_error",
|
|
status="error",
|
|
description=error_msg,
|
|
paper_id=paper.id
|
|
)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "path_write_error"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
try:
|
|
# Fetch HTML from DOI
|
|
doi_url = f"https://doi.org/{doi}"
|
|
headers = {'User-Agent': 'SciPaperLoader/1.0'}
|
|
response = requests.get(doi_url, headers=headers, timeout=30, allow_redirects=True)
|
|
|
|
# Check for invalid DOI (404) or other HTTP errors
|
|
if response.status_code == 404:
|
|
error_msg = f"Invalid DOI: {doi} not found"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "invalid_doi"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
response.raise_for_status() # Raise for other HTTP errors
|
|
|
|
# Save HTML content
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(response.text)
|
|
|
|
# Update paper status to success
|
|
paper.status = self.OUTPUT_STATUS_SUCCESS
|
|
paper.file_path = file_path
|
|
paper.error_msg = None
|
|
db.session.commit()
|
|
|
|
# Log success
|
|
self.log_scrape_success(doi, f"Successfully fetched HTML for {doi}", paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="success",
|
|
message=f"Successfully fetched HTML for {doi}",
|
|
data={
|
|
"file_path": file_path,
|
|
"url": response.url, # Final URL after redirects
|
|
"title": paper.title
|
|
},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
error_msg = f"Failed to fetch HTML from DOI {doi}: {str(e)}"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action="html_fetch",
|
|
status="error",
|
|
description=error_msg,
|
|
paper_id=paper.id
|
|
)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "network_error"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
)
|
|
|
|
except Exception as e:
|
|
error_msg = f"Failed to save HTML file: {str(e)}"
|
|
paper.status = self.OUTPUT_STATUS_FAILURE
|
|
paper.error_msg = error_msg
|
|
db.session.commit()
|
|
|
|
self.log_scrape_failure(doi, error_msg, paper.id)
|
|
|
|
return ScrapeResult(
|
|
status="error",
|
|
message=error_msg,
|
|
data={"error_code": "file_creation_error"},
|
|
duration=time.time() - start_time,
|
|
timestamp=datetime.utcnow()
|
|
) |