191 lines
7.2 KiB
Python

import time
import random
import os
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Dummy scraper for testing purposes that simulates paper downloading."""
# This scraper processes "New" papers and outputs "Done"/"Failed"
INPUT_STATUSES = ["New"]
OUTPUT_STATUS_SUCCESS = "Done"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "Pending"
def scrape(self, doi: str) -> ScrapeResult:
"""Simulate scraping a paper with realistic timing and random success/failure."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Simulate processing time (1-3 seconds)
processing_time = random.uniform(1, 3)
time.sleep(processing_time)
# Simulate 80% success rate
success = random.random() < 0.8
if success:
# Get download path and create an actual dummy file
download_path = DownloadPathConfig.get_path()
file_name = f"{doi.replace('/', '_')}.pdf"
file_path = f"{download_path}/{file_name}"
# Check if the path is readable and writable
if not os.path.exists(download_path):
try:
# Create directory if it doesn't exist
os.makedirs(download_path, exist_ok=True)
except OSError as e:
error_msg = f"Failed to create download directory: {str(e)}"
paper.status = "Failed"
paper.error_msg = error_msg
ActivityLog.log_scraper_activity(
action="dummy_scrape_path_error",
status="error",
description=error_msg,
paper_id=paper.id
)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Check if the path is readable
if not os.access(download_path, os.R_OK):
error_msg = f"Download path '{download_path}' is not readable"
paper.status = "Failed"
paper.error_msg = error_msg
ActivityLog.log_scraper_activity(
action="dummy_scrape_path_error",
status="error",
description=error_msg,
paper_id=paper.id
)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_read_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Check if the path is writable
if not os.access(download_path, os.W_OK):
error_msg = f"Download path '{download_path}' is not writable"
paper.status = "Failed"
paper.error_msg = error_msg
ActivityLog.log_scraper_activity(
action="dummy_scrape_path_error",
status="error",
description=error_msg,
paper_id=paper.id
)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_write_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Create a simple dummy PDF file
try:
with open(file_path, 'w') as f:
f.write(f"Dummy PDF file for paper with DOI: {doi}\n")
f.write(f"Title: {paper.title}\n")
f.write(f"Journal: {paper.journal}\n")
f.write(f"Generated: {datetime.utcnow().isoformat()}\n")
f.write("\nThis is a dummy file created by the SciPaperLoader dummy scraper.\n")
# Update paper status
paper.status = "Done"
paper.file_path = file_path
paper.error_msg = None
except Exception as e:
# Handle file creation errors
error_msg = f"Failed to create dummy file: {str(e)}"
paper.status = "Failed"
paper.error_msg = error_msg
ActivityLog.log_scraper_activity(
action="dummy_scrape_file_error",
status="error",
description=error_msg,
paper_id=paper.id
)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "file_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log success
self.log_scrape_success(doi, f"Successfully scraped {doi}", paper.id)
result = ScrapeResult(
status="success",
message=f"Successfully scraped {doi}",
data={
"file_path": file_path,
"title": paper.title,
"journal": paper.journal
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
else:
# Simulate failure
error_messages = [
"Paper not found in database",
"Access denied by publisher",
"Rate limit exceeded",
"Network timeout",
"Invalid DOI format"
]
error_msg = random.choice(error_messages)
paper.status = "Failed"
paper.error_msg = error_msg
# Log failure
self.log_scrape_failure(doi, error_msg, paper.id)
result = ScrapeResult(
status="error",
message=f"Failed to scrape {doi}: {error_msg}",
data={"error_code": "dummy_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
db.session.commit()
return result