import time import random from datetime import datetime from .base import BaseScraper, ScrapeResult from flask import current_app from ..models import PaperMetadata, ActivityLog, DownloadPathConfig from ..db import db class Scraper(BaseScraper): """Dummy scraper for testing purposes that simulates paper downloading.""" def scrape(self, doi: str) -> ScrapeResult: """Simulate scraping a paper with realistic timing and random success/failure.""" start_time = time.time() paper = PaperMetadata.query.filter_by(doi=doi).first() if not paper: return ScrapeResult( status="error", message=f"No paper found for DOI {doi}", data=None, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Simulate processing time (1-3 seconds) processing_time = random.uniform(1, 3) time.sleep(processing_time) # Simulate 80% success rate success = random.random() < 0.8 if success: # Get download path and simulate file creation download_path = DownloadPathConfig.get_path() file_name = f"{doi.replace('/', '_')}.pdf" file_path = f"{download_path}/{file_name}" # Update paper status paper.status = "Done" paper.file_path = file_path paper.error_msg = None # Log success ActivityLog.log_scraper_activity( action="dummy_scrape", status="success", description=f"Successfully scraped {doi}", paper_id=paper.id ) result = ScrapeResult( status="success", message=f"Successfully scraped {doi}", data={ "file_path": file_path, "title": paper.title, "journal": paper.journal }, duration=time.time() - start_time, timestamp=datetime.utcnow() ) else: # Simulate failure error_messages = [ "Paper not found in database", "Access denied by publisher", "Rate limit exceeded", "Network timeout", "Invalid DOI format" ] error_msg = random.choice(error_messages) paper.status = "Failed" paper.error_msg = error_msg # Log failure ActivityLog.log_scraper_activity( action="dummy_scrape", status="error", description=f"Failed to scrape {doi}: {error_msg}", paper_id=paper.id ) result = ScrapeResult( status="error", message=f"Failed to scrape {doi}: {error_msg}", data={"error_code": "dummy_error"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) db.session.commit() return result