import time import random import os from datetime import datetime from .base import BaseScraper, ScrapeResult from flask import current_app from ..models import PaperMetadata, ActivityLog, DownloadPathConfig from ..db import db class Scraper(BaseScraper): """Dummy scraper for testing purposes that simulates paper downloading.""" def scrape(self, doi: str) -> ScrapeResult: """Simulate scraping a paper with realistic timing and random success/failure.""" start_time = time.time() paper = PaperMetadata.query.filter_by(doi=doi).first() if not paper: return ScrapeResult( status="error", message=f"No paper found for DOI {doi}", data=None, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Simulate processing time (1-3 seconds) processing_time = random.uniform(1, 3) time.sleep(processing_time) # Simulate 80% success rate success = random.random() < 0.8 if success: # Get download path and create an actual dummy file download_path = DownloadPathConfig.get_path() file_name = f"{doi.replace('/', '_')}.pdf" file_path = f"{download_path}/{file_name}" # Create directory if it doesn't exist os.makedirs(download_path, exist_ok=True) # Create a simple dummy PDF file try: with open(file_path, 'w') as f: f.write(f"Dummy PDF file for paper with DOI: {doi}\n") f.write(f"Title: {paper.title}\n") f.write(f"Journal: {paper.journal}\n") f.write(f"Generated: {datetime.utcnow().isoformat()}\n") f.write("\nThis is a dummy file created by the SciPaperLoader dummy scraper.\n") # Update paper status paper.status = "Done" paper.file_path = file_path paper.error_msg = None except Exception as e: # Handle file creation errors error_msg = f"Failed to create dummy file: {str(e)}" paper.status = "Failed" paper.error_msg = error_msg ActivityLog.log_scraper_activity( action="dummy_scrape_file_error", status="error", description=error_msg, paper_id=paper.id ) return ScrapeResult( status="error", message=error_msg, data={"error_code": "file_creation_error"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) # Log success ActivityLog.log_scraper_activity( action="dummy_scrape", status="success", description=f"Successfully scraped {doi}", paper_id=paper.id ) result = ScrapeResult( status="success", message=f"Successfully scraped {doi}", data={ "file_path": file_path, "title": paper.title, "journal": paper.journal }, duration=time.time() - start_time, timestamp=datetime.utcnow() ) else: # Simulate failure error_messages = [ "Paper not found in database", "Access denied by publisher", "Rate limit exceeded", "Network timeout", "Invalid DOI format" ] error_msg = random.choice(error_messages) paper.status = "Failed" paper.error_msg = error_msg # Log failure ActivityLog.log_scraper_activity( action="dummy_scrape", status="error", description=f"Failed to scrape {doi}: {error_msg}", paper_id=paper.id ) result = ScrapeResult( status="error", message=f"Failed to scrape {doi}: {error_msg}", data={"error_code": "dummy_error"}, duration=time.time() - start_time, timestamp=datetime.utcnow() ) db.session.commit() return result