""" APScheduler-based task functions that replace Celery tasks for paper processing. """ import random from datetime import datetime, timedelta from typing import Optional from flask import current_app from ..models import ScraperState, ActivityLog, PaperMetadata from .manager import ScraperManager def hourly_scraper_scheduler(): """ Hourly task that schedules paper processing at random times within the hour. This task runs at the beginning of each hour and: 1. Calculates how many papers to process this hour 2. Schedules individual paper processing tasks at random times within the hour """ try: # Check if scraper is active scraper_state = ScraperState.get_current_state() if not scraper_state.is_active: ActivityLog.log_scraper_activity( action="hourly_scheduler", status="info", description="Hourly scheduler skipped - scraper not active" ) return {"status": "inactive", "papers_scheduled": 0} if scraper_state.is_paused: ActivityLog.log_scraper_activity( action="hourly_scheduler", status="info", description="Hourly scheduler skipped - scraper paused" ) return {"status": "paused", "papers_scheduled": 0} # Initialize scraper manager manager = ScraperManager() # Get papers to process this hour papers = manager.select_papers_for_processing() if not papers: ActivityLog.log_scraper_activity( action="hourly_scheduler", status="info", description="No papers available for processing this hour" ) return {"status": "empty", "papers_scheduled": 0} # Get scheduler from Flask app config scheduler = current_app.config.get('SCHEDULER') if not scheduler: ActivityLog.log_error( error_message="APScheduler not available for paper scheduling", source="hourly_scraper_scheduler" ) return {"status": "error", "message": "APScheduler not available"} # Schedule papers at random times within the hour (0-3600 seconds) scheduled_count = 0 current_time = datetime.now() for paper in papers: # Random delay between 1 second and 58 minutes delay_seconds = random.randint(1, 3480) # Up to 58 minutes run_date = current_time + timedelta(seconds=delay_seconds) # Schedule the task using APScheduler job_id = f"paper_process_{paper.id}_{int(current_time.timestamp())}" scheduler.add_job( func=process_single_paper, trigger='date', run_date=run_date, args=[paper.id], id=job_id, replace_existing=True ) scheduled_count += 1 # Log each scheduled paper ActivityLog.log_scraper_activity( action="schedule_paper", paper_id=paper.id, status="info", description=f"Scheduled paper {paper.doi} for processing at {run_date.strftime('%H:%M:%S')}" ) ActivityLog.log_scraper_activity( action="hourly_scheduler", status="success", description=f"Scheduled {scheduled_count} papers for random processing within this hour" ) return {"status": "success", "papers_scheduled": scheduled_count} except Exception as e: ActivityLog.log_error( error_message=f"Hourly scheduler error: {str(e)}", source="hourly_scraper_scheduler" ) return {"status": "error", "message": str(e)} def process_single_paper(paper_id: int): """ Process a single paper. This task is scheduled at random times within each hour. Args: paper_id: ID of the paper to process """ try: # ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times # Initial check before any processing scraper_state = ScraperState.get_current_state() if not scraper_state.is_active: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description="Task skipped - scraper not active (initial check)" ) return {"status": "inactive", "paper_id": paper_id} if scraper_state.is_paused: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description="Task skipped - scraper paused (initial check)" ) return {"status": "paused", "paper_id": paper_id} # Brief pause to allow stop commands to take effect import time time.sleep(0.1) # Second check after brief delay scraper_state = ScraperState.get_current_state() if not scraper_state.is_active: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description="Task skipped - scraper not active (secondary check)" ) return {"status": "inactive", "paper_id": paper_id} if scraper_state.is_paused: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description="Task skipped - scraper paused (secondary check)" ) return {"status": "paused", "paper_id": paper_id} # Get the paper paper = PaperMetadata.query.get(paper_id) if not paper: return {"status": "error", "message": f"Paper {paper_id} not found"} # Third check before starting actual processing scraper_state = ScraperState.get_current_state() if not scraper_state.is_active: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description="Task skipped - scraper not active (pre-processing check)" ) return {"status": "inactive", "paper_id": paper_id} # Process the paper using scraper manager manager = ScraperManager() result = manager.process_paper(paper) return result except Exception as e: ActivityLog.log_error( error_message=f"Error processing paper {paper_id}: {str(e)}", source="process_single_paper" ) return {"status": "error", "paper_id": paper_id, "message": str(e)} manager = ScraperManager() result = manager.process_paper(paper) return result except Exception as e: ActivityLog.log_error( error_message=f"Error processing paper {paper_id}: {str(e)}", source="process_single_paper" ) return {"status": "error", "paper_id": paper_id, "message": str(e)} def process_papers_batch(paper_ids: list, scraper_module: Optional[str] = None): """ Process multiple papers in a batch for immediate processing. Args: paper_ids: List of paper IDs to process scraper_module: Optional specific scraper module to use """ try: results = [] manager = ScraperManager() for paper_id in paper_ids: paper = PaperMetadata.query.get(paper_id) if paper: result = manager.process_paper(paper) results.append(result) else: results.append({ "paper_id": paper_id, "status": "error", "message": "Paper not found" }) return {"results": results, "total_processed": len(results)} except Exception as e: ActivityLog.log_error( error_message=f"Error processing batch: {str(e)}", source="process_papers_batch" ) return {"status": "error", "message": str(e)}