""" Hourly scheduler task that processes papers at random times within each hour. """ import random from datetime import datetime, timedelta from typing import Optional from celery import shared_task from ..models import ScraperState, ActivityLog from .manager import ScraperManager @shared_task(bind=True) def hourly_scraper_scheduler(self): """ Hourly task that schedules paper processing at random times within the hour. This task runs at the beginning of each hour and: 1. Calculates how many papers to process this hour 2. Schedules individual paper processing tasks at random times within the hour """ try: # Check if scraper is active scraper_state = ScraperState.get_current_state() if not scraper_state.is_active: ActivityLog.log_scraper_activity( action="hourly_scheduler", status="info", description="Hourly scheduler skipped - scraper not active" ) # Disable retries for inactive scheduler self.retry = False return {"status": "inactive", "papers_scheduled": 0} if scraper_state.is_paused: ActivityLog.log_scraper_activity( action="hourly_scheduler", status="info", description="Hourly scheduler skipped - scraper paused" ) # Disable retries for paused scheduler self.retry = False return {"status": "paused", "papers_scheduled": 0} # Initialize scraper manager manager = ScraperManager() # Get papers to process this hour papers = manager.select_papers_for_processing() if not papers: ActivityLog.log_scraper_activity( action="hourly_scheduler", status="info", description="No papers available for processing this hour" ) return {"status": "empty", "papers_scheduled": 0} # Schedule papers at random times within the hour (0-3600 seconds) scheduled_count = 0 current_time = datetime.now() for paper in papers: # Random delay between 1 second and 58 minutes delay_seconds = random.randint(1, 3480) # Up to 58 minutes # Schedule the task using Celery's task registry to avoid circular import issues from ..celery import celery celery.send_task( 'scipaperloader.scrapers.tasks.process_single_paper', args=[paper.id], countdown=delay_seconds ) scheduled_count += 1 # Log each scheduled paper schedule_time = current_time + timedelta(seconds=delay_seconds) ActivityLog.log_scraper_activity( action="schedule_paper", paper_id=paper.id, status="info", description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}" ) ActivityLog.log_scraper_activity( action="hourly_scheduler", status="success", description=f"Scheduled {scheduled_count} papers for random processing within this hour" ) return {"status": "success", "papers_scheduled": scheduled_count} except Exception as e: ActivityLog.log_error( error_message=f"Hourly scheduler error: {str(e)}", source="hourly_scraper_scheduler" ) return {"status": "error", "message": str(e)} @shared_task(bind=True) def process_single_paper(self, paper_id: int): """ Process a single paper. This task is scheduled at random times within each hour. Args: paper_id: ID of the paper to process """ try: # ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times # Initial check before any processing scraper_state = ScraperState.get_current_state() if not scraper_state.is_active: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description="Task skipped - scraper not active (initial check)" ) self.retry = False return {"status": "inactive", "paper_id": paper_id} if scraper_state.is_paused: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description="Task skipped - scraper paused (initial check)" ) self.retry = False return {"status": "paused", "paper_id": paper_id} # Check if this specific task has been revoked try: from ..celery import celery # Check if the current task is in the revoked list if hasattr(self, 'request') and self.request.id: revoked_tasks = celery.control.inspect().revoked() if revoked_tasks: for worker, tasks in revoked_tasks.items(): if self.request.id in tasks: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description=f"Task skipped - task ID {self.request.id} was revoked" ) return {"status": "revoked", "paper_id": paper_id, "task_id": self.request.id} except Exception: # Don't fail on revocation check issues, just continue with state checks pass # Brief pause to allow stop commands to take effect import time time.sleep(0.1) # Second check after brief delay scraper_state = ScraperState.get_current_state() if not scraper_state.is_active: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description="Task skipped - scraper not active (secondary check)" ) self.retry = False return {"status": "inactive", "paper_id": paper_id} if scraper_state.is_paused: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description="Task skipped - scraper paused (secondary check)" ) self.retry = False return {"status": "paused", "paper_id": paper_id} # Get the paper from ..models import PaperMetadata paper = PaperMetadata.query.get(paper_id) if not paper: return {"status": "error", "message": f"Paper {paper_id} not found"} # Third check before starting actual processing scraper_state = ScraperState.get_current_state() if not scraper_state.is_active: ActivityLog.log_scraper_activity( action="process_single_paper", paper_id=paper_id, status="skipped", description="Task skipped - scraper not active (pre-processing check)" ) self.retry = False return {"status": "inactive", "paper_id": paper_id} # Process the paper using scraper manager manager = ScraperManager() result = manager.process_paper(paper) return result except Exception as e: ActivityLog.log_error( error_message=f"Error processing paper {paper_id}: {str(e)}", source="process_single_paper" ) return {"status": "error", "paper_id": paper_id, "message": str(e)} @shared_task(bind=True) def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None): """ Process multiple papers in a batch for immediate processing. Args: paper_ids: List of paper IDs to process scraper_module: Optional specific scraper module to use """ try: results = [] manager = ScraperManager() for paper_id in paper_ids: from ..models import PaperMetadata paper = PaperMetadata.query.get(paper_id) if paper: result = manager.process_paper(paper) results.append(result) else: results.append({ "paper_id": paper_id, "status": "error", "message": "Paper not found" }) return {"results": results, "total_processed": len(results)} except Exception as e: ActivityLog.log_error( error_message=f"Error processing batch: {str(e)}", source="process_papers_batch" ) return {"status": "error", "message": str(e)}