2025-06-10 11:40:36 +02:00

249 lines
9.1 KiB
Python

"""
Hourly scheduler task that processes papers at random times within each hour.
"""
import random
from datetime import datetime, timedelta
from typing import Optional
from celery import shared_task
from ..models import ScraperState, ActivityLog
from .manager import ScraperManager
@shared_task(bind=True)
def hourly_scraper_scheduler(self):
"""
Hourly task that schedules paper processing at random times within the hour.
This task runs at the beginning of each hour and:
1. Calculates how many papers to process this hour
2. Schedules individual paper processing tasks at random times within the hour
"""
try:
# Check if scraper is active
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="Hourly scheduler skipped - scraper not active"
)
# Disable retries for inactive scheduler
self.retry = False
return {"status": "inactive", "papers_scheduled": 0}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="Hourly scheduler skipped - scraper paused"
)
# Disable retries for paused scheduler
self.retry = False
return {"status": "paused", "papers_scheduled": 0}
# Initialize scraper manager
manager = ScraperManager()
# Get papers to process this hour
papers = manager.select_papers_for_processing()
if not papers:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="No papers available for processing this hour"
)
return {"status": "empty", "papers_scheduled": 0}
# Schedule papers at random times within the hour (0-3600 seconds)
scheduled_count = 0
current_time = datetime.now()
for paper in papers:
# Random delay between 1 second and 58 minutes
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
# Schedule the task using Celery's task registry to avoid circular import issues
from ..celery import celery
celery.send_task(
'scipaperloader.scrapers.tasks.process_single_paper',
args=[paper.id],
countdown=delay_seconds
)
scheduled_count += 1
# Log each scheduled paper
schedule_time = current_time + timedelta(seconds=delay_seconds)
ActivityLog.log_scraper_activity(
action="schedule_paper",
paper_id=paper.id,
status="info",
description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
)
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="success",
description=f"Scheduled {scheduled_count} papers for random processing within this hour"
)
return {"status": "success", "papers_scheduled": scheduled_count}
except Exception as e:
ActivityLog.log_error(
error_message=f"Hourly scheduler error: {str(e)}",
source="hourly_scraper_scheduler"
)
return {"status": "error", "message": str(e)}
@shared_task(bind=True)
def process_single_paper(self, paper_id: int):
"""
Process a single paper. This task is scheduled at random times within each hour.
Args:
paper_id: ID of the paper to process
"""
try:
# ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times
# Initial check before any processing
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper not active (initial check)"
)
self.retry = False
return {"status": "inactive", "paper_id": paper_id}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper paused (initial check)"
)
self.retry = False
return {"status": "paused", "paper_id": paper_id}
# Check if this specific task has been revoked
try:
from ..celery import celery
# Check if the current task is in the revoked list
if hasattr(self, 'request') and self.request.id:
revoked_tasks = celery.control.inspect().revoked()
if revoked_tasks:
for worker, tasks in revoked_tasks.items():
if self.request.id in tasks:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description=f"Task skipped - task ID {self.request.id} was revoked"
)
return {"status": "revoked", "paper_id": paper_id, "task_id": self.request.id}
except Exception:
# Don't fail on revocation check issues, just continue with state checks
pass
# Brief pause to allow stop commands to take effect
import time
time.sleep(0.1)
# Second check after brief delay
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper not active (secondary check)"
)
self.retry = False
return {"status": "inactive", "paper_id": paper_id}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper paused (secondary check)"
)
self.retry = False
return {"status": "paused", "paper_id": paper_id}
# Get the paper
from ..models import PaperMetadata
paper = PaperMetadata.query.get(paper_id)
if not paper:
return {"status": "error", "message": f"Paper {paper_id} not found"}
# Third check before starting actual processing
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper not active (pre-processing check)"
)
self.retry = False
return {"status": "inactive", "paper_id": paper_id}
# Process the paper using scraper manager
manager = ScraperManager()
result = manager.process_paper(paper)
return result
except Exception as e:
ActivityLog.log_error(
error_message=f"Error processing paper {paper_id}: {str(e)}",
source="process_single_paper"
)
return {"status": "error", "paper_id": paper_id, "message": str(e)}
@shared_task(bind=True)
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
"""
Process multiple papers in a batch for immediate processing.
Args:
paper_ids: List of paper IDs to process
scraper_module: Optional specific scraper module to use
"""
try:
results = []
manager = ScraperManager()
for paper_id in paper_ids:
from ..models import PaperMetadata
paper = PaperMetadata.query.get(paper_id)
if paper:
result = manager.process_paper(paper)
results.append(result)
else:
results.append({
"paper_id": paper_id,
"status": "error",
"message": "Paper not found"
})
return {"results": results, "total_processed": len(results)}
except Exception as e:
ActivityLog.log_error(
error_message=f"Error processing batch: {str(e)}",
source="process_papers_batch"
)
return {"status": "error", "message": str(e)}