190 lines
6.6 KiB
Python

"""
Hourly scheduler task that processes papers at random times within each hour.
"""
import random
from datetime import datetime, timedelta
from typing import Optional
from celery import shared_task
from ..models import ScraperState, ActivityLog
from .manager import ScraperManager
@shared_task(bind=True)
def hourly_scraper_scheduler(self):
"""
Hourly task that schedules paper processing at random times within the hour.
This task runs at the beginning of each hour and:
1. Calculates how many papers to process this hour
2. Schedules individual paper processing tasks at random times within the hour
"""
try:
# Check if scraper is active
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="Hourly scheduler skipped - scraper not active"
)
# Disable retries for inactive scheduler
self.retry = False
return {"status": "inactive", "papers_scheduled": 0}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="Hourly scheduler skipped - scraper paused"
)
# Disable retries for paused scheduler
self.retry = False
return {"status": "paused", "papers_scheduled": 0}
# Initialize scraper manager
manager = ScraperManager()
# Get papers to process this hour
papers = manager.select_papers_for_processing()
if not papers:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="No papers available for processing this hour"
)
return {"status": "empty", "papers_scheduled": 0}
# Schedule papers at random times within the hour (0-3600 seconds)
scheduled_count = 0
current_time = datetime.now()
for paper in papers:
# Random delay between 1 second and 58 minutes
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
# Schedule the task using Celery's task registry to avoid circular import issues
from ..celery import celery
celery.send_task(
'scipaperloader.scrapers.tasks.process_single_paper',
args=[paper.id],
countdown=delay_seconds
)
scheduled_count += 1
# Log each scheduled paper
schedule_time = current_time + timedelta(seconds=delay_seconds)
ActivityLog.log_scraper_activity(
action="schedule_paper",
paper_id=paper.id,
status="info",
description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
)
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="success",
description=f"Scheduled {scheduled_count} papers for random processing within this hour"
)
return {"status": "success", "papers_scheduled": scheduled_count}
except Exception as e:
ActivityLog.log_error(
error_message=f"Hourly scheduler error: {str(e)}",
source="hourly_scraper_scheduler"
)
return {"status": "error", "message": str(e)}
@shared_task(bind=True)
def process_single_paper(self, paper_id: int):
"""
Process a single paper. This task is scheduled at random times within each hour.
Args:
paper_id: ID of the paper to process
"""
try:
# Double-check scraper state before processing
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Skipped processing - scraper not active"
)
# Use Celery's ignore to mark this task as completed without error
self.retry = False
return {"status": "inactive", "paper_id": paper_id}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Skipped processing - scraper paused"
)
# Use Celery's ignore for paused state too
self.retry = False
return {"status": "paused", "paper_id": paper_id}
# Get the paper
from ..models import PaperMetadata
paper = PaperMetadata.query.get(paper_id)
if not paper:
return {"status": "error", "message": f"Paper {paper_id} not found"}
# Process the paper using scraper manager
manager = ScraperManager()
result = manager.process_paper(paper)
return result
except Exception as e:
ActivityLog.log_error(
error_message=f"Error processing paper {paper_id}: {str(e)}",
source="process_single_paper"
)
return {"status": "error", "paper_id": paper_id, "message": str(e)}
@shared_task(bind=True)
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
"""
Process multiple papers in a batch for immediate processing.
Args:
paper_ids: List of paper IDs to process
scraper_module: Optional specific scraper module to use
"""
try:
results = []
manager = ScraperManager()
for paper_id in paper_ids:
from ..models import PaperMetadata
paper = PaperMetadata.query.get(paper_id)
if paper:
result = manager.process_paper(paper)
results.append(result)
else:
results.append({
"paper_id": paper_id,
"status": "error",
"message": "Paper not found"
})
return {"results": results, "total_processed": len(results)}
except Exception as e:
ActivityLog.log_error(
error_message=f"Error processing batch: {str(e)}",
source="process_papers_batch"
)
return {"status": "error", "message": str(e)}