190 lines
6.6 KiB
Python
190 lines
6.6 KiB
Python
"""
|
|
Hourly scheduler task that processes papers at random times within each hour.
|
|
"""
|
|
|
|
import random
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional
|
|
from celery import shared_task
|
|
|
|
from ..models import ScraperState, ActivityLog
|
|
from .manager import ScraperManager
|
|
|
|
|
|
@shared_task(bind=True)
|
|
def hourly_scraper_scheduler(self):
|
|
"""
|
|
Hourly task that schedules paper processing at random times within the hour.
|
|
|
|
This task runs at the beginning of each hour and:
|
|
1. Calculates how many papers to process this hour
|
|
2. Schedules individual paper processing tasks at random times within the hour
|
|
"""
|
|
try:
|
|
# Check if scraper is active
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="info",
|
|
description="Hourly scheduler skipped - scraper not active"
|
|
)
|
|
# Disable retries for inactive scheduler
|
|
self.retry = False
|
|
return {"status": "inactive", "papers_scheduled": 0}
|
|
|
|
if scraper_state.is_paused:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="info",
|
|
description="Hourly scheduler skipped - scraper paused"
|
|
)
|
|
# Disable retries for paused scheduler
|
|
self.retry = False
|
|
return {"status": "paused", "papers_scheduled": 0}
|
|
|
|
# Initialize scraper manager
|
|
manager = ScraperManager()
|
|
|
|
# Get papers to process this hour
|
|
papers = manager.select_papers_for_processing()
|
|
|
|
if not papers:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="info",
|
|
description="No papers available for processing this hour"
|
|
)
|
|
return {"status": "empty", "papers_scheduled": 0}
|
|
|
|
# Schedule papers at random times within the hour (0-3600 seconds)
|
|
scheduled_count = 0
|
|
current_time = datetime.now()
|
|
|
|
for paper in papers:
|
|
# Random delay between 1 second and 58 minutes
|
|
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
|
|
|
|
# Schedule the task using Celery's task registry to avoid circular import issues
|
|
from ..celery import celery
|
|
celery.send_task(
|
|
'scipaperloader.scrapers.tasks.process_single_paper',
|
|
args=[paper.id],
|
|
countdown=delay_seconds
|
|
)
|
|
|
|
scheduled_count += 1
|
|
|
|
# Log each scheduled paper
|
|
schedule_time = current_time + timedelta(seconds=delay_seconds)
|
|
ActivityLog.log_scraper_activity(
|
|
action="schedule_paper",
|
|
paper_id=paper.id,
|
|
status="info",
|
|
description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
|
|
)
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="success",
|
|
description=f"Scheduled {scheduled_count} papers for random processing within this hour"
|
|
)
|
|
|
|
return {"status": "success", "papers_scheduled": scheduled_count}
|
|
|
|
except Exception as e:
|
|
ActivityLog.log_error(
|
|
error_message=f"Hourly scheduler error: {str(e)}",
|
|
source="hourly_scraper_scheduler"
|
|
)
|
|
return {"status": "error", "message": str(e)}
|
|
|
|
|
|
@shared_task(bind=True)
|
|
def process_single_paper(self, paper_id: int):
|
|
"""
|
|
Process a single paper. This task is scheduled at random times within each hour.
|
|
|
|
Args:
|
|
paper_id: ID of the paper to process
|
|
"""
|
|
try:
|
|
# Double-check scraper state before processing
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Skipped processing - scraper not active"
|
|
)
|
|
# Use Celery's ignore to mark this task as completed without error
|
|
self.retry = False
|
|
return {"status": "inactive", "paper_id": paper_id}
|
|
|
|
if scraper_state.is_paused:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Skipped processing - scraper paused"
|
|
)
|
|
# Use Celery's ignore for paused state too
|
|
self.retry = False
|
|
return {"status": "paused", "paper_id": paper_id}
|
|
|
|
# Get the paper
|
|
from ..models import PaperMetadata
|
|
paper = PaperMetadata.query.get(paper_id)
|
|
if not paper:
|
|
return {"status": "error", "message": f"Paper {paper_id} not found"}
|
|
|
|
# Process the paper using scraper manager
|
|
manager = ScraperManager()
|
|
result = manager.process_paper(paper)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
ActivityLog.log_error(
|
|
error_message=f"Error processing paper {paper_id}: {str(e)}",
|
|
source="process_single_paper"
|
|
)
|
|
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
|
|
|
|
|
@shared_task(bind=True)
|
|
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
|
|
"""
|
|
Process multiple papers in a batch for immediate processing.
|
|
|
|
Args:
|
|
paper_ids: List of paper IDs to process
|
|
scraper_module: Optional specific scraper module to use
|
|
"""
|
|
try:
|
|
results = []
|
|
manager = ScraperManager()
|
|
|
|
for paper_id in paper_ids:
|
|
from ..models import PaperMetadata
|
|
paper = PaperMetadata.query.get(paper_id)
|
|
if paper:
|
|
result = manager.process_paper(paper)
|
|
results.append(result)
|
|
else:
|
|
results.append({
|
|
"paper_id": paper_id,
|
|
"status": "error",
|
|
"message": "Paper not found"
|
|
})
|
|
|
|
return {"results": results, "total_processed": len(results)}
|
|
|
|
except Exception as e:
|
|
ActivityLog.log_error(
|
|
error_message=f"Error processing batch: {str(e)}",
|
|
source="process_papers_batch"
|
|
)
|
|
return {"status": "error", "message": str(e)}
|