249 lines
9.1 KiB
Python
249 lines
9.1 KiB
Python
"""
|
|
Hourly scheduler task that processes papers at random times within each hour.
|
|
"""
|
|
|
|
import random
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional
|
|
from celery import shared_task
|
|
|
|
from ..models import ScraperState, ActivityLog
|
|
from .manager import ScraperManager
|
|
|
|
|
|
@shared_task(bind=True)
|
|
def hourly_scraper_scheduler(self):
|
|
"""
|
|
Hourly task that schedules paper processing at random times within the hour.
|
|
|
|
This task runs at the beginning of each hour and:
|
|
1. Calculates how many papers to process this hour
|
|
2. Schedules individual paper processing tasks at random times within the hour
|
|
"""
|
|
try:
|
|
# Check if scraper is active
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="info",
|
|
description="Hourly scheduler skipped - scraper not active"
|
|
)
|
|
# Disable retries for inactive scheduler
|
|
self.retry = False
|
|
return {"status": "inactive", "papers_scheduled": 0}
|
|
|
|
if scraper_state.is_paused:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="info",
|
|
description="Hourly scheduler skipped - scraper paused"
|
|
)
|
|
# Disable retries for paused scheduler
|
|
self.retry = False
|
|
return {"status": "paused", "papers_scheduled": 0}
|
|
|
|
# Initialize scraper manager
|
|
manager = ScraperManager()
|
|
|
|
# Get papers to process this hour
|
|
papers = manager.select_papers_for_processing()
|
|
|
|
if not papers:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="info",
|
|
description="No papers available for processing this hour"
|
|
)
|
|
return {"status": "empty", "papers_scheduled": 0}
|
|
|
|
# Schedule papers at random times within the hour (0-3600 seconds)
|
|
scheduled_count = 0
|
|
current_time = datetime.now()
|
|
|
|
for paper in papers:
|
|
# Random delay between 1 second and 58 minutes
|
|
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
|
|
|
|
# Schedule the task using Celery's task registry to avoid circular import issues
|
|
from ..celery import celery
|
|
celery.send_task(
|
|
'scipaperloader.scrapers.tasks.process_single_paper',
|
|
args=[paper.id],
|
|
countdown=delay_seconds
|
|
)
|
|
|
|
scheduled_count += 1
|
|
|
|
# Log each scheduled paper
|
|
schedule_time = current_time + timedelta(seconds=delay_seconds)
|
|
ActivityLog.log_scraper_activity(
|
|
action="schedule_paper",
|
|
paper_id=paper.id,
|
|
status="info",
|
|
description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
|
|
)
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="success",
|
|
description=f"Scheduled {scheduled_count} papers for random processing within this hour"
|
|
)
|
|
|
|
return {"status": "success", "papers_scheduled": scheduled_count}
|
|
|
|
except Exception as e:
|
|
ActivityLog.log_error(
|
|
error_message=f"Hourly scheduler error: {str(e)}",
|
|
source="hourly_scraper_scheduler"
|
|
)
|
|
return {"status": "error", "message": str(e)}
|
|
|
|
|
|
@shared_task(bind=True)
|
|
def process_single_paper(self, paper_id: int):
|
|
"""
|
|
Process a single paper. This task is scheduled at random times within each hour.
|
|
|
|
Args:
|
|
paper_id: ID of the paper to process
|
|
"""
|
|
try:
|
|
# ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times
|
|
|
|
# Initial check before any processing
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper not active (initial check)"
|
|
)
|
|
self.retry = False
|
|
return {"status": "inactive", "paper_id": paper_id}
|
|
|
|
if scraper_state.is_paused:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper paused (initial check)"
|
|
)
|
|
self.retry = False
|
|
return {"status": "paused", "paper_id": paper_id}
|
|
|
|
# Check if this specific task has been revoked
|
|
try:
|
|
from ..celery import celery
|
|
|
|
# Check if the current task is in the revoked list
|
|
if hasattr(self, 'request') and self.request.id:
|
|
revoked_tasks = celery.control.inspect().revoked()
|
|
if revoked_tasks:
|
|
for worker, tasks in revoked_tasks.items():
|
|
if self.request.id in tasks:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description=f"Task skipped - task ID {self.request.id} was revoked"
|
|
)
|
|
return {"status": "revoked", "paper_id": paper_id, "task_id": self.request.id}
|
|
except Exception:
|
|
# Don't fail on revocation check issues, just continue with state checks
|
|
pass
|
|
|
|
# Brief pause to allow stop commands to take effect
|
|
import time
|
|
time.sleep(0.1)
|
|
|
|
# Second check after brief delay
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper not active (secondary check)"
|
|
)
|
|
self.retry = False
|
|
return {"status": "inactive", "paper_id": paper_id}
|
|
|
|
if scraper_state.is_paused:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper paused (secondary check)"
|
|
)
|
|
self.retry = False
|
|
return {"status": "paused", "paper_id": paper_id}
|
|
|
|
# Get the paper
|
|
from ..models import PaperMetadata
|
|
paper = PaperMetadata.query.get(paper_id)
|
|
if not paper:
|
|
return {"status": "error", "message": f"Paper {paper_id} not found"}
|
|
|
|
# Third check before starting actual processing
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper not active (pre-processing check)"
|
|
)
|
|
self.retry = False
|
|
return {"status": "inactive", "paper_id": paper_id}
|
|
|
|
# Process the paper using scraper manager
|
|
manager = ScraperManager()
|
|
result = manager.process_paper(paper)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
ActivityLog.log_error(
|
|
error_message=f"Error processing paper {paper_id}: {str(e)}",
|
|
source="process_single_paper"
|
|
)
|
|
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
|
|
|
|
|
@shared_task(bind=True)
|
|
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
|
|
"""
|
|
Process multiple papers in a batch for immediate processing.
|
|
|
|
Args:
|
|
paper_ids: List of paper IDs to process
|
|
scraper_module: Optional specific scraper module to use
|
|
"""
|
|
try:
|
|
results = []
|
|
manager = ScraperManager()
|
|
|
|
for paper_id in paper_ids:
|
|
from ..models import PaperMetadata
|
|
paper = PaperMetadata.query.get(paper_id)
|
|
if paper:
|
|
result = manager.process_paper(paper)
|
|
results.append(result)
|
|
else:
|
|
results.append({
|
|
"paper_id": paper_id,
|
|
"status": "error",
|
|
"message": "Paper not found"
|
|
})
|
|
|
|
return {"results": results, "total_processed": len(results)}
|
|
|
|
except Exception as e:
|
|
ActivityLog.log_error(
|
|
error_message=f"Error processing batch: {str(e)}",
|
|
source="process_papers_batch"
|
|
)
|
|
return {"status": "error", "message": str(e)}
|