237 lines
8.4 KiB
Python

"""
APScheduler-based task functions that replace Celery tasks for paper processing.
"""
import random
from datetime import datetime, timedelta
from typing import Optional
from flask import current_app
from ..models import ScraperState, ActivityLog, PaperMetadata
from .manager import ScraperManager
def hourly_scraper_scheduler():
"""
Hourly task that schedules paper processing at random times within the hour.
This task runs at the beginning of each hour and:
1. Calculates how many papers to process this hour
2. Schedules individual paper processing tasks at random times within the hour
"""
try:
# Check if scraper is active
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="Hourly scheduler skipped - scraper not active"
)
return {"status": "inactive", "papers_scheduled": 0}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="Hourly scheduler skipped - scraper paused"
)
return {"status": "paused", "papers_scheduled": 0}
# Initialize scraper manager
manager = ScraperManager()
# Get papers to process this hour
papers = manager.select_papers_for_processing()
if not papers:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="No papers available for processing this hour"
)
return {"status": "empty", "papers_scheduled": 0}
# Get scheduler from Flask app config
scheduler = current_app.config.get('SCHEDULER')
if not scheduler:
ActivityLog.log_error(
error_message="APScheduler not available for paper scheduling",
source="hourly_scraper_scheduler"
)
return {"status": "error", "message": "APScheduler not available"}
# Schedule papers at random times within the hour (0-3600 seconds)
scheduled_count = 0
current_time = datetime.now()
for paper in papers:
# Random delay between 1 second and 58 minutes
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
run_date = current_time + timedelta(seconds=delay_seconds)
# Schedule the task using APScheduler
job_id = f"paper_process_{paper.id}_{int(current_time.timestamp())}"
scheduler.add_job(
func=process_single_paper,
trigger='date',
run_date=run_date,
args=[paper.id],
id=job_id,
replace_existing=True
)
scheduled_count += 1
# Log each scheduled paper
ActivityLog.log_scraper_activity(
action="schedule_paper",
paper_id=paper.id,
status="info",
description=f"Scheduled paper {paper.doi} for processing at {run_date.strftime('%H:%M:%S')}"
)
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="success",
description=f"Scheduled {scheduled_count} papers for random processing within this hour"
)
return {"status": "success", "papers_scheduled": scheduled_count}
except Exception as e:
ActivityLog.log_error(
error_message=f"Hourly scheduler error: {str(e)}",
source="hourly_scraper_scheduler"
)
return {"status": "error", "message": str(e)}
def process_single_paper(paper_id: int):
"""
Process a single paper. This task is scheduled at random times within each hour.
Args:
paper_id: ID of the paper to process
"""
try:
# ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times
# Initial check before any processing
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper not active (initial check)"
)
return {"status": "inactive", "paper_id": paper_id}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper paused (initial check)"
)
return {"status": "paused", "paper_id": paper_id}
# Brief pause to allow stop commands to take effect
import time
time.sleep(0.1)
# Second check after brief delay
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper not active (secondary check)"
)
return {"status": "inactive", "paper_id": paper_id}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper paused (secondary check)"
)
return {"status": "paused", "paper_id": paper_id}
# Get the paper
paper = PaperMetadata.query.get(paper_id)
if not paper:
return {"status": "error", "message": f"Paper {paper_id} not found"}
# Third check before starting actual processing
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper not active (pre-processing check)"
)
return {"status": "inactive", "paper_id": paper_id}
# Process the paper using scraper manager
manager = ScraperManager()
result = manager.process_paper(paper)
return result
except Exception as e:
ActivityLog.log_error(
error_message=f"Error processing paper {paper_id}: {str(e)}",
source="process_single_paper"
)
return {"status": "error", "paper_id": paper_id, "message": str(e)}
manager = ScraperManager()
result = manager.process_paper(paper)
return result
except Exception as e:
ActivityLog.log_error(
error_message=f"Error processing paper {paper_id}: {str(e)}",
source="process_single_paper"
)
return {"status": "error", "paper_id": paper_id, "message": str(e)}
def process_papers_batch(paper_ids: list, scraper_module: Optional[str] = None):
"""
Process multiple papers in a batch for immediate processing.
Args:
paper_ids: List of paper IDs to process
scraper_module: Optional specific scraper module to use
"""
try:
results = []
manager = ScraperManager()
for paper_id in paper_ids:
paper = PaperMetadata.query.get(paper_id)
if paper:
result = manager.process_paper(paper)
results.append(result)
else:
results.append({
"paper_id": paper_id,
"status": "error",
"message": "Paper not found"
})
return {"results": results, "total_processed": len(results)}
except Exception as e:
ActivityLog.log_error(
error_message=f"Error processing batch: {str(e)}",
source="process_papers_batch"
)
return {"status": "error", "message": str(e)}