450 lines
17 KiB
Python
450 lines
17 KiB
Python
"""
|
|
APScheduler-based scheduling system to replace complex Celery delayed task management.
|
|
This provides clean job scheduling and revocation without manual Redis manipulation.
|
|
"""
|
|
|
|
import random
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional, List
|
|
from apscheduler.schedulers.background import BackgroundScheduler
|
|
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
|
|
from apscheduler.executors.pool import ThreadPoolExecutor
|
|
from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR, EVENT_JOB_MISSED
|
|
|
|
# Configure APScheduler logging
|
|
logging.getLogger('apscheduler').setLevel(logging.WARNING)
|
|
|
|
# Global scheduler instance
|
|
_scheduler = None
|
|
_flask_app = None
|
|
|
|
|
|
def _get_flask_app():
|
|
"""Get the Flask app instance."""
|
|
global _flask_app
|
|
if _flask_app:
|
|
return _flask_app
|
|
|
|
try:
|
|
from flask import current_app
|
|
return current_app
|
|
except RuntimeError:
|
|
return None
|
|
|
|
|
|
def _hourly_scraper_scheduler():
|
|
"""Standalone function for hourly scheduling logic."""
|
|
app = _get_flask_app()
|
|
if not app:
|
|
return
|
|
|
|
with app.app_context():
|
|
try:
|
|
from .models import ScraperState, ActivityLog
|
|
|
|
# Check if scraper is active
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler_apscheduler",
|
|
status="info",
|
|
description="Hourly scheduler skipped - scraper not active"
|
|
)
|
|
return {"status": "inactive", "papers_scheduled": 0}
|
|
|
|
if scraper_state.is_paused:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler_apscheduler",
|
|
status="info",
|
|
description="Hourly scheduler skipped - scraper paused"
|
|
)
|
|
return {"status": "paused", "papers_scheduled": 0}
|
|
|
|
# Get papers to process this hour
|
|
from .scrapers.manager import ScraperManager
|
|
manager = ScraperManager()
|
|
papers = manager.select_papers_for_processing()
|
|
|
|
if not papers:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler_apscheduler",
|
|
status="info",
|
|
description="No papers available for processing this hour"
|
|
)
|
|
return {"status": "empty", "papers_scheduled": 0}
|
|
|
|
# Schedule papers at random times within the hour
|
|
scheduled_count = 0
|
|
current_time = datetime.now()
|
|
|
|
for paper in papers:
|
|
# Random delay between 1 second and 58 minutes
|
|
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
|
|
run_time = current_time + timedelta(seconds=delay_seconds)
|
|
|
|
# Schedule the individual paper processing job
|
|
job_id = f"process_paper_{paper.id}_{int(current_time.timestamp())}"
|
|
|
|
global _scheduler
|
|
if _scheduler:
|
|
_scheduler.add_job(
|
|
func=_process_single_paper,
|
|
trigger='date',
|
|
run_date=run_time,
|
|
args=[paper.id],
|
|
id=job_id,
|
|
replace_existing=False,
|
|
name=f"Process Paper {paper.doi}"
|
|
)
|
|
|
|
scheduled_count += 1
|
|
|
|
# Log each scheduled paper
|
|
ActivityLog.log_scraper_activity(
|
|
action="schedule_paper_apscheduler",
|
|
paper_id=paper.id,
|
|
status="info",
|
|
description=f"Scheduled paper {paper.doi} for processing at {run_time.strftime('%H:%M:%S')} (Job ID: {job_id})"
|
|
)
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler_apscheduler",
|
|
status="success",
|
|
description=f"Scheduled {scheduled_count} papers for random processing within this hour using APScheduler"
|
|
)
|
|
|
|
return {"status": "success", "papers_scheduled": scheduled_count}
|
|
|
|
except Exception as e:
|
|
from .models import ActivityLog
|
|
ActivityLog.log_error(
|
|
error_message=f"APScheduler hourly scheduler error: {str(e)}",
|
|
source="_hourly_scraper_scheduler"
|
|
)
|
|
return {"status": "error", "message": str(e)}
|
|
|
|
|
|
def _process_single_paper(paper_id: int):
|
|
"""Standalone function to process a single paper."""
|
|
app = _get_flask_app()
|
|
if not app:
|
|
return
|
|
|
|
with app.app_context():
|
|
try:
|
|
from .models import ScraperState, ActivityLog, PaperMetadata
|
|
|
|
# Enhanced race condition protection
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper_apscheduler",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper not active (APScheduler)"
|
|
)
|
|
return {"status": "inactive", "paper_id": paper_id}
|
|
|
|
if scraper_state.is_paused:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper_apscheduler",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper paused (APScheduler)"
|
|
)
|
|
return {"status": "paused", "paper_id": paper_id}
|
|
|
|
# Get the paper
|
|
paper = PaperMetadata.query.get(paper_id)
|
|
if not paper:
|
|
return {"status": "error", "message": f"Paper {paper_id} not found"}
|
|
|
|
# Final check before processing
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper_apscheduler",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper not active (pre-processing check)"
|
|
)
|
|
return {"status": "inactive", "paper_id": paper_id}
|
|
|
|
# Process the paper using scraper manager
|
|
from .scrapers.manager import ScraperManager
|
|
manager = ScraperManager()
|
|
result = manager.process_paper(paper)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
from .models import ActivityLog
|
|
ActivityLog.log_error(
|
|
error_message=f"Error processing paper {paper_id} in APScheduler: {str(e)}",
|
|
source="_process_single_paper"
|
|
)
|
|
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
|
|
|
|
|
def _job_listener(event):
|
|
"""Listen to job execution events."""
|
|
app = _get_flask_app()
|
|
if not app:
|
|
return
|
|
|
|
with app.app_context():
|
|
try:
|
|
from .models import ActivityLog
|
|
|
|
job_id = event.job_id
|
|
|
|
if event.exception:
|
|
ActivityLog.log_error(
|
|
error_message=f"APScheduler job {job_id} failed: {str(event.exception)}",
|
|
source="ScraperScheduler.job_listener"
|
|
)
|
|
elif hasattr(event, 'retval') and event.retval:
|
|
# Job completed successfully
|
|
if job_id.startswith('process_paper_'):
|
|
ActivityLog.log_scraper_activity(
|
|
action="apscheduler_job_complete",
|
|
status="success",
|
|
description=f"Job {job_id} completed successfully"
|
|
)
|
|
except Exception as e:
|
|
# Don't let logging errors break the scheduler
|
|
print(f"Error in job listener: {str(e)}")
|
|
|
|
|
|
class ScraperScheduler:
|
|
"""APScheduler-based scraper task scheduler."""
|
|
|
|
def __init__(self, app=None):
|
|
self.app = app
|
|
if app:
|
|
self.init_app(app)
|
|
|
|
@property
|
|
def scheduler(self):
|
|
"""Expose the global _scheduler instance."""
|
|
global _scheduler
|
|
return _scheduler
|
|
|
|
def init_app(self, app):
|
|
"""Initialize the scheduler with Flask app context."""
|
|
global _scheduler, _flask_app
|
|
_flask_app = app
|
|
self.app = app
|
|
|
|
# Initialize scheduler within app context to access db.engine properly
|
|
with app.app_context():
|
|
# Use the existing Flask-SQLAlchemy database engine for APScheduler
|
|
from .db import db
|
|
|
|
# Configure job store to use the existing database engine
|
|
jobstores = {
|
|
'default': SQLAlchemyJobStore(engine=db.engine)
|
|
}
|
|
|
|
# Configure thread pool executor
|
|
executors = {
|
|
'default': ThreadPoolExecutor(max_workers=50) # Increased from 20 to 50
|
|
}
|
|
|
|
# Job defaults
|
|
job_defaults = {
|
|
'coalesce': False, # Don't combine multiple scheduled instances
|
|
'max_instances': 3, # Allow up to 3 instances of the same job
|
|
'misfire_grace_time': 30 # 30 seconds grace period for missed jobs
|
|
}
|
|
|
|
# Create the scheduler
|
|
_scheduler = BackgroundScheduler(
|
|
jobstores=jobstores,
|
|
executors=executors,
|
|
job_defaults=job_defaults,
|
|
timezone='UTC'
|
|
)
|
|
|
|
# Add event listeners
|
|
_scheduler.add_listener(_job_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR | EVENT_JOB_MISSED)
|
|
|
|
# Start the scheduler FIRST, which will auto-create tables
|
|
_scheduler.start()
|
|
|
|
# THEN add the hourly scraper job
|
|
_scheduler.add_job(
|
|
func=_hourly_scraper_scheduler,
|
|
trigger='cron',
|
|
minute=0, # Run at the start of every hour
|
|
id='hourly_scraper_main',
|
|
replace_existing=True,
|
|
name='Hourly Scraper Scheduler'
|
|
)
|
|
|
|
try:
|
|
from .models import ActivityLog
|
|
ActivityLog.log_scraper_activity(
|
|
action="apscheduler_init",
|
|
status="success",
|
|
description="APScheduler initialized with database job store and hourly scheduling"
|
|
)
|
|
except Exception:
|
|
# Handle case where we're outside application context
|
|
print("✅ APScheduler initialized successfully")
|
|
|
|
def revoke_all_scraper_jobs(self) -> int:
|
|
"""Clean replacement for the complex _clear_delayed_tasks_from_redis method."""
|
|
global _scheduler
|
|
if not _scheduler:
|
|
try:
|
|
from .models import ActivityLog
|
|
ActivityLog.log_error(
|
|
error_message="Scheduler not initialized - cannot revoke jobs",
|
|
source="ScraperScheduler.revoke_all_scraper_jobs"
|
|
)
|
|
except Exception:
|
|
print("❌ Scheduler not initialized - cannot revoke jobs")
|
|
return 0
|
|
|
|
revoked_count = 0
|
|
|
|
try:
|
|
# Get all jobs
|
|
jobs = _scheduler.get_jobs()
|
|
|
|
for job in jobs:
|
|
# Remove any job that processes papers or uploads (but keep the main hourly scheduler)
|
|
if ('paper_process_' in job.id or 'test_paper_process_' in job.id or
|
|
'process_paper_' in job.id or 'csv_upload_' in job.id):
|
|
_scheduler.remove_job(job.id)
|
|
revoked_count += 1
|
|
|
|
try:
|
|
from .models import ActivityLog
|
|
ActivityLog.log_scraper_activity(
|
|
action="revoke_apscheduler_job",
|
|
status="success",
|
|
description=f"Revoked APScheduler job: {job.name} (ID: {job.id})"
|
|
)
|
|
except Exception:
|
|
print(f"✅ Revoked APScheduler job: {job.id}")
|
|
|
|
if revoked_count > 0:
|
|
try:
|
|
from .models import ActivityLog
|
|
ActivityLog.log_scraper_activity(
|
|
action="revoke_all_scraper_jobs_apscheduler",
|
|
status="success",
|
|
description=f"Successfully revoked {revoked_count} APScheduler jobs"
|
|
)
|
|
except Exception:
|
|
print(f"✅ Successfully revoked {revoked_count} APScheduler jobs")
|
|
|
|
return revoked_count
|
|
|
|
except Exception as e:
|
|
try:
|
|
from .models import ActivityLog
|
|
ActivityLog.log_error(
|
|
error_message=f"Error revoking APScheduler jobs: {str(e)}",
|
|
source="ScraperScheduler.revoke_all_scraper_jobs"
|
|
)
|
|
except Exception:
|
|
print(f"❌ Error revoking APScheduler jobs: {str(e)}")
|
|
return 0
|
|
|
|
def get_job_count(self) -> int:
|
|
"""Get the number of scheduled jobs."""
|
|
global _scheduler
|
|
if not _scheduler:
|
|
return 0
|
|
return len(_scheduler.get_jobs())
|
|
|
|
def get_paper_jobs(self) -> List[dict]:
|
|
"""Get information about scheduled paper processing jobs."""
|
|
global _scheduler
|
|
if not _scheduler:
|
|
return []
|
|
|
|
jobs = []
|
|
all_jobs = _scheduler.get_jobs()
|
|
|
|
for job in all_jobs:
|
|
# Match jobs that contain paper processing patterns
|
|
if ('process_paper_' in job.id or 'paper_process_' in job.id or 'test_paper_process_' in job.id):
|
|
job_info = {
|
|
'id': job.id,
|
|
'name': job.name,
|
|
'next_run_time': job.next_run_time.isoformat() if job.next_run_time else None,
|
|
'args': job.args
|
|
}
|
|
jobs.append(job_info)
|
|
|
|
return jobs
|
|
|
|
def shutdown(self):
|
|
"""Gracefully shutdown the scheduler."""
|
|
global _scheduler
|
|
if _scheduler:
|
|
try:
|
|
from .models import ActivityLog
|
|
ActivityLog.log_scraper_activity(
|
|
action="apscheduler_shutdown",
|
|
status="info",
|
|
description="Shutting down APScheduler"
|
|
)
|
|
except Exception:
|
|
print("🔄 Shutting down APScheduler")
|
|
|
|
_scheduler.shutdown(wait=False)
|
|
_scheduler = None
|
|
|
|
def schedule_paper_processing(self, paper_id: int, delay_seconds: int = 0, job_id: Optional[str] = None) -> str:
|
|
"""Schedule a paper for processing with APScheduler.
|
|
|
|
Args:
|
|
paper_id: ID of the paper to process
|
|
delay_seconds: Delay in seconds before processing (default: 0 for immediate)
|
|
job_id: Optional custom job ID (will be generated if not provided)
|
|
|
|
Returns:
|
|
str: The job ID of the scheduled job
|
|
"""
|
|
global _scheduler
|
|
if not _scheduler:
|
|
raise RuntimeError("APScheduler not initialized")
|
|
|
|
# Generate job ID if not provided
|
|
if not job_id:
|
|
job_id = f"process_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
|
|
# Calculate run time
|
|
run_time = datetime.now() + timedelta(seconds=delay_seconds)
|
|
|
|
# Schedule the job
|
|
job = _scheduler.add_job(
|
|
func=_process_single_paper,
|
|
trigger='date',
|
|
run_date=run_time,
|
|
args=[paper_id],
|
|
id=job_id,
|
|
name=f"Process Paper {paper_id}",
|
|
replace_existing=True
|
|
)
|
|
|
|
# Log the scheduling
|
|
try:
|
|
from .models import ActivityLog
|
|
ActivityLog.log_scraper_activity(
|
|
action="schedule_paper_processing_apscheduler",
|
|
paper_id=paper_id,
|
|
status="info",
|
|
description=f"Scheduled paper {paper_id} for processing at {run_time.strftime('%H:%M:%S')} (Job ID: {job_id})"
|
|
)
|
|
except Exception:
|
|
print(f"✅ Scheduled paper {paper_id} for processing (Job ID: {job_id})")
|
|
|
|
return job_id
|