237 lines
8.4 KiB
Python
237 lines
8.4 KiB
Python
"""
|
|
APScheduler-based task functions that replace Celery tasks for paper processing.
|
|
"""
|
|
|
|
import random
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional
|
|
from flask import current_app
|
|
|
|
from ..models import ScraperState, ActivityLog, PaperMetadata
|
|
from .manager import ScraperManager
|
|
|
|
|
|
def hourly_scraper_scheduler():
|
|
"""
|
|
Hourly task that schedules paper processing at random times within the hour.
|
|
|
|
This task runs at the beginning of each hour and:
|
|
1. Calculates how many papers to process this hour
|
|
2. Schedules individual paper processing tasks at random times within the hour
|
|
"""
|
|
try:
|
|
# Check if scraper is active
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="info",
|
|
description="Hourly scheduler skipped - scraper not active"
|
|
)
|
|
return {"status": "inactive", "papers_scheduled": 0}
|
|
|
|
if scraper_state.is_paused:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="info",
|
|
description="Hourly scheduler skipped - scraper paused"
|
|
)
|
|
return {"status": "paused", "papers_scheduled": 0}
|
|
|
|
# Initialize scraper manager
|
|
manager = ScraperManager()
|
|
|
|
# Get papers to process this hour
|
|
papers = manager.select_papers_for_processing()
|
|
|
|
if not papers:
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="info",
|
|
description="No papers available for processing this hour"
|
|
)
|
|
return {"status": "empty", "papers_scheduled": 0}
|
|
|
|
# Get scheduler from Flask app config
|
|
scheduler = current_app.config.get('SCHEDULER')
|
|
if not scheduler:
|
|
ActivityLog.log_error(
|
|
error_message="APScheduler not available for paper scheduling",
|
|
source="hourly_scraper_scheduler"
|
|
)
|
|
return {"status": "error", "message": "APScheduler not available"}
|
|
|
|
# Schedule papers at random times within the hour (0-3600 seconds)
|
|
scheduled_count = 0
|
|
current_time = datetime.now()
|
|
|
|
for paper in papers:
|
|
# Random delay between 1 second and 58 minutes
|
|
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
|
|
run_date = current_time + timedelta(seconds=delay_seconds)
|
|
|
|
# Schedule the task using APScheduler
|
|
job_id = f"paper_process_{paper.id}_{int(current_time.timestamp())}"
|
|
scheduler.add_job(
|
|
func=process_single_paper,
|
|
trigger='date',
|
|
run_date=run_date,
|
|
args=[paper.id],
|
|
id=job_id,
|
|
replace_existing=True
|
|
)
|
|
|
|
scheduled_count += 1
|
|
|
|
# Log each scheduled paper
|
|
ActivityLog.log_scraper_activity(
|
|
action="schedule_paper",
|
|
paper_id=paper.id,
|
|
status="info",
|
|
description=f"Scheduled paper {paper.doi} for processing at {run_date.strftime('%H:%M:%S')}"
|
|
)
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action="hourly_scheduler",
|
|
status="success",
|
|
description=f"Scheduled {scheduled_count} papers for random processing within this hour"
|
|
)
|
|
|
|
return {"status": "success", "papers_scheduled": scheduled_count}
|
|
|
|
except Exception as e:
|
|
ActivityLog.log_error(
|
|
error_message=f"Hourly scheduler error: {str(e)}",
|
|
source="hourly_scraper_scheduler"
|
|
)
|
|
return {"status": "error", "message": str(e)}
|
|
|
|
|
|
def process_single_paper(paper_id: int):
|
|
"""
|
|
Process a single paper. This task is scheduled at random times within each hour.
|
|
|
|
Args:
|
|
paper_id: ID of the paper to process
|
|
"""
|
|
try:
|
|
# ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times
|
|
|
|
# Initial check before any processing
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper not active (initial check)"
|
|
)
|
|
return {"status": "inactive", "paper_id": paper_id}
|
|
|
|
if scraper_state.is_paused:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper paused (initial check)"
|
|
)
|
|
return {"status": "paused", "paper_id": paper_id}
|
|
|
|
# Brief pause to allow stop commands to take effect
|
|
import time
|
|
time.sleep(0.1)
|
|
|
|
# Second check after brief delay
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper not active (secondary check)"
|
|
)
|
|
return {"status": "inactive", "paper_id": paper_id}
|
|
|
|
if scraper_state.is_paused:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper paused (secondary check)"
|
|
)
|
|
return {"status": "paused", "paper_id": paper_id}
|
|
|
|
# Get the paper
|
|
paper = PaperMetadata.query.get(paper_id)
|
|
if not paper:
|
|
return {"status": "error", "message": f"Paper {paper_id} not found"}
|
|
|
|
# Third check before starting actual processing
|
|
scraper_state = ScraperState.get_current_state()
|
|
if not scraper_state.is_active:
|
|
ActivityLog.log_scraper_activity(
|
|
action="process_single_paper",
|
|
paper_id=paper_id,
|
|
status="skipped",
|
|
description="Task skipped - scraper not active (pre-processing check)"
|
|
)
|
|
return {"status": "inactive", "paper_id": paper_id}
|
|
|
|
# Process the paper using scraper manager
|
|
manager = ScraperManager()
|
|
result = manager.process_paper(paper)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
ActivityLog.log_error(
|
|
error_message=f"Error processing paper {paper_id}: {str(e)}",
|
|
source="process_single_paper"
|
|
)
|
|
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
|
manager = ScraperManager()
|
|
result = manager.process_paper(paper)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
ActivityLog.log_error(
|
|
error_message=f"Error processing paper {paper_id}: {str(e)}",
|
|
source="process_single_paper"
|
|
)
|
|
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
|
|
|
|
|
def process_papers_batch(paper_ids: list, scraper_module: Optional[str] = None):
|
|
"""
|
|
Process multiple papers in a batch for immediate processing.
|
|
|
|
Args:
|
|
paper_ids: List of paper IDs to process
|
|
scraper_module: Optional specific scraper module to use
|
|
"""
|
|
try:
|
|
results = []
|
|
manager = ScraperManager()
|
|
|
|
for paper_id in paper_ids:
|
|
paper = PaperMetadata.query.get(paper_id)
|
|
if paper:
|
|
result = manager.process_paper(paper)
|
|
results.append(result)
|
|
else:
|
|
results.append({
|
|
"paper_id": paper_id,
|
|
"status": "error",
|
|
"message": "Paper not found"
|
|
})
|
|
|
|
return {"results": results, "total_processed": len(results)}
|
|
|
|
except Exception as e:
|
|
ActivityLog.log_error(
|
|
error_message=f"Error processing batch: {str(e)}",
|
|
source="process_papers_batch"
|
|
)
|
|
return {"status": "error", "message": str(e)}
|