716 lines
30 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Simplified scraper management system with hourly quota scheduling.
Uses APScheduler for all task processing - no Celery dependencies.
"""
import random
import math
from datetime import datetime, timedelta, UTC
from typing import List, Dict, Optional
from sqlalchemy import func
from flask import current_app
from ..models import (
PaperMetadata,
ScheduleConfig,
VolumeConfig,
ScraperState,
ActivityLog,
ScraperModuleConfig
)
from ..db import db
from ..cache_utils import get_cached_hourly_quota
from .factory import get_scraper, get_available_scrapers
class ScraperManager:
"""Manages scraper operations with hourly quota-based scheduling."""
def __init__(self):
self.current_scraper = None
self.pending_papers = [] # Track papers being processed
# No more Redis client initialization - using APScheduler now
def _get_scheduler(self):
"""Get the ScraperScheduler instance from Flask app config."""
try:
return current_app.config.get('SCHEDULER')
except RuntimeError:
# Outside application context
return None
def _get_raw_scheduler(self):
"""Get the raw APScheduler instance for direct job scheduling."""
try:
scheduler_wrapper = current_app.config.get('SCHEDULER')
if scheduler_wrapper:
return scheduler_wrapper.scheduler
return None
except RuntimeError:
return None
def _clear_delayed_tasks_from_apscheduler(self) -> int:
"""Clear delayed tasks from APScheduler - clean replacement for Redis manipulation.
Returns:
int: Number of delayed tasks cleared
"""
scheduler = self._get_scheduler()
if not scheduler:
try:
ActivityLog.log_error(
error_message="APScheduler not available - cannot clear delayed tasks",
source="ScraperManager._clear_delayed_tasks_from_apscheduler"
)
except RuntimeError:
print("❌ APScheduler not available - cannot clear delayed tasks")
return 0
try:
cleared_count = scheduler.revoke_all_scraper_jobs()
# Summary logging
if cleared_count > 0:
try:
ActivityLog.log_scraper_activity(
action="clear_delayed_tasks_complete_apscheduler",
status="success",
description=f"Total delayed scraper tasks cleared from APScheduler: {cleared_count}"
)
except RuntimeError:
print(f"✅ Total delayed scraper tasks cleared from APScheduler: {cleared_count}")
else:
try:
ActivityLog.log_scraper_activity(
action="clear_delayed_tasks_complete_apscheduler",
status="info",
description="No delayed scraper tasks found to clear in APScheduler"
)
except RuntimeError:
print(" No delayed scraper tasks found to clear in APScheduler")
return cleared_count
except Exception as e:
try:
ActivityLog.log_error(
error_message=f"Failed to clear delayed tasks from APScheduler: {str(e)}",
source="ScraperManager._clear_delayed_tasks_from_apscheduler"
)
except RuntimeError:
print(f"❌ Failed to clear delayed tasks from APScheduler: {str(e)}")
return 0
def start_scraper(self) -> Dict[str, str]:
"""Start the scraper system and immediately schedule papers for the current hour."""
try:
# Get current scraper
self.current_scraper = get_scraper()
# Activate scraper state
ScraperState.set_active(True)
ScraperState.set_paused(False)
scraper_name = self.current_scraper.get_name()
# Immediately schedule papers for the remaining time in the current hour
immediate_scheduled_count = self._schedule_papers_for_current_hour()
if immediate_scheduled_count > 0:
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description=f"Started scraper: {scraper_name}. Immediately scheduled {immediate_scheduled_count} papers for the remaining time in this hour."
)
return {"status": "success", "message": f"Scraper started successfully. Immediately scheduled {immediate_scheduled_count} papers for processing in the remaining time this hour."}
else:
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description=f"Started scraper: {scraper_name}. No papers available for immediate scheduling in the current hour."
)
return {"status": "success", "message": "Scraper started successfully. No papers available for immediate scheduling this hour."}
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to start scraper: {str(e)}",
source="ScraperManager.start_scraper"
)
return {"status": "error", "message": str(e)}
def pause_scraper(self) -> Dict[str, str]:
"""Pause the scraper system."""
try:
ScraperState.set_paused(True)
ActivityLog.log_scraper_command(
action="pause_scraper",
status="success",
description="Scraper paused - processing will halt"
)
return {"status": "success", "message": "Scraper paused"}
except Exception as e:
return {"status": "error", "message": str(e)}
def resume_scraper(self) -> Dict[str, str]:
"""Resume the scraper system."""
try:
ScraperState.set_paused(False)
ActivityLog.log_scraper_command(
action="resume_scraper",
status="success",
description="Scraper resumed - processing will continue"
)
return {"status": "success", "message": "Scraper resumed"}
except Exception as e:
return {"status": "error", "message": str(e)}
def stop_scraper(self) -> Dict[str, str]:
"""Stop the scraper, revoke all APScheduler jobs, and revert pending papers."""
try:
# STEP 1: Immediately set scraper as inactive - this is critical for race condition prevention
ScraperState.set_active(False)
ScraperState.set_paused(False)
ActivityLog.log_scraper_command(
action="stop_scraper_start",
status="info",
description="Scraper stop initiated - marked as inactive. Beginning APScheduler job revocation."
)
# STEP 2: Brief pause to allow running jobs to see the inactive state
import time
time.sleep(0.2)
# STEP 3: Revoke all APScheduler jobs
delayed_cleared_count = self._clear_delayed_tasks_from_apscheduler()
# STEP 4: Wait a bit for any remaining jobs to finish their checks and exit
time.sleep(1.0)
# STEP 5: Revert papers from processing status
scraper = get_scraper()
input_statuses = scraper.get_input_statuses()
# Find papers that are currently being processed
processing_status = scraper.get_output_statuses()["processing"]
pending_papers = PaperMetadata.query.filter_by(status=processing_status).all()
# Revert their status to the first input status
reverted_count = 0
if pending_papers and input_statuses:
revert_status = input_statuses[0] # Use first input status as default
for paper in pending_papers:
# Try to use previous_status if available, otherwise use first input status
if hasattr(paper, 'previous_status') and paper.previous_status:
paper.status = paper.previous_status
else:
paper.status = revert_status
paper.updated_at = datetime.now(UTC)
reverted_count += 1
db.session.commit()
ActivityLog.log_scraper_activity(
action="revert_pending_papers",
status="success",
description=f"Reverted {reverted_count} papers from '{processing_status}' to previous status"
)
ActivityLog.log_scraper_command(
action="stop_scraper",
status="success",
description=f"Scraper stopped completely. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers."
)
return {
"status": "success",
"message": f"Scraper stopped. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers to previous status."
}
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to stop scraper: {str(e)}",
source="ScraperManager.stop_scraper"
)
return {"status": "error", "message": str(e)}
def reset_scraper(self) -> Dict[str, str]:
"""Reset scraper state, revoke all APScheduler jobs, and clear all processing statuses."""
try:
ActivityLog.log_scraper_command(
action="reset_scraper_start",
status="info",
description="Beginning scraper reset process with APScheduler job revocation"
)
# Clear all APScheduler jobs
delayed_cleared_count = self._clear_delayed_tasks_from_apscheduler()
# Get current scraper configuration
scraper = get_scraper()
input_statuses = scraper.get_input_statuses()
processing_status = scraper.get_output_statuses()["processing"]
# Reset all papers in processing status
pending_papers = PaperMetadata.query.filter_by(status=processing_status).all()
reverted_count = 0
if pending_papers and input_statuses:
revert_status = input_statuses[0]
for paper in pending_papers:
# Try to use previous_status if available, otherwise use first input status
if hasattr(paper, 'previous_status') and paper.previous_status:
paper.status = paper.previous_status
else:
paper.status = revert_status
paper.updated_at = datetime.now(UTC)
paper.error_msg = None # Clear any error messages
reverted_count += 1
db.session.commit()
# Reset scraper state
ScraperState.set_active(False)
ScraperState.set_paused(False)
ActivityLog.log_scraper_command(
action="reset_scraper",
status="success",
description=f"Scraper reset. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers."
)
return {
"status": "success",
"message": f"Scraper reset. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers to original status."
}
except Exception as e:
return {"status": "error", "message": str(e)}
def get_current_hour_quota(self) -> int:
"""Calculate papers to process in current hour based on schedule."""
try:
return get_cached_hourly_quota(self._calculate_papers_for_current_hour)
except Exception as e:
ActivityLog.log_error(
error_message=f"Error calculating hourly quota: {str(e)}",
source="ScraperManager.get_current_hour_quota"
)
return 0
def _calculate_papers_for_current_hour(self) -> int:
"""Internal method to calculate hourly quota."""
try:
# Get current hour and volume config
current_hour = datetime.now().hour
volume_config = VolumeConfig.get_current_volume()
daily_volume = volume_config if volume_config else 100
# Get schedule config for current hour
schedule_config = ScheduleConfig.query.filter_by(hour=current_hour).first()
current_weight = schedule_config.weight if schedule_config else 1.0
# Get total weight across all hours
total_weight = db.session.query(func.sum(ScheduleConfig.weight)).scalar() or 24.0
# Calculate quota: (current_weight / total_weight) * daily_volume
quota = math.ceil((current_weight / total_weight) * daily_volume)
ActivityLog.log_scraper_activity(
action="calculate_hourly_quota",
status="info",
description=f"Hour {current_hour}: quota={quota} (weight={current_weight}, total_weight={total_weight}, daily_volume={daily_volume})"
)
return max(1, quota) # Ensure at least 1 paper per hour
except Exception as e:
ActivityLog.log_error(
error_message=f"Error in quota calculation: {str(e)}",
source="ScraperManager._calculate_papers_for_current_hour"
)
return 1 # Fallback to 1 paper per hour
def select_papers_for_processing(self, limit: Optional[int] = None) -> List[PaperMetadata]:
"""Select papers for processing based on current scraper configuration."""
try:
scraper = get_scraper()
input_statuses = scraper.get_input_statuses()
if not input_statuses:
return []
# Use provided limit or calculate from hourly quota
papers_needed = limit if limit is not None else self.get_current_hour_quota()
# Query papers with input statuses, randomize selection
papers = (PaperMetadata.query
.filter(PaperMetadata.status.in_(input_statuses))
.order_by(func.random())
.limit(papers_needed)
.all())
try:
ActivityLog.log_scraper_activity(
action="select_papers",
status="info",
description=f"Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})"
)
except RuntimeError:
# Outside application context - use print fallback
print(f"📋 Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})")
return papers
except Exception as e:
try:
ActivityLog.log_error(
error_message=f"Error selecting papers: {str(e)}",
source="ScraperManager.select_papers_for_processing"
)
except RuntimeError:
# Outside application context - use print fallback
print(f"❌ Error selecting papers: {str(e)}")
return []
def process_paper(self, paper: PaperMetadata) -> Dict:
"""Process a single paper using the current scraper."""
try:
# **RACE CONDITION FIX**: Double-check scraper state before proceeding
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_paper",
paper_id=paper.id,
status="skipped",
description="Skipped processing - scraper deactivated during task execution"
)
return {"paper_id": paper.id, "status": "skipped", "message": "Scraper not active"}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_paper",
paper_id=paper.id,
status="skipped",
description="Skipped processing - scraper paused during task execution"
)
return {"paper_id": paper.id, "status": "skipped", "message": "Scraper paused"}
scraper = get_scraper()
output_statuses = scraper.get_output_statuses()
# Store the previous status before changing it
previous_status = paper.status
# Update paper status to processing
paper.previous_status = previous_status
paper.status = output_statuses["processing"]
paper.updated_at = datetime.now(UTC)
db.session.commit()
# **ADDITIONAL RACE CONDITION CHECK**: Verify scraper is still active before expensive scraping operation
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
# Scraper was deactivated after we marked paper as processing - revert and exit
paper.status = previous_status
paper.updated_at = datetime.now(UTC)
db.session.commit()
ActivityLog.log_scraper_activity(
action="process_paper",
paper_id=paper.id,
status="cancelled",
description="Cancelled processing - scraper deactivated after paper marked as processing"
)
return {"paper_id": paper.id, "status": "cancelled", "message": "Scraper deactivated during processing"}
# Perform scraping
result = scraper.scrape(paper.doi)
# Update paper status based on result
if result.status == "success":
paper.status = output_statuses["success"]
paper.error_msg = None
if result.data and "file_path" in result.data:
paper.file_path = result.data["file_path"]
else:
paper.status = output_statuses["failure"]
paper.error_msg = result.message
paper.updated_at = datetime.now(UTC)
db.session.commit()
# Log result
ActivityLog.log_scraper_activity(
action="process_paper",
paper_id=paper.id,
status=result.status,
description=f"Processed {paper.doi}: {result.message}"
)
return {
"paper_id": paper.id,
"status": result.status,
"message": result.message,
"duration": result.duration
}
except Exception as e:
# Revert paper status on error
try:
input_statuses = get_scraper().get_input_statuses()
if input_statuses:
paper.status = input_statuses[0]
paper.error_msg = f"Processing error: {str(e)}"
paper.updated_at = datetime.now(UTC)
db.session.commit()
except:
pass # Don't fail if reversion fails
ActivityLog.log_error(
error_message=f"Error processing paper {paper.id}: {str(e)}",
source="ScraperManager.process_paper"
)
return {"paper_id": paper.id, "status": "error", "message": str(e)}
def process_paper_manual(self, paper: PaperMetadata, scraper_name: Optional[str] = None) -> Dict:
"""Process a single paper manually, bypassing scraper state checks."""
try:
# Get scraper configuration but skip state validation for manual processing
if scraper_name:
# Use the specified scraper
import importlib
from .base import BaseScraper
try:
module = importlib.import_module(f"scipaperloader.scrapers.{scraper_name}")
scraper_cls = getattr(module, "Scraper")
if not issubclass(scraper_cls, BaseScraper):
raise TypeError(f"Scraper class in module '{scraper_name}' does not inherit from BaseScraper")
scraper = scraper_cls()
except (ImportError, AttributeError, TypeError) as e:
ActivityLog.log_error(
error_message=f"Failed to load specified scraper '{scraper_name}': {str(e)}. Falling back to system default.",
source="ScraperManager.process_paper_manual"
)
scraper = get_scraper()
else:
# Use system default scraper
scraper = get_scraper()
output_statuses = scraper.get_output_statuses()
# Store the previous status before changing it
previous_status = paper.status
# Update paper status to processing
paper.previous_status = previous_status
paper.status = output_statuses["processing"]
paper.updated_at = datetime.now(UTC)
db.session.commit()
# Perform scraping (no state checks for manual processing)
result = scraper.scrape(paper.doi)
# Update paper status based on result
if result.status == "success":
paper.status = output_statuses["success"]
paper.error_msg = None
if result.data and "file_path" in result.data:
paper.file_path = result.data["file_path"]
else:
paper.status = output_statuses["failure"]
paper.error_msg = result.message
paper.updated_at = datetime.now(UTC)
db.session.commit()
# Log result
ActivityLog.log_scraper_activity(
action="process_paper_manual",
paper_id=paper.id,
status=result.status,
description=f"Manually processed {paper.doi}: {result.message}"
)
return {
"paper_id": paper.id,
"status": result.status,
"message": result.message,
"duration": result.duration
}
except Exception as e:
# Revert paper status on error
try:
input_statuses = get_scraper().get_input_statuses()
if input_statuses:
paper.status = input_statuses[0]
paper.error_msg = f"Manual processing error: {str(e)}"
paper.updated_at = datetime.now(UTC)
db.session.commit()
except:
pass # Don't fail if reversion fails
ActivityLog.log_error(
error_message=f"Error manually processing paper {paper.id}: {str(e)}",
source="ScraperManager.process_paper_manual"
)
return {"paper_id": paper.id, "status": "error", "message": str(e)}
def get_status(self) -> Dict:
"""Get current scraper status."""
scraper_state = ScraperState.get_current_state()
scraper = get_scraper()
# Count papers by status
input_statuses = scraper.get_input_statuses()
output_statuses = scraper.get_output_statuses()
available_count = (PaperMetadata.query
.filter(PaperMetadata.status.in_(input_statuses))
.count())
processing_count = (PaperMetadata.query
.filter_by(status=output_statuses["processing"])
.count())
return {
"active": scraper_state.is_active,
"paused": scraper_state.is_paused,
"current_scraper": scraper.get_name(),
"input_statuses": input_statuses,
"output_statuses": output_statuses,
"available_papers": available_count,
"processing_papers": processing_count,
"current_hour_quota": self.get_current_hour_quota()
}
def _schedule_papers_for_current_hour(self) -> int:
"""Schedule papers for processing in the remaining time of the current hour.
Returns:
int: Number of papers scheduled
"""
try:
# Get papers that should be processed this hour
papers = self.select_papers_for_processing()
if not papers:
return 0
# Get raw APScheduler instance for direct job scheduling
scheduler = self._get_raw_scheduler()
if not scheduler:
ActivityLog.log_error(
error_message="Raw APScheduler not available for immediate paper scheduling",
source="ScraperManager._schedule_papers_for_current_hour"
)
return 0
# Calculate remaining time in current hour
current_time = datetime.now()
next_hour = current_time.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)
remaining_seconds = int((next_hour - current_time).total_seconds())
# Don't schedule if less than 2 minutes remaining
if remaining_seconds < 120:
ActivityLog.log_scraper_activity(
action="start_scraper_immediate_scheduling",
status="info",
description=f"Skipping immediate scheduling - only {remaining_seconds} seconds remaining in current hour"
)
return 0
# Schedule papers at random times within the remaining time
scheduled_count = 0
scheduled_papers = []
for paper in papers:
try:
# Random delay between 1 second and remaining time minus 60 seconds buffer
max_delay = max(1, remaining_seconds - 60)
delay_seconds = random.randint(1, max_delay)
run_time = current_time + timedelta(seconds=delay_seconds)
# Generate unique job ID
import uuid
job_id = f"startup_paper_{paper.id}_{int(current_time.timestamp())}_{uuid.uuid4().hex[:8]}"
# Schedule the job
from ..scheduler import _process_single_paper
scheduler.add_job(
func=_process_single_paper,
trigger='date',
run_date=run_time,
args=[paper.id],
id=job_id,
name=f"Startup Process Paper {paper.id}",
replace_existing=True
)
scheduled_count += 1
# Collect paper info for logging
paper_info = {
"paper_id": paper.id,
"paper_doi": paper.doi,
"job_id": job_id,
"scheduled_time": run_time.isoformat(),
"delay_seconds": delay_seconds
}
scheduled_papers.append(paper_info)
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to schedule paper {paper.id} during startup: {str(e)}",
source="ScraperManager._schedule_papers_for_current_hour"
)
# Create single comprehensive log entry
if scheduled_papers:
try:
import json
scheduling_data = {
"total_scheduled": scheduled_count,
"scheduled_papers": scheduled_papers,
"timestamp": current_time.isoformat(),
"remaining_time_seconds": remaining_seconds,
"trigger": "startup_immediate_scheduling"
}
ActivityLog.log_scraper_activity(
action="startup_immediate_scheduling",
status="success",
description=f"Scheduled {scheduled_count} papers for immediate processing during startup for remaining {remaining_seconds}s in current hour. See extra_data for details.",
**{"scheduling_details": json.dumps(scheduling_data)}
)
except Exception:
# Fallback to simple logging
ActivityLog.log_scraper_activity(
action="startup_immediate_scheduling",
status="success",
description=f"Scheduled {scheduled_count} papers for immediate processing during startup"
)
return scheduled_count
except Exception as e:
ActivityLog.log_error(
error_message=f"Error in startup immediate scheduling: {str(e)}",
source="ScraperManager._schedule_papers_for_current_hour"
)
return 0