fixes logging and scraper start / stop task planning

This commit is contained in:
Michael Beck 2025-06-13 13:56:46 +02:00
parent 24f9eb5766
commit fe202b56d0
2 changed files with 238 additions and 59 deletions

View File

@ -78,6 +78,7 @@ def _hourly_scraper_scheduler():
# Schedule papers at random times within the hour # Schedule papers at random times within the hour
scheduled_count = 0 scheduled_count = 0
current_time = datetime.now() current_time = datetime.now()
scheduled_papers = []
for paper in papers: for paper in papers:
# Random delay between 1 second and 58 minutes # Random delay between 1 second and 58 minutes
@ -103,19 +104,41 @@ def _hourly_scraper_scheduler():
scheduled_count += 1 scheduled_count += 1
# Log each scheduled paper # Collect paper info for single log entry
ActivityLog.log_scraper_activity( paper_info = {
action="schedule_paper_apscheduler", "paper_id": paper.id,
paper_id=paper.id, "paper_doi": paper.doi,
status="info", "job_id": job_id,
description=f"Scheduled paper {paper.doi} for processing at {run_time.strftime('%H:%M:%S')} (Job ID: {job_id})" "scheduled_time": run_time.isoformat(),
) "delay_seconds": delay_seconds
}
scheduled_papers.append(paper_info)
ActivityLog.log_scraper_activity( # Create single comprehensive log entry with JSON data
action="hourly_scheduler_apscheduler", try:
status="success", import json
description=f"Scheduled {scheduled_count} papers for random processing within this hour using APScheduler" from .models import ActivityLog
)
scheduling_data = {
"total_scheduled": scheduled_count,
"scheduled_papers": scheduled_papers,
"timestamp": datetime.now().isoformat(),
"hour_range": f"{current_time.strftime('%H:%M')} - {(current_time + timedelta(hours=1)).strftime('%H:%M')}"
}
ActivityLog.log_scraper_activity(
action="hourly_scheduler_apscheduler",
status="success",
description=f"Scheduled {scheduled_count} papers for random processing within this hour using APScheduler. See extra_data for details.",
**{"scheduling_details": json.dumps(scheduling_data)}
)
except Exception:
# Fallback to simple logging
ActivityLog.log_scraper_activity(
action="hourly_scheduler_apscheduler",
status="success",
description=f"Scheduled {scheduled_count} papers for random processing within this hour using APScheduler"
)
return {"status": "success", "papers_scheduled": scheduled_count} return {"status": "success", "papers_scheduled": scheduled_count}
@ -347,6 +370,9 @@ class ScraperScheduler:
return 0 return 0
revoked_count = 0 revoked_count = 0
revoked_jobs = []
already_gone_jobs = []
failed_jobs = []
try: try:
# Get all jobs # Get all jobs
@ -355,51 +381,58 @@ class ScraperScheduler:
for job in jobs: for job in jobs:
# Remove any job that processes papers or uploads (but keep the main hourly scheduler) # Remove any job that processes papers or uploads (but keep the main hourly scheduler)
if ('paper_process_' in job.id or 'test_paper_process_' in job.id or if ('paper_process_' in job.id or 'test_paper_process_' in job.id or
'process_paper_' in job.id or 'csv_upload_' in job.id or 'manual_paper_' in job.id): 'process_paper_' in job.id or 'csv_upload_' in job.id or 'manual_paper_' in job.id or
'startup_paper_' in job.id):
try: try:
_scheduler.remove_job(job.id) _scheduler.remove_job(job.id)
revoked_count += 1 revoked_count += 1
try: # Collect job info for single log entry
from .models import ActivityLog job_info = {
ActivityLog.log_scraper_activity( "job_id": job.id,
action="revoke_apscheduler_job", "job_name": job.name,
status="success", "next_run_time": job.next_run_time.isoformat() if job.next_run_time else None,
description=f"Revoked APScheduler job: {job.name} (ID: {job.id})" "args": job.args
) }
except Exception: revoked_jobs.append(job_info)
print(f"✅ Revoked APScheduler job: {job.id}")
print(f"✅ Revoked APScheduler job: {job.id}")
except JobLookupError as e: except JobLookupError as e:
# Job already removed/completed - this is normal, just log it # Job already removed/completed - this is normal
try: already_gone_jobs.append({
from .models import ActivityLog "job_id": job.id,
ActivityLog.log_scraper_activity( "reason": str(e)
action="revoke_apscheduler_job_already_gone", })
status="info", print(f" Job {job.id} was already completed or removed")
description=f"Job {job.id} was already completed or removed: {str(e)}"
)
except Exception:
print(f" Job {job.id} was already completed or removed")
except Exception as e: except Exception as e:
# Other error - log it but continue # Other error - log it but continue
try: failed_jobs.append({
from .models import ActivityLog "job_id": job.id,
ActivityLog.log_error( "error": str(e)
error_message=f"Error removing job {job.id}: {str(e)}", })
source="ScraperScheduler.revoke_all_scraper_jobs" print(f"❌ Error removing job {job.id}: {str(e)}")
)
except Exception:
print(f"❌ Error removing job {job.id}: {str(e)}")
if revoked_count > 0: # Create single comprehensive log entry with JSON data
if revoked_jobs or already_gone_jobs or failed_jobs:
try: try:
import json
from .models import ActivityLog from .models import ActivityLog
revocation_data = {
"total_revoked": revoked_count,
"revoked_jobs": revoked_jobs,
"already_gone_jobs": already_gone_jobs,
"failed_jobs": failed_jobs,
"timestamp": datetime.now().isoformat()
}
ActivityLog.log_scraper_activity( ActivityLog.log_scraper_activity(
action="revoke_all_scraper_jobs_apscheduler", action="revoke_all_scraper_jobs_apscheduler",
status="success", status="success",
description=f"Successfully revoked {revoked_count} APScheduler jobs" description=f"Successfully revoked {revoked_count} APScheduler jobs. See extra_data for details.",
**{"revocation_details": json.dumps(revocation_data)}
) )
except Exception: except Exception:
print(f"✅ Successfully revoked {revoked_count} APScheduler jobs") print(f"✅ Successfully revoked {revoked_count} APScheduler jobs")

View File

@ -32,13 +32,23 @@ class ScraperManager:
# No more Redis client initialization - using APScheduler now # No more Redis client initialization - using APScheduler now
def _get_scheduler(self): def _get_scheduler(self):
"""Get the APScheduler instance from Flask app config.""" """Get the ScraperScheduler instance from Flask app config."""
try: try:
return current_app.config.get('SCHEDULER') return current_app.config.get('SCHEDULER')
except RuntimeError: except RuntimeError:
# Outside application context # Outside application context
return None return None
def _get_raw_scheduler(self):
"""Get the raw APScheduler instance for direct job scheduling."""
try:
scheduler_wrapper = current_app.config.get('SCHEDULER')
if scheduler_wrapper:
return scheduler_wrapper.scheduler
return None
except RuntimeError:
return None
def _clear_delayed_tasks_from_apscheduler(self) -> int: def _clear_delayed_tasks_from_apscheduler(self) -> int:
"""Clear delayed tasks from APScheduler - clean replacement for Redis manipulation. """Clear delayed tasks from APScheduler - clean replacement for Redis manipulation.
@ -93,7 +103,7 @@ class ScraperManager:
return 0 return 0
def start_scraper(self) -> Dict[str, str]: def start_scraper(self) -> Dict[str, str]:
"""Start the scraper system.""" """Start the scraper system and immediately schedule papers for the current hour."""
try: try:
# Get current scraper # Get current scraper
self.current_scraper = get_scraper() self.current_scraper = get_scraper()
@ -104,13 +114,25 @@ class ScraperManager:
scraper_name = self.current_scraper.get_name() scraper_name = self.current_scraper.get_name()
ActivityLog.log_scraper_command( # Immediately schedule papers for the remaining time in the current hour
action="start_scraper", immediate_scheduled_count = self._schedule_papers_for_current_hour()
status="success",
description=f"Started scraper: {scraper_name}. Use /trigger-immediate endpoint to immediately schedule papers instead of waiting for the next hourly boundary."
)
return {"status": "success", "message": "Scraper started successfully. Papers will be scheduled at the next hourly boundary, or use /trigger-immediate to schedule immediately."} if immediate_scheduled_count > 0:
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description=f"Started scraper: {scraper_name}. Immediately scheduled {immediate_scheduled_count} papers for the remaining time in this hour."
)
return {"status": "success", "message": f"Scraper started successfully. Immediately scheduled {immediate_scheduled_count} papers for processing in the remaining time this hour."}
else:
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description=f"Started scraper: {scraper_name}. No papers available for immediate scheduling in the current hour."
)
return {"status": "success", "message": "Scraper started successfully. No papers available for immediate scheduling this hour."}
except Exception as e: except Exception as e:
ActivityLog.log_error( ActivityLog.log_error(
@ -339,19 +361,27 @@ class ScraperManager:
.limit(papers_needed) .limit(papers_needed)
.all()) .all())
ActivityLog.log_scraper_activity( try:
action="select_papers", ActivityLog.log_scraper_activity(
status="info", action="select_papers",
description=f"Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})" status="info",
) description=f"Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})"
)
except RuntimeError:
# Outside application context - use print fallback
print(f"📋 Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})")
return papers return papers
except Exception as e: except Exception as e:
ActivityLog.log_error( try:
error_message=f"Error selecting papers: {str(e)}", ActivityLog.log_error(
source="ScraperManager.select_papers_for_processing" error_message=f"Error selecting papers: {str(e)}",
) source="ScraperManager.select_papers_for_processing"
)
except RuntimeError:
# Outside application context - use print fallback
print(f"❌ Error selecting papers: {str(e)}")
return [] return []
def process_paper(self, paper: PaperMetadata) -> Dict: def process_paper(self, paper: PaperMetadata) -> Dict:
@ -567,3 +597,119 @@ class ScraperManager:
"processing_papers": processing_count, "processing_papers": processing_count,
"current_hour_quota": self.get_current_hour_quota() "current_hour_quota": self.get_current_hour_quota()
} }
def _schedule_papers_for_current_hour(self) -> int:
"""Schedule papers for processing in the remaining time of the current hour.
Returns:
int: Number of papers scheduled
"""
try:
# Get papers that should be processed this hour
papers = self.select_papers_for_processing()
if not papers:
return 0
# Get raw APScheduler instance for direct job scheduling
scheduler = self._get_raw_scheduler()
if not scheduler:
ActivityLog.log_error(
error_message="Raw APScheduler not available for immediate paper scheduling",
source="ScraperManager._schedule_papers_for_current_hour"
)
return 0
# Calculate remaining time in current hour
current_time = datetime.now()
next_hour = current_time.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)
remaining_seconds = int((next_hour - current_time).total_seconds())
# Don't schedule if less than 2 minutes remaining
if remaining_seconds < 120:
ActivityLog.log_scraper_activity(
action="start_scraper_immediate_scheduling",
status="info",
description=f"Skipping immediate scheduling - only {remaining_seconds} seconds remaining in current hour"
)
return 0
# Schedule papers at random times within the remaining time
scheduled_count = 0
scheduled_papers = []
for paper in papers:
try:
# Random delay between 1 second and remaining time minus 60 seconds buffer
max_delay = max(1, remaining_seconds - 60)
delay_seconds = random.randint(1, max_delay)
run_time = current_time + timedelta(seconds=delay_seconds)
# Generate unique job ID
import uuid
job_id = f"startup_paper_{paper.id}_{int(current_time.timestamp())}_{uuid.uuid4().hex[:8]}"
# Schedule the job
from ..scheduler import _process_single_paper
scheduler.add_job(
func=_process_single_paper,
trigger='date',
run_date=run_time,
args=[paper.id],
id=job_id,
name=f"Startup Process Paper {paper.id}",
replace_existing=True
)
scheduled_count += 1
# Collect paper info for logging
paper_info = {
"paper_id": paper.id,
"paper_doi": paper.doi,
"job_id": job_id,
"scheduled_time": run_time.isoformat(),
"delay_seconds": delay_seconds
}
scheduled_papers.append(paper_info)
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to schedule paper {paper.id} during startup: {str(e)}",
source="ScraperManager._schedule_papers_for_current_hour"
)
# Create single comprehensive log entry
if scheduled_papers:
try:
import json
scheduling_data = {
"total_scheduled": scheduled_count,
"scheduled_papers": scheduled_papers,
"timestamp": current_time.isoformat(),
"remaining_time_seconds": remaining_seconds,
"trigger": "startup_immediate_scheduling"
}
ActivityLog.log_scraper_activity(
action="startup_immediate_scheduling",
status="success",
description=f"Scheduled {scheduled_count} papers for immediate processing during startup for remaining {remaining_seconds}s in current hour. See extra_data for details.",
**{"scheduling_details": json.dumps(scheduling_data)}
)
except Exception:
# Fallback to simple logging
ActivityLog.log_scraper_activity(
action="startup_immediate_scheduling",
status="success",
description=f"Scheduled {scheduled_count} papers for immediate processing during startup"
)
return scheduled_count
except Exception as e:
ActivityLog.log_error(
error_message=f"Error in startup immediate scheduling: {str(e)}",
source="ScraperManager._schedule_papers_for_current_hour"
)
return 0