SciPaperLoader/tools/diagnostics/diagnose_scraper.py
2025-05-24 12:39:23 +02:00

107 lines
3.9 KiB
Python
Executable File

"""
Diagnose and fix scraper stopping issues.
"""
from scipaperloader import create_app
from scipaperloader.celery import celery
from scipaperloader.models import ScraperState, ActivityLog
from scipaperloader.scrapers.factory import get_scraper
app = create_app()
def check_scraper_status():
"""Check the current status of the scraper in the database."""
with app.app_context():
scraper_state = ScraperState.query.first()
if scraper_state:
print(f"Scraper state in DB: active={scraper_state.is_active}, paused={scraper_state.is_paused}")
else:
print("No scraper state found in database")
def check_celery_tasks():
"""Check currently running Celery tasks."""
i = celery.control.inspect()
print("\n=== ACTIVE TASKS ===")
active_tasks = i.active() or {}
for worker, tasks in active_tasks.items():
for task in tasks:
print(f"Worker: {worker}, Task: {task.get('name', 'Unknown')}, ID: {task.get('id', 'Unknown')}")
print("\n=== SCHEDULED TASKS ===")
scheduled_tasks = i.scheduled() or {}
for worker, tasks in scheduled_tasks.items():
for task in tasks:
print(f"Worker: {worker}, Task: {task.get('name', 'Unknown')}, ID: {task.get('id', 'Unknown')}")
def check_recent_logs():
"""Check recent activity logs for clues."""
with app.app_context():
logs = ActivityLog.query.filter_by(category='scraper_command').order_by(ActivityLog.timestamp.desc()).limit(5).all()
print("\n=== RECENT COMMAND LOGS ===")
for log in logs:
print(f"[{log.timestamp}] {log.action}: {log.description}")
activity_logs = ActivityLog.query.filter_by(category='scraper_activity').order_by(ActivityLog.timestamp.desc()).limit(5).all()
print("\n=== RECENT ACTIVITY LOGS ===")
for log in activity_logs:
print(f"[{log.timestamp}] {log.action}: {log.description}")
def force_stop_scraper():
"""Force stop the scraper by setting the state and revoking all tasks."""
with app.app_context():
# Update scraper state
scraper_state = ScraperState.query.first()
if scraper_state:
scraper_state.is_active = False
scraper_state.is_paused = False
from scipaperloader.db import db
db.session.commit()
print("Set scraper state to inactive")
# Revoke all tasks
i = celery.control.inspect()
revoked_ids = []
# Check all queues
for queue_name, queue_func in [
("scheduled", i.scheduled),
("active", i.active),
("reserved", i.reserved)
]:
queue = queue_func() or {}
for worker, tasks in queue.items():
for task in tasks:
task_id = task.get('id')
if task_id and task_id not in revoked_ids:
celery.control.revoke(task_id, terminate=True)
revoked_ids.append(task_id)
print(f"Revoked task: {task_id}")
# Purge all queues
celery.control.purge()
print("Purged all task queues")
# Log the action
ActivityLog.log_scraper_command(
action="force_stop_scraper",
status="success",
description=f"Force stopped scraper, revoked {len(revoked_ids)} tasks"
)
print(f"\nRevoked {len(revoked_ids)} tasks in total")
if __name__ == "__main__":
print("=== SCRAPER STATUS DIAGNOSTIC TOOL ===")
check_scraper_status()
check_celery_tasks()
check_recent_logs()
stop_confirmation = input("\nDo you want to force stop the scraper? (y/n): ")
if stop_confirmation.lower() == 'y':
force_stop_scraper()
print("\nScraper force stopped. Current state:")
check_scraper_status()
else:
print("No changes made.")