diff --git a/check_state.py b/check_state.py new file mode 100644 index 0000000..9a1e95c --- /dev/null +++ b/check_state.py @@ -0,0 +1,8 @@ +from scipaperloader.models import ScraperState +from scipaperloader import create_app + +app = create_app() + +with app.app_context(): + scraper_state = ScraperState.query.first() + print(f"Active: {scraper_state.is_active}, Paused: {scraper_state.is_paused}") diff --git a/diagnose_scraper.py b/diagnose_scraper.py new file mode 100644 index 0000000..b823dcb --- /dev/null +++ b/diagnose_scraper.py @@ -0,0 +1,106 @@ +""" +Diagnose and fix scraper stopping issues. +""" + +from scipaperloader import create_app +from scipaperloader.celery import celery +from scipaperloader.models import ScraperState, ActivityLog +from scipaperloader.scrapers.factory import get_scraper + +app = create_app() + +def check_scraper_status(): + """Check the current status of the scraper in the database.""" + with app.app_context(): + scraper_state = ScraperState.query.first() + if scraper_state: + print(f"Scraper state in DB: active={scraper_state.is_active}, paused={scraper_state.is_paused}") + else: + print("No scraper state found in database") + +def check_celery_tasks(): + """Check currently running Celery tasks.""" + i = celery.control.inspect() + + print("\n=== ACTIVE TASKS ===") + active_tasks = i.active() or {} + for worker, tasks in active_tasks.items(): + for task in tasks: + print(f"Worker: {worker}, Task: {task.get('name', 'Unknown')}, ID: {task.get('id', 'Unknown')}") + + print("\n=== SCHEDULED TASKS ===") + scheduled_tasks = i.scheduled() or {} + for worker, tasks in scheduled_tasks.items(): + for task in tasks: + print(f"Worker: {worker}, Task: {task.get('name', 'Unknown')}, ID: {task.get('id', 'Unknown')}") + +def check_recent_logs(): + """Check recent activity logs for clues.""" + with app.app_context(): + logs = ActivityLog.query.filter_by(category='scraper_command').order_by(ActivityLog.timestamp.desc()).limit(5).all() + print("\n=== RECENT COMMAND LOGS ===") + for log in logs: + print(f"[{log.timestamp}] {log.action}: {log.description}") + + activity_logs = ActivityLog.query.filter_by(category='scraper_activity').order_by(ActivityLog.timestamp.desc()).limit(5).all() + print("\n=== RECENT ACTIVITY LOGS ===") + for log in activity_logs: + print(f"[{log.timestamp}] {log.action}: {log.description}") + +def force_stop_scraper(): + """Force stop the scraper by setting the state and revoking all tasks.""" + with app.app_context(): + # Update scraper state + scraper_state = ScraperState.query.first() + if scraper_state: + scraper_state.is_active = False + scraper_state.is_paused = False + from scipaperloader.db import db + db.session.commit() + print("Set scraper state to inactive") + + # Revoke all tasks + i = celery.control.inspect() + revoked_ids = [] + + # Check all queues + for queue_name, queue_func in [ + ("scheduled", i.scheduled), + ("active", i.active), + ("reserved", i.reserved) + ]: + queue = queue_func() or {} + for worker, tasks in queue.items(): + for task in tasks: + task_id = task.get('id') + if task_id and task_id not in revoked_ids: + celery.control.revoke(task_id, terminate=True) + revoked_ids.append(task_id) + print(f"Revoked task: {task_id}") + + # Purge all queues + celery.control.purge() + print("Purged all task queues") + + # Log the action + ActivityLog.log_scraper_command( + action="force_stop_scraper", + status="success", + description=f"Force stopped scraper, revoked {len(revoked_ids)} tasks" + ) + + print(f"\nRevoked {len(revoked_ids)} tasks in total") + +if __name__ == "__main__": + print("=== SCRAPER STATUS DIAGNOSTIC TOOL ===") + check_scraper_status() + check_celery_tasks() + check_recent_logs() + + stop_confirmation = input("\nDo you want to force stop the scraper? (y/n): ") + if stop_confirmation.lower() == 'y': + force_stop_scraper() + print("\nScraper force stopped. Current state:") + check_scraper_status() + else: + print("No changes made.") diff --git a/emergency_stop.py b/emergency_stop.py new file mode 100755 index 0000000..941e692 --- /dev/null +++ b/emergency_stop.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +""" +Emergency force stop utility for the scraper. + +This script will: +1. Set the scraper state to inactive +2. Revoke all running/scheduled tasks +3. Purge task queues +4. Revert any papers in "Pending" state to their previous status + +Use this to recover from a misbehaving scraper or when the web UI is unresponsive. +""" + +import os +import sys +import time +from datetime import datetime + +# Add project root to path +sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) + +# Import required modules +from scipaperloader import create_app +from scipaperloader.db import db +from scipaperloader.models import PaperMetadata, ActivityLog, ScraperState +from scipaperloader.celery import celery + +app = create_app() + +def emergency_stop(): + """Force stop the scraper and revert all pending papers""" + with app.app_context(): + print("Emergency Scraper Stop") + print("-" * 50) + + # 1. Set scraper state to inactive + ScraperState.set_active(False) + ScraperState.set_paused(False) + print("✓ Set scraper state to inactive") + + # 2. Revoke all tasks + print("\nRevoking running tasks...") + try: + i = celery.control.inspect() + active = i.active() or {} + scheduled = i.scheduled() or {} + reserved = i.reserved() or {} + + revoked_count = 0 + + # Revoke active tasks + for worker, tasks in active.items(): + for task in tasks: + if 'id' in task: + celery.control.revoke(task['id'], terminate=True) + revoked_count += 1 + print(f" Revoked active task: {task.get('name', 'unknown')}") + + # Revoke scheduled tasks + for worker, tasks in scheduled.items(): + for task in tasks: + if 'id' in task: + celery.control.revoke(task['id'], terminate=True) + revoked_count += 1 + + # Revoke reserved tasks + for worker, tasks in reserved.items(): + for task in tasks: + if 'id' in task: + celery.control.revoke(task['id'], terminate=True) + revoked_count += 1 + + print(f"✓ Revoked {revoked_count} tasks") + + # 3. Purge queues + celery.control.purge() + print("✓ Purged all task queues") + + except Exception as e: + print(f"⚠ Error revoking tasks: {str(e)}") + + # 4. Revert papers in "Pending" status + try: + print("\nReverting papers from 'Pending' status...") + pending_papers = PaperMetadata.query.filter_by(status="Pending").all() + reverted_count = 0 + + for paper in pending_papers: + # Get previous status or use "New" as fallback + previous_status = paper.previous_status if hasattr(paper, 'previous_status') and paper.previous_status else "New" + paper.status = previous_status + + ActivityLog.log_scraper_activity( + action="emergency_revert", + paper_id=paper.id, + status="info", + description=f"Emergency reversion from 'Pending' to '{previous_status}'", + ) + reverted_count += 1 + print(f" Reverted paper ID {paper.id}: {paper.title} -> {previous_status}") + + # Commit changes + db.session.commit() + print(f"✓ Reverted {reverted_count} papers") + + ActivityLog.log_scraper_command( + action="emergency_stop", + status="success", + description=f"Emergency stop performed. Revoked {revoked_count} tasks and reverted {reverted_count} papers." + ) + + except Exception as e: + db.session.rollback() + print(f"⚠ Error reverting papers: {str(e)}") + + print("\nEmergency stop completed!") + print(f"Current time: {datetime.now()}") + +if __name__ == "__main__": + emergency_stop()