SciPaperLoader/tools/diagnostics/test_reversion.py

#!/usr/bin/env python3
"""
Test script for verifying the paper reversion fix with APScheduler.
This script:
1. Creates test papers and simulates processing
2. Tests the stop_scraper functionality
3. Checks that all pending papers were reverted to their previous status
4. Ensures all running tasks were terminated
"""

import os
import sys
import time
from datetime import datetime, UTC, timedelta
from sqlalchemy import func
from flask import Flask

# Add project root to path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))

# Import the app and models
from scipaperloader import create_app
from scipaperloader.db import db
from scipaperloader.models import PaperMetadata, ActivityLog, ScraperState
from scipaperloader.scrapers.factory import get_scraper
from scipaperloader.scrapers.manager import ScraperManager

print("[DEBUG] Initializing Flask app...")
app = create_app()

print("[DEBUG] Flask app initialized.")

def test_stop_scraper():
    """Test the stop_scraper functionality with proper APScheduler integration"""

    print("[DEBUG] Entering app context...")
    with app.app_context():
        print("[DEBUG] App context entered.")

        # Clear existing test data
        print("[DEBUG] Clearing existing test data...")
        PaperMetadata.query.filter(PaperMetadata.doi.like('10.1234/test%')).delete()
        db.session.commit()
        print("[DEBUG] Existing test data cleared.")

        # Get scraper configuration
        scraper = get_scraper()
        input_statuses = scraper.get_input_statuses()
        output_statuses = scraper.get_output_statuses()

        if not input_statuses:
            print("❌ No input statuses found for current scraper")
            return

        input_status = input_statuses[0]  # Use first input status
        processing_status = output_statuses.get("processing", "Processing")

        print(f"[DEBUG] Using input status: {input_status}")
        print(f"[DEBUG] Using processing status: {processing_status}")

        # Create test papers in input status
        test_papers = []
        print("[DEBUG] Creating test papers...")
        for i in range(3):
            test_paper = PaperMetadata()
            test_paper.title = f"Test Paper {i+1}"
            test_paper.doi = f"10.1234/test{i+1}"
            test_paper.status = input_status
            test_paper.created_at = datetime.now(UTC)
            test_paper.updated_at = datetime.now(UTC)
            db.session.add(test_paper)
            test_papers.append(test_paper)
        db.session.commit()
        print(f"[DEBUG] Created {len(test_papers)} test papers in '{input_status}' status.")

        # Simulate some papers being moved to processing status
        print("[DEBUG] Simulating papers in processing...")
        for i, paper in enumerate(test_papers[:2]):  # Move first 2 papers to processing
            paper.previous_status = paper.status  # Store previous status
            paper.status = processing_status
            paper.updated_at = datetime.now(UTC)
        db.session.commit()
        print(f"[DEBUG] Moved 2 papers to '{processing_status}' status.")

        # Check current scraper state
        scraper_state = ScraperState.get_current_state()
        print(f"[DEBUG] Current scraper state: active={scraper_state.is_active}, paused={scraper_state.is_paused}")

        # Check paper counts before stopping
        input_count = PaperMetadata.query.filter_by(status=input_status).count()
        processing_count = PaperMetadata.query.filter_by(status=processing_status).count()
        print(f"[DEBUG] Papers before stopping: {input_count} in '{input_status}', {processing_count} in '{processing_status}'")

        # Test APScheduler job management
        scheduler = app.config.get('SCHEDULER')
        if scheduler:
            print("[DEBUG] Testing APScheduler job management...")

            # Create some test jobs using the correct API
            for paper in test_papers:
                job_id = scheduler.schedule_paper_processing(
                    paper_id=paper.id,
                    delay_seconds=60,  # 1 minute from now
                    job_id=f"test_paper_process_{paper.id}"
                )
                print(f"[DEBUG] Scheduled job {job_id} for paper {paper.id}")

            jobs_before = len(scheduler.get_paper_jobs())
            print(f"[DEBUG] Created {jobs_before} test jobs in APScheduler")

            # Test the manager's stop_scraper method
            print("[DEBUG] Testing ScraperManager.stop_scraper()...")
            manager = ScraperManager()
            result = manager.stop_scraper()

            print(f"[DEBUG] stop_scraper result: {result}")

            # Check jobs after stopping
            jobs_after = len(scheduler.get_paper_jobs())
            print(f"[DEBUG] Jobs after stopping: {jobs_after} (should be 0)")

            if jobs_after == 0:
                print("✅ All APScheduler jobs successfully revoked")
            else:
                print(f"❌ {jobs_after} jobs still exist after revocation")
        else:
            print("❌ APScheduler not found in app config")

        # Check paper counts after stopping
        input_count_after = PaperMetadata.query.filter_by(status=input_status).count()
        processing_count_after = PaperMetadata.query.filter_by(status=processing_status).count()
        print(f"[DEBUG] Papers after stopping: {input_count_after} in '{input_status}', {processing_count_after} in '{processing_status}'")

        # Verify that processing papers were reverted
        if processing_count_after == 0 and input_count_after >= processing_count:
            print("✅ Papers successfully reverted from processing to previous status")
        else:
            print(f"❌ Paper reversion failed: expected 0 processing papers, got {processing_count_after}")

        # Check scraper state after stopping
        scraper_state_after = ScraperState.get_current_state()
        print(f"[DEBUG] Scraper state after stopping: active={scraper_state_after.is_active}, paused={scraper_state_after.is_paused}")

        if not scraper_state_after.is_active and not scraper_state_after.is_paused:
            print("✅ Scraper state correctly set to inactive")
        else:
            print("❌ Scraper state not properly updated")

        # Clean up test data
        print("[DEBUG] Cleaning up test data...")
        PaperMetadata.query.filter(PaperMetadata.doi.like('10.1234/test%')).delete()
        db.session.commit()
        print("[DEBUG] Test data cleaned up.")

print("[DEBUG] Starting test_stop_scraper...")
test_stop_scraper()
print("[DEBUG] test_stop_scraper completed.")