SciPaperLoader/scipaperloader/scrapers/tasks.py

"""
APScheduler-based task functions that replace Celery tasks for paper processing.
"""

import random
from datetime import datetime, timedelta
from typing import Optional
from flask import current_app

from ..models import ScraperState, ActivityLog, PaperMetadata
from .manager import ScraperManager


def hourly_scraper_scheduler():
    """
    Hourly task that schedules paper processing at random times within the hour.

    This task runs at the beginning of each hour and:
    1. Calculates how many papers to process this hour
    2. Schedules individual paper processing tasks at random times within the hour
    """
    try:
        # Check if scraper is active
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="hourly_scheduler",
                status="info",
                description="Hourly scheduler skipped - scraper not active"
            )
            return {"status": "inactive", "papers_scheduled": 0}

        if scraper_state.is_paused:
            ActivityLog.log_scraper_activity(
                action="hourly_scheduler",
                status="info",
                description="Hourly scheduler skipped - scraper paused"
            )
            return {"status": "paused", "papers_scheduled": 0}

        # Initialize scraper manager
        manager = ScraperManager()

        # Get papers to process this hour
        papers = manager.select_papers_for_processing()

        if not papers:
            ActivityLog.log_scraper_activity(
                action="hourly_scheduler",
                status="info",
                description="No papers available for processing this hour"
            )
            return {"status": "empty", "papers_scheduled": 0}

        # Get scheduler from Flask app config
        scheduler = current_app.config.get('SCHEDULER')
        if not scheduler:
            ActivityLog.log_error(
                error_message="APScheduler not available for paper scheduling",
                source="hourly_scraper_scheduler"
            )
            return {"status": "error", "message": "APScheduler not available"}

        # Schedule papers at random times within the hour (0-3600 seconds)
        scheduled_count = 0
        current_time = datetime.now()

        for paper in papers:
            # Random delay between 1 second and 58 minutes
            delay_seconds = random.randint(1, 3480)  # Up to 58 minutes
            run_date = current_time + timedelta(seconds=delay_seconds)

            # Schedule the task using APScheduler
            job_id = f"paper_process_{paper.id}_{int(current_time.timestamp())}"
            scheduler.add_job(
                func=process_single_paper,
                trigger='date',
                run_date=run_date,
                args=[paper.id],
                id=job_id,
                replace_existing=True
            )

            scheduled_count += 1

            # Log each scheduled paper
            ActivityLog.log_scraper_activity(
                action="schedule_paper",
                paper_id=paper.id,
                status="info",
                description=f"Scheduled paper {paper.doi} for processing at {run_date.strftime('%H:%M:%S')}"
            )

        ActivityLog.log_scraper_activity(
            action="hourly_scheduler",
            status="success",
            description=f"Scheduled {scheduled_count} papers for random processing within this hour"
        )

        return {"status": "success", "papers_scheduled": scheduled_count}

    except Exception as e:
        ActivityLog.log_error(
            error_message=f"Hourly scheduler error: {str(e)}",
            source="hourly_scraper_scheduler"
        )
        return {"status": "error", "message": str(e)}


def process_single_paper(paper_id: int):
    """
    Process a single paper. This task is scheduled at random times within each hour.

    Args:
        paper_id: ID of the paper to process
    """
    try:
        # ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times

        # Initial check before any processing
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper not active (initial check)"
            )
            return {"status": "inactive", "paper_id": paper_id}

        if scraper_state.is_paused:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper paused (initial check)"
            )
            return {"status": "paused", "paper_id": paper_id}

        # Brief pause to allow stop commands to take effect
        import time
        time.sleep(0.1)

        # Second check after brief delay
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper not active (secondary check)"
            )
            return {"status": "inactive", "paper_id": paper_id}

        if scraper_state.is_paused:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper paused (secondary check)"
            )
            return {"status": "paused", "paper_id": paper_id}

        # Get the paper
        paper = PaperMetadata.query.get(paper_id)
        if not paper:
            return {"status": "error", "message": f"Paper {paper_id} not found"}

        # Third check before starting actual processing
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper not active (pre-processing check)"
            )
            return {"status": "inactive", "paper_id": paper_id}

        # Process the paper using scraper manager
        manager = ScraperManager()
        result = manager.process_paper(paper)

        return result

    except Exception as e:
        ActivityLog.log_error(
            error_message=f"Error processing paper {paper_id}: {str(e)}",
            source="process_single_paper"
        )
        return {"status": "error", "paper_id": paper_id, "message": str(e)}
        manager = ScraperManager()
        result = manager.process_paper(paper)

        return result

    except Exception as e:
        ActivityLog.log_error(
            error_message=f"Error processing paper {paper_id}: {str(e)}",
            source="process_single_paper"
        )
        return {"status": "error", "paper_id": paper_id, "message": str(e)}


def process_papers_batch(paper_ids: list, scraper_module: Optional[str] = None):
    """
    Process multiple papers in a batch for immediate processing.

    Args:
        paper_ids: List of paper IDs to process
        scraper_module: Optional specific scraper module to use
    """
    try:
        results = []
        manager = ScraperManager()

        for paper_id in paper_ids:
            paper = PaperMetadata.query.get(paper_id)
            if paper:
                result = manager.process_paper(paper)
                results.append(result)
            else:
                results.append({
                    "paper_id": paper_id,
                    "status": "error",
                    "message": "Paper not found"
                })

        return {"results": results, "total_processed": len(results)}

    except Exception as e:
        ActivityLog.log_error(
            error_message=f"Error processing batch: {str(e)}",
            source="process_papers_batch"
        )
        return {"status": "error", "message": str(e)}