SciPaperLoader/scipaperloader/scrapers/tasks.py

"""
Hourly scheduler task that processes papers at random times within each hour.
"""

import random
from datetime import datetime, timedelta
from typing import Optional
from celery import shared_task

from ..models import ScraperState, ActivityLog
from .manager import ScraperManager


@shared_task(bind=True)
def hourly_scraper_scheduler(self):
    """
    Hourly task that schedules paper processing at random times within the hour.

    This task runs at the beginning of each hour and:
    1. Calculates how many papers to process this hour
    2. Schedules individual paper processing tasks at random times within the hour
    """
    try:
        # Check if scraper is active
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="hourly_scheduler",
                status="info",
                description="Hourly scheduler skipped - scraper not active"
            )
            # Disable retries for inactive scheduler
            self.retry = False
            return {"status": "inactive", "papers_scheduled": 0}

        if scraper_state.is_paused:
            ActivityLog.log_scraper_activity(
                action="hourly_scheduler",
                status="info",
                description="Hourly scheduler skipped - scraper paused"
            )
            # Disable retries for paused scheduler
            self.retry = False
            return {"status": "paused", "papers_scheduled": 0}

        # Initialize scraper manager
        manager = ScraperManager()

        # Get papers to process this hour
        papers = manager.select_papers_for_processing()

        if not papers:
            ActivityLog.log_scraper_activity(
                action="hourly_scheduler",
                status="info",
                description="No papers available for processing this hour"
            )
            return {"status": "empty", "papers_scheduled": 0}

        # Schedule papers at random times within the hour (0-3600 seconds)
        scheduled_count = 0
        current_time = datetime.now()

        for paper in papers:
            # Random delay between 1 second and 58 minutes
            delay_seconds = random.randint(1, 3480)  # Up to 58 minutes

            # Schedule the task using Celery's task registry to avoid circular import issues
            from ..celery import celery
            celery.send_task(
                'scipaperloader.scrapers.tasks.process_single_paper',
                args=[paper.id],
                countdown=delay_seconds
            )

            scheduled_count += 1

            # Log each scheduled paper
            schedule_time = current_time + timedelta(seconds=delay_seconds)
            ActivityLog.log_scraper_activity(
                action="schedule_paper",
                paper_id=paper.id,
                status="info",
                description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
            )

        ActivityLog.log_scraper_activity(
            action="hourly_scheduler",
            status="success",
            description=f"Scheduled {scheduled_count} papers for random processing within this hour"
        )

        return {"status": "success", "papers_scheduled": scheduled_count}

    except Exception as e:
        ActivityLog.log_error(
            error_message=f"Hourly scheduler error: {str(e)}",
            source="hourly_scraper_scheduler"
        )
        return {"status": "error", "message": str(e)}


@shared_task(bind=True)
def process_single_paper(self, paper_id: int):
    """
    Process a single paper. This task is scheduled at random times within each hour.

    Args:
        paper_id: ID of the paper to process
    """
    try:
        # Double-check scraper state before processing
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Skipped processing - scraper not active"
            )
            # Use Celery's ignore to mark this task as completed without error
            self.retry = False
            return {"status": "inactive", "paper_id": paper_id}

        if scraper_state.is_paused:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Skipped processing - scraper paused"
            )
            # Use Celery's ignore for paused state too
            self.retry = False
            return {"status": "paused", "paper_id": paper_id}

        # Get the paper
        from ..models import PaperMetadata
        paper = PaperMetadata.query.get(paper_id)
        if not paper:
            return {"status": "error", "message": f"Paper {paper_id} not found"}

        # Process the paper using scraper manager
        manager = ScraperManager()
        result = manager.process_paper(paper)

        return result

    except Exception as e:
        ActivityLog.log_error(
            error_message=f"Error processing paper {paper_id}: {str(e)}",
            source="process_single_paper"
        )
        return {"status": "error", "paper_id": paper_id, "message": str(e)}


@shared_task(bind=True)
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
    """
    Process multiple papers in a batch for immediate processing.

    Args:
        paper_ids: List of paper IDs to process
        scraper_module: Optional specific scraper module to use
    """
    try:
        results = []
        manager = ScraperManager()

        for paper_id in paper_ids:
            from ..models import PaperMetadata
            paper = PaperMetadata.query.get(paper_id)
            if paper:
                result = manager.process_paper(paper)
                results.append(result)
            else:
                results.append({
                    "paper_id": paper_id,
                    "status": "error",
                    "message": "Paper not found"
                })

        return {"results": results, "total_processed": len(results)}

    except Exception as e:
        ActivityLog.log_error(
            error_message=f"Error processing batch: {str(e)}",
            source="process_papers_batch"
        )
        return {"status": "error", "message": str(e)}