SciPaperLoader/scipaperloader/scrapers/tasks.py

"""
Hourly scheduler task that processes papers at random times within each hour.
"""

import random
from datetime import datetime, timedelta
from typing import Optional
from celery import shared_task

from ..models import ScraperState, ActivityLog
from .manager import ScraperManager


@shared_task(bind=True)
def hourly_scraper_scheduler(self):
    """
    Hourly task that schedules paper processing at random times within the hour.

    This task runs at the beginning of each hour and:
    1. Calculates how many papers to process this hour
    2. Schedules individual paper processing tasks at random times within the hour
    """
    try:
        # Check if scraper is active
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="hourly_scheduler",
                status="info",
                description="Hourly scheduler skipped - scraper not active"
            )
            # Disable retries for inactive scheduler
            self.retry = False
            return {"status": "inactive", "papers_scheduled": 0}

        if scraper_state.is_paused:
            ActivityLog.log_scraper_activity(
                action="hourly_scheduler",
                status="info",
                description="Hourly scheduler skipped - scraper paused"
            )
            # Disable retries for paused scheduler
            self.retry = False
            return {"status": "paused", "papers_scheduled": 0}

        # Initialize scraper manager
        manager = ScraperManager()

        # Get papers to process this hour
        papers = manager.select_papers_for_processing()

        if not papers:
            ActivityLog.log_scraper_activity(
                action="hourly_scheduler",
                status="info",
                description="No papers available for processing this hour"
            )
            return {"status": "empty", "papers_scheduled": 0}

        # Schedule papers at random times within the hour (0-3600 seconds)
        scheduled_count = 0
        current_time = datetime.now()

        for paper in papers:
            # Random delay between 1 second and 58 minutes
            delay_seconds = random.randint(1, 3480)  # Up to 58 minutes

            # Schedule the task using Celery's task registry to avoid circular import issues
            from ..celery import celery
            celery.send_task(
                'scipaperloader.scrapers.tasks.process_single_paper',
                args=[paper.id],
                countdown=delay_seconds
            )

            scheduled_count += 1

            # Log each scheduled paper
            schedule_time = current_time + timedelta(seconds=delay_seconds)
            ActivityLog.log_scraper_activity(
                action="schedule_paper",
                paper_id=paper.id,
                status="info",
                description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
            )

        ActivityLog.log_scraper_activity(
            action="hourly_scheduler",
            status="success",
            description=f"Scheduled {scheduled_count} papers for random processing within this hour"
        )

        return {"status": "success", "papers_scheduled": scheduled_count}

    except Exception as e:
        ActivityLog.log_error(
            error_message=f"Hourly scheduler error: {str(e)}",
            source="hourly_scraper_scheduler"
        )
        return {"status": "error", "message": str(e)}


@shared_task(bind=True)
def process_single_paper(self, paper_id: int):
    """
    Process a single paper. This task is scheduled at random times within each hour.

    Args:
        paper_id: ID of the paper to process
    """
    try:
        # ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times

        # Initial check before any processing
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper not active (initial check)"
            )
            self.retry = False
            return {"status": "inactive", "paper_id": paper_id}

        if scraper_state.is_paused:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper paused (initial check)"
            )
            self.retry = False
            return {"status": "paused", "paper_id": paper_id}

        # Check if this specific task has been revoked
        try:
            from ..celery import celery

            # Check if the current task is in the revoked list
            if hasattr(self, 'request') and self.request.id:
                revoked_tasks = celery.control.inspect().revoked()
                if revoked_tasks:
                    for worker, tasks in revoked_tasks.items():
                        if self.request.id in tasks:
                            ActivityLog.log_scraper_activity(
                                action="process_single_paper",
                                paper_id=paper_id,
                                status="skipped",
                                description=f"Task skipped - task ID {self.request.id} was revoked"
                            )
                            return {"status": "revoked", "paper_id": paper_id, "task_id": self.request.id}
        except Exception:
            # Don't fail on revocation check issues, just continue with state checks
            pass

        # Brief pause to allow stop commands to take effect
        import time
        time.sleep(0.1)

        # Second check after brief delay
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper not active (secondary check)"
            )
            self.retry = False
            return {"status": "inactive", "paper_id": paper_id}

        if scraper_state.is_paused:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper paused (secondary check)"
            )
            self.retry = False
            return {"status": "paused", "paper_id": paper_id}

        # Get the paper
        from ..models import PaperMetadata
        paper = PaperMetadata.query.get(paper_id)
        if not paper:
            return {"status": "error", "message": f"Paper {paper_id} not found"}

        # Third check before starting actual processing
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper not active (pre-processing check)"
            )
            self.retry = False
            return {"status": "inactive", "paper_id": paper_id}

        # Process the paper using scraper manager
        manager = ScraperManager()
        result = manager.process_paper(paper)

        return result

    except Exception as e:
        ActivityLog.log_error(
            error_message=f"Error processing paper {paper_id}: {str(e)}",
            source="process_single_paper"
        )
        return {"status": "error", "paper_id": paper_id, "message": str(e)}


@shared_task(bind=True)
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
    """
    Process multiple papers in a batch for immediate processing.

    Args:
        paper_ids: List of paper IDs to process
        scraper_module: Optional specific scraper module to use
    """
    try:
        results = []
        manager = ScraperManager()

        for paper_id in paper_ids:
            from ..models import PaperMetadata
            paper = PaperMetadata.query.get(paper_id)
            if paper:
                result = manager.process_paper(paper)
                results.append(result)
            else:
                results.append({
                    "paper_id": paper_id,
                    "status": "error",
                    "message": "Paper not found"
                })

        return {"results": results, "total_processed": len(results)}

    except Exception as e:
        ActivityLog.log_error(
            error_message=f"Error processing batch: {str(e)}",
            source="process_papers_batch"
        )
        return {"status": "error", "message": str(e)}