fixes scheduling

2025-06-10 11:40:36 +02:00 · 2025-06-10 11:40:36 +02:00 · 3b42010fab
commit 3b42010fab
parent 1e97a9cc7b
2 changed files with 127 additions and 20 deletions
--- a/scipaperloader/scrapers/manager.py
+++ b/scipaperloader/scrapers/manager.py
@ -46,10 +46,15 @@ class ScraperManager:
            # Test connection
            self.redis_client.ping()
        except Exception as e:
-            ActivityLog.log_error(
+            # Only log if we're in an application context
-                error_message=f"Failed to initialize Redis client: {str(e)}",
+            try:
-                source="ScraperManager._init_redis_client"
+                ActivityLog.log_error(
-            )
+                    error_message=f"Failed to initialize Redis client: {str(e)}",
                    source="ScraperManager._init_redis_client"
                )
            except RuntimeError:
                # Outside application context - just print to console
                print(f"Warning: Failed to initialize Redis client: {str(e)}")
            self.redis_client = None
    def _clear_delayed_tasks_from_redis(self) -> int:
@ -320,16 +325,24 @@ class ScraperManager:
    def stop_scraper(self) -> Dict[str, str]:
        """Stop the scraper, revoke all running tasks, and revert pending papers."""
        try:
-            # First, revoke all running tasks
+            # STEP 1: Immediately set scraper as inactive - this is critical for race condition prevention
-            revoked_count = 0
+            ScraperState.set_active(False)
-            delayed_cleared_count = 0
+            ScraperState.set_paused(False)
            ActivityLog.log_scraper_command(
                action="stop_scraper_start",
                status="info",
-                description="Beginning scraper stop process with task revocation and delayed task clearing"
+                description="Scraper stop initiated - marked as inactive. Beginning task revocation and delayed task clearing."
            )
            # STEP 2: Brief pause to allow running tasks to see the inactive state
            import time
            time.sleep(0.2)
            # STEP 3: Revoke all running tasks
            revoked_count = 0
            delayed_cleared_count = 0
            try:
                # Get Celery inspector to check for running tasks
                i = celery.control.inspect()
@ -381,7 +394,7 @@ class ScraperManager:
                    description="Purged all task queues"
                )
-                # **NEW: Clear delayed tasks from Redis sorted sets**
+                # STEP 4: Clear delayed tasks from Redis sorted sets
                delayed_cleared_count = self._clear_delayed_tasks_from_redis()
                # Additional cleanup: revoke any remaining scraper-related tasks by name pattern
@ -434,7 +447,10 @@ class ScraperManager:
                )
                # Continue with paper reversion even if task revocation fails
-            # Get current scraper to know what status to revert to
+            # STEP 5: Wait a bit longer for any remaining tasks to finish their checks and exit
            time.sleep(1.0)
            # STEP 6: Revert papers from processing status
            scraper = get_scraper()
            input_statuses = scraper.get_input_statuses()
@ -464,14 +480,10 @@ class ScraperManager:
                    description=f"Reverted {reverted_count} papers from '{processing_status}' to previous status"
                )
            # Deactivate scraper
            ScraperState.set_active(False)
            ScraperState.set_paused(False)
            ActivityLog.log_scraper_command(
                action="stop_scraper",
                status="success",
-                description=f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers."
+                description=f"Scraper stopped completely. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers."
            )
            return {
@ -656,6 +668,26 @@ class ScraperManager:
    def process_paper(self, paper: PaperMetadata) -> Dict:
        """Process a single paper using the current scraper."""
        try:
            # **RACE CONDITION FIX**: Double-check scraper state before proceeding
            scraper_state = ScraperState.get_current_state()
            if not scraper_state.is_active:
                ActivityLog.log_scraper_activity(
                    action="process_paper",
                    paper_id=paper.id,
                    status="skipped",
                    description="Skipped processing - scraper deactivated during task execution"
                )
                return {"paper_id": paper.id, "status": "skipped", "message": "Scraper not active"}
            if scraper_state.is_paused:
                ActivityLog.log_scraper_activity(
                    action="process_paper", 
                    paper_id=paper.id,
                    status="skipped",
                    description="Skipped processing - scraper paused during task execution"
                )
                return {"paper_id": paper.id, "status": "skipped", "message": "Scraper paused"}
            scraper = get_scraper()
            output_statuses = scraper.get_output_statuses()
@ -668,6 +700,22 @@ class ScraperManager:
            paper.updated_at = datetime.utcnow()
            db.session.commit()
            # **ADDITIONAL RACE CONDITION CHECK**: Verify scraper is still active before expensive scraping operation
            scraper_state = ScraperState.get_current_state()
            if not scraper_state.is_active:
                # Scraper was deactivated after we marked paper as processing - revert and exit
                paper.status = previous_status
                paper.updated_at = datetime.utcnow()
                db.session.commit()
                ActivityLog.log_scraper_activity(
                    action="process_paper",
                    paper_id=paper.id,
                    status="cancelled",
                    description="Cancelled processing - scraper deactivated after paper marked as processing"
                )
                return {"paper_id": paper.id, "status": "cancelled", "message": "Scraper deactivated during processing"}
            # Perform scraping
            result = scraper.scrape(paper.doi)
--- a/scipaperloader/scrapers/tasks.py
+++ b/scipaperloader/scrapers/tasks.py
@ -109,16 +109,17 @@ def process_single_paper(self, paper_id: int):
        paper_id: ID of the paper to process
    """
    try:
-        # Double-check scraper state before processing
+        # ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times
        # Initial check before any processing
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
-                description="Skipped processing - scraper not active"
+                description="Task skipped - scraper not active (initial check)"
            )
            # Use Celery's ignore to mark this task as completed without error
            self.retry = False
            return {"status": "inactive", "paper_id": paper_id}
@ -127,9 +128,55 @@ def process_single_paper(self, paper_id: int):
                action="process_single_paper", 
                paper_id=paper_id,
                status="skipped",
-                description="Skipped processing - scraper paused"
+                description="Task skipped - scraper paused (initial check)"
            )
            self.retry = False
            return {"status": "paused", "paper_id": paper_id}
        # Check if this specific task has been revoked
        try:
            from ..celery import celery
            # Check if the current task is in the revoked list
            if hasattr(self, 'request') and self.request.id:
                revoked_tasks = celery.control.inspect().revoked()
                if revoked_tasks:
                    for worker, tasks in revoked_tasks.items():
                        if self.request.id in tasks:
                            ActivityLog.log_scraper_activity(
                                action="process_single_paper",
                                paper_id=paper_id,
                                status="skipped",
                                description=f"Task skipped - task ID {self.request.id} was revoked"
                            )
                            return {"status": "revoked", "paper_id": paper_id, "task_id": self.request.id}
        except Exception:
            # Don't fail on revocation check issues, just continue with state checks
            pass
        # Brief pause to allow stop commands to take effect
        import time
        time.sleep(0.1)
        # Second check after brief delay
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper not active (secondary check)"
            )
            self.retry = False
            return {"status": "inactive", "paper_id": paper_id}
        if scraper_state.is_paused:
            ActivityLog.log_scraper_activity(
                action="process_single_paper", 
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper paused (secondary check)"
            )
            # Use Celery's ignore for paused state too
            self.retry = False
            return {"status": "paused", "paper_id": paper_id}
@ -139,6 +186,18 @@ def process_single_paper(self, paper_id: int):
        if not paper:
            return {"status": "error", "message": f"Paper {paper_id} not found"}
        # Third check before starting actual processing
        scraper_state = ScraperState.get_current_state()
        if not scraper_state.is_active:
            ActivityLog.log_scraper_activity(
                action="process_single_paper",
                paper_id=paper_id,
                status="skipped",
                description="Task skipped - scraper not active (pre-processing check)"
            )
            self.retry = False
            return {"status": "inactive", "paper_id": paper_id}
        # Process the paper using scraper manager
        manager = ScraperManager()
        result = manager.process_paper(paper)