refines modular scraping system. adds another dummy scraper

2025-05-26 16:13:42 +02:00 · 2025-05-26 16:13:42 +02:00 · 1e97a9cc7b
commit 1e97a9cc7b
parent ac348696b5
14 changed files with 1801 additions and 981 deletions
--- a/celery_worker.py
+++ b/celery_worker.py
@ -1,9 +1,11 @@
 from scipaperloader.celery import celery, configure_celery
 # Import all task modules to ensure they are registered with Celery
+import scipaperloader.scrapers.tasks  # Import new scheduler tasks
 import scipaperloader.blueprints.scraper  # Import the scraper module with our tasks

 # Configure celery with Flask app
 configure_celery()

 if __name__ == '__main__':
-    celery.start()
+    # Start the Celery worker
+    celery.start(['worker', '--loglevel=info', '--concurrency=2'])
--- a/scipaperloader/blueprints/api.py
+++ b/scipaperloader/blueprints/api.py
@ -9,7 +9,7 @@ bp = Blueprint("api", __name__, url_prefix="/api")
 def get_activity_logs():
    """Get activity logs with filtering options."""
    # Get query parameters
-    category = request.args.get("category")
+    categories = request.args.getlist("category")  # Changed to getlist for multiple values
    action = request.args.get("action")
    after = request.args.get("after")
    limit = request.args.get("limit", 20, type=int)
@ -17,8 +17,9 @@ def get_activity_logs():
    # Build query
    query = ActivityLog.query
    
-    if category:
-        query = query.filter(ActivityLog.category == category)
+    if categories:
+        # Filter by multiple categories using in_() for SQL IN clause
+        query = query.filter(ActivityLog.category.in_(categories))
    
    if action:
        query = query.filter(ActivityLog.action == action)
--- a/scipaperloader/blueprints/config.py
+++ b/scipaperloader/blueprints/config.py
@ -34,21 +34,8 @@ def _update_volume(new_volume):
        if new_volume <= 0 or new_volume > MAX_VOLUME:
            return False, f"Volume must be between 1 and {MAX_VOLUME}", None

-        volume_config = VolumeConfig.query.first()
-        if not volume_config:
-            volume_config = VolumeConfig(volume=new_volume)
-            db.session.add(volume_config)
-        else:
-            old_value = volume_config.volume
-            volume_config.volume = new_volume
-            ActivityLog.log_config_change(
-                config_key="scraper_volume",
-                old_value=old_value,
-                new_value=new_volume,
-                description="Updated scraper volume"
-            )
-
-        db.session.commit()
+        # Use the new class method to set the volume
+        volume_config = VolumeConfig.set_volume(new_volume)
        
        # Invalidate and recalculate the hourly quota cache
        try:
--- a/scipaperloader/blueprints/scraper.py
+++ b/scipaperloader/blueprints/scraper.py
--- a/scipaperloader/celery.py
+++ b/scipaperloader/celery.py
@ -32,8 +32,8 @@ def configure_celery(app=None):
        task_reject_on_worker_lost=True,  # Requeue tasks if worker dies
        # Configure Beat schedule for periodic tasks
        beat_schedule={
-            'scheduled-scraper-hourly': {
-                'task': 'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
+            'hourly-scraper-scheduler': {
+                'task': 'scipaperloader.scrapers.tasks.hourly_scraper_scheduler',
                'schedule': crontab(minute=0),  # Run at the start of every hour
                'options': {'expires': 3600}
            },
--- a/scipaperloader/models.py
+++ b/scipaperloader/models.py
@ -91,12 +91,13 @@ class ActivityLog(db.Model):
        return log

    @classmethod
-    def log_scraper_command(cls, action, status=None, user_id=None, **extra):
+    def log_scraper_command(cls, action, status=None, description=None, user_id=None, **extra):
        """Log a scraper command (start/stop/pause)."""
        log = cls(
            category=ActivityCategory.SCRAPER_COMMAND.value,
            action=action,
            status=status,
+            description=description,
            user_id=user_id
        )
        log.set_extra_data(extra)
@ -191,6 +192,7 @@ class PaperMetadata(db.Model):
    language = db.Column(db.String(50))
    published_online = db.Column(db.Date)  # or DateTime/String
    status = db.Column(db.String(10))  # 'Pending','Done','Failed'
+    previous_status = db.Column(db.String(10), nullable=True)  # Store previous status for reversion
    file_path = db.Column(db.Text)
    error_msg = db.Column(db.Text)
    created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
@ -209,6 +211,35 @@ class ScheduleConfig(db.Model):
 class VolumeConfig(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    volume = db.Column(db.Float)  # volume of papers to scrape per day
+    
+    @classmethod
+    def get_current_volume(cls):
+        """Get the current volume configuration, creating default if needed."""
+        config = cls.query.first()
+        if not config:
+            config = cls(volume=100)
+            db.session.add(config)
+            db.session.commit()
+        return config.volume
+    
+    @classmethod
+    def set_volume(cls, new_volume):
+        """Set the volume configuration."""
+        config = cls.query.first()
+        if not config:
+            config = cls(volume=new_volume)
+            db.session.add(config)
+        else:
+            old_value = config.volume
+            config.volume = new_volume
+            ActivityLog.log_config_change(
+                config_key="scraper_volume",
+                old_value=old_value,
+                new_value=new_volume,
+                description="Updated scraper volume configuration"
+            )
+        db.session.commit()
+        return config

 class DownloadPathConfig(db.Model):
    """Model to store the base path for downloaded files."""
@ -220,7 +251,7 @@ class DownloadPathConfig(db.Model):
        """Get the configured download path, creating default if needed."""
        config = cls.query.first()
        if not config:
-            config = cls(path="/path/to/dummy/papers") # Ensure default exists
+            config = cls(path="/tmp/") # Ensure default exists
            db.session.add(config)
            db.session.commit()
        return config.path
@ -341,6 +372,7 @@ def init_schedule_config():
        default_volume = VolumeConfig(volume=100)
        db.session.add(default_volume)
        db.session.commit()
+        

    # Initialize DownloadPathConfig if it doesn't exist
    if DownloadPathConfig.query.count() == 0:
--- a/scipaperloader/scrapers/init.py
+++ b/scipaperloader/scrapers/init.py
@ -1,2 +1,18 @@
 # This package contains all scraper modules.
 # Each scraper should implement the BaseScraper interface from base.py.
+
+from .base import BaseScraper, ScrapeResult
+from .factory import get_scraper, get_available_scrapers
+from .manager import ScraperManager
+from .dummy import Scraper as DummyScraper
+from .failed_retry import Scraper as FailedRetryScraper
+
+__all__ = [
+    'BaseScraper',
+    'ScrapeResult', 
+    'get_scraper',
+    'get_available_scrapers',
+    'ScraperManager',
+    'DummyScraper',
+    'FailedRetryScraper'
+]
--- a/scipaperloader/scrapers/base.py
+++ b/scipaperloader/scrapers/base.py
@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import NamedTuple, Optional, Dict
+from typing import NamedTuple, Optional, Dict, List
 from datetime import datetime

 class ScrapeResult(NamedTuple):
@ -12,6 +12,12 @@ class ScrapeResult(NamedTuple):
 class BaseScraper(ABC):
    """Base class for all scraper implementations."""
    
+    # Default input/output statuses - can be overridden by subclasses
+    INPUT_STATUSES = ["New"]      # Which paper statuses this scraper will process
+    OUTPUT_STATUS_SUCCESS = "Done"     # Status to set on successful scraping
+    OUTPUT_STATUS_FAILURE = "Failed"   # Status to set on failed scraping
+    OUTPUT_STATUS_PROCESSING = "Pending"  # Status to set while processing
+    
    @abstractmethod
    def scrape(self, doi: str) -> ScrapeResult:
        """
@ -32,3 +38,15 @@ class BaseScraper(ABC):
    def get_description(self) -> str:
        """Return a description of this scraper."""
        return getattr(self.__class__, "__doc__", "No description available")
+    
+    def get_input_statuses(self) -> List[str]:
+        """Return list of paper statuses this scraper can process."""
+        return self.INPUT_STATUSES
+    
+    def get_output_statuses(self) -> Dict[str, str]:
+        """Return mapping of result types to output statuses."""
+        return {
+            "success": self.OUTPUT_STATUS_SUCCESS,
+            "failure": self.OUTPUT_STATUS_FAILURE,
+            "processing": self.OUTPUT_STATUS_PROCESSING
+        }
--- a/scipaperloader/scrapers/dummy.py
+++ b/scipaperloader/scrapers/dummy.py
@ -10,6 +10,12 @@ from ..db import db
 class Scraper(BaseScraper):
    """Dummy scraper for testing purposes that simulates paper downloading."""
    
+    # This scraper processes "New" papers and outputs "Done"/"Failed"
+    INPUT_STATUSES = ["New"]
+    OUTPUT_STATUS_SUCCESS = "Done"
+    OUTPUT_STATUS_FAILURE = "Failed" 
+    OUTPUT_STATUS_PROCESSING = "Pending"
+    
    def scrape(self, doi: str) -> ScrapeResult:
        """Simulate scraping a paper with realistic timing and random success/failure."""
        start_time = time.time()
--- a/scipaperloader/scrapers/factory.py
+++ b/scipaperloader/scrapers/factory.py
@ -1,5 +1,4 @@
 import importlib
-from flask import current_app
 from .base import BaseScraper

 def get_scraper() -> BaseScraper:
@ -7,10 +6,16 @@ def get_scraper() -> BaseScraper:
    from ..models import ScraperModuleConfig, ActivityLog
    
    try:
-        # Get module name from database first, fallback to config
+        # Get module name from database first, fallback to dummy
        name = ScraperModuleConfig.get_current_module()
        if not name:
-            name = current_app.config.get("SCRAPER_MODULE", "dummy")
+            # Only try to access Flask config if we're in app context
+            try:
+                from flask import current_app
+                name = current_app.config.get("SCRAPER_MODULE", "dummy")
+            except RuntimeError:
+                # No app context, use dummy
+                name = "dummy"
            
        module = importlib.import_module(f"scipaperloader.scrapers.{name}")
        cls = getattr(module, "Scraper")
--- a/scipaperloader/scrapers/failed_retry.py
+++ b/scipaperloader/scrapers/failed_retry.py
@ -0,0 +1,123 @@
+import time
+import random
+import os
+from datetime import datetime
+from .base import BaseScraper, ScrapeResult
+from flask import current_app
+from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
+from ..db import db
+
+class Scraper(BaseScraper):
+    """Retry scraper that attempts to re-process failed papers with different strategies."""
+    
+    # This scraper specifically targets "Failed" papers and retries them
+    INPUT_STATUSES = ["Failed"]
+    OUTPUT_STATUS_SUCCESS = "Done"
+    OUTPUT_STATUS_FAILURE = "Failed"
+    OUTPUT_STATUS_PROCESSING = "Retrying"
+    
+    def scrape(self, doi: str) -> ScrapeResult:
+        """Retry scraping a failed paper with enhanced error handling."""
+        start_time = time.time()
+        
+        paper = PaperMetadata.query.filter_by(doi=doi).first()
+        if not paper:
+            return ScrapeResult(
+                status="error", 
+                message=f"No paper found for DOI {doi}", 
+                data=None,
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        # Log retry attempt
+        ActivityLog.log_scraper_activity(
+            action="retry_failed_paper",
+            status="info",
+            description=f"Retrying failed paper: {paper.title}",
+            paper_id=paper.id
+        )
+
+        # Simulate longer processing time for retry (2-5 seconds)
+        processing_time = random.uniform(2, 5)
+        time.sleep(processing_time)
+        
+        # Simulate 60% success rate on retry (lower than initial attempt)
+        success = random.random() < 0.6
+        
+        result_data = {}
+        
+        if success:
+            # Get download path and create dummy file
+            download_path = DownloadPathConfig.get_path()
+            file_name = f"{doi.replace('/', '_')}_retry.pdf"
+            file_path = f"{download_path}/{file_name}"
+            
+            try:
+                # Ensure directory exists
+                os.makedirs(download_path, exist_ok=True)
+                
+                # Create a dummy PDF file
+                with open(file_path, 'w') as f:
+                    f.write(f"Dummy PDF content for retry of {doi}")
+                
+                result_data = {"file_path": file_path}
+                
+                # Log success
+                ActivityLog.log_scraper_activity(
+                    action="retry_scrape_success",
+                    status="success",
+                    description=f"Successfully retried {doi} on second attempt",
+                    paper_id=paper.id
+                )
+                
+                result = ScrapeResult(
+                    status="success",
+                    message=f"Successfully retried paper {doi}",
+                    data=result_data,
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+                
+            except Exception as e:
+                error_msg = f"Failed to save retry file: {str(e)}"
+                ActivityLog.log_scraper_activity(
+                    action="retry_scrape_file_error",
+                    status="error", 
+                    description=error_msg,
+                    paper_id=paper.id
+                )
+                
+                result = ScrapeResult(
+                    status="error",
+                    message=error_msg,
+                    data=None,
+                    duration=time.time() - start_time,
+                    timestamp=datetime.utcnow()
+                )
+        else:
+            # Retry failed - generate different error message
+            error_messages = [
+                "Retry failed: Still no access to publisher",
+                "Retry failed: Alternative download methods exhausted", 
+                "Retry failed: DOI appears permanently inaccessible",
+                "Retry failed: Network timeout persists"
+            ]
+            error_msg = random.choice(error_messages)
+            
+            ActivityLog.log_scraper_activity(
+                action="retry_scrape_failure",
+                status="error",
+                description=f"Retry failed for {doi}: {error_msg}",
+                paper_id=paper.id
+            )
+            
+            result = ScrapeResult(
+                status="error",
+                message=error_msg,
+                data=None,
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+        
+        return result
--- a/scipaperloader/scrapers/manager.py
+++ b/scipaperloader/scrapers/manager.py
@ -0,0 +1,747 @@
+"""
+Simplified scraper management system with hourly quota scheduling.
+"""
+
+import random
+import math
+import redis
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional
+from sqlalchemy import func
+
+from ..models import (
+    PaperMetadata, 
+    ScheduleConfig, 
+    VolumeConfig, 
+    ScraperState, 
+    ActivityLog,
+    ScraperModuleConfig
+)
+from ..db import db
+from ..cache_utils import get_cached_hourly_quota
+from .factory import get_scraper, get_available_scrapers
+from ..celery import celery
+
+
+class ScraperManager:
+    """Manages scraper operations with hourly quota-based scheduling."""
+    
+    def __init__(self):
+        self.current_scraper = None
+        self.pending_papers = []  # Track papers being processed
+        # Initialize Redis client for delayed task management
+        self.redis_client = None
+        self._init_redis_client()
+    
+    def _init_redis_client(self):
+        """Initialize Redis client for delayed task management."""
+        try:
+            # Use same Redis configuration as Celery
+            self.redis_client = redis.Redis(
+                host='localhost',
+                port=6379,
+                db=0,
+                decode_responses=True
+            )
+            # Test connection
+            self.redis_client.ping()
+        except Exception as e:
+            ActivityLog.log_error(
+                error_message=f"Failed to initialize Redis client: {str(e)}",
+                source="ScraperManager._init_redis_client"
+            )
+            self.redis_client = None
+    
+    def _clear_delayed_tasks_from_redis(self) -> int:
+        """Clear delayed tasks from Redis structures used by Celery.
+        
+        Based on analysis, Celery stores delayed tasks in:
+        - 'unacked_index': Sorted set containing task IDs with execution timestamps
+        - 'unacked': Hash containing task data keyed by task ID
+        
+        Returns:
+            int: Number of delayed tasks cleared
+        """
+        if not self.redis_client:
+            try:
+                ActivityLog.log_error(
+                    error_message="Redis client not available - cannot clear delayed tasks",
+                    source="ScraperManager._clear_delayed_tasks_from_redis"
+                )
+            except RuntimeError:
+                # Working outside application context - just print instead
+                print("❌ Redis client not available - cannot clear delayed tasks")
+            return 0
+        
+        cleared_count = 0
+        try:
+            # Define scraper task patterns to identify our tasks
+            scraper_patterns = [
+                'process_single_paper',
+                'process_papers_batch', 
+                'hourly_scraper_scheduler'
+            ]
+            
+            try:
+                ActivityLog.log_scraper_activity(
+                    action="check_delayed_tasks",
+                    status="info",
+                    description="Checking Celery delayed task structures (unacked_index, unacked)"
+                )
+            except RuntimeError:
+                print("🔍 Checking Celery delayed task structures (unacked_index, unacked)")
+            
+            # Check 'unacked_index' (sorted set with task IDs and timestamps)
+            unacked_index_cleared = 0
+            if self.redis_client.exists('unacked_index'):
+                try:
+                    # Get all task IDs from the sorted set
+                    task_ids = self.redis_client.zrange('unacked_index', 0, -1)
+                    
+                    if task_ids:
+                        try:
+                            ActivityLog.log_scraper_activity(
+                                action="scan_unacked_index",
+                                status="info",
+                                description=f"Found {len(task_ids)} tasks in 'unacked_index'"
+                            )
+                        except RuntimeError:
+                            print(f"📋 Found {len(task_ids)} tasks in 'unacked_index'")
+                        
+                        # Check each task ID against the 'unacked' hash to get task details
+                        scraper_task_ids = []
+                        for task_id in task_ids:
+                            try:
+                                # Get task data from 'unacked' hash
+                                task_data = self.redis_client.hget('unacked', task_id)
+                                if task_data:
+                                    # Check if this task contains any of our scraper patterns
+                                    if any(pattern in str(task_data) for pattern in scraper_patterns):
+                                        scraper_task_ids.append(task_id)
+                            except Exception:
+                                # Skip individual task errors
+                                continue
+                        
+                        # Remove scraper task IDs from both structures
+                        for task_id in scraper_task_ids:
+                            try:
+                                # Remove from unacked_index (sorted set)
+                                removed_from_index = self.redis_client.zrem('unacked_index', task_id)
+                                # Remove from unacked (hash)
+                                removed_from_hash = self.redis_client.hdel('unacked', task_id)
+                                
+                                if removed_from_index or removed_from_hash:
+                                    unacked_index_cleared += 1
+                                    
+                            except Exception as e:
+                                try:
+                                    ActivityLog.log_error(
+                                        error_message=f"Error removing delayed task {task_id}: {str(e)}",
+                                        source="ScraperManager._clear_delayed_tasks_from_redis"
+                                    )
+                                except RuntimeError:
+                                    print(f"❌ Error removing delayed task {task_id}: {str(e)}")
+                                continue
+                        
+                        cleared_count += unacked_index_cleared
+                        
+                        if unacked_index_cleared > 0:
+                            try:
+                                ActivityLog.log_scraper_activity(
+                                    action="clear_unacked_tasks",
+                                    status="success",
+                                    description=f"Cleared {unacked_index_cleared} scraper tasks from unacked structures"
+                                )
+                            except RuntimeError:
+                                print(f"✅ Cleared {unacked_index_cleared} scraper tasks from unacked structures")
+                    else:
+                        try:
+                            ActivityLog.log_scraper_activity(
+                                action="check_unacked_index",
+                                status="info",
+                                description="No tasks found in 'unacked_index'"
+                            )
+                        except RuntimeError:
+                            print("ℹ️  No tasks found in 'unacked_index'")
+                            
+                except Exception as e:
+                    try:
+                        ActivityLog.log_error(
+                            error_message=f"Error accessing 'unacked_index': {str(e)}",
+                            source="ScraperManager._clear_delayed_tasks_from_redis"
+                        )
+                    except RuntimeError:
+                        print(f"❌ Error accessing 'unacked_index': {str(e)}")
+            else:
+                try:
+                    ActivityLog.log_scraper_activity(
+                        action="check_unacked_index",
+                        status="info",
+                        description="'unacked_index' key does not exist - no delayed tasks"
+                    )
+                except RuntimeError:
+                    print("ℹ️  'unacked_index' key does not exist - no delayed tasks")
+            
+            # Also check the 'celery' queue for immediate tasks (backup check)
+            celery_cleared = 0
+            try:
+                queue_length = self.redis_client.llen('celery')
+                if queue_length and queue_length > 0:
+                    # Scan for any scraper tasks in the immediate queue
+                    scraper_tasks = []
+                    for i in range(queue_length):
+                        try:
+                            task_data = self.redis_client.lindex('celery', i)
+                            if task_data and any(pattern in str(task_data) for pattern in scraper_patterns):
+                                scraper_tasks.append(task_data)
+                        except Exception:
+                            continue
+                    
+                    # Remove scraper tasks from celery queue
+                    for task_data in scraper_tasks:
+                        try:
+                            removed_count = self.redis_client.lrem('celery', 0, task_data)
+                            celery_cleared += removed_count
+                        except Exception:
+                            continue
+                    
+                    cleared_count += celery_cleared
+                    
+                    if celery_cleared > 0:
+                        try:
+                            ActivityLog.log_scraper_activity(
+                                action="clear_celery_tasks",
+                                status="success",
+                                description=f"Cleared {celery_cleared} scraper tasks from 'celery' queue"
+                            )
+                        except RuntimeError:
+                            print(f"✅ Cleared {celery_cleared} scraper tasks from 'celery' queue")
+                            
+            except Exception as e:
+                try:
+                    ActivityLog.log_error(
+                        error_message=f"Error checking 'celery' queue: {str(e)}",
+                        source="ScraperManager._clear_delayed_tasks_from_redis"
+                    )
+                except RuntimeError:
+                    print(f"❌ Error checking 'celery' queue: {str(e)}")
+            
+            # Summary
+            if cleared_count > 0:
+                try:
+                    ActivityLog.log_scraper_activity(
+                        action="clear_delayed_tasks_complete",
+                        status="success",
+                        description=f"Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})"
+                    )
+                except RuntimeError:
+                    print(f"✅ Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})")
+            else:
+                try:
+                    ActivityLog.log_scraper_activity(
+                        action="clear_delayed_tasks_complete",
+                        status="info",
+                        description="No delayed scraper tasks found to clear in Redis"
+                    )
+                except RuntimeError:
+                    print("ℹ️  No delayed scraper tasks found to clear in Redis")
+            
+            return cleared_count
+            
+        except Exception as e:
+            try:
+                ActivityLog.log_error(
+                    error_message=f"Failed to clear delayed tasks from Redis: {str(e)}",
+                    source="ScraperManager._clear_delayed_tasks_from_redis"
+                )
+            except RuntimeError:
+                print(f"❌ Failed to clear delayed tasks from Redis: {str(e)}")
+            return 0
+        
+    def start_scraper(self) -> Dict[str, str]:
+        """Start the scraper system."""
+        try:
+            # Get current scraper
+            self.current_scraper = get_scraper()
+            
+            # Activate scraper state
+            ScraperState.set_active(True)
+            ScraperState.set_paused(False)
+            
+            scraper_name = self.current_scraper.get_name()
+            
+            ActivityLog.log_scraper_command(
+                action="start_scraper",
+                status="success",
+                description=f"Started scraper: {scraper_name}. Use /trigger-immediate endpoint to immediately schedule papers instead of waiting for the next hourly boundary."
+            )
+            
+            return {"status": "success", "message": "Scraper started successfully. Papers will be scheduled at the next hourly boundary, or use /trigger-immediate to schedule immediately."}
+            
+        except Exception as e:
+            ActivityLog.log_error(
+                error_message=f"Failed to start scraper: {str(e)}",
+                source="ScraperManager.start_scraper"
+            )
+            return {"status": "error", "message": str(e)}
+    
+    def pause_scraper(self) -> Dict[str, str]:
+        """Pause the scraper system."""
+        try:
+            ScraperState.set_paused(True)
+            
+            ActivityLog.log_scraper_command(
+                action="pause_scraper", 
+                status="success",
+                description="Scraper paused - processing will halt"
+            )
+            
+            return {"status": "success", "message": "Scraper paused"}
+            
+        except Exception as e:
+            return {"status": "error", "message": str(e)}
+    
+    def resume_scraper(self) -> Dict[str, str]:
+        """Resume the scraper system."""
+        try:
+            ScraperState.set_paused(False)
+            
+            ActivityLog.log_scraper_command(
+                action="resume_scraper",
+                status="success", 
+                description="Scraper resumed - processing will continue"
+            )
+            
+            return {"status": "success", "message": "Scraper resumed"}
+            
+        except Exception as e:
+            return {"status": "error", "message": str(e)}
+    
+    def stop_scraper(self) -> Dict[str, str]:
+        """Stop the scraper, revoke all running tasks, and revert pending papers."""
+        try:
+            # First, revoke all running tasks
+            revoked_count = 0
+            delayed_cleared_count = 0
+            
+            ActivityLog.log_scraper_command(
+                action="stop_scraper_start",
+                status="info",
+                description="Beginning scraper stop process with task revocation and delayed task clearing"
+            )
+            
+            try:
+                # Get Celery inspector to check for running tasks
+                i = celery.control.inspect()
+                active = i.active() or {}
+                scheduled = i.scheduled() or {}
+                reserved = i.reserved() or {}
+                
+                # Revoke active tasks
+                for worker, tasks in active.items():
+                    for task in tasks:
+                        if 'id' in task:
+                            celery.control.revoke(task['id'], terminate=True)
+                            revoked_count += 1
+                            ActivityLog.log_scraper_activity(
+                                action="revoke_task",
+                                status="success",
+                                description=f"Revoked active task: {task.get('name', 'unknown')} (ID: {task['id']})"
+                            )
+                
+                # Revoke scheduled tasks
+                for worker, tasks in scheduled.items():
+                    for task in tasks:
+                        if 'id' in task:
+                            celery.control.revoke(task['id'], terminate=True)
+                            revoked_count += 1
+                            ActivityLog.log_scraper_activity(
+                                action="revoke_task",
+                                status="success",
+                                description=f"Revoked scheduled task: {task.get('name', 'unknown')} (ID: {task['id']})"
+                            )
+                
+                # Revoke reserved tasks
+                for worker, tasks in reserved.items():
+                    for task in tasks:
+                        if 'id' in task:
+                            celery.control.revoke(task['id'], terminate=True)
+                            revoked_count += 1
+                            ActivityLog.log_scraper_activity(
+                                action="revoke_task",
+                                status="success",
+                                description=f"Revoked reserved task: {task.get('name', 'unknown')} (ID: {task['id']})"
+                            )
+                
+                # Purge all task queues
+                celery.control.purge()
+                ActivityLog.log_scraper_activity(
+                    action="purge_queues",
+                    status="success",
+                    description="Purged all task queues"
+                )
+                
+                # **NEW: Clear delayed tasks from Redis sorted sets**
+                delayed_cleared_count = self._clear_delayed_tasks_from_redis()
+                
+                # Additional cleanup: revoke any remaining scraper-related tasks by name pattern
+                try:
+                    # Use broadcast to revoke tasks that match scraper patterns
+                    scraper_task_patterns = [
+                        'process_single_paper',
+                        'process_papers_batch', 
+                        'hourly_scraper_scheduler'
+                    ]
+                    
+                    # Get a fresh inspection of tasks after purge
+                    fresh_inspect = celery.control.inspect()
+                    all_tasks = {}
+                    all_tasks.update(fresh_inspect.active() or {})
+                    all_tasks.update(fresh_inspect.scheduled() or {})
+                    all_tasks.update(fresh_inspect.reserved() or {})
+                    
+                    additional_revoked = 0
+                    for worker, tasks in all_tasks.items():
+                        for task in tasks:
+                            task_name = task.get('name', '')
+                            task_id = task.get('id', '')
+                            if any(pattern in task_name for pattern in scraper_task_patterns) and task_id:
+                                celery.control.revoke(task_id, terminate=True)
+                                additional_revoked += 1
+                                ActivityLog.log_scraper_activity(
+                                    action="revoke_scraper_task",
+                                    status="success", 
+                                    description=f"Revoked lingering scraper task: {task_name} (ID: {task_id})"
+                                )
+                    
+                    if additional_revoked > 0:
+                        ActivityLog.log_scraper_activity(
+                            action="cleanup_scraper_tasks",
+                            status="success",
+                            description=f"Additional cleanup: revoked {additional_revoked} lingering scraper tasks"
+                        )
+                        
+                except Exception as e:
+                    ActivityLog.log_error(
+                        error_message=f"Error during additional scraper task cleanup: {str(e)}",
+                        source="ScraperManager.stop_scraper.cleanup"
+                    )
+                
+            except Exception as e:
+                ActivityLog.log_error(
+                    error_message=f"Error revoking tasks: {str(e)}",
+                    source="ScraperManager.stop_scraper"
+                )
+                # Continue with paper reversion even if task revocation fails
+            
+            # Get current scraper to know what status to revert to
+            scraper = get_scraper()
+            input_statuses = scraper.get_input_statuses()
+            
+            # Find papers that are currently being processed
+            processing_status = scraper.get_output_statuses()["processing"]
+            pending_papers = PaperMetadata.query.filter_by(status=processing_status).all()
+            
+            # Revert their status to the first input status
+            reverted_count = 0
+            if pending_papers and input_statuses:
+                revert_status = input_statuses[0]  # Use first input status as default
+                
+                for paper in pending_papers:
+                    # Try to use previous_status if available, otherwise use first input status
+                    if hasattr(paper, 'previous_status') and paper.previous_status:
+                        paper.status = paper.previous_status
+                    else:
+                        paper.status = revert_status
+                    paper.updated_at = datetime.utcnow()
+                    reverted_count += 1
+                
+                db.session.commit()
+                
+                ActivityLog.log_scraper_activity(
+                    action="revert_pending_papers",
+                    status="success",
+                    description=f"Reverted {reverted_count} papers from '{processing_status}' to previous status"
+                )
+            
+            # Deactivate scraper
+            ScraperState.set_active(False)
+            ScraperState.set_paused(False)
+            
+            ActivityLog.log_scraper_command(
+                action="stop_scraper",
+                status="success",
+                description=f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers."
+            )
+            
+            return {
+                "status": "success", 
+                "message": f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers to previous status."
+            }
+            
+        except Exception as e:
+            ActivityLog.log_error(
+                error_message=f"Failed to stop scraper: {str(e)}",
+                source="ScraperManager.stop_scraper"
+            )
+            return {"status": "error", "message": str(e)}
+    
+    def reset_scraper(self) -> Dict[str, str]:
+        """Reset scraper state, revoke all running tasks, and clear all processing statuses."""
+        try:
+            # First, revoke all running tasks (similar to stop_scraper)
+            revoked_count = 0
+            
+            ActivityLog.log_scraper_command(
+                action="reset_scraper_start",
+                status="info",
+                description="Beginning scraper reset process with task revocation"
+            )
+            
+            try:
+                # Get Celery inspector to check for running tasks
+                i = celery.control.inspect()
+                active = i.active() or {}
+                scheduled = i.scheduled() or {}
+                reserved = i.reserved() or {}
+                
+                # Revoke all tasks (active, scheduled, reserved)
+                for queue_name, queue_tasks in [("active", active), ("scheduled", scheduled), ("reserved", reserved)]:
+                    for worker, tasks in queue_tasks.items():
+                        for task in tasks:
+                            if 'id' in task:
+                                celery.control.revoke(task['id'], terminate=True)
+                                revoked_count += 1
+                                ActivityLog.log_scraper_activity(
+                                    action="revoke_task",
+                                    status="success",
+                                    description=f"Revoked {queue_name} task: {task.get('name', 'unknown')} (ID: {task['id']})"
+                                )
+                
+                # Purge all task queues
+                celery.control.purge()
+                ActivityLog.log_scraper_activity(
+                    action="purge_queues",
+                    status="success",
+                    description="Purged all task queues during reset"
+                )
+                
+            except Exception as e:
+                ActivityLog.log_error(
+                    error_message=f"Error revoking tasks during reset: {str(e)}",
+                    source="ScraperManager.reset_scraper"
+                )
+                # Continue with paper reversion even if task revocation fails
+            
+            # Get current scraper configuration  
+            scraper = get_scraper()
+            input_statuses = scraper.get_input_statuses()
+            processing_status = scraper.get_output_statuses()["processing"]
+            
+            # Reset all papers in processing status
+            pending_papers = PaperMetadata.query.filter_by(status=processing_status).all()
+            reverted_count = 0
+            
+            if pending_papers and input_statuses:
+                revert_status = input_statuses[0]
+                
+                for paper in pending_papers:
+                    # Try to use previous_status if available, otherwise use first input status
+                    if hasattr(paper, 'previous_status') and paper.previous_status:
+                        paper.status = paper.previous_status
+                    else:
+                        paper.status = revert_status
+                    paper.updated_at = datetime.utcnow()
+                    paper.error_msg = None  # Clear any error messages
+                    reverted_count += 1
+                
+                db.session.commit()
+            
+            # Reset scraper state
+            ScraperState.set_active(False)
+            ScraperState.set_paused(False)
+            
+            ActivityLog.log_scraper_command(
+                action="reset_scraper",
+                status="success", 
+                description=f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers."
+            )
+            
+            return {
+                "status": "success",
+                "message": f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers to original status."
+            }
+            
+        except Exception as e:
+            return {"status": "error", "message": str(e)}
+    
+    def get_current_hour_quota(self) -> int:
+        """Calculate papers to process in current hour based on schedule."""
+        try:
+            return get_cached_hourly_quota(self._calculate_papers_for_current_hour)
+        except Exception as e:
+            ActivityLog.log_error(
+                error_message=f"Error calculating hourly quota: {str(e)}",
+                source="ScraperManager.get_current_hour_quota"
+            )
+            return 0
+    
+    def _calculate_papers_for_current_hour(self) -> int:
+        """Internal method to calculate hourly quota."""
+        try:
+            # Get current hour and volume config
+            current_hour = datetime.now().hour
+            volume_config = VolumeConfig.get_current_volume()
+            daily_volume = volume_config if volume_config else 100
+            
+            # Get schedule config for current hour
+            schedule_config = ScheduleConfig.query.filter_by(hour=current_hour).first()
+            current_weight = schedule_config.weight if schedule_config else 1.0
+            
+            # Get total weight across all hours
+            total_weight = db.session.query(func.sum(ScheduleConfig.weight)).scalar() or 24.0
+            
+            # Calculate quota: (current_weight / total_weight) * daily_volume
+            quota = math.ceil((current_weight / total_weight) * daily_volume)
+            
+            ActivityLog.log_scraper_activity(
+                action="calculate_hourly_quota",
+                status="info",
+                description=f"Hour {current_hour}: quota={quota} (weight={current_weight}, total_weight={total_weight}, daily_volume={daily_volume})"
+            )
+            
+            return max(1, quota)  # Ensure at least 1 paper per hour
+            
+        except Exception as e:
+            ActivityLog.log_error(
+                error_message=f"Error in quota calculation: {str(e)}",
+                source="ScraperManager._calculate_papers_for_current_hour"
+            )
+            return 1  # Fallback to 1 paper per hour
+    
+    def select_papers_for_processing(self, limit: Optional[int] = None) -> List[PaperMetadata]:
+        """Select papers for processing based on current scraper configuration."""
+        try:
+            scraper = get_scraper()
+            input_statuses = scraper.get_input_statuses()
+            
+            if not input_statuses:
+                return []
+            
+            # Use provided limit or calculate from hourly quota
+            papers_needed = limit if limit is not None else self.get_current_hour_quota()
+            
+            # Query papers with input statuses, randomize selection
+            papers = (PaperMetadata.query
+                     .filter(PaperMetadata.status.in_(input_statuses))
+                     .order_by(func.random())
+                     .limit(papers_needed)
+                     .all())
+            
+            ActivityLog.log_scraper_activity(
+                action="select_papers",
+                status="info",
+                description=f"Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})"
+            )
+            
+            return papers
+            
+        except Exception as e:
+            ActivityLog.log_error(
+                error_message=f"Error selecting papers: {str(e)}",
+                source="ScraperManager.select_papers_for_processing"
+            )
+            return []
+    
+    def process_paper(self, paper: PaperMetadata) -> Dict:
+        """Process a single paper using the current scraper."""
+        try:
+            scraper = get_scraper()
+            output_statuses = scraper.get_output_statuses()
+            
+            # Store the previous status before changing it
+            previous_status = paper.status
+            
+            # Update paper status to processing
+            paper.previous_status = previous_status
+            paper.status = output_statuses["processing"]
+            paper.updated_at = datetime.utcnow()
+            db.session.commit()
+            
+            # Perform scraping
+            result = scraper.scrape(paper.doi)
+            
+            # Update paper status based on result
+            if result.status == "success":
+                paper.status = output_statuses["success"]
+                paper.error_msg = None
+                if result.data and "file_path" in result.data:
+                    paper.file_path = result.data["file_path"]
+            else:
+                paper.status = output_statuses["failure"]
+                paper.error_msg = result.message
+            
+            paper.updated_at = datetime.utcnow()
+            db.session.commit()
+            
+            # Log result
+            ActivityLog.log_scraper_activity(
+                action="process_paper",
+                paper_id=paper.id,
+                status=result.status,
+                description=f"Processed {paper.doi}: {result.message}"
+            )
+            
+            return {
+                "paper_id": paper.id,
+                "status": result.status,
+                "message": result.message,
+                "duration": result.duration
+            }
+            
+        except Exception as e:
+            # Revert paper status on error
+            try:
+                input_statuses = get_scraper().get_input_statuses()
+                if input_statuses:
+                    paper.status = input_statuses[0]
+                    paper.error_msg = f"Processing error: {str(e)}"
+                    paper.updated_at = datetime.utcnow()
+                    db.session.commit()
+            except:
+                pass  # Don't fail if reversion fails
+            
+            ActivityLog.log_error(
+                error_message=f"Error processing paper {paper.id}: {str(e)}",
+                source="ScraperManager.process_paper"
+            )
+            
+            return {"paper_id": paper.id, "status": "error", "message": str(e)}
+    
+    def get_status(self) -> Dict:
+        """Get current scraper status."""
+        scraper_state = ScraperState.get_current_state()
+        scraper = get_scraper()
+        
+        # Count papers by status
+        input_statuses = scraper.get_input_statuses()
+        output_statuses = scraper.get_output_statuses()
+        
+        available_count = (PaperMetadata.query
+                          .filter(PaperMetadata.status.in_(input_statuses))
+                          .count())
+        
+        processing_count = (PaperMetadata.query
+                           .filter_by(status=output_statuses["processing"])
+                           .count())
+        
+        return {
+            "active": scraper_state.is_active,
+            "paused": scraper_state.is_paused,
+            "current_scraper": scraper.get_name(),
+            "input_statuses": input_statuses,
+            "output_statuses": output_statuses,
+            "available_papers": available_count,
+            "processing_papers": processing_count,
+            "current_hour_quota": self.get_current_hour_quota()
+        }
--- a/scipaperloader/scrapers/tasks.py
+++ b/scipaperloader/scrapers/tasks.py
@ -0,0 +1,189 @@
+"""
+Hourly scheduler task that processes papers at random times within each hour.
+"""
+
+import random
+from datetime import datetime, timedelta
+from typing import Optional
+from celery import shared_task
+
+from ..models import ScraperState, ActivityLog
+from .manager import ScraperManager
+
+
+@shared_task(bind=True)
+def hourly_scraper_scheduler(self):
+    """
+    Hourly task that schedules paper processing at random times within the hour.
+    
+    This task runs at the beginning of each hour and:
+    1. Calculates how many papers to process this hour
+    2. Schedules individual paper processing tasks at random times within the hour
+    """
+    try:
+        # Check if scraper is active
+        scraper_state = ScraperState.get_current_state()
+        if not scraper_state.is_active:
+            ActivityLog.log_scraper_activity(
+                action="hourly_scheduler",
+                status="info",
+                description="Hourly scheduler skipped - scraper not active"
+            )
+            # Disable retries for inactive scheduler
+            self.retry = False
+            return {"status": "inactive", "papers_scheduled": 0}
+        
+        if scraper_state.is_paused:
+            ActivityLog.log_scraper_activity(
+                action="hourly_scheduler",
+                status="info", 
+                description="Hourly scheduler skipped - scraper paused"
+            )
+            # Disable retries for paused scheduler  
+            self.retry = False
+            return {"status": "paused", "papers_scheduled": 0}
+        
+        # Initialize scraper manager
+        manager = ScraperManager()
+        
+        # Get papers to process this hour
+        papers = manager.select_papers_for_processing()
+        
+        if not papers:
+            ActivityLog.log_scraper_activity(
+                action="hourly_scheduler",
+                status="info",
+                description="No papers available for processing this hour"
+            )
+            return {"status": "empty", "papers_scheduled": 0}
+        
+        # Schedule papers at random times within the hour (0-3600 seconds)
+        scheduled_count = 0
+        current_time = datetime.now()
+        
+        for paper in papers:
+            # Random delay between 1 second and 58 minutes
+            delay_seconds = random.randint(1, 3480)  # Up to 58 minutes
+            
+            # Schedule the task using Celery's task registry to avoid circular import issues
+            from ..celery import celery
+            celery.send_task(
+                'scipaperloader.scrapers.tasks.process_single_paper',
+                args=[paper.id],
+                countdown=delay_seconds
+            )
+            
+            scheduled_count += 1
+            
+            # Log each scheduled paper
+            schedule_time = current_time + timedelta(seconds=delay_seconds)
+            ActivityLog.log_scraper_activity(
+                action="schedule_paper",
+                paper_id=paper.id,
+                status="info",
+                description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
+            )
+        
+        ActivityLog.log_scraper_activity(
+            action="hourly_scheduler",
+            status="success",
+            description=f"Scheduled {scheduled_count} papers for random processing within this hour"
+        )
+        
+        return {"status": "success", "papers_scheduled": scheduled_count}
+        
+    except Exception as e:
+        ActivityLog.log_error(
+            error_message=f"Hourly scheduler error: {str(e)}",
+            source="hourly_scraper_scheduler"
+        )
+        return {"status": "error", "message": str(e)}
+
+
+@shared_task(bind=True)
+def process_single_paper(self, paper_id: int):
+    """
+    Process a single paper. This task is scheduled at random times within each hour.
+    
+    Args:
+        paper_id: ID of the paper to process
+    """
+    try:
+        # Double-check scraper state before processing
+        scraper_state = ScraperState.get_current_state()
+        if not scraper_state.is_active:
+            ActivityLog.log_scraper_activity(
+                action="process_single_paper",
+                paper_id=paper_id,
+                status="skipped",
+                description="Skipped processing - scraper not active"
+            )
+            # Use Celery's ignore to mark this task as completed without error
+            self.retry = False
+            return {"status": "inactive", "paper_id": paper_id}
+        
+        if scraper_state.is_paused:
+            ActivityLog.log_scraper_activity(
+                action="process_single_paper", 
+                paper_id=paper_id,
+                status="skipped",
+                description="Skipped processing - scraper paused"
+            )
+            # Use Celery's ignore for paused state too
+            self.retry = False
+            return {"status": "paused", "paper_id": paper_id}
+        
+        # Get the paper
+        from ..models import PaperMetadata
+        paper = PaperMetadata.query.get(paper_id)
+        if not paper:
+            return {"status": "error", "message": f"Paper {paper_id} not found"}
+        
+        # Process the paper using scraper manager
+        manager = ScraperManager()
+        result = manager.process_paper(paper)
+        
+        return result
+        
+    except Exception as e:
+        ActivityLog.log_error(
+            error_message=f"Error processing paper {paper_id}: {str(e)}",
+            source="process_single_paper"
+        )
+        return {"status": "error", "paper_id": paper_id, "message": str(e)}
+
+
+@shared_task(bind=True)
+def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
+    """
+    Process multiple papers in a batch for immediate processing.
+    
+    Args:
+        paper_ids: List of paper IDs to process
+        scraper_module: Optional specific scraper module to use
+    """
+    try:
+        results = []
+        manager = ScraperManager()
+        
+        for paper_id in paper_ids:
+            from ..models import PaperMetadata
+            paper = PaperMetadata.query.get(paper_id)
+            if paper:
+                result = manager.process_paper(paper)
+                results.append(result)
+            else:
+                results.append({
+                    "paper_id": paper_id,
+                    "status": "error", 
+                    "message": "Paper not found"
+                })
+        
+        return {"results": results, "total_processed": len(results)}
+        
+    except Exception as e:
+        ActivityLog.log_error(
+            error_message=f"Error processing batch: {str(e)}",
+            source="process_papers_batch"
+        )
+        return {"status": "error", "message": str(e)}
--- a/scipaperloader/templates/scraper.html.jinja
+++ b/scipaperloader/templates/scraper.html.jinja
@ -29,6 +29,11 @@
        height: 400px;
    }

+    .chart-wrapper {
+        position: relative;
+        height: 400px;
+    }
+
    .notification {
        position: fixed;
        bottom: 20px;
@ -100,132 +105,137 @@
                        <div class="form-group">
                            <label for="volumeInput">Papers per day:</label>
                            <input type="number" class="form-control" id="volumeInput"
-                                value="{{ volume_config.volume if volume_config else 100 }}" min="1"
-                                max="{{ max_volume }}">
-                            <div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
+                                value="{{ volume_config if volume_config else 100 }}" min="1" max="{{ max_volume }}">
+                            <button type="submit" class="btn btn-primary mt-2">
+                                <i class="fas fa-save"></i> Update Volume
+                            </button>
                        </div>
-                        <button type="submit" class="btn btn-primary mt-2">Update Volume</button>
-                    </form>
+                        <div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
                </div>
+                </form>
            </div>
        </div>
    </div>
+</div>

-    <!-- New row for single paper processing -->
-    <div class="row mb-4">
-        <div class="col-12">
-            <div class="card">
-                <div class="card-header">
-                    <h5>Process Single Paper</h5>
-                </div>
-                <div class="card-body">
-                    <div class="row">
-                        <div class="col-md-6">
-                            <form id="searchPaperForm" class="mb-3">
-                                <div class="input-group">
-                                    <input type="text" id="paperSearchInput" class="form-control"
-                                        placeholder="Search paper by title, DOI, or ID...">
-                                    <button class="btn btn-outline-secondary" type="submit">Search</button>
-                                </div>
-                            </form>
-                        </div>
-                        <div class="col-md-6">
-                            <div class="form-group">
-                                <label for="scraperSelect">Scraper Module:</label>
-                                <select class="form-control" id="scraperSelect">
-                                    <option value="">Use default system scraper</option>
-                                    <!-- Available scrapers will be populated here -->
-                                </select>
-                                <div class="form-text">
-                                    Select which scraper to use for processing the paper
-                                </div>
+<!-- New row for single paper processing -->
+<div class="row mb-4">
+    <div class="col-12">
+        <div class="card">
+            <div class="card-header">
+                <h5>Process Single Paper</h5>
+            </div>
+            <div class="card-body">
+                <div class="row">
+                    <div class="col-md-6">
+                        <form id="searchPaperForm" class="mb-3">
+                            <div class="input-group">
+                                <input type="text" id="paperSearchInput" class="form-control"
+                                    placeholder="Search paper by title, DOI, or ID...">
+                                <button class="btn btn-outline-secondary" type="submit">Search</button>
+                            </div>
+                        </form>
+                    </div>
+                    <div class="col-md-6">
+                        <div class="form-group">
+                            <label for="scraperSelect">Scraper Module:</label>
+                            <select class="form-control" id="scraperSelect">
+                                <option value="">Use default system scraper</option>
+                                <!-- Available scrapers will be populated here -->
+                            </select>
+                            <div class="form-text">
+                                Select which scraper to use for processing the paper
                            </div>
                        </div>
                    </div>
-
-                    <div id="searchResults" class="mt-3 search-results-container d-none">
-                        <table class="table table-hover table-striped">
-                            <thead>
-                                <tr>
-                                    <th>ID</th>
-                                    <th>Title</th>
-                                    <th>DOI</th>
-                                    <th>Status</th>
-                                    <th>Actions</th>
-                                </tr>
-                            </thead>
-                            <tbody id="paperSearchResults">
-                                <!-- Search results will be populated here -->
-                            </tbody>
-                        </table>
-                    </div>
-
-                    <div id="processingStatus" class="alert alert-info mt-3 d-none"></div>
                </div>
+
+                <div id="searchResults" class="mt-3 search-results-container d-none">
+                    <table class="table table-hover table-striped">
+                        <thead>
+                            <tr>
+                                <th>ID</th>
+                                <th>Title</th>
+                                <th>DOI</th>
+                                <th>Status</th>
+                                <th>Actions</th>
+                            </tr>
+                        </thead>
+                        <tbody id="paperSearchResults">
+                            <!-- Search results will be populated here -->
+                        </tbody>
+                    </table>
+                </div>
+
+                <div id="processingStatus" class="alert alert-info mt-3 d-none"></div>
            </div>
        </div>
    </div>
+</div>

-    <div class="row mb-4">
-        <div class="col-12">
-            <div class="card">
-                <div class="card-header d-flex justify-content-between align-items-center">
-                    <h5>Scraping Activity</h5>
-                    <div>
-                        <div class="form-check form-switch">
-                            <input class="form-check-input" type="checkbox" id="notificationsToggle" checked>
-                            <label class="form-check-label" for="notificationsToggle">Show Notifications</label>
-                        </div>
+<div class="row mb-4">
+    <div class="col-12">
+        <div class="card">
+            <div class="card-header d-flex justify-content-between align-items-center">
+                <h5>Scraping Activity</h5>
+                <div>
+                    <div class="form-check form-switch">
+                        <input class="form-check-input" type="checkbox" id="notificationsToggle" checked>
+                        <label class="form-check-label" for="notificationsToggle">Show Notifications</label>
                    </div>
                </div>
-                <div class="card-body">
-                    <div class="btn-group mb-3">
-                        <button class="btn btn-outline-secondary time-range-btn" data-hours="6">Last 6
-                            hours</button>
-                        <button class="btn btn-outline-secondary time-range-btn active" data-hours="24">Last 24
-                            hours</button>
-                        <button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3
-                            days</button>
-                    </div>
-                    <div class="stats-chart" id="activityChart"></div>
-                </div>
            </div>
-        </div>
-    </div>
-
-    <div class="row mb-4">
-        <div class="col-12">
-            <div class="card">
-                <div class="card-header">
-                    <h5>Recent Activity</h5>
+            <div class="card-body">
+                <div class="btn-group mb-3">
+                    <button class="btn btn-outline-secondary time-range-btn" data-hours="6">Last 6
+                        hours</button>
+                    <button class="btn btn-outline-secondary time-range-btn active" data-hours="24">Last 24
+                        hours</button>
+                    <button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3
+                        days</button>
                </div>
-                <div class="card-body">
-                    <div class="table-responsive">
-                        <table class="table table-striped">
-                            <thead>
-                                <tr>
-                                    <th>Time</th>
-                                    <th>Action</th>
-                                    <th>Status</th>
-                                    <th>Description</th>
-                                </tr>
-                            </thead>
-                            <tbody id="activityLog">
-                                <tr>
-                                    <td colspan="4" class="text-center">Loading activities...</td>
-                                </tr>
-                            </tbody>
-                        </table>
-                    </div>
+                <div class="chart-wrapper">
+                    <canvas id="activityChart"></canvas>
                </div>
            </div>
        </div>
    </div>
 </div>
+
+<div class="row mb-4">
+    <div class="col-12">
+        <div class="card">
+            <div class="card-header">
+                <h5>Recent Activity</h5>
+            </div>
+            <div class="card-body">
+                <div class="table-responsive">
+                    <table class="table table-striped">
+                        <thead>
+                            <tr>
+                                <th>Time</th>
+                                <th>Action</th>
+                                <th>Status</th>
+                                <th>Description</th>
+                            </tr>
+                        </thead>
+                        <tbody id="activityLog">
+                            <tr>
+                                <td colspan="4" class="text-center">Loading activities...</td>
+                            </tr>
+                        </tbody>
+                    </table>
+                </div>
+            </div>
+        </div>
+    </div>
+</div>
+</div>
 {% endblock content %}

 {% block scripts %}
 {{ super() }}
+<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
 <script>
    // Global variables for the scraper dashboard
    let notificationsEnabled = true;
@ -251,10 +261,14 @@
    // Initialize the page
    document.addEventListener('DOMContentLoaded', function () {
        initStatusPolling();
-        loadActivityStats(currentTimeRange);
        loadRecentActivity();
        loadAvailableScrapers();

+        // Load chart data after a short delay to ensure Chart.js is loaded
+        setTimeout(() => {
+            loadActivityStats(currentTimeRange);
+        }, 100);
+
        // Initialize event listeners
        startButton.addEventListener('click', startScraper);
        pauseButton.addEventListener('click', togglePauseScraper);
@ -470,13 +484,21 @@
        fetch('/scraper/status')
            .then(response => response.json())
            .then(data => {
-                if (data.active) {
-                    if (data.paused) {
-                        statusIndicator.className = 'status-indicator status-paused';
+                console.log('Status data received:', data); // Debug log
+
+                // Remove all status classes first
+                statusIndicator.classList.remove('status-active', 'status-paused', 'status-inactive');
+
+                // Handle the new JSON structure with scraper_state
+                const scraperState = data.scraper_state || data; // Fallback for old structure
+
+                if (scraperState.active) {
+                    if (scraperState.paused) {
+                        statusIndicator.classList.add('status-paused');
                        statusText.textContent = 'Paused';
                        pauseButton.textContent = 'Resume';
                    } else {
-                        statusIndicator.className = 'status-indicator status-active';
+                        statusIndicator.classList.add('status-active');
                        statusText.textContent = 'Active';
                        pauseButton.textContent = 'Pause';
                    }
@ -485,13 +507,20 @@
                    stopButton.disabled = false;
                    resetButton.disabled = false;  // Enable reset when active
                } else {
-                    statusIndicator.className = 'status-indicator status-inactive';
+                    statusIndicator.classList.add('status-inactive');
                    statusText.textContent = 'Inactive';
                    startButton.disabled = false;
                    pauseButton.disabled = true;
                    stopButton.disabled = true;
                    resetButton.disabled = false;  // Enable reset when inactive too
                }
+            })
+            .catch(error => {
+                console.error('Error fetching status:', error);
+                // On error, show inactive state
+                statusIndicator.classList.remove('status-active', 'status-paused', 'status-inactive');
+                statusIndicator.classList.add('status-inactive');
+                statusText.textContent = 'Error';
            });
    }

@ -499,7 +528,13 @@
    function startScraper() {
        console.log("Start button clicked - sending request to /scraper/start");

-        fetch('/scraper/start', { method: 'POST' })
+        fetch('/scraper/start', {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            body: JSON.stringify({})
+        })
            .then(response => {
                console.log("Response received:", response);
                return response.json();
@ -521,7 +556,13 @@
    }

    function togglePauseScraper() {
-        fetch('/scraper/pause', { method: 'POST' })
+        fetch('/scraper/pause', {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            body: JSON.stringify({})
+        })
            .then(response => response.json())
            .then(data => {
                if (data.success) {
@ -535,7 +576,13 @@
    }

    function stopScraper() {
-        fetch('/scraper/stop', { method: 'POST' })
+        fetch('/scraper/stop', {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json'
+            },
+            body: JSON.stringify({})
+        })
            .then(response => response.json())
            .then(data => {
                if (data.success) {
@ -706,14 +753,28 @@
    // Load data functions
    function loadActivityStats(hours) {
        fetch(`/scraper/stats?hours=${hours}`)
-            .then(response => response.json())
+            .then(response => {
+                if (!response.ok) {
+                    throw new Error(`HTTP error! status: ${response.status}`);
+                }
+                return response.json();
+            })
            .then(data => {
+                console.log('Stats data loaded:', data);
                renderActivityChart(data);
+            })
+            .catch(error => {
+                console.error('Failed to load activity stats:', error);
+                // Hide the chart or show an error message
+                const chartContainer = document.getElementById('activityChart').parentElement;
+                if (chartContainer) {
+                    chartContainer.innerHTML = '<p class="text-muted">Chart data unavailable</p>';
+                }
            });
    }

    function loadRecentActivity() {
-        fetch('/api/activity_logs?category=scraper_activity&limit=20')
+        fetch('/api/activity_logs?category=scraper_activity&category=scraper_command&limit=50')
            .then(response => response.json())
            .then(data => {
                renderActivityLog(data);
@ -728,7 +789,19 @@

    // Rendering functions
    function renderActivityChart(data) {
-        const ctx = document.getElementById('activityChart').getContext('2d');
+        // Check if Chart.js is available
+        if (typeof Chart === 'undefined') {
+            console.error('Chart.js is not loaded');
+            return;
+        }
+
+        const chartElement = document.getElementById('activityChart');
+        if (!chartElement) {
+            console.error('Chart canvas element not found');
+            return;
+        }
+
+        const ctx = chartElement.getContext('2d');

        // Extract the data for the chart
        const labels = data.map(item => `${item.hour}:00`);
@ -857,7 +930,7 @@
    let lastPaperTimestamp = new Date().toISOString();

    function checkForNewPapers() {
-        fetch(`/api/activity_logs?category=scraper_activity&action=scrape_paper&after=${lastPaperTimestamp}&limit=5`)
+        fetch(`/api/activity_logs?category=scraper_activity&category=scraper_command&action=scrape_paper&after=${lastPaperTimestamp}&limit=5`)
            .then(response => response.json())
            .then(data => {
                if (data && data.length > 0) {