From 36ba835980da9bf99c91b06e7e3d9a910316f64c Mon Sep 17 00:00:00 2001
From: Michael Beck <ich@mischbeck.de>
Date: Fri, 23 May 2025 19:07:40 +0200
Subject: [PATCH] adds cache management

---
 scipaperloader/blueprints/config.py  | 14 +++++++
 scipaperloader/blueprints/scraper.py | 22 +++++++---
 scipaperloader/cache_utils.py        | 61 ++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 5 deletions(-)
 create mode 100644 scipaperloader/cache_utils.py

diff --git a/scipaperloader/blueprints/config.py b/scipaperloader/blueprints/config.py
index 43d7e65..4a69c5e 100644
--- a/scipaperloader/blueprints/config.py
+++ b/scipaperloader/blueprints/config.py
@@ -5,7 +5,10 @@ from ..db import db
 from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata
 from ..defaults import MAX_VOLUME
 import os # Import os for path validation
+import sys
 from scipaperloader.scrapers import __path__ as scrapers_path
+# Import the cache invalidation function from our new module
+from ..cache_utils import invalidate_hourly_quota_cache
 
 bp = Blueprint("config", __name__, url_prefix="/config")
 
@@ -166,6 +169,17 @@ def _update_schedule(schedule_data):
                 )
         
         db.session.commit()
+        
+        # Invalidate hourly quota cache using the cache_utils module
+        try:
+            invalidate_hourly_quota_cache()
+        except Exception as e:
+            # Log the error but don't fail the update
+            ActivityLog.log_error(
+                error_message=f"Error invalidating hourly quota cache: {str(e)}",
+                source="_update_schedule"
+            )
+            
         return True, "Schedule updated successfully!"
         
     except Exception as e:
diff --git a/scipaperloader/blueprints/scraper.py b/scipaperloader/blueprints/scraper.py
index 902a6dc..07b955a 100644
--- a/scipaperloader/blueprints/scraper.py
+++ b/scipaperloader/blueprints/scraper.py
@@ -10,6 +10,7 @@ from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory,
 from ..db import db
 from ..celery import celery
 from ..defaults import MAX_VOLUME
+from ..cache_utils import get_cached_hourly_quota, invalidate_hourly_quota_cache
 from celery.schedules import crontab
 from sqlalchemy import func
 from scipaperloader.scrapers.factory import get_scraper, get_available_scrapers
@@ -360,6 +361,9 @@ def update_config():
                         description="Updated scraper volume"
                     )
 
+                    # Invalidate hourly quota cache when volume changes
+                    invalidate_hourly_quota_cache()
+
                 db.session.commit()
             except (ValueError, TypeError):
                 return jsonify({
@@ -441,7 +445,8 @@ def dummy_scheduled_scraper():
         )
         return False # Stop if not active/paused
 
-    papers_to_select = calculate_papers_for_current_hour()
+    # Use cached hourly quota instead of calculating each time
+    papers_to_select = get_cached_hourly_quota(calculate_papers_for_current_hour)
 
     if papers_to_select <= 0:
         ActivityLog.log_scraper_activity(
@@ -463,11 +468,18 @@ def dummy_scheduled_scraper():
             ActivityLog.log_scraper_activity(
                 action="dummy_scheduled_scraper_info",
                 status="info",
-                description="No 'New' papers found in the database to select."
+                description="No 'New' papers found in the database. Stopping scraper."
             )
-            # Optional: Depending on requirements, you might want to check later
-            # or handle this case differently. For now, we just log and exit.
-            return True 
+            
+            # Stop the scraper since there are no more papers to process
+            ScraperState.set_active(False)
+            ActivityLog.log_scraper_command(
+                action="auto_stop_scraper",
+                status="success",
+                description="Scraper automatically stopped due to no 'New' papers left to process."
+            )
+            
+            return True
 
         selected_paper_ids = [p.id for p in new_papers]
         
diff --git a/scipaperloader/cache_utils.py b/scipaperloader/cache_utils.py
new file mode 100644
index 0000000..b8743fe
--- /dev/null
+++ b/scipaperloader/cache_utils.py
@@ -0,0 +1,61 @@
+"""
+Utility module for cache management in the SciPaperLoader application.
+This module contains functions for managing the hourly quota cache and other caching mechanisms.
+"""
+from datetime import datetime
+from .models import ActivityLog
+
+# Global cache for hourly quota
+HOURLY_QUOTA_CACHE = {
+    'hour': None,  # Current hour
+    'quota': None,  # Calculated quota
+    'last_config_update': None,  # Last time volume or schedule config was updated
+}
+
+def invalidate_hourly_quota_cache():
+    """Invalidate the hourly quota cache when configuration changes."""
+    global HOURLY_QUOTA_CACHE
+    HOURLY_QUOTA_CACHE['last_config_update'] = None
+    
+    # Log the cache invalidation
+    ActivityLog.log_scraper_activity(
+        action="cache_invalidated",
+        status="info",
+        description="Hourly quota cache was invalidated due to configuration changes"
+    )
+
+def get_cached_hourly_quota(calculate_function):
+    """
+    Get the cached hourly quota if it's still valid, or recalculate if needed.
+    
+    Args:
+        calculate_function: Function to call when recalculation is needed
+    
+    Returns:
+        int: Number of papers to download this hour
+    """
+    global HOURLY_QUOTA_CACHE
+    current_hour = datetime.now().hour
+    
+    # Check if we need to recalculate
+    if (HOURLY_QUOTA_CACHE['hour'] != current_hour or 
+        HOURLY_QUOTA_CACHE['quota'] is None or 
+        HOURLY_QUOTA_CACHE['last_config_update'] is None):
+        
+        # Recalculate and update cache
+        quota = calculate_function()
+        HOURLY_QUOTA_CACHE['hour'] = current_hour
+        HOURLY_QUOTA_CACHE['quota'] = quota
+        HOURLY_QUOTA_CACHE['last_config_update'] = datetime.now()
+        
+        # Log cache update
+        ActivityLog.log_scraper_activity(
+            action="cache_updated",
+            status="info",
+            description=f"Hourly quota cache updated for hour {current_hour}: {quota} papers"
+        )
+        
+        return quota
+    else:
+        # Use cached value
+        return HOURLY_QUOTA_CACHE['quota']