From 36ba835980da9bf99c91b06e7e3d9a910316f64c Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Fri, 23 May 2025 19:07:40 +0200 Subject: [PATCH] adds cache management --- scipaperloader/blueprints/config.py | 14 +++++++ scipaperloader/blueprints/scraper.py | 22 +++++++--- scipaperloader/cache_utils.py | 61 ++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 5 deletions(-) create mode 100644 scipaperloader/cache_utils.py diff --git a/scipaperloader/blueprints/config.py b/scipaperloader/blueprints/config.py index 43d7e65..4a69c5e 100644 --- a/scipaperloader/blueprints/config.py +++ b/scipaperloader/blueprints/config.py @@ -5,7 +5,10 @@ from ..db import db from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata from ..defaults import MAX_VOLUME import os # Import os for path validation +import sys from scipaperloader.scrapers import __path__ as scrapers_path +# Import the cache invalidation function from our new module +from ..cache_utils import invalidate_hourly_quota_cache bp = Blueprint("config", __name__, url_prefix="/config") @@ -166,6 +169,17 @@ def _update_schedule(schedule_data): ) db.session.commit() + + # Invalidate hourly quota cache using the cache_utils module + try: + invalidate_hourly_quota_cache() + except Exception as e: + # Log the error but don't fail the update + ActivityLog.log_error( + error_message=f"Error invalidating hourly quota cache: {str(e)}", + source="_update_schedule" + ) + return True, "Schedule updated successfully!" except Exception as e: diff --git a/scipaperloader/blueprints/scraper.py b/scipaperloader/blueprints/scraper.py index 902a6dc..07b955a 100644 --- a/scipaperloader/blueprints/scraper.py +++ b/scipaperloader/blueprints/scraper.py @@ -10,6 +10,7 @@ from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory, from ..db import db from ..celery import celery from ..defaults import MAX_VOLUME +from ..cache_utils import get_cached_hourly_quota, invalidate_hourly_quota_cache from celery.schedules import crontab from sqlalchemy import func from scipaperloader.scrapers.factory import get_scraper, get_available_scrapers @@ -360,6 +361,9 @@ def update_config(): description="Updated scraper volume" ) + # Invalidate hourly quota cache when volume changes + invalidate_hourly_quota_cache() + db.session.commit() except (ValueError, TypeError): return jsonify({ @@ -441,7 +445,8 @@ def dummy_scheduled_scraper(): ) return False # Stop if not active/paused - papers_to_select = calculate_papers_for_current_hour() + # Use cached hourly quota instead of calculating each time + papers_to_select = get_cached_hourly_quota(calculate_papers_for_current_hour) if papers_to_select <= 0: ActivityLog.log_scraper_activity( @@ -463,11 +468,18 @@ def dummy_scheduled_scraper(): ActivityLog.log_scraper_activity( action="dummy_scheduled_scraper_info", status="info", - description="No 'New' papers found in the database to select." + description="No 'New' papers found in the database. Stopping scraper." ) - # Optional: Depending on requirements, you might want to check later - # or handle this case differently. For now, we just log and exit. - return True + + # Stop the scraper since there are no more papers to process + ScraperState.set_active(False) + ActivityLog.log_scraper_command( + action="auto_stop_scraper", + status="success", + description="Scraper automatically stopped due to no 'New' papers left to process." + ) + + return True selected_paper_ids = [p.id for p in new_papers] diff --git a/scipaperloader/cache_utils.py b/scipaperloader/cache_utils.py new file mode 100644 index 0000000..b8743fe --- /dev/null +++ b/scipaperloader/cache_utils.py @@ -0,0 +1,61 @@ +""" +Utility module for cache management in the SciPaperLoader application. +This module contains functions for managing the hourly quota cache and other caching mechanisms. +""" +from datetime import datetime +from .models import ActivityLog + +# Global cache for hourly quota +HOURLY_QUOTA_CACHE = { + 'hour': None, # Current hour + 'quota': None, # Calculated quota + 'last_config_update': None, # Last time volume or schedule config was updated +} + +def invalidate_hourly_quota_cache(): + """Invalidate the hourly quota cache when configuration changes.""" + global HOURLY_QUOTA_CACHE + HOURLY_QUOTA_CACHE['last_config_update'] = None + + # Log the cache invalidation + ActivityLog.log_scraper_activity( + action="cache_invalidated", + status="info", + description="Hourly quota cache was invalidated due to configuration changes" + ) + +def get_cached_hourly_quota(calculate_function): + """ + Get the cached hourly quota if it's still valid, or recalculate if needed. + + Args: + calculate_function: Function to call when recalculation is needed + + Returns: + int: Number of papers to download this hour + """ + global HOURLY_QUOTA_CACHE + current_hour = datetime.now().hour + + # Check if we need to recalculate + if (HOURLY_QUOTA_CACHE['hour'] != current_hour or + HOURLY_QUOTA_CACHE['quota'] is None or + HOURLY_QUOTA_CACHE['last_config_update'] is None): + + # Recalculate and update cache + quota = calculate_function() + HOURLY_QUOTA_CACHE['hour'] = current_hour + HOURLY_QUOTA_CACHE['quota'] = quota + HOURLY_QUOTA_CACHE['last_config_update'] = datetime.now() + + # Log cache update + ActivityLog.log_scraper_activity( + action="cache_updated", + status="info", + description=f"Hourly quota cache updated for hour {current_hour}: {quota} papers" + ) + + return quota + else: + # Use cached value + return HOURLY_QUOTA_CACHE['quota']