adds cache management

This commit is contained in:
Michael Beck 2025-05-23 19:07:40 +02:00
parent 987c76969b
commit 36ba835980
3 changed files with 92 additions and 5 deletions

View File

@ -5,7 +5,10 @@ from ..db import db
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata
from ..defaults import MAX_VOLUME
import os # Import os for path validation
import sys
from scipaperloader.scrapers import __path__ as scrapers_path
# Import the cache invalidation function from our new module
from ..cache_utils import invalidate_hourly_quota_cache
bp = Blueprint("config", __name__, url_prefix="/config")
@ -166,6 +169,17 @@ def _update_schedule(schedule_data):
)
db.session.commit()
# Invalidate hourly quota cache using the cache_utils module
try:
invalidate_hourly_quota_cache()
except Exception as e:
# Log the error but don't fail the update
ActivityLog.log_error(
error_message=f"Error invalidating hourly quota cache: {str(e)}",
source="_update_schedule"
)
return True, "Schedule updated successfully!"
except Exception as e:

View File

@ -10,6 +10,7 @@ from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory,
from ..db import db
from ..celery import celery
from ..defaults import MAX_VOLUME
from ..cache_utils import get_cached_hourly_quota, invalidate_hourly_quota_cache
from celery.schedules import crontab
from sqlalchemy import func
from scipaperloader.scrapers.factory import get_scraper, get_available_scrapers
@ -360,6 +361,9 @@ def update_config():
description="Updated scraper volume"
)
# Invalidate hourly quota cache when volume changes
invalidate_hourly_quota_cache()
db.session.commit()
except (ValueError, TypeError):
return jsonify({
@ -441,7 +445,8 @@ def dummy_scheduled_scraper():
)
return False # Stop if not active/paused
papers_to_select = calculate_papers_for_current_hour()
# Use cached hourly quota instead of calculating each time
papers_to_select = get_cached_hourly_quota(calculate_papers_for_current_hour)
if papers_to_select <= 0:
ActivityLog.log_scraper_activity(
@ -463,11 +468,18 @@ def dummy_scheduled_scraper():
ActivityLog.log_scraper_activity(
action="dummy_scheduled_scraper_info",
status="info",
description="No 'New' papers found in the database to select."
description="No 'New' papers found in the database. Stopping scraper."
)
# Optional: Depending on requirements, you might want to check later
# or handle this case differently. For now, we just log and exit.
return True
# Stop the scraper since there are no more papers to process
ScraperState.set_active(False)
ActivityLog.log_scraper_command(
action="auto_stop_scraper",
status="success",
description="Scraper automatically stopped due to no 'New' papers left to process."
)
return True
selected_paper_ids = [p.id for p in new_papers]

View File

@ -0,0 +1,61 @@
"""
Utility module for cache management in the SciPaperLoader application.
This module contains functions for managing the hourly quota cache and other caching mechanisms.
"""
from datetime import datetime
from .models import ActivityLog
# Global cache for hourly quota
HOURLY_QUOTA_CACHE = {
'hour': None, # Current hour
'quota': None, # Calculated quota
'last_config_update': None, # Last time volume or schedule config was updated
}
def invalidate_hourly_quota_cache():
"""Invalidate the hourly quota cache when configuration changes."""
global HOURLY_QUOTA_CACHE
HOURLY_QUOTA_CACHE['last_config_update'] = None
# Log the cache invalidation
ActivityLog.log_scraper_activity(
action="cache_invalidated",
status="info",
description="Hourly quota cache was invalidated due to configuration changes"
)
def get_cached_hourly_quota(calculate_function):
"""
Get the cached hourly quota if it's still valid, or recalculate if needed.
Args:
calculate_function: Function to call when recalculation is needed
Returns:
int: Number of papers to download this hour
"""
global HOURLY_QUOTA_CACHE
current_hour = datetime.now().hour
# Check if we need to recalculate
if (HOURLY_QUOTA_CACHE['hour'] != current_hour or
HOURLY_QUOTA_CACHE['quota'] is None or
HOURLY_QUOTA_CACHE['last_config_update'] is None):
# Recalculate and update cache
quota = calculate_function()
HOURLY_QUOTA_CACHE['hour'] = current_hour
HOURLY_QUOTA_CACHE['quota'] = quota
HOURLY_QUOTA_CACHE['last_config_update'] = datetime.now()
# Log cache update
ActivityLog.log_scraper_activity(
action="cache_updated",
status="info",
description=f"Hourly quota cache updated for hour {current_hour}: {quota} papers"
)
return quota
else:
# Use cached value
return HOURLY_QUOTA_CACHE['quota']