Compare commits

...

5 Commits

Author SHA1 Message Date
Michael Beck
243e24e100 fixes quota recalculation 2025-05-23 22:22:49 +02:00
Michael Beck
74e713e8a6 adds option to set paper status when adding dummy papers 2025-05-23 21:36:41 +02:00
Michael Beck
4f0539f4b0 changes config nav to bootstrap pills 2025-05-23 21:11:40 +02:00
Michael Beck
f42be483d6 adds db config page and an option to add test papers 2025-05-23 20:03:39 +02:00
Michael Beck
36ba835980 adds cache management 2025-05-23 19:07:40 +02:00
9 changed files with 506 additions and 109 deletions

View File

@ -5,7 +5,15 @@ from ..db import db
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata
from ..defaults import MAX_VOLUME
import os # Import os for path validation
import sys
from scipaperloader.scrapers import __path__ as scrapers_path
# Import the cache invalidation function from our new module
from ..cache_utils import invalidate_hourly_quota_cache
import random
from datetime import datetime, timedelta
from uuid import uuid4
bp = Blueprint("config", __name__, url_prefix="/config")
@ -41,6 +49,19 @@ def _update_volume(new_volume):
)
db.session.commit()
# Invalidate and recalculate the hourly quota cache
try:
# Import the calculation function from the scraper module
from ..blueprints.scraper import calculate_papers_for_current_hour
invalidate_hourly_quota_cache(calculate_papers_for_current_hour)
except Exception as e:
# Log the error but don't fail the update
ActivityLog.log_error(
error_message=f"Error invalidating hourly quota cache: {str(e)}",
source="_update_volume"
)
return True, "Volume updated successfully!", volume_config
except (ValueError, TypeError) as e:
@ -166,6 +187,19 @@ def _update_schedule(schedule_data):
)
db.session.commit()
# Invalidate hourly quota cache and immediately recalculate
try:
# Import the calculation function from the scraper module
from ..blueprints.scraper import calculate_papers_for_current_hour
invalidate_hourly_quota_cache(calculate_papers_for_current_hour)
except Exception as e:
# Log the error but don't fail the update
ActivityLog.log_error(
error_message=f"Error invalidating hourly quota cache: {str(e)}",
source="_update_schedule"
)
return True, "Schedule updated successfully!"
except Exception as e:
@ -234,12 +268,118 @@ def schedule():
app_title="Configuration"
)
@bp.route("/database")
def database():
"""Show database configuration page."""
# Remove old update_volume route
# @bp.route("/update/volume", methods=["POST"])
# def update_volume(): ...
return render_template(
"config/index.html.jinja",
active_tab="database",
app_title="Configuration"
)
@bp.route("/generate_test_papers", methods=["POST"])
def generate_test_papers():
"""Generate random test papers for the database."""
try:
# Get the requested number of papers (with validation)
try:
paper_count = int(request.form.get("paper_count", "100"))
if paper_count < 1:
paper_count = 1
elif paper_count > 1000:
paper_count = 1000
except (ValueError, TypeError):
paper_count = 100
# Get the status settings
try:
dummy_paper_status = request.form.get("dummy_paper_status")
if dummy_paper_status == "new":
dummy_paper_status = "New"
else:
dummy_paper_status = random.choice(["New","Pending", "Done", "Failed"])
except (ValueError, TypeError):
dummy_paper_status = random.choice(["New","Pending", "Done", "Failed"])
# Get the download path for file paths
download_path = DownloadPathConfig.get_path()
# Sample journal names for realistic test data
journals = [
"Nature", "Science", "Cell", "PNAS", "Journal of Biological Chemistry",
"IEEE Transactions on Neural Networks", "Artificial Intelligence",
"Machine Learning", "Neural Computation", "Journal of Machine Learning Research",
"Journal of Artificial Intelligence Research", "Data Mining and Knowledge Discovery",
"Pattern Recognition", "Neural Networks", "Journal of Physical Chemistry"
]
# Sample paper types
paper_types = ["Article", "Review", "Conference", "Preprint", "Book Chapter"]
# Sample languages
languages = ["English", "German", "French", "Chinese", "Spanish", "Japanese"]
# Generate random papers
papers_added = 0
for i in range(paper_count):
# Generate a random DOI
doi = f"10.{random.randint(1000, 9999)}/{uuid4().hex[:8]}"
# Skip if DOI already exists
if PaperMetadata.query.filter_by(doi=doi).first():
continue
# Random publishing date within the last 5 years
days_ago = random.randint(0, 5 * 365)
pub_date = datetime.now() - timedelta(days=days_ago)
# Create paper
paper = PaperMetadata(
title=f"Test Paper {i+1}: {''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(5))}",
doi=doi,
alt_id=f"ALT-{random.randint(10000, 99999)}",
issn=f"{random.randint(1000, 9999)}-{random.randint(1000, 9999)}",
journal=random.choice(journals),
type=random.choice(paper_types),
language=random.choice(languages),
published_online=pub_date.date(),
status=dummy_paper_status,
file_path=f"{download_path}/test_paper_{i+1}.pdf" if random.random() > 0.3 else None,
error_msg="Download failed: connection timeout" if random.random() < 0.1 else None,
created_at=datetime.now() - timedelta(days=random.randint(0, 30))
)
db.session.add(paper)
papers_added += 1
# Commit in batches to improve performance
if i % 100 == 0:
db.session.commit()
# Final commit
db.session.commit()
# Log the action using the existing log_import_activity method
ActivityLog.log_import_activity(
action="generate_test_papers",
status="success",
description=f"Generated {papers_added} test papers for the database"
)
flash(f"Successfully generated {papers_added} test papers.", "success")
except Exception as e:
db.session.rollback()
flash(f"Failed to generate test papers: {str(e)}", "error")
ActivityLog.log_error(
error_message=f"Failed to generate test papers: {str(e)}",
exception=e,
source="config.generate_test_papers"
)
return redirect(url_for("config.database"))
# Add new route to handle general settings form
@bp.route("/update/general", methods=["POST"])
def update_general():
"""Update general configuration (Volume and Download Path)."""

View File

@ -10,6 +10,7 @@ from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory,
from ..db import db
from ..celery import celery
from ..defaults import MAX_VOLUME
from ..cache_utils import get_cached_hourly_quota, invalidate_hourly_quota_cache
from celery.schedules import crontab
from sqlalchemy import func
from scipaperloader.scrapers.factory import get_scraper, get_available_scrapers
@ -360,6 +361,9 @@ def update_config():
description="Updated scraper volume"
)
# Invalidate hourly quota cache when volume changes
invalidate_hourly_quota_cache()
db.session.commit()
except (ValueError, TypeError):
return jsonify({
@ -441,7 +445,8 @@ def dummy_scheduled_scraper():
)
return False # Stop if not active/paused
papers_to_select = calculate_papers_for_current_hour()
# Use cached hourly quota instead of calculating each time
papers_to_select = get_cached_hourly_quota(calculate_papers_for_current_hour)
if papers_to_select <= 0:
ActivityLog.log_scraper_activity(
@ -463,10 +468,17 @@ def dummy_scheduled_scraper():
ActivityLog.log_scraper_activity(
action="dummy_scheduled_scraper_info",
status="info",
description="No 'New' papers found in the database to select."
description="No 'New' papers found in the database. Stopping scraper."
)
# Optional: Depending on requirements, you might want to check later
# or handle this case differently. For now, we just log and exit.
# Stop the scraper since there are no more papers to process
ScraperState.set_active(False)
ActivityLog.log_scraper_command(
action="auto_stop_scraper",
status="success",
description="Scraper automatically stopped due to no 'New' papers left to process."
)
return True
selected_paper_ids = [p.id for p in new_papers]

View File

@ -0,0 +1,81 @@
"""
Utility module for cache management in the SciPaperLoader application.
This module contains functions for managing the hourly quota cache and other caching mechanisms.
"""
from datetime import datetime
from .models import ActivityLog
# Global cache for hourly quota
HOURLY_QUOTA_CACHE = {
'hour': None, # Current hour
'quota': None, # Calculated quota
'last_config_update': None, # Last time volume or schedule config was updated
}
def invalidate_hourly_quota_cache(calculate_function=None):
"""
Invalidate the hourly quota cache when configuration changes.
Args:
calculate_function (callable, optional): Function to recalculate quota immediately.
If None, recalculation will happen during next get_cached_hourly_quota() call.
"""
global HOURLY_QUOTA_CACHE
HOURLY_QUOTA_CACHE['last_config_update'] = None
# If a calculation function is provided, recalculate immediately
if calculate_function:
current_hour = datetime.now().hour
quota = calculate_function()
HOURLY_QUOTA_CACHE['hour'] = current_hour
HOURLY_QUOTA_CACHE['quota'] = quota
HOURLY_QUOTA_CACHE['last_config_update'] = datetime.now()
ActivityLog.log_scraper_activity(
action="cache_recalculated",
status="info",
description=f"Hourly quota immediately recalculated after config change: {quota} papers"
)
else:
# Log the cache invalidation
ActivityLog.log_scraper_activity(
action="cache_invalidated",
status="info",
description="Hourly quota cache was invalidated due to configuration changes"
)
def get_cached_hourly_quota(calculate_function):
"""
Get the cached hourly quota if it's still valid, or recalculate if needed.
Args:
calculate_function: Function to call when recalculation is needed
Returns:
int: Number of papers to download this hour
"""
global HOURLY_QUOTA_CACHE
current_hour = datetime.now().hour
# Check if we need to recalculate
if (HOURLY_QUOTA_CACHE['hour'] != current_hour or
HOURLY_QUOTA_CACHE['quota'] is None or
HOURLY_QUOTA_CACHE['last_config_update'] is None):
# Recalculate and update cache
quota = calculate_function()
HOURLY_QUOTA_CACHE['hour'] = current_hour
HOURLY_QUOTA_CACHE['quota'] = quota
HOURLY_QUOTA_CACHE['last_config_update'] = datetime.now()
# Log cache update
ActivityLog.log_scraper_activity(
action="cache_updated",
status="info",
description=f"Hourly quota cache updated for hour {current_hour}: {quota} papers"
)
return quota
else:
# Use cached value
return HOURLY_QUOTA_CACHE['quota']

View File

@ -0,0 +1,87 @@
<!-- General Configuration Tab -->
<div class="tab-pane active">
<div class="config-form">
<div class="card">
<div class="card-header">
<h5>Database Configuration</h5>
</div>
<div class="card-body">
<!-- include flash messages template -->
{% include "partials/flash_messages.html.jinja" %}
<!-- Generate Test Papers Section -->
<div class="row mt-4">
<div class="col-12">
<div class="card border-primary">
<div class="card-header bg-primary text-white">
<h5>Generate Test Papers</h5>
</div>
<div class="card-body">
<div class="form-section">
<h6>Add Test Papers for Testing</h6>
<p class="text-muted">Generate random test papers to populate your database for
testing purposes.</p>
<form method="post" action="{{ url_for('config.generate_test_papers') }}"
class="mt-3">
<div class="form-group row">
<label for="paper_count" class="col-sm-3 col-form-label">Number of
Papers:</label>
<div class="col-sm-4">
<input type="number" class="form-control" id="paper_count"
name="paper_count" min="1" max="1000" value="100" required>
<small class="form-text text-muted">Enter a number between 1 and
1000</small>
</div>
</div>
<div class="form-group row">
<label for="dummy_paper_status" class="col-sm-3 col-form-label">Paper
Status:</label>
<div class="col-sm-4">
<select id="dummy_paper_status" class="form-control"
name="dummy_paper_status">
<option value="new">New Only</option>
<option value="mixed-random">Randomly Mixed</option>
</select>
</div>
<div class="col-sm-5">
<button type="submit" class="btn btn-primary">
<i class="fas fa-plus-circle"></i> Generate Test Papers
</button>
</div>
</div>
</form>
</div>
</div>
</div>
</div>
</div>
<!-- Database Management Section -->
<div class="row mt-4">
<div class="col-12">
<div class="card border-danger">
<div class="card-header bg-danger text-white">
<h5>Database Management</h5>
</div>
<div class="card-body">
<div class="form-section">
<h6>Delete All Papers</h6>
<p class="text-muted">This action will permanently delete all paper records from the
database. This cannot be undone.</p>
<form method="post" action="{{ url_for('config.delete_all_papers') }}" class="mt-3"
onsubmit="return confirm('WARNING: You are about to delete ALL papers from the database. This action cannot be undone. Are you sure you want to proceed?');">
<button type="submit" class="btn btn-danger">
<i class="fas fa-trash-alt"></i> Delete All Papers
</button>
</form>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>

View File

@ -91,30 +91,6 @@
</div>
</div>
<!-- Database Management Section -->
<div class="row mt-4">
<div class="col-12">
<div class="card border-danger">
<div class="card-header bg-danger text-white">
<h5>Database Management</h5>
</div>
<div class="card-body">
<div class="form-section">
<h6>Delete All Papers</h6>
<p class="text-muted">This action will permanently delete all paper records from the
database. This cannot be undone.</p>
<form method="post" action="{{ url_for('config.delete_all_papers') }}" class="mt-3"
onsubmit="return confirm('WARNING: You are about to delete ALL papers from the database. This action cannot be undone. Are you sure you want to proceed?');">
<button type="submit" class="btn btn-danger">
<i class="fas fa-trash-alt"></i> Delete All Papers
</button>
</form>
</div>
</div>
</div>
</div>
</div>
</div>
</div>
</div>

View File

@ -28,7 +28,7 @@
<div class="container mt-4">
<h1>Configuration</h1>
<ul class="nav nav-tabs mb-4">
<ul class="nav nav-pills nav-fill mb-4">
<li class="nav-item">
<a class="nav-link {% if active_tab == 'general' %}active{% endif %}"
href="{{ url_for('config.general') }}">General</a>
@ -37,6 +37,10 @@
<a class="nav-link {% if active_tab == 'schedule' %}active{% endif %}"
href="{{ url_for('config.schedule') }}">Schedule</a>
</li>
<li class="nav-item">
<a class="nav-link {% if active_tab == 'database' %}active{% endif %}"
href="{{ url_for('config.database') }}">Database</a>
</li>
</ul>
<div class="tab-content">
@ -44,6 +48,8 @@
{% include "config/general.html.jinja" %}
{% elif active_tab == 'schedule' %}
{% include "config/schedule.html.jinja" %}
{% elif active_tab == 'database' %}
{% include "config/database.html.jinja" %}
{% endif %}
</div>
</div>

View File

@ -30,6 +30,13 @@
font-size: 0.7rem;
margin-top: 2px;
}
.weight-gradient {
width: 50px;
height: 15px;
background: linear-gradient(to right, hsl(210, 10%, 95%), hsl(210, 10%, 30%));
border-radius: 2px;
}
</style>
<script>
@ -39,63 +46,135 @@
<div x-data="scheduleManager(initialSchedule, totalVolume)" class="tab-pane active">
<div class="card">
<div class="card-header">
<div class="card-header d-flex justify-content-between">
<h5>Scheduling Configuration</h5>
<span>
<button class="btn btn-sm btn-outline-secondary" type="button" data-bs-toggle="collapse"
data-bs-target="#helpContent">
<i class="fas fa-question-circle"></i> Help
</button>
</span>
</div>
<div class="card-body">
<!-- include flash messages template -->
{% include "partials/flash_messages.html.jinja" %}
<!-- Content -->
<div class="mb-3">
<h3>How it Works</h3>
<p class="text-muted mb-0">
This page allows you to configure the daily volume of papers to be
downloaded and the hourly download weights for the papers. The weights
determine how many papers will be downloaded during each hour of the day.
The total volume (<strong x-text="volume"></strong> papers/day) is split
across all hours based on their relative weights. Each weight controls the
proportion of papers downloaded during that hour. Click to select one or
more hours below. Then assign a weight to them using the input and apply
it. Color indicates relative intensity. The total daily volume will be
split proportionally across these weights.
<strong>Don't forget to submit the changes!</strong>
</p>
<h3>Example</h3>
<p class="text-muted mb-0">
If the total volume is <strong>240 papers</strong> and hours are
<strong>weighted as 1.0, 2.0, and 3.0</strong>, they will receive
<strong>40, 80, and 120 papers</strong> respectively.
</p>
<!-- Collapsible Help Content -->
<div class="collapse mt-3" id="helpContent">
<div class="card card-body">
<ul class="nav nav-tabs" id="helpTabs" role="tablist">
<li class="nav-item" role="presentation">
<button class="nav-link active" id="calculation-tab" data-bs-toggle="tab"
data-bs-target="#calculation" type="button">Calculation</button>
</li>
<li class="nav-item" role="presentation">
<button class="nav-link" id="usage-tab" data-bs-toggle="tab" data-bs-target="#usage"
type="button">Usage</button>
</li>
<li class="nav-item" role="presentation">
<button class="nav-link" id="example-tab" data-bs-toggle="tab" data-bs-target="#example"
type="button">Example</button>
</li>
</ul>
<div class="tab-content p-3 border border-top-0 rounded-bottom">
<!-- Calculation Tab -->
<div class="tab-pane fade show active" id="calculation" role="tabpanel">
<h5>Quota Calculation</h5>
<p>Each hour's quota is calculated as:</p>
<div class="bg-light p-2 mb-2 rounded">
<code>Papers per hour = (Hour Weight ÷ Total Weight) × Daily Volume</code>
</div>
<p class="small mb-0">Changes to either volume or schedule weights will immediately
recalculate all hourly quotas.</p>
</div>
<h2 class="mt-4">Volume</h2>
<!-- Usage Instructions Tab -->
<div class="tab-pane fade" id="usage" role="tabpanel">
<h5>Usage Instructions</h5>
<ol class="mb-0">
<li>Click to select one or more hour blocks (use drag to select multiple)</li>
<li>Adjust the weight value for selected hours (0.1-5.0)</li>
<li>Click "Apply to Selected" to set the weights</li>
<li>Click "Save Schedule" to commit your changes</li>
</ol>
</div>
<div class="align-items-start flex-wrap gap-2">
<p class="text-muted">
The total volume of data to be downloaded each day is
<strong x-text="volume"></strong> papers.
<!-- Example Tab -->
<div class="tab-pane fade" id="example" role="tabpanel">
<h5>Example</h5>
<p class="mb-0">
With a daily volume of <strong>240 papers</strong> and hour weights of
<strong>1.0, 2.0, and 3.0</strong>, the distribution will be
<strong>40, 80, and 120 papers</strong> respectively
(based on ratios 1:2:3 of the total weight 6.0).
</p>
<div class="d-flex align-items-center mb-3" x-data="{ volumeValue: volume }">
<div class="input-group w-50">
<label class="input-group-text">Papers per day:</label>
<input type="number" class="form-control" x-model="volumeValue" min="1" max="{{ max_volume }}"
required />
</div>
</div>
</div>
</div>
<!-- Volume and Schedule Controls -->
<div class="row g-3 mb-3">
<!-- Daily Volume Column -->
<div class="col-md-4" x-data="{ volumeValue: volume }">
<div class="card h-100">
<div class="card-header bg-light">
<h5 class="mb-0"><i class="fas fa-chart-line"></i> Daily Volume</h5>
</div>
<div class="card-body">
<p class="lead mb-2">
<strong x-text="volume"></strong> papers/day
</p>
<div class="input-group input-group-sm mb-2">
<input type="number" class="form-control" x-model="volumeValue" min="1"
max="{{ max_volume }}" required />
<button type="button" class="btn btn-primary" @click="updateVolume()">
Update Volume
<i class="fas fa-save"></i> Update
</button>
</div>
<small class="text-muted">Range: 1-{{ max_volume }}</small>
</div>
</div>
</div>
<h2 class="mt-4">Current Schedule</h2>
<form x-data id="scheduleForm">
<!-- Legend Column -->
<div class="col-md-8">
<div class="card h-100">
<div class="card-header bg-light">
<h5 class="mb-0"><i class="fas fa-info-circle"></i> Quick Guide</h5>
</div>
<div class="card-body">
<div class="d-flex align-items-center justify-content-between mb-2">
<div class="d-flex align-items-center">
<div class="weight-gradient me-2"></div>
<small>Darker blocks = higher weight</small>
</div>
<div class="badge bg-info">Formula: (Weight ÷ Total) × Volume</div>
</div>
<div class="d-flex align-items-center small text-muted">
<div class="me-3"><i class="fas fa-mouse-pointer"></i> Click to select hours</div>
<div class="me-3"><i class="fas fa-arrows-alt-h"></i> Drag to select multiple</div>
<div><i class="fas fa-save"></i> Save after changes</div>
</div>
</div>
</div>
</div>
</div>
<!-- 24-Hour Schedule -->
<form id="scheduleForm">
<div class="card">
<div class="card-header d-flex justify-content-between align-items-center bg-light">
<h5 class="mb-0"><i class="fas fa-clock"></i> 24-Hour Schedule</h5>
<span class="badge bg-info"
x-text="selectedHours.length ? selectedHours.length + ' hours selected' : ''"></span>
</div>
<div class="card-body">
<div class="timeline mb-3" @mouseup="endDrag()" @mouseleave="endDrag()">
<template x-for="hour in Object.keys(schedule)" :key="hour">
<div class="hour-block" :id="'hour-' + hour" :data-hour="hour" :style="getBackgroundStyle(hour)"
:class="{'selected': isSelected(hour)}" @mousedown="startDrag($event, hour)"
@mouseover="dragSelect(hour)">
<div class="hour-block" :id="'hour-' + hour" :data-hour="hour"
:style="getBackgroundStyle(hour)" :class="{'selected': isSelected(hour)}"
@mousedown="startDrag($event, hour)" @mouseover="dragSelect(hour)">
<div><strong x-text="formatHour(hour)"></strong></div>
<div class="weight"><span x-text="schedule[hour]"></span></div>
<div class="papers">
@ -106,19 +185,29 @@
</template>
</div>
<div class="input-group mb-4 w-50">
<label class="input-group-text">Set Weight:</label>
<input type="number" step="0.1" min="0" max="5" x-model="newWeight" class="form-control" />
<button type="button" class="btn btn-outline-primary" @click="applyWeight()">
Apply to Selected
<div class="input-group mb-2">
<label class="input-group-text"><i class="fas fa-weight"></i> Weight</label>
<input type="number" step="0.1" min="0.1" max="5" x-model="newWeight"
class="form-control" />
<button type="button" class="btn btn-primary" @click="applyWeight()"
:disabled="selectedHours.length === 0">
Apply to <span x-text="selectedHours.length"></span> Selected
</button>
</div>
</div>
<div class="card-footer">
<div class="d-flex justify-content-between">
<a href="{{ url_for('config.general') }}" class="btn btn-outline-secondary">⬅ Back</a>
<button type="button" class="btn btn-success" @click="saveSchedule()">💾 Save Schedule</button>
<a href="{{ url_for('config.general') }}" class="btn btn-outline-secondary">
<i class="fas fa-arrow-left"></i> Back
</a>
<button type="button" class="btn btn-success" @click="saveSchedule()">
<i class="fas fa-save"></i> Save Schedule
</button>
</div>
</div>
</div>
</form>
</div>
</div>
</div>

View File

@ -16,6 +16,12 @@
<label>Status</label>
<select name="status" class="form-select">
<option value="">All</option>
{% if request.args.get('status') == 'New' %}
<option value="New" selected>New</option>
{% else %}
<option value="New">New</option>
{% endif %}
{% if request.args.get('status') == 'Pending' %}
<option value="Pending" selected>Pending</option>
{% else %}