568 lines
20 KiB
Python

"""Configuration management blueprint."""
from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify, current_app
from ..db import db
# Import the new model
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata
from ..defaults import MAX_VOLUME
import os # Import os for path validation
import sys
from scipaperloader.scrapers import __path__ as scrapers_path
# Import the cache invalidation function from our new module
from ..cache_utils import invalidate_hourly_quota_cache
import random
from datetime import datetime, timedelta
from uuid import uuid4
bp = Blueprint("config", __name__, url_prefix="/config")
# Helper functions for configuration updates
def _update_volume(new_volume):
"""
Helper function to update volume configuration.
Args:
new_volume (float): The new volume value
Returns:
tuple: (success, message, volume_config)
"""
try:
new_volume = float(new_volume)
if new_volume <= 0 or new_volume > MAX_VOLUME:
return False, f"Volume must be between 1 and {MAX_VOLUME}", None
volume_config = VolumeConfig.query.first()
if not volume_config:
volume_config = VolumeConfig(volume=new_volume)
db.session.add(volume_config)
else:
old_value = volume_config.volume
volume_config.volume = new_volume
ActivityLog.log_config_change(
config_key="scraper_volume",
old_value=old_value,
new_value=new_volume,
description="Updated scraper volume"
)
db.session.commit()
return True, "Volume updated successfully!", volume_config
except (ValueError, TypeError) as e:
db.session.rollback()
return False, f"Error updating volume: {str(e)}", None
# Add helper for download path
def _update_download_path(new_path):
"""
Helper function to update download path configuration.
Args:
new_path (str): The new download path
Returns:
tuple: (success, message, download_path_config)
"""
try:
# Basic validation: check if it's a non-empty string
if not new_path or not isinstance(new_path, str):
return False, "Download path cannot be empty.", None
# --- Add more validation like checking if path exists or is writable ---
# Check if the path exists and is a directory
if not os.path.isdir(new_path):
# Try to create it if it doesn't exist
try:
os.makedirs(new_path, exist_ok=True)
ActivityLog.log_scraper_activity(
action="create_directory",
status="info",
description=f"Created download directory: {new_path}"
)
except OSError as e:
ActivityLog.log_error(
error_message=f"Failed to create download directory: {new_path}, Error: {str(e)}",
source="update_download_path"
)
return False, f"Path '{new_path}' is not a valid directory and could not be created: {e}", None
# Check if the path is readable
if not os.access(new_path, os.R_OK):
ActivityLog.log_error(
error_message=f"Download path '{new_path}' is not readable.",
source="check_directory_permissions"
)
return False, f"Path '{new_path}' exists but is not readable by the application.", None
# Check if the path is writable
if not os.access(new_path, os.W_OK):
ActivityLog.log_error(
error_message=f"Download path '{new_path}' is not writable.",
source="check_directory_permissions"
)
return False, f"Path '{new_path}' exists but is not writable by the application.", None
# --- End of validation ---
config = DownloadPathConfig.query.first()
if not config:
config = DownloadPathConfig(path=new_path)
db.session.add(config)
else:
old_value = config.path
config.path = new_path
ActivityLog.log_config_change(
config_key="download_path",
old_value=old_value,
new_value=new_path,
description="Updated download path"
)
db.session.commit()
return True, "Download path updated successfully!", config
except Exception as e:
db.session.rollback()
return False, f"Error updating download path: {str(e)}", None
def _update_schedule(schedule_data):
"""
Helper function to update schedule configuration.
Args:
schedule_data (dict): Dictionary with hour:weight pairs
Returns:
tuple: (success, message)
"""
try:
# Validate all entries first
for hour_str, weight in schedule_data.items():
try:
hour = int(hour_str)
weight = float(weight)
if hour < 0 or hour > 23:
return False, f"Hour value must be between 0 and 23, got {hour}"
if weight < 0.1 or weight > 5:
return False, f"Weight for hour {hour} must be between 0.1 and 5, got {weight}"
except ValueError:
return False, f"Invalid data format for hour {hour_str}"
# Update schedule after validation
for hour_str, weight in schedule_data.items():
hour = int(hour_str)
weight = float(weight)
config = ScheduleConfig.query.get(hour)
if not config:
config = ScheduleConfig(hour=hour, weight=weight)
db.session.add(config)
else:
old_value = config.weight
config.weight = weight
ActivityLog.log_config_change(
config_key=f"schedule_hour_{hour}",
old_value=old_value,
new_value=weight,
description=f"Updated schedule weight for hour {hour}"
)
db.session.commit()
# Invalidate hourly quota cache using the cache_utils module
try:
invalidate_hourly_quota_cache()
except Exception as e:
# Log the error but don't fail the update
ActivityLog.log_error(
error_message=f"Error invalidating hourly quota cache: {str(e)}",
source="_update_schedule"
)
return True, "Schedule updated successfully!"
except Exception as e:
db.session.rollback()
return False, f"Error updating schedule: {str(e)}"
@bp.route("/")
@bp.route("/general")
def general():
"""Show general configuration page."""
volume_config = VolumeConfig.query.first()
if not volume_config:
volume_config = VolumeConfig(volume=100) # Default value
db.session.add(volume_config)
db.session.commit()
# Fetch download path config
download_path_config = DownloadPathConfig.query.first()
if not download_path_config:
download_path_config = DownloadPathConfig() # Use default from model
db.session.add(download_path_config)
db.session.commit()
return render_template(
"config/index.html.jinja",
active_tab="general",
volume_config=volume_config,
download_path_config=download_path_config, # Pass to template
max_volume=MAX_VOLUME,
app_title="Configuration"
)
@bp.route("/schedule")
def schedule():
"""Show schedule configuration page."""
# Ensure we have schedule config for all hours
existing_hours = {record.hour: record for record in ScheduleConfig.query.all()}
schedule_config = {}
for hour in range(24):
if hour in existing_hours:
schedule_config[hour] = existing_hours[hour].weight
else:
# Create default schedule entry (weight 1.0)
new_config = ScheduleConfig(hour=hour, weight=1.0)
db.session.add(new_config)
schedule_config[hour] = 1.0
if len(existing_hours) < 24:
db.session.commit()
volume_config = VolumeConfig.query.first()
if not volume_config:
volume_config = VolumeConfig(volume=100) # Default value
db.session.add(volume_config)
db.session.commit()
return render_template(
"config/index.html.jinja",
active_tab="schedule",
schedule=schedule_config,
volume=volume_config.volume,
max_volume=MAX_VOLUME,
app_title="Configuration"
)
@bp.route("/database")
def database():
"""Show database configuration page."""
return render_template(
"config/index.html.jinja",
active_tab="database",
app_title="Configuration"
)
@bp.route("/generate_test_papers", methods=["POST"])
def generate_test_papers():
"""Generate random test papers for the database."""
try:
# Get the requested number of papers (with validation)
try:
paper_count = int(request.form.get("paper_count", "100"))
if paper_count < 1:
paper_count = 1
elif paper_count > 1000:
paper_count = 1000
except (ValueError, TypeError):
paper_count = 100
# Get the status settings
try:
dummy_paper_status = request.form.get("dummy_paper_status")
if dummy_paper_status == "new":
dummy_paper_status = "New"
else:
dummy_paper_status = random.choice(["New","Pending", "Done", "Failed"])
except (ValueError, TypeError):
dummy_paper_status = random.choice(["New","Pending", "Done", "Failed"])
# Get the download path for file paths
download_path = DownloadPathConfig.get_path()
# Sample journal names for realistic test data
journals = [
"Nature", "Science", "Cell", "PNAS", "Journal of Biological Chemistry",
"IEEE Transactions on Neural Networks", "Artificial Intelligence",
"Machine Learning", "Neural Computation", "Journal of Machine Learning Research",
"Journal of Artificial Intelligence Research", "Data Mining and Knowledge Discovery",
"Pattern Recognition", "Neural Networks", "Journal of Physical Chemistry"
]
# Sample paper types
paper_types = ["Article", "Review", "Conference", "Preprint", "Book Chapter"]
# Sample languages
languages = ["English", "German", "French", "Chinese", "Spanish", "Japanese"]
# Generate random papers
papers_added = 0
for i in range(paper_count):
# Generate a random DOI
doi = f"10.{random.randint(1000, 9999)}/{uuid4().hex[:8]}"
# Skip if DOI already exists
if PaperMetadata.query.filter_by(doi=doi).first():
continue
# Random publishing date within the last 5 years
days_ago = random.randint(0, 5 * 365)
pub_date = datetime.now() - timedelta(days=days_ago)
# Create paper
paper = PaperMetadata(
title=f"Test Paper {i+1}: {''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(5))}",
doi=doi,
alt_id=f"ALT-{random.randint(10000, 99999)}",
issn=f"{random.randint(1000, 9999)}-{random.randint(1000, 9999)}",
journal=random.choice(journals),
type=random.choice(paper_types),
language=random.choice(languages),
published_online=pub_date.date(),
status=dummy_paper_status,
file_path=f"{download_path}/test_paper_{i+1}.pdf" if random.random() > 0.3 else None,
error_msg="Download failed: connection timeout" if random.random() < 0.1 else None,
created_at=datetime.now() - timedelta(days=random.randint(0, 30))
)
db.session.add(paper)
papers_added += 1
# Commit in batches to improve performance
if i % 100 == 0:
db.session.commit()
# Final commit
db.session.commit()
# Log the action using the existing log_import_activity method
ActivityLog.log_import_activity(
action="generate_test_papers",
status="success",
description=f"Generated {papers_added} test papers for the database"
)
flash(f"Successfully generated {papers_added} test papers.", "success")
except Exception as e:
db.session.rollback()
flash(f"Failed to generate test papers: {str(e)}", "error")
ActivityLog.log_error(
error_message=f"Failed to generate test papers: {str(e)}",
exception=e,
source="config.generate_test_papers"
)
return redirect(url_for("config.database"))
@bp.route("/update/general", methods=["POST"])
def update_general():
"""Update general configuration (Volume and Download Path)."""
volume_success, volume_message = True, ""
path_success, path_message = True, ""
# Update Volume
new_volume = request.form.get("total_volume")
if new_volume is not None:
volume_success, volume_message, _ = _update_volume(new_volume)
if volume_success:
flash(volume_message, "success")
else:
flash(volume_message, "error")
# Update Download Path
new_path = request.form.get("download_path")
if new_path is not None:
path_success, path_message, _ = _update_download_path(new_path)
if path_success:
flash(path_message, "success")
else:
flash(path_message, "error")
return redirect(url_for("config.general"))
@bp.route("/update/schedule", methods=["POST"])
def update_schedule():
"""Update schedule configuration."""
schedule_data = {}
for hour in range(24):
key = f"hour_{hour}"
if key not in request.form:
flash(f"Missing data for hour {hour}", "error")
return redirect(url_for("config.schedule"))
schedule_data[str(hour)] = request.form.get(key, 0)
success, message = _update_schedule(schedule_data)
if success:
flash(message, "success")
else:
flash(message, "error")
return redirect(url_for("config.schedule"))
@bp.route("/update/scraper_module", methods=["POST"])
def update_scraper_module():
"""Update the scraper module configuration."""
from ..models import ScraperModuleConfig
new_scraper_module = request.form.get("scraper_module")
if not new_scraper_module:
flash("Scraper module cannot be empty.", "error")
return redirect(url_for("config.general"))
# Validate that the module exists and is valid
from scipaperloader.scrapers.factory import get_available_scrapers
available_modules = [m["name"] for m in get_available_scrapers()]
if new_scraper_module not in available_modules:
flash(f"Invalid scraper module: {new_scraper_module}", "error")
return redirect(url_for("config.general"))
# Update the database configuration
ScraperModuleConfig.set_module(new_scraper_module)
flash(f"Scraper module updated to '{new_scraper_module}'.", "success")
return redirect(url_for("config.general"))
@bp.context_processor
def inject_scraper_modules():
"""Inject available scraper modules into the template context."""
from scipaperloader.scrapers.factory import get_available_scrapers
from ..models import ScraperModuleConfig
available_scrapers = get_available_scrapers()
current_module = ScraperModuleConfig.get_current_module()
return {
"available_scraper_modules": [s["name"] for s in available_scrapers],
"current_scraper_module": current_module,
"scraper_details": {s["name"]: s for s in available_scrapers}
}
@bp.route("/api/schedule/stats")
def schedule_stats():
"""Get statistics about the current schedule configuration."""
volume_config = VolumeConfig.query.first()
if not volume_config:
return jsonify({"error": "No volume configuration found"})
total_volume = volume_config.volume
schedule_configs = ScheduleConfig.query.all()
if not schedule_configs:
return jsonify({"error": "No schedule configuration found"})
# Calculate total weight
total_weight = sum(config.weight for config in schedule_configs)
# Calculate papers per hour
papers_per_hour = {}
hourly_weights = {}
for config in schedule_configs:
weight_ratio = config.weight / total_weight if total_weight > 0 else 0
papers = weight_ratio * total_volume
papers_per_hour[config.hour] = papers
hourly_weights[config.hour] = config.weight
return jsonify({
"total_volume": total_volume,
"total_weight": total_weight,
"papers_per_hour": papers_per_hour,
"hourly_weights": hourly_weights
})
@bp.route("/api/update_config", methods=["POST"])
def api_update_config():
"""API endpoint to update configuration."""
data = request.json
response = {"success": True, "updates": []}
try:
# Update volume if provided
if "volume" in data:
success, message, _ = _update_volume(data["volume"])
response["updates"].append({
"type": "volume",
"success": success,
"message": message
})
if not success:
response["success"] = False
# Update download path if provided
if "download_path" in data:
success, message, _ = _update_download_path(data["download_path"])
response["updates"].append({
"type": "download_path",
"success": success,
"message": message
})
if not success:
response["success"] = False
# Update schedule if provided
if "schedule" in data:
success, message = _update_schedule(data["schedule"])
response["updates"].append({
"type": "schedule",
"success": success,
"message": message
})
if not success:
response["success"] = False
return jsonify(response)
except Exception as e:
db.session.rollback()
return jsonify({
"success": False,
"message": f"Unexpected error: {str(e)}"
})
@bp.route("/delete_all_papers", methods=["POST"])
def delete_all_papers():
"""Delete all paper records from the database."""
try:
# Count papers before deletion for logging purposes
paper_count = PaperMetadata.query.count()
# Delete all records from the PaperMetadata table
PaperMetadata.query.delete()
db.session.commit()
# Log the action
ActivityLog.log_config_change(
config_key="database",
old_value=f"{paper_count} papers",
new_value="0 papers",
description=f"Deleted all {paper_count} papers from the database"
)
flash(f"Successfully deleted all {paper_count} papers from the database.", "success")
except Exception as e:
db.session.rollback()
flash(f"Failed to delete papers: {str(e)}", "error")
ActivityLog.log_error(
error_message=f"Failed to delete all papers: {str(e)}",
exception=e,
source="config.delete_all_papers"
)
return redirect(url_for("config.general"))