SciPaperLoader/scipaperloader/blueprints/config.py

"""Configuration management blueprint."""
from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify, current_app
from ..db import db
# Import the new model
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata
from ..defaults import MAX_VOLUME
import os # Import os for path validation
import sys
from scipaperloader.scrapers import __path__ as scrapers_path
# Import the cache invalidation function from our new module
from ..cache_utils import invalidate_hourly_quota_cache

import random
from datetime import datetime, timedelta
from uuid import uuid4


bp = Blueprint("config", __name__, url_prefix="/config")


# Helper functions for configuration updates
def _update_volume(new_volume):
    """
    Helper function to update volume configuration.

    Args:
        new_volume (float): The new volume value

    Returns:
        tuple: (success, message, volume_config)
    """
    try:
        new_volume = float(new_volume)
        if new_volume <= 0 or new_volume > MAX_VOLUME:
            return False, f"Volume must be between 1 and {MAX_VOLUME}", None

        volume_config = VolumeConfig.query.first()
        if not volume_config:
            volume_config = VolumeConfig(volume=new_volume)
            db.session.add(volume_config)
        else:
            old_value = volume_config.volume
            volume_config.volume = new_volume
            ActivityLog.log_config_change(
                config_key="scraper_volume",
                old_value=old_value,
                new_value=new_volume,
                description="Updated scraper volume"
            )

        db.session.commit()
        return True, "Volume updated successfully!", volume_config

    except (ValueError, TypeError) as e:
        db.session.rollback()
        return False, f"Error updating volume: {str(e)}", None


# Add helper for download path
def _update_download_path(new_path):
    """
    Helper function to update download path configuration.

    Args:
        new_path (str): The new download path

    Returns:
        tuple: (success, message, download_path_config)
    """
    try:
        # Basic validation: check if it's a non-empty string
        if not new_path or not isinstance(new_path, str):
            return False, "Download path cannot be empty.", None

        # --- Add more validation like checking if path exists or is writable ---
        # Check if the path exists and is a directory
        if not os.path.isdir(new_path):
            # Try to create it if it doesn't exist
            try:
                os.makedirs(new_path, exist_ok=True)
                ActivityLog.log_scraper_activity(
                    action="create_directory",
                    status="info",
                    description=f"Created download directory: {new_path}"
                )
            except OSError as e:
                ActivityLog.log_error(
                    error_message=f"Failed to create download directory: {new_path}, Error: {str(e)}",
                    source="update_download_path"
                )
                return False, f"Path '{new_path}' is not a valid directory and could not be created: {e}", None

        # Check if the path is readable
        if not os.access(new_path, os.R_OK):
            ActivityLog.log_error(
                error_message=f"Download path '{new_path}' is not readable.",
                source="check_directory_permissions"
            )
            return False, f"Path '{new_path}' exists but is not readable by the application.", None

        # Check if the path is writable
        if not os.access(new_path, os.W_OK):
            ActivityLog.log_error(
                error_message=f"Download path '{new_path}' is not writable.",
                source="check_directory_permissions"
            )
            return False, f"Path '{new_path}' exists but is not writable by the application.", None
        # --- End of validation ---

        config = DownloadPathConfig.query.first()
        if not config:
            config = DownloadPathConfig(path=new_path)
            db.session.add(config)
        else:
            old_value = config.path
            config.path = new_path
            ActivityLog.log_config_change(
                config_key="download_path",
                old_value=old_value,
                new_value=new_path,
                description="Updated download path"
            )

        db.session.commit()
        return True, "Download path updated successfully!", config

    except Exception as e:
        db.session.rollback()
        return False, f"Error updating download path: {str(e)}", None


def _update_schedule(schedule_data):
    """
    Helper function to update schedule configuration.

    Args:
        schedule_data (dict): Dictionary with hour:weight pairs

    Returns:
        tuple: (success, message)
    """
    try:
        # Validate all entries first
        for hour_str, weight in schedule_data.items():
            try:
                hour = int(hour_str)
                weight = float(weight)

                if hour < 0 or hour > 23:
                    return False, f"Hour value must be between 0 and 23, got {hour}"

                if weight < 0.1 or weight > 5:
                    return False, f"Weight for hour {hour} must be between 0.1 and 5, got {weight}"
            except ValueError:
                return False, f"Invalid data format for hour {hour_str}"

        # Update schedule after validation
        for hour_str, weight in schedule_data.items():
            hour = int(hour_str)
            weight = float(weight)

            config = ScheduleConfig.query.get(hour)
            if not config:
                config = ScheduleConfig(hour=hour, weight=weight)
                db.session.add(config)
            else:
                old_value = config.weight
                config.weight = weight
                ActivityLog.log_config_change(
                    config_key=f"schedule_hour_{hour}",
                    old_value=old_value,
                    new_value=weight,
                    description=f"Updated schedule weight for hour {hour}"
                )

        db.session.commit()

        # Invalidate hourly quota cache using the cache_utils module
        try:
            invalidate_hourly_quota_cache()
        except Exception as e:
            # Log the error but don't fail the update
            ActivityLog.log_error(
                error_message=f"Error invalidating hourly quota cache: {str(e)}",
                source="_update_schedule"
            )

        return True, "Schedule updated successfully!"

    except Exception as e:
        db.session.rollback()
        return False, f"Error updating schedule: {str(e)}"


@bp.route("/")
@bp.route("/general")
def general():
    """Show general configuration page."""
    volume_config = VolumeConfig.query.first()
    if not volume_config:
        volume_config = VolumeConfig(volume=100)  # Default value
        db.session.add(volume_config)
        db.session.commit()

    # Fetch download path config
    download_path_config = DownloadPathConfig.query.first()
    if not download_path_config:
        download_path_config = DownloadPathConfig() # Use default from model
        db.session.add(download_path_config)
        db.session.commit()

    return render_template(
        "config/index.html.jinja",
        active_tab="general",
        volume_config=volume_config,
        download_path_config=download_path_config, # Pass to template
        max_volume=MAX_VOLUME,
        app_title="Configuration"
    )


@bp.route("/schedule")
def schedule():
    """Show schedule configuration page."""
    # Ensure we have schedule config for all hours
    existing_hours = {record.hour: record for record in ScheduleConfig.query.all()}
    schedule_config = {}

    for hour in range(24):
        if hour in existing_hours:
            schedule_config[hour] = existing_hours[hour].weight
        else:
            # Create default schedule entry (weight 1.0)
            new_config = ScheduleConfig(hour=hour, weight=1.0)
            db.session.add(new_config)
            schedule_config[hour] = 1.0

    if len(existing_hours) < 24:
        db.session.commit()

    volume_config = VolumeConfig.query.first()
    if not volume_config:
        volume_config = VolumeConfig(volume=100)  # Default value
        db.session.add(volume_config)
        db.session.commit()

    return render_template(
        "config/index.html.jinja",
        active_tab="schedule",
        schedule=schedule_config,
        volume=volume_config.volume,
        max_volume=MAX_VOLUME,
        app_title="Configuration"
    )

@bp.route("/database")
def database():
    """Show database configuration page."""

    return render_template(
        "config/index.html.jinja",
        active_tab="database",
        app_title="Configuration"
    )


@bp.route("/generate_test_papers", methods=["POST"])
def generate_test_papers():
    """Generate random test papers for the database."""
    try:
        # Get the requested number of papers (with validation)
        try:
            paper_count = int(request.form.get("paper_count", "100"))
            if paper_count < 1:
                paper_count = 1
            elif paper_count > 1000:
                paper_count = 1000
        except (ValueError, TypeError):
            paper_count = 100

        # Get the status settings
        try:
            dummy_paper_status = request.form.get("dummy_paper_status")
            if dummy_paper_status == "new":
                dummy_paper_status = "New"
            else:
                dummy_paper_status = random.choice(["New","Pending", "Done", "Failed"])
        except (ValueError, TypeError):
            dummy_paper_status = random.choice(["New","Pending", "Done", "Failed"])

        # Get the download path for file paths
        download_path = DownloadPathConfig.get_path()

        # Sample journal names for realistic test data
        journals = [
            "Nature", "Science", "Cell", "PNAS", "Journal of Biological Chemistry",
            "IEEE Transactions on Neural Networks", "Artificial Intelligence",
            "Machine Learning", "Neural Computation", "Journal of Machine Learning Research",
            "Journal of Artificial Intelligence Research", "Data Mining and Knowledge Discovery",
            "Pattern Recognition", "Neural Networks", "Journal of Physical Chemistry"
        ]

        # Sample paper types
        paper_types = ["Article", "Review", "Conference", "Preprint", "Book Chapter"]

        # Sample languages
        languages = ["English", "German", "French", "Chinese", "Spanish", "Japanese"]

        # Generate random papers
        papers_added = 0
        for i in range(paper_count):
            # Generate a random DOI
            doi = f"10.{random.randint(1000, 9999)}/{uuid4().hex[:8]}"

            # Skip if DOI already exists
            if PaperMetadata.query.filter_by(doi=doi).first():
                continue

            # Random publishing date within the last 5 years
            days_ago = random.randint(0, 5 * 365)
            pub_date = datetime.now() - timedelta(days=days_ago)

            # Create paper
            paper = PaperMetadata(
                title=f"Test Paper {i+1}: {''.join(random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ') for _ in range(5))}",
                doi=doi,
                alt_id=f"ALT-{random.randint(10000, 99999)}",
                issn=f"{random.randint(1000, 9999)}-{random.randint(1000, 9999)}",
                journal=random.choice(journals),
                type=random.choice(paper_types),
                language=random.choice(languages),
                published_online=pub_date.date(),
                status=dummy_paper_status,
                file_path=f"{download_path}/test_paper_{i+1}.pdf" if random.random() > 0.3 else None,
                error_msg="Download failed: connection timeout" if random.random() < 0.1 else None,
                created_at=datetime.now() - timedelta(days=random.randint(0, 30))
            )
            db.session.add(paper)
            papers_added += 1

            # Commit in batches to improve performance
            if i % 100 == 0:
                db.session.commit()

        # Final commit
        db.session.commit()

        # Log the action using the existing log_import_activity method
        ActivityLog.log_import_activity(
            action="generate_test_papers",
            status="success",
            description=f"Generated {papers_added} test papers for the database"
        )

        flash(f"Successfully generated {papers_added} test papers.", "success")

    except Exception as e:
        db.session.rollback()
        flash(f"Failed to generate test papers: {str(e)}", "error")
        ActivityLog.log_error(
            error_message=f"Failed to generate test papers: {str(e)}",
            exception=e,
            source="config.generate_test_papers"
        )

    return redirect(url_for("config.database"))

@bp.route("/update/general", methods=["POST"])
def update_general():
    """Update general configuration (Volume and Download Path)."""
    volume_success, volume_message = True, ""
    path_success, path_message = True, ""

    # Update Volume
    new_volume = request.form.get("total_volume")
    if new_volume is not None:
        volume_success, volume_message, _ = _update_volume(new_volume)
        if volume_success:
            flash(volume_message, "success")
        else:
            flash(volume_message, "error")

    # Update Download Path
    new_path = request.form.get("download_path")
    if new_path is not None:
        path_success, path_message, _ = _update_download_path(new_path)
        if path_success:
            flash(path_message, "success")
        else:
            flash(path_message, "error")

    return redirect(url_for("config.general"))


@bp.route("/update/schedule", methods=["POST"])
def update_schedule():
    """Update schedule configuration."""
    schedule_data = {}
    for hour in range(24):
        key = f"hour_{hour}"
        if key not in request.form:
            flash(f"Missing data for hour {hour}", "error")
            return redirect(url_for("config.schedule"))
        schedule_data[str(hour)] = request.form.get(key, 0)

    success, message = _update_schedule(schedule_data)

    if success:
        flash(message, "success")
    else:
        flash(message, "error")

    return redirect(url_for("config.schedule"))


@bp.route("/update/scraper_module", methods=["POST"])
def update_scraper_module():
    """Update the scraper module configuration."""
    from ..models import ScraperModuleConfig

    new_scraper_module = request.form.get("scraper_module")
    if not new_scraper_module:
        flash("Scraper module cannot be empty.", "error")
        return redirect(url_for("config.general"))

    # Validate that the module exists and is valid
    from scipaperloader.scrapers.factory import get_available_scrapers
    available_modules = [m["name"] for m in get_available_scrapers()]

    if new_scraper_module not in available_modules:
        flash(f"Invalid scraper module: {new_scraper_module}", "error")
        return redirect(url_for("config.general"))

    # Update the database configuration
    ScraperModuleConfig.set_module(new_scraper_module)
    flash(f"Scraper module updated to '{new_scraper_module}'.", "success")
    return redirect(url_for("config.general"))


@bp.context_processor
def inject_scraper_modules():
    """Inject available scraper modules into the template context."""
    from scipaperloader.scrapers.factory import get_available_scrapers
    from ..models import ScraperModuleConfig

    available_scrapers = get_available_scrapers()
    current_module = ScraperModuleConfig.get_current_module()

    return {
        "available_scraper_modules": [s["name"] for s in available_scrapers],
        "current_scraper_module": current_module,
        "scraper_details": {s["name"]: s for s in available_scrapers}
    }


@bp.route("/api/schedule/stats")
def schedule_stats():
    """Get statistics about the current schedule configuration."""
    volume_config = VolumeConfig.query.first()
    if not volume_config:
        return jsonify({"error": "No volume configuration found"})

    total_volume = volume_config.volume
    schedule_configs = ScheduleConfig.query.all()

    if not schedule_configs:
        return jsonify({"error": "No schedule configuration found"})

    # Calculate total weight
    total_weight = sum(config.weight for config in schedule_configs)

    # Calculate papers per hour
    papers_per_hour = {}
    hourly_weights = {}
    for config in schedule_configs:
        weight_ratio = config.weight / total_weight if total_weight > 0 else 0
        papers = weight_ratio * total_volume
        papers_per_hour[config.hour] = papers
        hourly_weights[config.hour] = config.weight

    return jsonify({
        "total_volume": total_volume,
        "total_weight": total_weight,
        "papers_per_hour": papers_per_hour,
        "hourly_weights": hourly_weights
    })


@bp.route("/api/update_config", methods=["POST"])
def api_update_config():
    """API endpoint to update configuration."""
    data = request.json
    response = {"success": True, "updates": []}

    try:
        # Update volume if provided
        if "volume" in data:
            success, message, _ = _update_volume(data["volume"])
            response["updates"].append({
                "type": "volume",
                "success": success,
                "message": message
            })
            if not success:
                response["success"] = False

        # Update download path if provided
        if "download_path" in data:
            success, message, _ = _update_download_path(data["download_path"])
            response["updates"].append({
                "type": "download_path",
                "success": success,
                "message": message
            })
            if not success:
                response["success"] = False

        # Update schedule if provided
        if "schedule" in data:
            success, message = _update_schedule(data["schedule"])
            response["updates"].append({
                "type": "schedule",
                "success": success,
                "message": message
            })
            if not success:
                response["success"] = False

        return jsonify(response)

    except Exception as e:
        db.session.rollback()
        return jsonify({
            "success": False,
            "message": f"Unexpected error: {str(e)}"
        })


@bp.route("/delete_all_papers", methods=["POST"])
def delete_all_papers():
    """Delete all paper records from the database."""
    try:
        # Count papers before deletion for logging purposes
        paper_count = PaperMetadata.query.count()

        # Delete all records from the PaperMetadata table
        PaperMetadata.query.delete()
        db.session.commit()

        # Log the action
        ActivityLog.log_config_change(
            config_key="database",
            old_value=f"{paper_count} papers",
            new_value="0 papers",
            description=f"Deleted all {paper_count} papers from the database"
        )

        flash(f"Successfully deleted all {paper_count} papers from the database.", "success")
    except Exception as e:
        db.session.rollback()
        flash(f"Failed to delete papers: {str(e)}", "error")
        ActivityLog.log_error(
            error_message=f"Failed to delete all papers: {str(e)}",
            exception=e,
            source="config.delete_all_papers"
        )

    return redirect(url_for("config.general"))