diff --git a/scipaperloader/blueprints/scraper.py b/scipaperloader/blueprints/scraper.py index 499174b..a5ae271 100644 --- a/scipaperloader/blueprints/scraper.py +++ b/scipaperloader/blueprints/scraper.py @@ -1,7 +1,7 @@ import random import json from datetime import datetime -from flask import Blueprint, jsonify, render_template, request, current_app +from flask import Blueprint, jsonify, render_template, request, current_app, flash from ..models import ScheduleConfig, VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory from ..db import db from ..celery import celery @@ -16,7 +16,28 @@ SCRAPER_PAUSED = False def index(): """Render the scraper control panel.""" volume_config = VolumeConfig.query.first() - schedule_config = {record.hour: record.weight for record in ScheduleConfig.query.all()} + + # Ensure we have volume config + if not volume_config: + volume_config = VolumeConfig(volume=100) # Default value + db.session.add(volume_config) + db.session.commit() + + # Ensure we have schedule config for all hours + existing_hours = {record.hour: record for record in ScheduleConfig.query.all()} + schedule_config = {} + + for hour in range(24): + if hour in existing_hours: + schedule_config[hour] = existing_hours[hour].weight + else: + # Create default schedule entry (weight 1.0) + new_config = ScheduleConfig(hour=hour, weight=1.0) + db.session.add(new_config) + schedule_config[hour] = 1.0 + + if len(existing_hours) < 24: + db.session.commit() return render_template( "scraper.html.jinja", @@ -179,36 +200,71 @@ def update_config(): """Update scraper configuration.""" data = request.json - if "volume" in data: - try: - new_volume = float(data["volume"]) - volume_config = VolumeConfig.query.first() - if not volume_config: - volume_config = VolumeConfig(volume=new_volume) - db.session.add(volume_config) - else: - old_value = volume_config.volume - volume_config.volume = new_volume - ActivityLog.log_config_change( - config_key="scraper_volume", - old_value=old_value, - new_value=new_volume, - description="Updated scraper volume" - ) - - db.session.commit() - except (ValueError, TypeError): - return jsonify({"success": False, "message": "Invalid volume value"}) - - if "schedule" in data: - try: - schedule = data["schedule"] - - for hour_str, weight in schedule.items(): - hour = int(hour_str) - weight = float(weight) + try: + if "volume" in data: + try: + new_volume = float(data["volume"]) - if 0 <= hour <= 23 and weight >= 0: + # Validate volume value (from schedule.py) + if new_volume <= 0 or new_volume > 1000: + return jsonify({ + "success": False, + "message": "Volume must be between 1 and 1000" + }) + + volume_config = VolumeConfig.query.first() + if not volume_config: + volume_config = VolumeConfig(volume=new_volume) + db.session.add(volume_config) + else: + old_value = volume_config.volume + volume_config.volume = new_volume + ActivityLog.log_config_change( + config_key="scraper_volume", + old_value=old_value, + new_value=new_volume, + description="Updated scraper volume" + ) + + db.session.commit() + except (ValueError, TypeError): + return jsonify({ + "success": False, + "message": "Invalid volume value" + }) + + if "schedule" in data: + try: + schedule = data["schedule"] + + # Validate entire schedule + for hour_str, weight in schedule.items(): + try: + hour = int(hour_str) + weight = float(weight) + + if hour < 0 or hour > 23: + return jsonify({ + "success": False, + "message": f"Hour value must be between 0 and 23, got {hour}" + }) + + if weight < 0.1 or weight > 5: + return jsonify({ + "success": False, + "message": f"Weight for hour {hour} must be between 0.1 and 5, got {weight}" + }) + except ValueError: + return jsonify({ + "success": False, + "message": f"Invalid data format for hour {hour_str}" + }) + + # Update schedule after validation + for hour_str, weight in schedule.items(): + hour = int(hour_str) + weight = float(weight) + schedule_config = ScheduleConfig.query.get(hour) if not schedule_config: schedule_config = ScheduleConfig(hour=hour, weight=weight) @@ -222,12 +278,124 @@ def update_config(): new_value=weight, description=f"Updated schedule weight for hour {hour}" ) - - db.session.commit() - except (ValueError, TypeError): - return jsonify({"success": False, "message": "Invalid schedule format"}) + + db.session.commit() + except Exception as e: + db.session.rollback() + return jsonify({ + "success": False, + "message": f"Error updating schedule: {str(e)}" + }) + + return jsonify({"success": True, "message": "Configuration updated"}) - return jsonify({"success": True, "message": "Configuration updated"}) + except Exception as e: + db.session.rollback() + return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"}) + +@bp.route("/schedule", methods=["GET", "POST"]) +def schedule(): + """Legacy route to maintain compatibility with the schedule blueprint.""" + # For GET requests, redirect to the scraper index with the schedule tab active + if request.method == "GET": + return index() + + # For POST requests, handle form data and process like the original schedule blueprint + if request.method == "POST": + try: + # Check if we're updating volume or schedule + if "total_volume" in request.form: + # Volume update + try: + new_volume = float(request.form.get("total_volume", 0)) + if new_volume <= 0 or new_volume > 1000: + raise ValueError("Volume must be between 1 and 1000") + + volume_config = VolumeConfig.query.first() + if not volume_config: + volume_config = VolumeConfig(volume=new_volume) + db.session.add(volume_config) + else: + volume_config.volume = new_volume + + db.session.commit() + flash("Volume updated successfully!", "success") + + except ValueError as e: + db.session.rollback() + flash(f"Error updating volume: {str(e)}", "error") + else: + # Schedule update logic + # Validate form data + for hour in range(24): + key = f"hour_{hour}" + if key not in request.form: + raise ValueError(f"Missing data for hour {hour}") + + try: + weight = float(request.form.get(key, 0)) + if weight < 0 or weight > 5: + raise ValueError( + f"Weight for hour {hour} must be between 0 and 5" + ) + except ValueError: + raise ValueError(f"Invalid weight value for hour {hour}") + + # Update database if validation passes + for hour in range(24): + key = f"hour_{hour}" + weight = float(request.form.get(key, 0)) + config = ScheduleConfig.query.get(hour) + if config: + config.weight = weight + else: + db.session.add(ScheduleConfig(hour=hour, weight=weight)) + + db.session.commit() + flash("Schedule updated successfully!", "success") + + except ValueError as e: + db.session.rollback() + flash(f"Error updating schedule: {str(e)}", "error") + + # Redirect back to the scraper page + return index() + +# Calculate schedule information for visualization/decision making +def get_schedule_stats(): + """Get statistics about the current schedule configuration.""" + volume_config = VolumeConfig.query.first() + if not volume_config: + return {"error": "No volume configuration found"} + + total_volume = volume_config.volume + schedule_configs = ScheduleConfig.query.all() + + if not schedule_configs: + return {"error": "No schedule configuration found"} + + # Calculate total weight + total_weight = sum(config.weight for config in schedule_configs) + + # Calculate papers per hour + papers_per_hour = {} + for config in schedule_configs: + weight_ratio = config.weight / total_weight if total_weight > 0 else 0 + papers = weight_ratio * total_volume + papers_per_hour[config.hour] = papers + + return { + "total_volume": total_volume, + "total_weight": total_weight, + "papers_per_hour": papers_per_hour + } + +# Enhanced API route to get schedule information +@bp.route("/schedule_info") +def schedule_info(): + """Get information about the current schedule configuration.""" + stats = get_schedule_stats() + return jsonify(stats) # Define the Celery tasks @celery.task(bind=True) diff --git a/scipaperloader/templates/scraper.html.jinja b/scipaperloader/templates/scraper.html.jinja index b89c2d7..0f5c770 100644 --- a/scipaperloader/templates/scraper.html.jinja +++ b/scipaperloader/templates/scraper.html.jinja @@ -37,36 +37,49 @@ z-index: 1050; } - .schedule-grid { - display: grid; - grid-template-columns: repeat(6, 1fr); - gap: 10px; + /* Enhanced scheduler styles */ + .timeline { + display: flex; + flex-wrap: wrap; + gap: 3px; + user-select: none; } .hour-block { - padding: 10px; + width: 49px; + height: 70px; border-radius: 5px; text-align: center; + line-height: 1.2; + font-size: 0.9rem; + padding-top: 6px; + cursor: pointer; + user-select: none; + transition: background-color 0.2s ease-in-out; + margin: 1px; } - .weight-1 { - background-color: #d4edda; + .hour-block.selected { + outline: 2px solid #4584b8; } - .weight-0-7 { - background-color: #d1ecf1; + .papers { + font-size: 0.7rem; + margin-top: 2px; } - .weight-0-5 { - background-color: #fff3cd; + /* Tab styles */ + .nav-tabs .nav-link { + color: #495057; } - .weight-0-2 { - background-color: #f8d7da; + .nav-tabs .nav-link.active { + font-weight: bold; + color: #007bff; } - .weight-0-1 { - background-color: #f5c6cb; + .tab-pane { + padding-top: 1rem; } {% endblock styles %} @@ -75,135 +88,193 @@
Time | +Action | +Status | +Description | +
---|---|---|---|
Loading activities... | +
+ Configure the daily volume of papers to be downloaded and the hourly download weights. + The weights determine how many papers will be downloaded during each hour of the day. + The total volume ( papers/day) is split across all hours based on + their relative weights. + Lower weights result in higher scraping rates for that hour. +
++ Click to select one or more hours below. Then assign a weight to them using the input and apply it. + Color indicates relative intensity. Changes are saved immediately when you click "Update Schedule". +
++ The total volume of data to be downloaded each day is + papers. +
Time | -Action | -Status | -Description | -
---|---|---|---|
Loading activities... | -