whoopsie forgot the scraper logic

This commit is contained in:
Michael Beck 2025-04-16 12:29:42 +02:00
parent b09c6f1b9b
commit e15867c9a6
2 changed files with 536 additions and 194 deletions

View File

@ -1,7 +1,7 @@
import random
import json
from datetime import datetime
from flask import Blueprint, jsonify, render_template, request, current_app
from flask import Blueprint, jsonify, render_template, request, current_app, flash
from ..models import ScheduleConfig, VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory
from ..db import db
from ..celery import celery
@ -16,7 +16,28 @@ SCRAPER_PAUSED = False
def index():
"""Render the scraper control panel."""
volume_config = VolumeConfig.query.first()
schedule_config = {record.hour: record.weight for record in ScheduleConfig.query.all()}
# Ensure we have volume config
if not volume_config:
volume_config = VolumeConfig(volume=100) # Default value
db.session.add(volume_config)
db.session.commit()
# Ensure we have schedule config for all hours
existing_hours = {record.hour: record for record in ScheduleConfig.query.all()}
schedule_config = {}
for hour in range(24):
if hour in existing_hours:
schedule_config[hour] = existing_hours[hour].weight
else:
# Create default schedule entry (weight 1.0)
new_config = ScheduleConfig(hour=hour, weight=1.0)
db.session.add(new_config)
schedule_config[hour] = 1.0
if len(existing_hours) < 24:
db.session.commit()
return render_template(
"scraper.html.jinja",
@ -179,36 +200,71 @@ def update_config():
"""Update scraper configuration."""
data = request.json
if "volume" in data:
try:
new_volume = float(data["volume"])
volume_config = VolumeConfig.query.first()
if not volume_config:
volume_config = VolumeConfig(volume=new_volume)
db.session.add(volume_config)
else:
old_value = volume_config.volume
volume_config.volume = new_volume
ActivityLog.log_config_change(
config_key="scraper_volume",
old_value=old_value,
new_value=new_volume,
description="Updated scraper volume"
)
db.session.commit()
except (ValueError, TypeError):
return jsonify({"success": False, "message": "Invalid volume value"})
if "schedule" in data:
try:
schedule = data["schedule"]
for hour_str, weight in schedule.items():
hour = int(hour_str)
weight = float(weight)
try:
if "volume" in data:
try:
new_volume = float(data["volume"])
if 0 <= hour <= 23 and weight >= 0:
# Validate volume value (from schedule.py)
if new_volume <= 0 or new_volume > 1000:
return jsonify({
"success": False,
"message": "Volume must be between 1 and 1000"
})
volume_config = VolumeConfig.query.first()
if not volume_config:
volume_config = VolumeConfig(volume=new_volume)
db.session.add(volume_config)
else:
old_value = volume_config.volume
volume_config.volume = new_volume
ActivityLog.log_config_change(
config_key="scraper_volume",
old_value=old_value,
new_value=new_volume,
description="Updated scraper volume"
)
db.session.commit()
except (ValueError, TypeError):
return jsonify({
"success": False,
"message": "Invalid volume value"
})
if "schedule" in data:
try:
schedule = data["schedule"]
# Validate entire schedule
for hour_str, weight in schedule.items():
try:
hour = int(hour_str)
weight = float(weight)
if hour < 0 or hour > 23:
return jsonify({
"success": False,
"message": f"Hour value must be between 0 and 23, got {hour}"
})
if weight < 0.1 or weight > 5:
return jsonify({
"success": False,
"message": f"Weight for hour {hour} must be between 0.1 and 5, got {weight}"
})
except ValueError:
return jsonify({
"success": False,
"message": f"Invalid data format for hour {hour_str}"
})
# Update schedule after validation
for hour_str, weight in schedule.items():
hour = int(hour_str)
weight = float(weight)
schedule_config = ScheduleConfig.query.get(hour)
if not schedule_config:
schedule_config = ScheduleConfig(hour=hour, weight=weight)
@ -222,12 +278,124 @@ def update_config():
new_value=weight,
description=f"Updated schedule weight for hour {hour}"
)
db.session.commit()
except (ValueError, TypeError):
return jsonify({"success": False, "message": "Invalid schedule format"})
db.session.commit()
except Exception as e:
db.session.rollback()
return jsonify({
"success": False,
"message": f"Error updating schedule: {str(e)}"
})
return jsonify({"success": True, "message": "Configuration updated"})
return jsonify({"success": True, "message": "Configuration updated"})
except Exception as e:
db.session.rollback()
return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"})
@bp.route("/schedule", methods=["GET", "POST"])
def schedule():
"""Legacy route to maintain compatibility with the schedule blueprint."""
# For GET requests, redirect to the scraper index with the schedule tab active
if request.method == "GET":
return index()
# For POST requests, handle form data and process like the original schedule blueprint
if request.method == "POST":
try:
# Check if we're updating volume or schedule
if "total_volume" in request.form:
# Volume update
try:
new_volume = float(request.form.get("total_volume", 0))
if new_volume <= 0 or new_volume > 1000:
raise ValueError("Volume must be between 1 and 1000")
volume_config = VolumeConfig.query.first()
if not volume_config:
volume_config = VolumeConfig(volume=new_volume)
db.session.add(volume_config)
else:
volume_config.volume = new_volume
db.session.commit()
flash("Volume updated successfully!", "success")
except ValueError as e:
db.session.rollback()
flash(f"Error updating volume: {str(e)}", "error")
else:
# Schedule update logic
# Validate form data
for hour in range(24):
key = f"hour_{hour}"
if key not in request.form:
raise ValueError(f"Missing data for hour {hour}")
try:
weight = float(request.form.get(key, 0))
if weight < 0 or weight > 5:
raise ValueError(
f"Weight for hour {hour} must be between 0 and 5"
)
except ValueError:
raise ValueError(f"Invalid weight value for hour {hour}")
# Update database if validation passes
for hour in range(24):
key = f"hour_{hour}"
weight = float(request.form.get(key, 0))
config = ScheduleConfig.query.get(hour)
if config:
config.weight = weight
else:
db.session.add(ScheduleConfig(hour=hour, weight=weight))
db.session.commit()
flash("Schedule updated successfully!", "success")
except ValueError as e:
db.session.rollback()
flash(f"Error updating schedule: {str(e)}", "error")
# Redirect back to the scraper page
return index()
# Calculate schedule information for visualization/decision making
def get_schedule_stats():
"""Get statistics about the current schedule configuration."""
volume_config = VolumeConfig.query.first()
if not volume_config:
return {"error": "No volume configuration found"}
total_volume = volume_config.volume
schedule_configs = ScheduleConfig.query.all()
if not schedule_configs:
return {"error": "No schedule configuration found"}
# Calculate total weight
total_weight = sum(config.weight for config in schedule_configs)
# Calculate papers per hour
papers_per_hour = {}
for config in schedule_configs:
weight_ratio = config.weight / total_weight if total_weight > 0 else 0
papers = weight_ratio * total_volume
papers_per_hour[config.hour] = papers
return {
"total_volume": total_volume,
"total_weight": total_weight,
"papers_per_hour": papers_per_hour
}
# Enhanced API route to get schedule information
@bp.route("/schedule_info")
def schedule_info():
"""Get information about the current schedule configuration."""
stats = get_schedule_stats()
return jsonify(stats)
# Define the Celery tasks
@celery.task(bind=True)

View File

@ -37,36 +37,49 @@
z-index: 1050;
}
.schedule-grid {
display: grid;
grid-template-columns: repeat(6, 1fr);
gap: 10px;
/* Enhanced scheduler styles */
.timeline {
display: flex;
flex-wrap: wrap;
gap: 3px;
user-select: none;
}
.hour-block {
padding: 10px;
width: 49px;
height: 70px;
border-radius: 5px;
text-align: center;
line-height: 1.2;
font-size: 0.9rem;
padding-top: 6px;
cursor: pointer;
user-select: none;
transition: background-color 0.2s ease-in-out;
margin: 1px;
}
.weight-1 {
background-color: #d4edda;
.hour-block.selected {
outline: 2px solid #4584b8;
}
.weight-0-7 {
background-color: #d1ecf1;
.papers {
font-size: 0.7rem;
margin-top: 2px;
}
.weight-0-5 {
background-color: #fff3cd;
/* Tab styles */
.nav-tabs .nav-link {
color: #495057;
}
.weight-0-2 {
background-color: #f8d7da;
.nav-tabs .nav-link.active {
font-weight: bold;
color: #007bff;
}
.weight-0-1 {
background-color: #f5c6cb;
.tab-pane {
padding-top: 1rem;
}
</style>
{% endblock styles %}
@ -75,135 +88,193 @@
<div class="container mt-4">
<h1>Paper Scraper Control Panel</h1>
<div class="row mb-4">
<div class="col-md-6">
<div class="card">
<!-- Navigation tabs -->
<ul class="nav nav-tabs mb-4" id="scraperTabs" role="tablist">
<li class="nav-item" role="presentation">
<button class="nav-link active" id="dashboard-tab" data-bs-toggle="tab" data-bs-target="#dashboard"
type="button" role="tab" aria-controls="dashboard" aria-selected="true">
Dashboard
</button>
</li>
<li class="nav-item" role="presentation">
<button class="nav-link" id="schedule-tab" data-bs-toggle="tab" data-bs-target="#schedule" type="button"
role="tab" aria-controls="schedule" aria-selected="false">
Schedule Configuration
</button>
</li>
</ul>
<div class="tab-content" id="scraperTabsContent">
<!-- Dashboard Tab -->
<div class="tab-pane fade show active" id="dashboard" role="tabpanel" aria-labelledby="dashboard-tab">
<div class="row mb-4">
<div class="col-md-6">
<div class="card">
<div class="card-header">
<h5>Scraper Status</h5>
</div>
<div class="card-body">
<div class="d-flex align-items-center mb-3">
<div id="statusIndicator" class="status-indicator status-inactive"></div>
<span id="statusText">Inactive</span>
</div>
<div class="btn-group" role="group">
<button id="startButton" class="btn btn-success">Start</button>
<button id="pauseButton" class="btn btn-warning" disabled>Pause</button>
<button id="stopButton" class="btn btn-danger" disabled>Stop</button>
</div>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card">
<div class="card-header">
<h5>Volume Configuration</h5>
</div>
<div class="card-body">
<form id="volumeForm">
<div class="form-group">
<label for="volumeInput">Papers per day:</label>
<input type="number" class="form-control" id="volumeInput"
value="{{ volume_config.volume if volume_config else 100 }}">
</div>
<button type="submit" class="btn btn-primary mt-2">Update Volume</button>
</form>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header d-flex justify-content-between align-items-center">
<h5>Scraping Activity</h5>
<div>
<div class="form-check form-switch">
<input class="form-check-input" type="checkbox" id="notificationsToggle" checked>
<label class="form-check-label" for="notificationsToggle">Show Notifications</label>
</div>
</div>
</div>
<div class="card-body">
<div class="btn-group mb-3">
<button class="btn btn-outline-secondary time-range-btn" data-hours="6">Last 6
hours</button>
<button class="btn btn-outline-secondary time-range-btn active" data-hours="24">Last 24
hours</button>
<button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3
days</button>
</div>
<div class="stats-chart" id="activityChart"></div>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5>Recent Activity</h5>
</div>
<div class="card-body">
<div class="table-responsive">
<table class="table table-striped">
<thead>
<tr>
<th>Time</th>
<th>Action</th>
<th>Status</th>
<th>Description</th>
</tr>
</thead>
<tbody id="activityLog">
<tr>
<td colspan="4" class="text-center">Loading activities...</td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Schedule Configuration Tab -->
<div class="tab-pane fade" id="schedule" role="tabpanel" aria-labelledby="schedule-tab"
x-data="scheduleManager({{ schedule_config | tojson }}, {{ volume_config.volume if volume_config else 100 }})">
<div class="mb-3">
<h3>How it Works</h3>
<p class="text-muted mb-0">
Configure the daily volume of papers to be downloaded and the hourly download weights.
The weights determine how many papers will be downloaded during each hour of the day.
The total volume (<strong x-text="volume"></strong> papers/day) is split across all hours based on
their relative weights.
<strong>Lower weights result in higher scraping rates</strong> for that hour.
</p>
<h5 class="mt-3">Instructions:</h5>
<p class="text-muted">
Click to select one or more hours below. Then assign a weight to them using the input and apply it.
Color indicates relative intensity. Changes are saved immediately when you click "Update Schedule".
</p>
</div>
<div class="card mb-4">
<div class="card-header">
<h5>Scraper Status</h5>
<h4 class="m-0">Volume Configuration</h4>
</div>
<div class="card-body">
<p class="text-muted">
The total volume of data to be downloaded each day is
<strong x-text="volume"></strong> papers.
</p>
<div class="d-flex align-items-center mb-3">
<div id="statusIndicator" class="status-indicator status-inactive"></div>
<span id="statusText">Inactive</span>
</div>
<div class="btn-group" role="group">
<button id="startButton" class="btn btn-success">Start</button>
<button id="pauseButton" class="btn btn-warning" disabled>Pause</button>
<button id="stopButton" class="btn btn-danger" disabled>Stop</button>
</div>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card">
<div class="card-header">
<h5>Volume Configuration</h5>
</div>
<div class="card-body">
<form id="volumeForm">
<div class="form-group">
<label for="volumeInput">Papers per day:</label>
<input type="number" class="form-control" id="volumeInput"
value="{{ volume_config.volume }}">
</div>
<button type="submit" class="btn btn-primary mt-2">Update Volume</button>
</form>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5>Schedule Configuration</h5>
<small class="text-muted">Weight factor for each hour (lower value = higher scraping rate)</small>
</div>
<div class="card-body">
<div class="schedule-grid">
{% for hour in range(24) %}
{% set weight = schedule_config.get(hour, 1.0) %}
{% set weight_class = "weight-1" %}
{% if weight == 0.1 %}
{% set weight_class = "weight-0-1" %}
{% elif weight == 0.2 %}
{% set weight_class = "weight-0-2" %}
{% elif weight == 0.5 %}
{% set weight_class = "weight-0-5" %}
{% elif weight == 0.7 %}
{% set weight_class = "weight-0-7" %}
{% endif %}
<div class="hour-block border {{ weight_class }}" data-hour="{{ hour }}">
<div class="hour-label">{{ "%02d:00"|format(hour) }}</div>
<select class="form-control hour-weight mt-1" data-hour="{{ hour }}">
<option value="0.1" {% if weight==0.1 %}selected{% endif %}>Very High</option>
<option value="0.2" {% if weight==0.2 %}selected{% endif %}>High</option>
<option value="0.5" {% if weight==0.5 %}selected{% endif %}>Medium</option>
<option value="0.7" {% if weight==0.7 %}selected{% endif %}>Low</option>
<option value="1.0" {% if weight==1.0 %}selected{% endif %}>Very Low</option>
</select>
</div>
{% endfor %}
</div>
<button id="updateScheduleButton" class="btn btn-primary mt-3">Update Schedule</button>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header d-flex justify-content-between align-items-center">
<h5>Scraping Activity</h5>
<div>
<div class="form-check form-switch">
<input class="form-check-input" type="checkbox" id="notificationsToggle" checked>
<label class="form-check-label" for="notificationsToggle">Show Notifications</label>
<div class="input-group">
<span class="input-group-text">Papers per day:</span>
<input type="number" class="form-control" x-model="volume" min="1" max="1000" />
<button type="button" class="btn btn-primary" @click="updateVolume()">
Update Volume
</button>
</div>
</div>
</div>
<div class="card-body">
<div class="btn-group mb-3">
<button class="btn btn-outline-secondary time-range-btn" data-hours="6">Last 6 hours</button>
<button class="btn btn-outline-secondary time-range-btn active" data-hours="24">Last 24
hours</button>
<button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3 days</button>
</div>
<div class="stats-chart" id="activityChart"></div>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5>Recent Activity</h5>
<h4 class="m-0">Hourly Weights</h4>
</div>
<div class="card-body">
<div class="table-responsive">
<table class="table table-striped">
<thead>
<tr>
<th>Time</th>
<th>Action</th>
<th>Status</th>
<th>Description</th>
</tr>
</thead>
<tbody id="activityLog">
<tr>
<td colspan="4" class="text-center">Loading activities...</td>
</tr>
</tbody>
</table>
<div class="timeline mb-3" @mouseup="endDrag()" @mouseleave="endDrag()">
<template x-for="hour in Object.keys(schedule)" :key="hour">
<div class="hour-block" :id="'hour-' + hour" :data-hour="hour"
:style="getBackgroundStyle(hour)" :class="{'selected': isSelected(hour)}"
@mousedown="startDrag($event, hour)" @mouseover="dragSelect(hour)">
<div><strong x-text="formatHour(hour)"></strong></div>
<div class="weight"><span x-text="schedule[hour]"></span></div>
<div class="papers">
<span x-text="getPapersPerHour(hour)"></span> p.
</div>
</div>
</template>
</div>
<div class="input-group mb-4 w-50">
<span class="input-group-text">Set Weight:</span>
<input type="number" step="0.1" min="0.1" max="5" x-model="newWeight" class="form-control" />
<button type="button" class="btn btn-outline-primary" @click="applyWeight()">
Apply to Selected
</button>
</div>
<button type="button" class="btn btn-success" @click="updateSchedule()">
💾 Update Schedule
</button>
</div>
</div>
</div>
@ -217,8 +288,138 @@
{% block scripts %}
{{ super() }}
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js" defer></script>
<script>
// Global variables
// Alpine.js scheduler component
function scheduleManager(initial, volume) {
return {
schedule: initial || {},
volume: volume,
selectedHours: [],
newWeight: 1.0,
isDragging: false,
dragOperation: null,
formatHour(h) {
return String(h).padStart(2, "0") + ":00";
},
getBackgroundStyle(hour) {
const weight = parseFloat(this.schedule[hour]);
const maxWeight = 2.5; // You can adjust this
// Normalize weight (0.0 to 1.0)
const t = Math.min(weight / maxWeight, 1.0);
// Interpolate HSL lightness: 95% (light) to 30% (dark)
const lightness = 95 - t * 65; // 95 → 30
const backgroundColor = `hsl(210, 10%, ${lightness}%)`;
const textColor = t > 0.65 ? "white" : "black"; // adaptive text color
return {
backgroundColor,
color: textColor,
};
},
startDrag(event, hour) {
event.preventDefault();
this.isDragging = true;
this.dragOperation = this.isSelected(hour) ? "remove" : "add";
this.toggleSelect(hour);
},
dragSelect(hour) {
if (!this.isDragging) return;
const selected = this.isSelected(hour);
if (this.dragOperation === "add" && !selected) {
this.selectedHours.push(hour);
} else if (this.dragOperation === "remove" && selected) {
this.selectedHours = this.selectedHours.filter((h) => h !== hour);
}
},
endDrag() {
this.isDragging = false;
},
toggleSelect(hour) {
if (this.isSelected(hour)) {
this.selectedHours = this.selectedHours.filter((h) => h !== hour);
} else {
this.selectedHours.push(hour);
}
},
isSelected(hour) {
return this.selectedHours.includes(hour);
},
applyWeight() {
this.selectedHours.forEach((hour) => {
this.schedule[hour] = parseFloat(this.newWeight).toFixed(1);
});
},
getTotalWeight() {
return Object.values(this.schedule).reduce(
(sum, w) => sum + parseFloat(w),
0
);
},
getPapersPerHour(hour) {
const total = this.getTotalWeight();
if (total === 0) return 0;
return (
(parseFloat(this.schedule[hour]) / total) *
this.volume
).toFixed(1);
},
updateVolume() {
fetch('/scraper/update_config', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ volume: parseFloat(this.volume) })
})
.then(response => response.json())
.then(data => {
if (data.success) {
showNotification('Volume updated successfully', 'success');
// Update the volume in the dashboard tab too
document.getElementById('volumeInput').value = this.volume;
} else {
showNotification(data.message, 'danger');
}
});
},
updateSchedule() {
fetch('/scraper/update_config', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ schedule: this.schedule })
})
.then(response => response.json())
.then(data => {
if (data.success) {
showNotification('Schedule updated successfully', 'success');
this.selectedHours = []; // Clear selections after update
} else {
showNotification(data.message, 'danger');
}
});
}
};
}
// Global variables for the scraper dashboard
let notificationsEnabled = true;
let activityChart = null;
let currentTimeRange = 24;
@ -249,8 +450,6 @@
updateVolume();
});
document.getElementById('updateScheduleButton').addEventListener('click', updateSchedule);
document.querySelectorAll('.time-range-btn').forEach(btn => {
btn.addEventListener('click', function () {
document.querySelectorAll('.time-range-btn').forEach(b => b.classList.remove('active'));
@ -357,31 +556,6 @@
});
}
function updateSchedule() {
const schedule = {};
document.querySelectorAll('.hour-weight').forEach(select => {
const hour = select.dataset.hour;
const weight = select.value;
schedule[hour] = weight;
});
fetch('/scraper/update_config', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ schedule: schedule })
})
.then(response => response.json())
.then(data => {
if (data.success) {
showNotification('Schedule updated successfully', 'success');
} else {
showNotification(data.message, 'danger');
}
});
}
function toggleNotifications() {
notificationsEnabled = notificationsToggle.checked;
}