adds initial scraper logic

This commit is contained in:
Michael Beck 2025-04-16 12:28:39 +02:00
parent 6e119f1412
commit b09c6f1b9b
7 changed files with 1000 additions and 9 deletions

View File

@ -18,7 +18,7 @@ def create_app(test_config=None):
app.config.update(test_config)
db.init_app(app)
migrate = Migrate(app, db) # Add this line to initialize Flask-Migrate
migrate = Migrate(app, db)
with app.app_context():
db.create_all()

View File

@ -6,6 +6,8 @@ from .papers import bp as papers_bp
from .upload import bp as upload_bp
from .schedule import bp as schedule_bp
from .logger import bp as logger_bp
from .api import bp as api_bp
from .scraper import bp as scraper_bp
def register_blueprints(app: Flask):
@ -15,3 +17,5 @@ def register_blueprints(app: Flask):
app.register_blueprint(upload_bp, url_prefix='/upload')
app.register_blueprint(schedule_bp, url_prefix='/schedule')
app.register_blueprint(logger_bp, url_prefix='/logs')
app.register_blueprint(api_bp, url_prefix='/api')
app.register_blueprint(scraper_bp, url_prefix='/scraper')

View File

@ -0,0 +1,50 @@
from datetime import datetime
from flask import Blueprint, jsonify, request
from ..models import ActivityLog, ActivityCategory
bp = Blueprint("api", __name__, url_prefix="/api")
@bp.route("/activity_logs")
def get_activity_logs():
"""Get activity logs with filtering options."""
# Get query parameters
category = request.args.get("category")
action = request.args.get("action")
after = request.args.get("after")
limit = request.args.get("limit", 20, type=int)
# Build query
query = ActivityLog.query
if category:
query = query.filter(ActivityLog.category == category)
if action:
query = query.filter(ActivityLog.action == action)
if after:
try:
after_date = datetime.fromisoformat(after.replace("Z", "+00:00"))
query = query.filter(ActivityLog.timestamp > after_date)
except (ValueError, TypeError):
pass
# Order by most recent first and limit results
logs = query.order_by(ActivityLog.timestamp.desc()).limit(limit).all()
# Format the results
result = []
for log in logs:
log_data = {
"id": log.id,
"timestamp": log.timestamp.isoformat(),
"category": log.category,
"action": log.action,
"description": log.description,
"status": log.status,
"paper_id": log.paper_id,
"extra_data": log.extra_data
}
result.append(log_data)
return jsonify(result)

View File

@ -0,0 +1,344 @@
import random
import json
from datetime import datetime
from flask import Blueprint, jsonify, render_template, request, current_app
from ..models import ScheduleConfig, VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory
from ..db import db
from ..celery import celery
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
# Global variables to track scraper state
SCRAPER_ACTIVE = False
SCRAPER_PAUSED = False
@bp.route("/")
def index():
"""Render the scraper control panel."""
volume_config = VolumeConfig.query.first()
schedule_config = {record.hour: record.weight for record in ScheduleConfig.query.all()}
return render_template(
"scraper.html.jinja",
volume_config=volume_config,
schedule_config=schedule_config,
scraper_active=SCRAPER_ACTIVE,
scraper_paused=SCRAPER_PAUSED
)
@bp.route("/start", methods=["POST"])
def start_scraper():
"""Start the scraper."""
global SCRAPER_ACTIVE, SCRAPER_PAUSED
if not SCRAPER_ACTIVE:
SCRAPER_ACTIVE = True
SCRAPER_PAUSED = False
# Log the action
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description="Scraper started manually"
)
# Start the scheduler task
task = dummy_scraper_scheduler.delay()
return jsonify({
"success": True,
"message": "Scraper started",
"task_id": task.id
})
else:
return jsonify({
"success": False,
"message": "Scraper is already running"
})
@bp.route("/stop", methods=["POST"])
def stop_scraper():
"""Stop the scraper."""
global SCRAPER_ACTIVE, SCRAPER_PAUSED
if SCRAPER_ACTIVE:
SCRAPER_ACTIVE = False
SCRAPER_PAUSED = False
ActivityLog.log_scraper_command(
action="stop_scraper",
status="success",
description="Scraper stopped manually"
)
return jsonify({
"success": True,
"message": "Scraper stopped"
})
else:
return jsonify({
"success": False,
"message": "Scraper is not running"
})
@bp.route("/pause", methods=["POST"])
def pause_scraper():
"""Pause the scraper."""
global SCRAPER_ACTIVE, SCRAPER_PAUSED
if SCRAPER_ACTIVE and not SCRAPER_PAUSED:
SCRAPER_PAUSED = True
ActivityLog.log_scraper_command(
action="pause_scraper",
status="success",
description="Scraper paused manually"
)
return jsonify({
"success": True,
"message": "Scraper paused"
})
elif SCRAPER_ACTIVE and SCRAPER_PAUSED:
SCRAPER_PAUSED = False
ActivityLog.log_scraper_command(
action="resume_scraper",
status="success",
description="Scraper resumed manually"
)
return jsonify({
"success": True,
"message": "Scraper resumed"
})
else:
return jsonify({
"success": False,
"message": "Scraper is not running"
})
@bp.route("/status")
def scraper_status():
"""Get the current status of the scraper."""
return jsonify({
"active": SCRAPER_ACTIVE,
"paused": SCRAPER_PAUSED,
"current_hour": datetime.now().hour,
})
@bp.route("/stats")
def scraper_stats():
"""Get scraper statistics for the dashboard."""
# Get the last 24 hours of activity
hours = 24
if request.args.get('hours'):
try:
hours = int(request.args.get('hours'))
except ValueError:
pass
cutoff_time = datetime.utcnow().replace(
minute=0, second=0, microsecond=0
)
# Get activity logs for scraper actions
logs = ActivityLog.query.filter(
ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value,
ActivityLog.timestamp >= cutoff_time.replace(hour=cutoff_time.hour - hours)
).all()
# Group by hour and status
stats = {}
for hour in range(hours):
target_hour = (cutoff_time.hour - hour) % 24
stats[target_hour] = {
"success": 0,
"error": 0,
"pending": 0,
"hour": target_hour,
}
for log in logs:
hour = log.timestamp.hour
if hour in stats:
if log.status == "success":
stats[hour]["success"] += 1
elif log.status == "error":
stats[hour]["error"] += 1
elif log.status == "pending":
stats[hour]["pending"] += 1
# Convert to list for easier consumption by JavaScript
result = [stats[hour] for hour in sorted(stats.keys())]
return jsonify(result)
@bp.route("/update_config", methods=["POST"])
def update_config():
"""Update scraper configuration."""
data = request.json
if "volume" in data:
try:
new_volume = float(data["volume"])
volume_config = VolumeConfig.query.first()
if not volume_config:
volume_config = VolumeConfig(volume=new_volume)
db.session.add(volume_config)
else:
old_value = volume_config.volume
volume_config.volume = new_volume
ActivityLog.log_config_change(
config_key="scraper_volume",
old_value=old_value,
new_value=new_volume,
description="Updated scraper volume"
)
db.session.commit()
except (ValueError, TypeError):
return jsonify({"success": False, "message": "Invalid volume value"})
if "schedule" in data:
try:
schedule = data["schedule"]
for hour_str, weight in schedule.items():
hour = int(hour_str)
weight = float(weight)
if 0 <= hour <= 23 and weight >= 0:
schedule_config = ScheduleConfig.query.get(hour)
if not schedule_config:
schedule_config = ScheduleConfig(hour=hour, weight=weight)
db.session.add(schedule_config)
else:
old_value = schedule_config.weight
schedule_config.weight = weight
ActivityLog.log_config_change(
config_key=f"schedule_hour_{hour}",
old_value=old_value,
new_value=weight,
description=f"Updated schedule weight for hour {hour}"
)
db.session.commit()
except (ValueError, TypeError):
return jsonify({"success": False, "message": "Invalid schedule format"})
return jsonify({"success": True, "message": "Configuration updated"})
# Define the Celery tasks
@celery.task(bind=True)
def dummy_scraper_scheduler(self):
"""Main scheduler task for the dummy scraper."""
global SCRAPER_ACTIVE, SCRAPER_PAUSED
if not SCRAPER_ACTIVE:
return {"status": "Scraper not active"}
if SCRAPER_PAUSED:
return {"status": "Scraper paused"}
# Calculate how many papers to scrape based on current hour and configuration
current_hour = datetime.now().hour
hour_config = ScheduleConfig.query.get(current_hour)
volume_config = VolumeConfig.query.first()
if not hour_config or not volume_config:
return {"status": "Missing configuration"}
# Calculate papers to scrape this hour
hourly_rate = volume_config.volume / 24 # Base rate per hour
adjusted_rate = hourly_rate * (1 / hour_config.weight) # Adjust by weight
papers_to_scrape = int(adjusted_rate)
# Log the scheduling decision
ActivityLog.log_scraper_activity(
action="schedule_papers",
status="success",
description=f"Scheduled {papers_to_scrape} papers for scraping at hour {current_hour}",
hourly_rate=hourly_rate,
weight=hour_config.weight,
adjusted_rate=adjusted_rate,
)
# Launch individual scraping tasks
for _ in range(papers_to_scrape):
if not SCRAPER_ACTIVE or SCRAPER_PAUSED:
break
# Schedule a new paper to be scraped
dummy_scrape_paper.delay()
# Schedule the next run in 5 minutes if still active
if SCRAPER_ACTIVE:
dummy_scraper_scheduler.apply_async(countdown=300) # 5 minutes
return {"status": "success", "papers_scheduled": papers_to_scrape}
@celery.task(bind=True)
def dummy_scrape_paper(self):
"""Simulate scraping a single paper."""
# Simulate success or failure
success = random.random() > 0.3 # 70% success rate
# Simulate processing time
import time
time.sleep(random.randint(2, 5)) # 2-5 seconds
if success:
# Create a dummy paper
new_paper = PaperMetadata(
title=f"Dummy Paper {random.randint(1000, 9999)}",
doi=f"10.1234/dummy.{random.randint(1000, 9999)}",
journal=random.choice([
"Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
"Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
]),
type="article",
language="en",
published_online=datetime.now().date(),
status="Done",
file_path="/path/to/dummy/paper.pdf"
)
db.session.add(new_paper)
db.session.commit()
# Log the successful scrape
ActivityLog.log_scraper_activity(
action="scrape_paper",
paper_id=new_paper.id,
status="success",
description=f"Successfully scraped paper {new_paper.doi}"
)
return {
"success": True,
"paper_id": new_paper.id,
"title": new_paper.title,
"doi": new_paper.doi
}
else:
# Log the failed scrape
error_message = random.choice([
"Connection timeout",
"404 Not Found",
"Access denied",
"Invalid DOI format",
"PDF download failed",
"Rate limited by publisher"
])
ActivityLog.log_scraper_activity(
action="scrape_paper",
status="error",
description=f"Failed to scrape paper: {error_message}"
)
return {
"success": False,
"error": error_message
}

View File

@ -7,6 +7,9 @@
</button>
<div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
<li class="nav-item">
<a class="nav-link" href="{{ url_for('scraper.index') }}">Scraper</a>
</li>
<li class="nav-item">
<a class="nav-link" href="{{ url_for('upload.upload') }}">Import CSV</a>
</li>

View File

@ -144,13 +144,13 @@
</th>
<th>
{% set params = request.args.to_dict() %}
{% set params = params.update({'sort_by': 'journal', 'sort_dir': journal_sort}) or params %}
<a href="{{ url_for('papers.list_papers', **params) }}">Journal</a>
{% set params = params.update({'sort_by': 'doi', 'sort_dir': doi_sort}) or params %}
<a href="{{ url_for('papers.list_papers', **params) }}">DOI</a>
</th>
<th>
{% set params = request.args.to_dict() %}
{% set params = params.update({'sort_by': 'doi', 'sort_dir': doi_sort}) or params %}
<a href="{{ url_for('papers.list_papers', **params) }}">DOI</a>
{% set params = params.update({'sort_by': 'journal', 'sort_dir': journal_sort}) or params %}
<a href="{{ url_for('papers.list_papers', **params) }}">Journal</a>
</th>
<th>
{% set params = request.args.to_dict() %}
@ -186,10 +186,9 @@
<path
d="M9.5 1a.5.5 0 0 1 .5.5v1a.5.5 0 0 1-.5.5h-3a.5.5 0 0 1-.5-.5v-1a.5.5 0 0 1 .5-.5h3zm-3-1A1.5 1.5 0 0 0 5 1.5v1A1.5 1.5 0 0 0 6.5 4h3A1.5 1.5 0 0 0 11 2.5v-1A1.5 1.5 0 0 0 9.5 0h-3z" />
</svg>
{{ paper.title }}
{{ paper.title|escape }}
</a>
</td>
<td>{{ paper.journal }}</td>
<td>
<a href="https://doi.org/{{ paper.doi }}" target="_blank" class="icon-link icon-link-hover">
{{ paper.doi }}
@ -199,7 +198,17 @@
</svg>
</a>
</td>
<td>{{ paper.issn }}</td>
<td>{{ paper.journal }}</td>
<td>
<a href="https://search.worldcat.org/search?q=issn:{{ paper.issn }}" target="_blank"
class="icon-link icon-link-hover">
{{ paper.issn }}
<svg xmlns="http://www.w3.org/2000/svg" class="bi" viewBox="0 0 16 16" aria-hidden="true">
<path
d="M1 8a.5.5 0 0 1 .5-.5h11.793l-3.147-3.146a.5.5 0 0 1 .708-.708l4 4a.5.5 0 0 1 0 .708l-4 4a.5.5 0 0 1-.708-.708L13.293 8.5H1.5A.5.5 0 0 1 1 8z" />
</svg>
</a>
</td>
<td>{{ paper.status }}</td>
<td>{{ paper.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
<td>{{ paper.updated_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>

View File

@ -0,0 +1,581 @@
{% extends "base.html.jinja" %}
{% block title %}Paper Scraper Control Panel{% endblock title %}
{% block styles %}
{{ super() }}
<style>
.status-indicator {
width: 15px;
height: 15px;
border-radius: 50%;
display: inline-block;
margin-right: 5px;
}
.status-active {
background-color: #28a745;
}
.status-paused {
background-color: #ffc107;
}
.status-inactive {
background-color: #dc3545;
}
.stats-chart {
height: 400px;
}
.notification {
position: fixed;
bottom: 20px;
right: 20px;
max-width: 350px;
z-index: 1050;
}
.schedule-grid {
display: grid;
grid-template-columns: repeat(6, 1fr);
gap: 10px;
}
.hour-block {
padding: 10px;
border-radius: 5px;
text-align: center;
}
.weight-1 {
background-color: #d4edda;
}
.weight-0-7 {
background-color: #d1ecf1;
}
.weight-0-5 {
background-color: #fff3cd;
}
.weight-0-2 {
background-color: #f8d7da;
}
.weight-0-1 {
background-color: #f5c6cb;
}
</style>
{% endblock styles %}
{% block content %}
<div class="container mt-4">
<h1>Paper Scraper Control Panel</h1>
<div class="row mb-4">
<div class="col-md-6">
<div class="card">
<div class="card-header">
<h5>Scraper Status</h5>
</div>
<div class="card-body">
<div class="d-flex align-items-center mb-3">
<div id="statusIndicator" class="status-indicator status-inactive"></div>
<span id="statusText">Inactive</span>
</div>
<div class="btn-group" role="group">
<button id="startButton" class="btn btn-success">Start</button>
<button id="pauseButton" class="btn btn-warning" disabled>Pause</button>
<button id="stopButton" class="btn btn-danger" disabled>Stop</button>
</div>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card">
<div class="card-header">
<h5>Volume Configuration</h5>
</div>
<div class="card-body">
<form id="volumeForm">
<div class="form-group">
<label for="volumeInput">Papers per day:</label>
<input type="number" class="form-control" id="volumeInput"
value="{{ volume_config.volume }}">
</div>
<button type="submit" class="btn btn-primary mt-2">Update Volume</button>
</form>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5>Schedule Configuration</h5>
<small class="text-muted">Weight factor for each hour (lower value = higher scraping rate)</small>
</div>
<div class="card-body">
<div class="schedule-grid">
{% for hour in range(24) %}
{% set weight = schedule_config.get(hour, 1.0) %}
{% set weight_class = "weight-1" %}
{% if weight == 0.1 %}
{% set weight_class = "weight-0-1" %}
{% elif weight == 0.2 %}
{% set weight_class = "weight-0-2" %}
{% elif weight == 0.5 %}
{% set weight_class = "weight-0-5" %}
{% elif weight == 0.7 %}
{% set weight_class = "weight-0-7" %}
{% endif %}
<div class="hour-block border {{ weight_class }}" data-hour="{{ hour }}">
<div class="hour-label">{{ "%02d:00"|format(hour) }}</div>
<select class="form-control hour-weight mt-1" data-hour="{{ hour }}">
<option value="0.1" {% if weight==0.1 %}selected{% endif %}>Very High</option>
<option value="0.2" {% if weight==0.2 %}selected{% endif %}>High</option>
<option value="0.5" {% if weight==0.5 %}selected{% endif %}>Medium</option>
<option value="0.7" {% if weight==0.7 %}selected{% endif %}>Low</option>
<option value="1.0" {% if weight==1.0 %}selected{% endif %}>Very Low</option>
</select>
</div>
{% endfor %}
</div>
<button id="updateScheduleButton" class="btn btn-primary mt-3">Update Schedule</button>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header d-flex justify-content-between align-items-center">
<h5>Scraping Activity</h5>
<div>
<div class="form-check form-switch">
<input class="form-check-input" type="checkbox" id="notificationsToggle" checked>
<label class="form-check-label" for="notificationsToggle">Show Notifications</label>
</div>
</div>
</div>
<div class="card-body">
<div class="btn-group mb-3">
<button class="btn btn-outline-secondary time-range-btn" data-hours="6">Last 6 hours</button>
<button class="btn btn-outline-secondary time-range-btn active" data-hours="24">Last 24
hours</button>
<button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3 days</button>
</div>
<div class="stats-chart" id="activityChart"></div>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5>Recent Activity</h5>
</div>
<div class="card-body">
<div class="table-responsive">
<table class="table table-striped">
<thead>
<tr>
<th>Time</th>
<th>Action</th>
<th>Status</th>
<th>Description</th>
</tr>
</thead>
<tbody id="activityLog">
<tr>
<td colspan="4" class="text-center">Loading activities...</td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
</div>
</div>
</div>
<!-- Notification template -->
<div id="notificationContainer"></div>
{% endblock content %}
{% block scripts %}
{{ super() }}
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script>
// Global variables
let notificationsEnabled = true;
let activityChart = null;
let currentTimeRange = 24;
// DOM elements
const statusIndicator = document.getElementById('statusIndicator');
const statusText = document.getElementById('statusText');
const startButton = document.getElementById('startButton');
const pauseButton = document.getElementById('pauseButton');
const stopButton = document.getElementById('stopButton');
const notificationsToggle = document.getElementById('notificationsToggle');
const activityLog = document.getElementById('activityLog');
// Initialize the page
document.addEventListener('DOMContentLoaded', function () {
initStatusPolling();
loadActivityStats(currentTimeRange);
loadRecentActivity();
// Initialize event listeners
startButton.addEventListener('click', startScraper);
pauseButton.addEventListener('click', togglePauseScraper);
stopButton.addEventListener('click', stopScraper);
notificationsToggle.addEventListener('click', toggleNotifications);
document.getElementById('volumeForm').addEventListener('submit', function (e) {
e.preventDefault();
updateVolume();
});
document.getElementById('updateScheduleButton').addEventListener('click', updateSchedule);
document.querySelectorAll('.time-range-btn').forEach(btn => {
btn.addEventListener('click', function () {
document.querySelectorAll('.time-range-btn').forEach(b => b.classList.remove('active'));
this.classList.add('active');
currentTimeRange = parseInt(this.dataset.hours);
loadActivityStats(currentTimeRange);
});
});
});
// Status polling
function initStatusPolling() {
updateStatus();
setInterval(updateStatus, 5000); // Poll every 5 seconds
}
function updateStatus() {
fetch('/scraper/status')
.then(response => response.json())
.then(data => {
if (data.active) {
if (data.paused) {
statusIndicator.className = 'status-indicator status-paused';
statusText.textContent = 'Paused';
pauseButton.textContent = 'Resume';
} else {
statusIndicator.className = 'status-indicator status-active';
statusText.textContent = 'Active';
pauseButton.textContent = 'Pause';
}
startButton.disabled = true;
pauseButton.disabled = false;
stopButton.disabled = false;
} else {
statusIndicator.className = 'status-indicator status-inactive';
statusText.textContent = 'Inactive';
startButton.disabled = false;
pauseButton.disabled = true;
stopButton.disabled = true;
}
});
}
// Action functions
function startScraper() {
fetch('/scraper/start', { method: 'POST' })
.then(response => response.json())
.then(data => {
if (data.success) {
showNotification('Scraper started successfully', 'success');
updateStatus();
setTimeout(() => { loadRecentActivity(); }, 1000);
} else {
showNotification(data.message, 'danger');
}
});
}
function togglePauseScraper() {
fetch('/scraper/pause', { method: 'POST' })
.then(response => response.json())
.then(data => {
if (data.success) {
showNotification(data.message, 'info');
updateStatus();
setTimeout(() => { loadRecentActivity(); }, 1000);
} else {
showNotification(data.message, 'danger');
}
});
}
function stopScraper() {
fetch('/scraper/stop', { method: 'POST' })
.then(response => response.json())
.then(data => {
if (data.success) {
showNotification('Scraper stopped successfully', 'warning');
updateStatus();
setTimeout(() => { loadRecentActivity(); }, 1000);
} else {
showNotification(data.message, 'danger');
}
});
}
function updateVolume() {
const volume = document.getElementById('volumeInput').value;
fetch('/scraper/update_config', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ volume: volume })
})
.then(response => response.json())
.then(data => {
if (data.success) {
showNotification('Volume updated successfully', 'success');
} else {
showNotification(data.message, 'danger');
}
});
}
function updateSchedule() {
const schedule = {};
document.querySelectorAll('.hour-weight').forEach(select => {
const hour = select.dataset.hour;
const weight = select.value;
schedule[hour] = weight;
});
fetch('/scraper/update_config', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({ schedule: schedule })
})
.then(response => response.json())
.then(data => {
if (data.success) {
showNotification('Schedule updated successfully', 'success');
} else {
showNotification(data.message, 'danger');
}
});
}
function toggleNotifications() {
notificationsEnabled = notificationsToggle.checked;
}
// Load data functions
function loadActivityStats(hours) {
fetch(`/scraper/stats?hours=${hours}`)
.then(response => response.json())
.then(data => {
renderActivityChart(data);
});
}
function loadRecentActivity() {
fetch('/api/activity_logs?category=scraper_activity&limit=20')
.then(response => response.json())
.then(data => {
renderActivityLog(data);
})
.catch(() => {
// If the API endpoint doesn't exist, just show a message
activityLog.innerHTML = '<tr><td colspan="4" class="text-center">Activity log API not available</td></tr>';
});
}
// Rendering functions
function renderActivityChart(data) {
const ctx = document.getElementById('activityChart').getContext('2d');
// Extract the data for the chart
const labels = data.map(item => `${item.hour}:00`);
const successData = data.map(item => item.success);
const errorData = data.map(item => item.error);
const pendingData = data.map(item => item.pending);
if (activityChart) {
activityChart.destroy();
}
activityChart = new Chart(ctx, {
type: 'bar',
data: {
labels: labels,
datasets: [
{
label: 'Success',
data: successData,
backgroundColor: '#28a745',
stack: 'Stack 0'
},
{
label: 'Error',
data: errorData,
backgroundColor: '#dc3545',
stack: 'Stack 0'
},
{
label: 'Pending',
data: pendingData,
backgroundColor: '#ffc107',
stack: 'Stack 0'
}
]
},
options: {
responsive: true,
maintainAspectRatio: false,
scales: {
x: {
stacked: true,
title: {
display: true,
text: 'Hour'
}
},
y: {
stacked: true,
beginAtZero: true,
title: {
display: true,
text: 'Papers Scraped'
}
}
}
}
});
}
function renderActivityLog(logs) {
activityLog.innerHTML = '';
if (!logs || logs.length === 0) {
activityLog.innerHTML = '<tr><td colspan="4" class="text-center">No recent activity</td></tr>';
return;
}
logs.forEach(log => {
const row = document.createElement('tr');
// Format timestamp
const date = new Date(log.timestamp);
const timeStr = date.toLocaleTimeString();
// Create status badge
let statusBadge = '';
if (log.status === 'success') {
statusBadge = '<span class="badge bg-success">Success</span>';
} else if (log.status === 'error') {
statusBadge = '<span class="badge bg-danger">Error</span>';
} else if (log.status === 'pending') {
statusBadge = '<span class="badge bg-warning text-dark">Pending</span>';
} else {
statusBadge = `<span class="badge bg-secondary">${log.status || 'Unknown'}</span>`;
}
row.innerHTML = `
<td>${timeStr}</td>
<td>${log.action}</td>
<td>${statusBadge}</td>
<td>${log.description || ''}</td>
`;
activityLog.appendChild(row);
});
}
// Notification functions
function showNotification(message, type) {
if (!notificationsEnabled && type !== 'danger') {
return;
}
const container = document.getElementById('notificationContainer');
const notification = document.createElement('div');
notification.className = `alert alert-${type} notification shadow-sm`;
notification.innerHTML = `
${message}
<button type="button" class="btn-close float-end" aria-label="Close"></button>
`;
container.appendChild(notification);
// Add close handler
notification.querySelector('.btn-close').addEventListener('click', () => {
notification.remove();
});
// Auto-close after 5 seconds
setTimeout(() => {
notification.classList.add('fade');
setTimeout(() => {
notification.remove();
}, 500);
}, 5000);
}
// WebSocket for real-time notifications
function setupWebSocket() {
// If WebSocket is available, implement it here
// For now we'll poll the server periodically for new papers
setInterval(checkForNewPapers, 10000); // Check every 10 seconds
}
let lastPaperTimestamp = new Date().toISOString();
function checkForNewPapers() {
fetch(`/api/activity_logs?category=scraper_activity&action=scrape_paper&after=${lastPaperTimestamp}&limit=5`)
.then(response => response.json())
.then(data => {
if (data && data.length > 0) {
// Update the timestamp
lastPaperTimestamp = new Date().toISOString();
// Show notifications for new papers
data.forEach(log => {
const extraData = log.extra_data ? JSON.parse(log.extra_data) : {};
if (log.status === 'success') {
showNotification(`New paper scraped: ${extraData.title || 'Unknown title'}`, 'success');
} else if (log.status === 'error') {
showNotification(`Failed to scrape paper: ${log.description}`, 'danger');
}
});
// Refresh the activity chart and log
loadActivityStats(currentTimeRange);
loadRecentActivity();
}
})
.catch(() => {
// If the API endpoint doesn't exist, do nothing
});
}
// Start checking for new papers
setupWebSocket();
</script>
{% endblock scripts %}