adds initial scraper logic
This commit is contained in:
parent
6e119f1412
commit
b09c6f1b9b
@ -18,7 +18,7 @@ def create_app(test_config=None):
|
||||
app.config.update(test_config)
|
||||
|
||||
db.init_app(app)
|
||||
migrate = Migrate(app, db) # Add this line to initialize Flask-Migrate
|
||||
migrate = Migrate(app, db)
|
||||
|
||||
with app.app_context():
|
||||
db.create_all()
|
||||
|
@ -6,6 +6,8 @@ from .papers import bp as papers_bp
|
||||
from .upload import bp as upload_bp
|
||||
from .schedule import bp as schedule_bp
|
||||
from .logger import bp as logger_bp
|
||||
from .api import bp as api_bp
|
||||
from .scraper import bp as scraper_bp
|
||||
|
||||
|
||||
def register_blueprints(app: Flask):
|
||||
@ -14,4 +16,6 @@ def register_blueprints(app: Flask):
|
||||
app.register_blueprint(papers_bp, url_prefix='/papers')
|
||||
app.register_blueprint(upload_bp, url_prefix='/upload')
|
||||
app.register_blueprint(schedule_bp, url_prefix='/schedule')
|
||||
app.register_blueprint(logger_bp, url_prefix='/logs')
|
||||
app.register_blueprint(logger_bp, url_prefix='/logs')
|
||||
app.register_blueprint(api_bp, url_prefix='/api')
|
||||
app.register_blueprint(scraper_bp, url_prefix='/scraper')
|
50
scipaperloader/blueprints/api.py
Normal file
50
scipaperloader/blueprints/api.py
Normal file
@ -0,0 +1,50 @@
|
||||
from datetime import datetime
|
||||
from flask import Blueprint, jsonify, request
|
||||
from ..models import ActivityLog, ActivityCategory
|
||||
|
||||
bp = Blueprint("api", __name__, url_prefix="/api")
|
||||
|
||||
@bp.route("/activity_logs")
|
||||
def get_activity_logs():
|
||||
"""Get activity logs with filtering options."""
|
||||
# Get query parameters
|
||||
category = request.args.get("category")
|
||||
action = request.args.get("action")
|
||||
after = request.args.get("after")
|
||||
limit = request.args.get("limit", 20, type=int)
|
||||
|
||||
# Build query
|
||||
query = ActivityLog.query
|
||||
|
||||
if category:
|
||||
query = query.filter(ActivityLog.category == category)
|
||||
|
||||
if action:
|
||||
query = query.filter(ActivityLog.action == action)
|
||||
|
||||
if after:
|
||||
try:
|
||||
after_date = datetime.fromisoformat(after.replace("Z", "+00:00"))
|
||||
query = query.filter(ActivityLog.timestamp > after_date)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Order by most recent first and limit results
|
||||
logs = query.order_by(ActivityLog.timestamp.desc()).limit(limit).all()
|
||||
|
||||
# Format the results
|
||||
result = []
|
||||
for log in logs:
|
||||
log_data = {
|
||||
"id": log.id,
|
||||
"timestamp": log.timestamp.isoformat(),
|
||||
"category": log.category,
|
||||
"action": log.action,
|
||||
"description": log.description,
|
||||
"status": log.status,
|
||||
"paper_id": log.paper_id,
|
||||
"extra_data": log.extra_data
|
||||
}
|
||||
result.append(log_data)
|
||||
|
||||
return jsonify(result)
|
344
scipaperloader/blueprints/scraper.py
Normal file
344
scipaperloader/blueprints/scraper.py
Normal file
@ -0,0 +1,344 @@
|
||||
import random
|
||||
import json
|
||||
from datetime import datetime
|
||||
from flask import Blueprint, jsonify, render_template, request, current_app
|
||||
from ..models import ScheduleConfig, VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory
|
||||
from ..db import db
|
||||
from ..celery import celery
|
||||
|
||||
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
|
||||
|
||||
# Global variables to track scraper state
|
||||
SCRAPER_ACTIVE = False
|
||||
SCRAPER_PAUSED = False
|
||||
|
||||
@bp.route("/")
|
||||
def index():
|
||||
"""Render the scraper control panel."""
|
||||
volume_config = VolumeConfig.query.first()
|
||||
schedule_config = {record.hour: record.weight for record in ScheduleConfig.query.all()}
|
||||
|
||||
return render_template(
|
||||
"scraper.html.jinja",
|
||||
volume_config=volume_config,
|
||||
schedule_config=schedule_config,
|
||||
scraper_active=SCRAPER_ACTIVE,
|
||||
scraper_paused=SCRAPER_PAUSED
|
||||
)
|
||||
|
||||
@bp.route("/start", methods=["POST"])
|
||||
def start_scraper():
|
||||
"""Start the scraper."""
|
||||
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
||||
|
||||
if not SCRAPER_ACTIVE:
|
||||
SCRAPER_ACTIVE = True
|
||||
SCRAPER_PAUSED = False
|
||||
|
||||
# Log the action
|
||||
ActivityLog.log_scraper_command(
|
||||
action="start_scraper",
|
||||
status="success",
|
||||
description="Scraper started manually"
|
||||
)
|
||||
|
||||
# Start the scheduler task
|
||||
task = dummy_scraper_scheduler.delay()
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"message": "Scraper started",
|
||||
"task_id": task.id
|
||||
})
|
||||
else:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": "Scraper is already running"
|
||||
})
|
||||
|
||||
@bp.route("/stop", methods=["POST"])
|
||||
def stop_scraper():
|
||||
"""Stop the scraper."""
|
||||
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
||||
|
||||
if SCRAPER_ACTIVE:
|
||||
SCRAPER_ACTIVE = False
|
||||
SCRAPER_PAUSED = False
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="stop_scraper",
|
||||
status="success",
|
||||
description="Scraper stopped manually"
|
||||
)
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"message": "Scraper stopped"
|
||||
})
|
||||
else:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": "Scraper is not running"
|
||||
})
|
||||
|
||||
@bp.route("/pause", methods=["POST"])
|
||||
def pause_scraper():
|
||||
"""Pause the scraper."""
|
||||
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
||||
|
||||
if SCRAPER_ACTIVE and not SCRAPER_PAUSED:
|
||||
SCRAPER_PAUSED = True
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="pause_scraper",
|
||||
status="success",
|
||||
description="Scraper paused manually"
|
||||
)
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"message": "Scraper paused"
|
||||
})
|
||||
elif SCRAPER_ACTIVE and SCRAPER_PAUSED:
|
||||
SCRAPER_PAUSED = False
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="resume_scraper",
|
||||
status="success",
|
||||
description="Scraper resumed manually"
|
||||
)
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"message": "Scraper resumed"
|
||||
})
|
||||
else:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": "Scraper is not running"
|
||||
})
|
||||
|
||||
@bp.route("/status")
|
||||
def scraper_status():
|
||||
"""Get the current status of the scraper."""
|
||||
return jsonify({
|
||||
"active": SCRAPER_ACTIVE,
|
||||
"paused": SCRAPER_PAUSED,
|
||||
"current_hour": datetime.now().hour,
|
||||
})
|
||||
|
||||
@bp.route("/stats")
|
||||
def scraper_stats():
|
||||
"""Get scraper statistics for the dashboard."""
|
||||
# Get the last 24 hours of activity
|
||||
hours = 24
|
||||
if request.args.get('hours'):
|
||||
try:
|
||||
hours = int(request.args.get('hours'))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
cutoff_time = datetime.utcnow().replace(
|
||||
minute=0, second=0, microsecond=0
|
||||
)
|
||||
|
||||
# Get activity logs for scraper actions
|
||||
logs = ActivityLog.query.filter(
|
||||
ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value,
|
||||
ActivityLog.timestamp >= cutoff_time.replace(hour=cutoff_time.hour - hours)
|
||||
).all()
|
||||
|
||||
# Group by hour and status
|
||||
stats = {}
|
||||
for hour in range(hours):
|
||||
target_hour = (cutoff_time.hour - hour) % 24
|
||||
stats[target_hour] = {
|
||||
"success": 0,
|
||||
"error": 0,
|
||||
"pending": 0,
|
||||
"hour": target_hour,
|
||||
}
|
||||
|
||||
for log in logs:
|
||||
hour = log.timestamp.hour
|
||||
if hour in stats:
|
||||
if log.status == "success":
|
||||
stats[hour]["success"] += 1
|
||||
elif log.status == "error":
|
||||
stats[hour]["error"] += 1
|
||||
elif log.status == "pending":
|
||||
stats[hour]["pending"] += 1
|
||||
|
||||
# Convert to list for easier consumption by JavaScript
|
||||
result = [stats[hour] for hour in sorted(stats.keys())]
|
||||
|
||||
return jsonify(result)
|
||||
|
||||
@bp.route("/update_config", methods=["POST"])
|
||||
def update_config():
|
||||
"""Update scraper configuration."""
|
||||
data = request.json
|
||||
|
||||
if "volume" in data:
|
||||
try:
|
||||
new_volume = float(data["volume"])
|
||||
volume_config = VolumeConfig.query.first()
|
||||
if not volume_config:
|
||||
volume_config = VolumeConfig(volume=new_volume)
|
||||
db.session.add(volume_config)
|
||||
else:
|
||||
old_value = volume_config.volume
|
||||
volume_config.volume = new_volume
|
||||
ActivityLog.log_config_change(
|
||||
config_key="scraper_volume",
|
||||
old_value=old_value,
|
||||
new_value=new_volume,
|
||||
description="Updated scraper volume"
|
||||
)
|
||||
|
||||
db.session.commit()
|
||||
except (ValueError, TypeError):
|
||||
return jsonify({"success": False, "message": "Invalid volume value"})
|
||||
|
||||
if "schedule" in data:
|
||||
try:
|
||||
schedule = data["schedule"]
|
||||
|
||||
for hour_str, weight in schedule.items():
|
||||
hour = int(hour_str)
|
||||
weight = float(weight)
|
||||
|
||||
if 0 <= hour <= 23 and weight >= 0:
|
||||
schedule_config = ScheduleConfig.query.get(hour)
|
||||
if not schedule_config:
|
||||
schedule_config = ScheduleConfig(hour=hour, weight=weight)
|
||||
db.session.add(schedule_config)
|
||||
else:
|
||||
old_value = schedule_config.weight
|
||||
schedule_config.weight = weight
|
||||
ActivityLog.log_config_change(
|
||||
config_key=f"schedule_hour_{hour}",
|
||||
old_value=old_value,
|
||||
new_value=weight,
|
||||
description=f"Updated schedule weight for hour {hour}"
|
||||
)
|
||||
|
||||
db.session.commit()
|
||||
except (ValueError, TypeError):
|
||||
return jsonify({"success": False, "message": "Invalid schedule format"})
|
||||
|
||||
return jsonify({"success": True, "message": "Configuration updated"})
|
||||
|
||||
# Define the Celery tasks
|
||||
@celery.task(bind=True)
|
||||
def dummy_scraper_scheduler(self):
|
||||
"""Main scheduler task for the dummy scraper."""
|
||||
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
||||
|
||||
if not SCRAPER_ACTIVE:
|
||||
return {"status": "Scraper not active"}
|
||||
|
||||
if SCRAPER_PAUSED:
|
||||
return {"status": "Scraper paused"}
|
||||
|
||||
# Calculate how many papers to scrape based on current hour and configuration
|
||||
current_hour = datetime.now().hour
|
||||
hour_config = ScheduleConfig.query.get(current_hour)
|
||||
volume_config = VolumeConfig.query.first()
|
||||
|
||||
if not hour_config or not volume_config:
|
||||
return {"status": "Missing configuration"}
|
||||
|
||||
# Calculate papers to scrape this hour
|
||||
hourly_rate = volume_config.volume / 24 # Base rate per hour
|
||||
adjusted_rate = hourly_rate * (1 / hour_config.weight) # Adjust by weight
|
||||
papers_to_scrape = int(adjusted_rate)
|
||||
|
||||
# Log the scheduling decision
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="schedule_papers",
|
||||
status="success",
|
||||
description=f"Scheduled {papers_to_scrape} papers for scraping at hour {current_hour}",
|
||||
hourly_rate=hourly_rate,
|
||||
weight=hour_config.weight,
|
||||
adjusted_rate=adjusted_rate,
|
||||
)
|
||||
|
||||
# Launch individual scraping tasks
|
||||
for _ in range(papers_to_scrape):
|
||||
if not SCRAPER_ACTIVE or SCRAPER_PAUSED:
|
||||
break
|
||||
|
||||
# Schedule a new paper to be scraped
|
||||
dummy_scrape_paper.delay()
|
||||
|
||||
# Schedule the next run in 5 minutes if still active
|
||||
if SCRAPER_ACTIVE:
|
||||
dummy_scraper_scheduler.apply_async(countdown=300) # 5 minutes
|
||||
|
||||
return {"status": "success", "papers_scheduled": papers_to_scrape}
|
||||
|
||||
@celery.task(bind=True)
|
||||
def dummy_scrape_paper(self):
|
||||
"""Simulate scraping a single paper."""
|
||||
# Simulate success or failure
|
||||
success = random.random() > 0.3 # 70% success rate
|
||||
|
||||
# Simulate processing time
|
||||
import time
|
||||
time.sleep(random.randint(2, 5)) # 2-5 seconds
|
||||
|
||||
if success:
|
||||
# Create a dummy paper
|
||||
new_paper = PaperMetadata(
|
||||
title=f"Dummy Paper {random.randint(1000, 9999)}",
|
||||
doi=f"10.1234/dummy.{random.randint(1000, 9999)}",
|
||||
journal=random.choice([
|
||||
"Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
|
||||
"Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
|
||||
]),
|
||||
type="article",
|
||||
language="en",
|
||||
published_online=datetime.now().date(),
|
||||
status="Done",
|
||||
file_path="/path/to/dummy/paper.pdf"
|
||||
)
|
||||
|
||||
db.session.add(new_paper)
|
||||
db.session.commit()
|
||||
|
||||
# Log the successful scrape
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="scrape_paper",
|
||||
paper_id=new_paper.id,
|
||||
status="success",
|
||||
description=f"Successfully scraped paper {new_paper.doi}"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"paper_id": new_paper.id,
|
||||
"title": new_paper.title,
|
||||
"doi": new_paper.doi
|
||||
}
|
||||
else:
|
||||
# Log the failed scrape
|
||||
error_message = random.choice([
|
||||
"Connection timeout",
|
||||
"404 Not Found",
|
||||
"Access denied",
|
||||
"Invalid DOI format",
|
||||
"PDF download failed",
|
||||
"Rate limited by publisher"
|
||||
])
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="scrape_paper",
|
||||
status="error",
|
||||
description=f"Failed to scrape paper: {error_message}"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": False,
|
||||
"error": error_message
|
||||
}
|
@ -7,6 +7,9 @@
|
||||
</button>
|
||||
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
||||
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="{{ url_for('scraper.index') }}">Scraper</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="{{ url_for('upload.upload') }}">Import CSV</a>
|
||||
</li>
|
||||
|
@ -144,13 +144,13 @@
|
||||
</th>
|
||||
<th>
|
||||
{% set params = request.args.to_dict() %}
|
||||
{% set params = params.update({'sort_by': 'journal', 'sort_dir': journal_sort}) or params %}
|
||||
<a href="{{ url_for('papers.list_papers', **params) }}">Journal</a>
|
||||
{% set params = params.update({'sort_by': 'doi', 'sort_dir': doi_sort}) or params %}
|
||||
<a href="{{ url_for('papers.list_papers', **params) }}">DOI</a>
|
||||
</th>
|
||||
<th>
|
||||
{% set params = request.args.to_dict() %}
|
||||
{% set params = params.update({'sort_by': 'doi', 'sort_dir': doi_sort}) or params %}
|
||||
<a href="{{ url_for('papers.list_papers', **params) }}">DOI</a>
|
||||
{% set params = params.update({'sort_by': 'journal', 'sort_dir': journal_sort}) or params %}
|
||||
<a href="{{ url_for('papers.list_papers', **params) }}">Journal</a>
|
||||
</th>
|
||||
<th>
|
||||
{% set params = request.args.to_dict() %}
|
||||
@ -186,10 +186,9 @@
|
||||
<path
|
||||
d="M9.5 1a.5.5 0 0 1 .5.5v1a.5.5 0 0 1-.5.5h-3a.5.5 0 0 1-.5-.5v-1a.5.5 0 0 1 .5-.5h3zm-3-1A1.5 1.5 0 0 0 5 1.5v1A1.5 1.5 0 0 0 6.5 4h3A1.5 1.5 0 0 0 11 2.5v-1A1.5 1.5 0 0 0 9.5 0h-3z" />
|
||||
</svg>
|
||||
{{ paper.title }}
|
||||
{{ paper.title|escape }}
|
||||
</a>
|
||||
</td>
|
||||
<td>{{ paper.journal }}</td>
|
||||
<td>
|
||||
<a href="https://doi.org/{{ paper.doi }}" target="_blank" class="icon-link icon-link-hover">
|
||||
{{ paper.doi }}
|
||||
@ -199,7 +198,17 @@
|
||||
</svg>
|
||||
</a>
|
||||
</td>
|
||||
<td>{{ paper.issn }}</td>
|
||||
<td>{{ paper.journal }}</td>
|
||||
<td>
|
||||
<a href="https://search.worldcat.org/search?q=issn:{{ paper.issn }}" target="_blank"
|
||||
class="icon-link icon-link-hover">
|
||||
{{ paper.issn }}
|
||||
<svg xmlns="http://www.w3.org/2000/svg" class="bi" viewBox="0 0 16 16" aria-hidden="true">
|
||||
<path
|
||||
d="M1 8a.5.5 0 0 1 .5-.5h11.793l-3.147-3.146a.5.5 0 0 1 .708-.708l4 4a.5.5 0 0 1 0 .708l-4 4a.5.5 0 0 1-.708-.708L13.293 8.5H1.5A.5.5 0 0 1 1 8z" />
|
||||
</svg>
|
||||
</a>
|
||||
</td>
|
||||
<td>{{ paper.status }}</td>
|
||||
<td>{{ paper.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
|
||||
<td>{{ paper.updated_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
|
||||
|
581
scipaperloader/templates/scraper.html.jinja
Normal file
581
scipaperloader/templates/scraper.html.jinja
Normal file
@ -0,0 +1,581 @@
|
||||
{% extends "base.html.jinja" %}
|
||||
|
||||
{% block title %}Paper Scraper Control Panel{% endblock title %}
|
||||
|
||||
{% block styles %}
|
||||
{{ super() }}
|
||||
<style>
|
||||
.status-indicator {
|
||||
width: 15px;
|
||||
height: 15px;
|
||||
border-radius: 50%;
|
||||
display: inline-block;
|
||||
margin-right: 5px;
|
||||
}
|
||||
|
||||
.status-active {
|
||||
background-color: #28a745;
|
||||
}
|
||||
|
||||
.status-paused {
|
||||
background-color: #ffc107;
|
||||
}
|
||||
|
||||
.status-inactive {
|
||||
background-color: #dc3545;
|
||||
}
|
||||
|
||||
.stats-chart {
|
||||
height: 400px;
|
||||
}
|
||||
|
||||
.notification {
|
||||
position: fixed;
|
||||
bottom: 20px;
|
||||
right: 20px;
|
||||
max-width: 350px;
|
||||
z-index: 1050;
|
||||
}
|
||||
|
||||
.schedule-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(6, 1fr);
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.hour-block {
|
||||
padding: 10px;
|
||||
border-radius: 5px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.weight-1 {
|
||||
background-color: #d4edda;
|
||||
}
|
||||
|
||||
.weight-0-7 {
|
||||
background-color: #d1ecf1;
|
||||
}
|
||||
|
||||
.weight-0-5 {
|
||||
background-color: #fff3cd;
|
||||
}
|
||||
|
||||
.weight-0-2 {
|
||||
background-color: #f8d7da;
|
||||
}
|
||||
|
||||
.weight-0-1 {
|
||||
background-color: #f5c6cb;
|
||||
}
|
||||
</style>
|
||||
{% endblock styles %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container mt-4">
|
||||
<h1>Paper Scraper Control Panel</h1>
|
||||
|
||||
<div class="row mb-4">
|
||||
<div class="col-md-6">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5>Scraper Status</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="d-flex align-items-center mb-3">
|
||||
<div id="statusIndicator" class="status-indicator status-inactive"></div>
|
||||
<span id="statusText">Inactive</span>
|
||||
</div>
|
||||
|
||||
<div class="btn-group" role="group">
|
||||
<button id="startButton" class="btn btn-success">Start</button>
|
||||
<button id="pauseButton" class="btn btn-warning" disabled>Pause</button>
|
||||
<button id="stopButton" class="btn btn-danger" disabled>Stop</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-md-6">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5>Volume Configuration</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<form id="volumeForm">
|
||||
<div class="form-group">
|
||||
<label for="volumeInput">Papers per day:</label>
|
||||
<input type="number" class="form-control" id="volumeInput"
|
||||
value="{{ volume_config.volume }}">
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary mt-2">Update Volume</button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mb-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5>Schedule Configuration</h5>
|
||||
<small class="text-muted">Weight factor for each hour (lower value = higher scraping rate)</small>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="schedule-grid">
|
||||
{% for hour in range(24) %}
|
||||
{% set weight = schedule_config.get(hour, 1.0) %}
|
||||
{% set weight_class = "weight-1" %}
|
||||
{% if weight == 0.1 %}
|
||||
{% set weight_class = "weight-0-1" %}
|
||||
{% elif weight == 0.2 %}
|
||||
{% set weight_class = "weight-0-2" %}
|
||||
{% elif weight == 0.5 %}
|
||||
{% set weight_class = "weight-0-5" %}
|
||||
{% elif weight == 0.7 %}
|
||||
{% set weight_class = "weight-0-7" %}
|
||||
{% endif %}
|
||||
|
||||
<div class="hour-block border {{ weight_class }}" data-hour="{{ hour }}">
|
||||
<div class="hour-label">{{ "%02d:00"|format(hour) }}</div>
|
||||
<select class="form-control hour-weight mt-1" data-hour="{{ hour }}">
|
||||
<option value="0.1" {% if weight==0.1 %}selected{% endif %}>Very High</option>
|
||||
<option value="0.2" {% if weight==0.2 %}selected{% endif %}>High</option>
|
||||
<option value="0.5" {% if weight==0.5 %}selected{% endif %}>Medium</option>
|
||||
<option value="0.7" {% if weight==0.7 %}selected{% endif %}>Low</option>
|
||||
<option value="1.0" {% if weight==1.0 %}selected{% endif %}>Very Low</option>
|
||||
</select>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<button id="updateScheduleButton" class="btn btn-primary mt-3">Update Schedule</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mb-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header d-flex justify-content-between align-items-center">
|
||||
<h5>Scraping Activity</h5>
|
||||
<div>
|
||||
<div class="form-check form-switch">
|
||||
<input class="form-check-input" type="checkbox" id="notificationsToggle" checked>
|
||||
<label class="form-check-label" for="notificationsToggle">Show Notifications</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="btn-group mb-3">
|
||||
<button class="btn btn-outline-secondary time-range-btn" data-hours="6">Last 6 hours</button>
|
||||
<button class="btn btn-outline-secondary time-range-btn active" data-hours="24">Last 24
|
||||
hours</button>
|
||||
<button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3 days</button>
|
||||
</div>
|
||||
<div class="stats-chart" id="activityChart"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mb-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5>Recent Activity</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="table-responsive">
|
||||
<table class="table table-striped">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Time</th>
|
||||
<th>Action</th>
|
||||
<th>Status</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="activityLog">
|
||||
<tr>
|
||||
<td colspan="4" class="text-center">Loading activities...</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Notification template -->
|
||||
<div id="notificationContainer"></div>
|
||||
{% endblock content %}
|
||||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script>
|
||||
// Global variables
|
||||
let notificationsEnabled = true;
|
||||
let activityChart = null;
|
||||
let currentTimeRange = 24;
|
||||
|
||||
// DOM elements
|
||||
const statusIndicator = document.getElementById('statusIndicator');
|
||||
const statusText = document.getElementById('statusText');
|
||||
const startButton = document.getElementById('startButton');
|
||||
const pauseButton = document.getElementById('pauseButton');
|
||||
const stopButton = document.getElementById('stopButton');
|
||||
const notificationsToggle = document.getElementById('notificationsToggle');
|
||||
const activityLog = document.getElementById('activityLog');
|
||||
|
||||
// Initialize the page
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
initStatusPolling();
|
||||
loadActivityStats(currentTimeRange);
|
||||
loadRecentActivity();
|
||||
|
||||
// Initialize event listeners
|
||||
startButton.addEventListener('click', startScraper);
|
||||
pauseButton.addEventListener('click', togglePauseScraper);
|
||||
stopButton.addEventListener('click', stopScraper);
|
||||
notificationsToggle.addEventListener('click', toggleNotifications);
|
||||
|
||||
document.getElementById('volumeForm').addEventListener('submit', function (e) {
|
||||
e.preventDefault();
|
||||
updateVolume();
|
||||
});
|
||||
|
||||
document.getElementById('updateScheduleButton').addEventListener('click', updateSchedule);
|
||||
|
||||
document.querySelectorAll('.time-range-btn').forEach(btn => {
|
||||
btn.addEventListener('click', function () {
|
||||
document.querySelectorAll('.time-range-btn').forEach(b => b.classList.remove('active'));
|
||||
this.classList.add('active');
|
||||
currentTimeRange = parseInt(this.dataset.hours);
|
||||
loadActivityStats(currentTimeRange);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// Status polling
|
||||
function initStatusPolling() {
|
||||
updateStatus();
|
||||
setInterval(updateStatus, 5000); // Poll every 5 seconds
|
||||
}
|
||||
|
||||
function updateStatus() {
|
||||
fetch('/scraper/status')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.active) {
|
||||
if (data.paused) {
|
||||
statusIndicator.className = 'status-indicator status-paused';
|
||||
statusText.textContent = 'Paused';
|
||||
pauseButton.textContent = 'Resume';
|
||||
} else {
|
||||
statusIndicator.className = 'status-indicator status-active';
|
||||
statusText.textContent = 'Active';
|
||||
pauseButton.textContent = 'Pause';
|
||||
}
|
||||
startButton.disabled = true;
|
||||
pauseButton.disabled = false;
|
||||
stopButton.disabled = false;
|
||||
} else {
|
||||
statusIndicator.className = 'status-indicator status-inactive';
|
||||
statusText.textContent = 'Inactive';
|
||||
startButton.disabled = false;
|
||||
pauseButton.disabled = true;
|
||||
stopButton.disabled = true;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Action functions
|
||||
function startScraper() {
|
||||
fetch('/scraper/start', { method: 'POST' })
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
showNotification('Scraper started successfully', 'success');
|
||||
updateStatus();
|
||||
setTimeout(() => { loadRecentActivity(); }, 1000);
|
||||
} else {
|
||||
showNotification(data.message, 'danger');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function togglePauseScraper() {
|
||||
fetch('/scraper/pause', { method: 'POST' })
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
showNotification(data.message, 'info');
|
||||
updateStatus();
|
||||
setTimeout(() => { loadRecentActivity(); }, 1000);
|
||||
} else {
|
||||
showNotification(data.message, 'danger');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function stopScraper() {
|
||||
fetch('/scraper/stop', { method: 'POST' })
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
showNotification('Scraper stopped successfully', 'warning');
|
||||
updateStatus();
|
||||
setTimeout(() => { loadRecentActivity(); }, 1000);
|
||||
} else {
|
||||
showNotification(data.message, 'danger');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function updateVolume() {
|
||||
const volume = document.getElementById('volumeInput').value;
|
||||
|
||||
fetch('/scraper/update_config', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({ volume: volume })
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
showNotification('Volume updated successfully', 'success');
|
||||
} else {
|
||||
showNotification(data.message, 'danger');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function updateSchedule() {
|
||||
const schedule = {};
|
||||
document.querySelectorAll('.hour-weight').forEach(select => {
|
||||
const hour = select.dataset.hour;
|
||||
const weight = select.value;
|
||||
schedule[hour] = weight;
|
||||
});
|
||||
|
||||
fetch('/scraper/update_config', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({ schedule: schedule })
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
showNotification('Schedule updated successfully', 'success');
|
||||
} else {
|
||||
showNotification(data.message, 'danger');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function toggleNotifications() {
|
||||
notificationsEnabled = notificationsToggle.checked;
|
||||
}
|
||||
|
||||
// Load data functions
|
||||
function loadActivityStats(hours) {
|
||||
fetch(`/scraper/stats?hours=${hours}`)
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
renderActivityChart(data);
|
||||
});
|
||||
}
|
||||
|
||||
function loadRecentActivity() {
|
||||
fetch('/api/activity_logs?category=scraper_activity&limit=20')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
renderActivityLog(data);
|
||||
})
|
||||
.catch(() => {
|
||||
// If the API endpoint doesn't exist, just show a message
|
||||
activityLog.innerHTML = '<tr><td colspan="4" class="text-center">Activity log API not available</td></tr>';
|
||||
});
|
||||
}
|
||||
|
||||
// Rendering functions
|
||||
function renderActivityChart(data) {
|
||||
const ctx = document.getElementById('activityChart').getContext('2d');
|
||||
|
||||
// Extract the data for the chart
|
||||
const labels = data.map(item => `${item.hour}:00`);
|
||||
const successData = data.map(item => item.success);
|
||||
const errorData = data.map(item => item.error);
|
||||
const pendingData = data.map(item => item.pending);
|
||||
|
||||
if (activityChart) {
|
||||
activityChart.destroy();
|
||||
}
|
||||
|
||||
activityChart = new Chart(ctx, {
|
||||
type: 'bar',
|
||||
data: {
|
||||
labels: labels,
|
||||
datasets: [
|
||||
{
|
||||
label: 'Success',
|
||||
data: successData,
|
||||
backgroundColor: '#28a745',
|
||||
stack: 'Stack 0'
|
||||
},
|
||||
{
|
||||
label: 'Error',
|
||||
data: errorData,
|
||||
backgroundColor: '#dc3545',
|
||||
stack: 'Stack 0'
|
||||
},
|
||||
{
|
||||
label: 'Pending',
|
||||
data: pendingData,
|
||||
backgroundColor: '#ffc107',
|
||||
stack: 'Stack 0'
|
||||
}
|
||||
]
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
maintainAspectRatio: false,
|
||||
scales: {
|
||||
x: {
|
||||
stacked: true,
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Hour'
|
||||
}
|
||||
},
|
||||
y: {
|
||||
stacked: true,
|
||||
beginAtZero: true,
|
||||
title: {
|
||||
display: true,
|
||||
text: 'Papers Scraped'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function renderActivityLog(logs) {
|
||||
activityLog.innerHTML = '';
|
||||
|
||||
if (!logs || logs.length === 0) {
|
||||
activityLog.innerHTML = '<tr><td colspan="4" class="text-center">No recent activity</td></tr>';
|
||||
return;
|
||||
}
|
||||
|
||||
logs.forEach(log => {
|
||||
const row = document.createElement('tr');
|
||||
|
||||
// Format timestamp
|
||||
const date = new Date(log.timestamp);
|
||||
const timeStr = date.toLocaleTimeString();
|
||||
|
||||
// Create status badge
|
||||
let statusBadge = '';
|
||||
if (log.status === 'success') {
|
||||
statusBadge = '<span class="badge bg-success">Success</span>';
|
||||
} else if (log.status === 'error') {
|
||||
statusBadge = '<span class="badge bg-danger">Error</span>';
|
||||
} else if (log.status === 'pending') {
|
||||
statusBadge = '<span class="badge bg-warning text-dark">Pending</span>';
|
||||
} else {
|
||||
statusBadge = `<span class="badge bg-secondary">${log.status || 'Unknown'}</span>`;
|
||||
}
|
||||
|
||||
row.innerHTML = `
|
||||
<td>${timeStr}</td>
|
||||
<td>${log.action}</td>
|
||||
<td>${statusBadge}</td>
|
||||
<td>${log.description || ''}</td>
|
||||
`;
|
||||
|
||||
activityLog.appendChild(row);
|
||||
});
|
||||
}
|
||||
|
||||
// Notification functions
|
||||
function showNotification(message, type) {
|
||||
if (!notificationsEnabled && type !== 'danger') {
|
||||
return;
|
||||
}
|
||||
|
||||
const container = document.getElementById('notificationContainer');
|
||||
const notification = document.createElement('div');
|
||||
notification.className = `alert alert-${type} notification shadow-sm`;
|
||||
notification.innerHTML = `
|
||||
${message}
|
||||
<button type="button" class="btn-close float-end" aria-label="Close"></button>
|
||||
`;
|
||||
|
||||
container.appendChild(notification);
|
||||
|
||||
// Add close handler
|
||||
notification.querySelector('.btn-close').addEventListener('click', () => {
|
||||
notification.remove();
|
||||
});
|
||||
|
||||
// Auto-close after 5 seconds
|
||||
setTimeout(() => {
|
||||
notification.classList.add('fade');
|
||||
setTimeout(() => {
|
||||
notification.remove();
|
||||
}, 500);
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
// WebSocket for real-time notifications
|
||||
function setupWebSocket() {
|
||||
// If WebSocket is available, implement it here
|
||||
// For now we'll poll the server periodically for new papers
|
||||
setInterval(checkForNewPapers, 10000); // Check every 10 seconds
|
||||
}
|
||||
|
||||
let lastPaperTimestamp = new Date().toISOString();
|
||||
|
||||
function checkForNewPapers() {
|
||||
fetch(`/api/activity_logs?category=scraper_activity&action=scrape_paper&after=${lastPaperTimestamp}&limit=5`)
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data && data.length > 0) {
|
||||
// Update the timestamp
|
||||
lastPaperTimestamp = new Date().toISOString();
|
||||
|
||||
// Show notifications for new papers
|
||||
data.forEach(log => {
|
||||
const extraData = log.extra_data ? JSON.parse(log.extra_data) : {};
|
||||
if (log.status === 'success') {
|
||||
showNotification(`New paper scraped: ${extraData.title || 'Unknown title'}`, 'success');
|
||||
} else if (log.status === 'error') {
|
||||
showNotification(`Failed to scrape paper: ${log.description}`, 'danger');
|
||||
}
|
||||
});
|
||||
|
||||
// Refresh the activity chart and log
|
||||
loadActivityStats(currentTimeRange);
|
||||
loadRecentActivity();
|
||||
}
|
||||
})
|
||||
.catch(() => {
|
||||
// If the API endpoint doesn't exist, do nothing
|
||||
});
|
||||
}
|
||||
|
||||
// Start checking for new papers
|
||||
setupWebSocket();
|
||||
</script>
|
||||
{% endblock scripts %}
|
Loading…
x
Reference in New Issue
Block a user