2025-04-16 15:19:28 +02:00

285 lines
8.4 KiB
Python

import random
import json
from datetime import datetime
from flask import Blueprint, jsonify, render_template, request, current_app, flash
from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory
from ..db import db
from ..celery import celery
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
# Global variables to track scraper state
SCRAPER_ACTIVE = False
SCRAPER_PAUSED = False
@bp.route("/")
def index():
"""Render the scraper control panel."""
volume_config = VolumeConfig.query.first()
# Ensure we have volume config
if not volume_config:
volume_config = VolumeConfig(volume=100) # Default value
db.session.add(volume_config)
db.session.commit()
return render_template(
"scraper.html.jinja",
volume_config=volume_config,
scraper_active=SCRAPER_ACTIVE,
scraper_paused=SCRAPER_PAUSED
)
@bp.route("/start", methods=["POST"])
def start_scraper():
"""Start the scraper."""
global SCRAPER_ACTIVE, SCRAPER_PAUSED
if not SCRAPER_ACTIVE:
SCRAPER_ACTIVE = True
SCRAPER_PAUSED = False
# Log the action
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description="Scraper started manually"
)
# Trigger the schedule.py to start actual scheduling
return jsonify({
"success": True,
"message": "Scraper started"
})
else:
return jsonify({
"success": False,
"message": "Scraper is already running"
})
@bp.route("/stop", methods=["POST"])
def stop_scraper():
"""Stop the scraper."""
global SCRAPER_ACTIVE, SCRAPER_PAUSED
if SCRAPER_ACTIVE:
SCRAPER_ACTIVE = False
SCRAPER_PAUSED = False
ActivityLog.log_scraper_command(
action="stop_scraper",
status="success",
description="Scraper stopped manually"
)
return jsonify({
"success": True,
"message": "Scraper stopped"
})
else:
return jsonify({
"success": False,
"message": "Scraper is not running"
})
@bp.route("/pause", methods=["POST"])
def pause_scraper():
"""Pause the scraper."""
global SCRAPER_ACTIVE, SCRAPER_PAUSED
if SCRAPER_ACTIVE and not SCRAPER_PAUSED:
SCRAPER_PAUSED = True
ActivityLog.log_scraper_command(
action="pause_scraper",
status="success",
description="Scraper paused manually"
)
return jsonify({
"success": True,
"message": "Scraper paused"
})
elif SCRAPER_ACTIVE and SCRAPER_PAUSED:
SCRAPER_PAUSED = False
ActivityLog.log_scraper_command(
action="resume_scraper",
status="success",
description="Scraper resumed manually"
)
return jsonify({
"success": True,
"message": "Scraper resumed"
})
else:
return jsonify({
"success": False,
"message": "Scraper is not running"
})
@bp.route("/status")
def scraper_status():
"""Get the current status of the scraper."""
return jsonify({
"active": SCRAPER_ACTIVE,
"paused": SCRAPER_PAUSED,
"current_hour": datetime.now().hour,
})
@bp.route("/stats")
def scraper_stats():
"""Get scraper statistics for the dashboard."""
# Get the last 24 hours of activity
hours = 24
if request.args.get('hours'):
try:
hours = int(request.args.get('hours'))
except ValueError:
pass
cutoff_time = datetime.utcnow().replace(
minute=0, second=0, microsecond=0
)
# Get activity logs for scraper actions
logs = ActivityLog.query.filter(
ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value,
ActivityLog.timestamp >= cutoff_time.replace(hour=cutoff_time.hour - hours)
).all()
# Group by hour and status
stats = {}
for hour in range(hours):
target_hour = (cutoff_time.hour - hour) % 24
stats[target_hour] = {
"success": 0,
"error": 0,
"pending": 0,
"hour": target_hour,
}
for log in logs:
hour = log.timestamp.hour
if hour in stats:
if log.status == "success":
stats[hour]["success"] += 1
elif log.status == "error":
stats[hour]["error"] += 1
elif log.status == "pending":
stats[hour]["pending"] += 1
# Convert to list for easier consumption by JavaScript
result = [stats[hour] for hour in sorted(stats.keys())]
return jsonify(result)
@bp.route("/update_config", methods=["POST"])
def update_config():
"""Update scraper configuration."""
data = request.json
try:
if "volume" in data:
try:
new_volume = float(data["volume"])
# Validate volume value
if new_volume <= 0 or new_volume > 1000:
return jsonify({
"success": False,
"message": "Volume must be between 1 and 1000"
})
volume_config = VolumeConfig.query.first()
if not volume_config:
volume_config = VolumeConfig(volume=new_volume)
db.session.add(volume_config)
else:
old_value = volume_config.volume
volume_config.volume = new_volume
ActivityLog.log_config_change(
config_key="scraper_volume",
old_value=old_value,
new_value=new_volume,
description="Updated scraper volume"
)
db.session.commit()
except (ValueError, TypeError):
return jsonify({
"success": False,
"message": "Invalid volume value"
})
return jsonify({"success": True, "message": "Configuration updated"})
except Exception as e:
db.session.rollback()
return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"})
@celery.task(bind=True)
def dummy_scrape_paper(self):
"""Simulate scraping a single paper."""
# Simulate success or failure
success = random.random() > 0.3 # 70% success rate
# Simulate processing time
import time
time.sleep(random.randint(2, 5)) # 2-5 seconds
if success:
# Create a dummy paper
new_paper = PaperMetadata(
title=f"Dummy Paper {random.randint(1000, 9999)}",
doi=f"10.1234/dummy.{random.randint(1000, 9999)}",
journal=random.choice([
"Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
"Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
]),
type="article",
language="en",
published_online=datetime.now().date(),
status="Done",
file_path="/path/to/dummy/paper.pdf"
)
db.session.add(new_paper)
db.session.commit()
# Log the successful scrape
ActivityLog.log_scraper_activity(
action="scrape_paper",
paper_id=new_paper.id,
status="success",
description=f"Successfully scraped paper {new_paper.doi}"
)
return {
"success": True,
"paper_id": new_paper.id,
"title": new_paper.title,
"doi": new_paper.doi
}
else:
# Log the failed scrape
error_message = random.choice([
"Connection timeout",
"404 Not Found",
"Access denied",
"Invalid DOI format",
"PDF download failed",
"Rate limited by publisher"
])
ActivityLog.log_scraper_activity(
action="scrape_paper",
status="error",
description=f"Failed to scrape paper: {error_message}"
)
return {
"success": False,
"error": error_message
}