344 lines
11 KiB
Python
344 lines
11 KiB
Python
import random
|
|
import json
|
|
from datetime import datetime
|
|
from flask import Blueprint, jsonify, render_template, request, current_app
|
|
from ..models import ScheduleConfig, VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory
|
|
from ..db import db
|
|
from ..celery import celery
|
|
|
|
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
|
|
|
|
# Global variables to track scraper state
|
|
SCRAPER_ACTIVE = False
|
|
SCRAPER_PAUSED = False
|
|
|
|
@bp.route("/")
|
|
def index():
|
|
"""Render the scraper control panel."""
|
|
volume_config = VolumeConfig.query.first()
|
|
schedule_config = {record.hour: record.weight for record in ScheduleConfig.query.all()}
|
|
|
|
return render_template(
|
|
"scraper.html.jinja",
|
|
volume_config=volume_config,
|
|
schedule_config=schedule_config,
|
|
scraper_active=SCRAPER_ACTIVE,
|
|
scraper_paused=SCRAPER_PAUSED
|
|
)
|
|
|
|
@bp.route("/start", methods=["POST"])
|
|
def start_scraper():
|
|
"""Start the scraper."""
|
|
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
|
|
|
if not SCRAPER_ACTIVE:
|
|
SCRAPER_ACTIVE = True
|
|
SCRAPER_PAUSED = False
|
|
|
|
# Log the action
|
|
ActivityLog.log_scraper_command(
|
|
action="start_scraper",
|
|
status="success",
|
|
description="Scraper started manually"
|
|
)
|
|
|
|
# Start the scheduler task
|
|
task = dummy_scraper_scheduler.delay()
|
|
|
|
return jsonify({
|
|
"success": True,
|
|
"message": "Scraper started",
|
|
"task_id": task.id
|
|
})
|
|
else:
|
|
return jsonify({
|
|
"success": False,
|
|
"message": "Scraper is already running"
|
|
})
|
|
|
|
@bp.route("/stop", methods=["POST"])
|
|
def stop_scraper():
|
|
"""Stop the scraper."""
|
|
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
|
|
|
if SCRAPER_ACTIVE:
|
|
SCRAPER_ACTIVE = False
|
|
SCRAPER_PAUSED = False
|
|
|
|
ActivityLog.log_scraper_command(
|
|
action="stop_scraper",
|
|
status="success",
|
|
description="Scraper stopped manually"
|
|
)
|
|
|
|
return jsonify({
|
|
"success": True,
|
|
"message": "Scraper stopped"
|
|
})
|
|
else:
|
|
return jsonify({
|
|
"success": False,
|
|
"message": "Scraper is not running"
|
|
})
|
|
|
|
@bp.route("/pause", methods=["POST"])
|
|
def pause_scraper():
|
|
"""Pause the scraper."""
|
|
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
|
|
|
if SCRAPER_ACTIVE and not SCRAPER_PAUSED:
|
|
SCRAPER_PAUSED = True
|
|
|
|
ActivityLog.log_scraper_command(
|
|
action="pause_scraper",
|
|
status="success",
|
|
description="Scraper paused manually"
|
|
)
|
|
|
|
return jsonify({
|
|
"success": True,
|
|
"message": "Scraper paused"
|
|
})
|
|
elif SCRAPER_ACTIVE and SCRAPER_PAUSED:
|
|
SCRAPER_PAUSED = False
|
|
|
|
ActivityLog.log_scraper_command(
|
|
action="resume_scraper",
|
|
status="success",
|
|
description="Scraper resumed manually"
|
|
)
|
|
|
|
return jsonify({
|
|
"success": True,
|
|
"message": "Scraper resumed"
|
|
})
|
|
else:
|
|
return jsonify({
|
|
"success": False,
|
|
"message": "Scraper is not running"
|
|
})
|
|
|
|
@bp.route("/status")
|
|
def scraper_status():
|
|
"""Get the current status of the scraper."""
|
|
return jsonify({
|
|
"active": SCRAPER_ACTIVE,
|
|
"paused": SCRAPER_PAUSED,
|
|
"current_hour": datetime.now().hour,
|
|
})
|
|
|
|
@bp.route("/stats")
|
|
def scraper_stats():
|
|
"""Get scraper statistics for the dashboard."""
|
|
# Get the last 24 hours of activity
|
|
hours = 24
|
|
if request.args.get('hours'):
|
|
try:
|
|
hours = int(request.args.get('hours'))
|
|
except ValueError:
|
|
pass
|
|
|
|
cutoff_time = datetime.utcnow().replace(
|
|
minute=0, second=0, microsecond=0
|
|
)
|
|
|
|
# Get activity logs for scraper actions
|
|
logs = ActivityLog.query.filter(
|
|
ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value,
|
|
ActivityLog.timestamp >= cutoff_time.replace(hour=cutoff_time.hour - hours)
|
|
).all()
|
|
|
|
# Group by hour and status
|
|
stats = {}
|
|
for hour in range(hours):
|
|
target_hour = (cutoff_time.hour - hour) % 24
|
|
stats[target_hour] = {
|
|
"success": 0,
|
|
"error": 0,
|
|
"pending": 0,
|
|
"hour": target_hour,
|
|
}
|
|
|
|
for log in logs:
|
|
hour = log.timestamp.hour
|
|
if hour in stats:
|
|
if log.status == "success":
|
|
stats[hour]["success"] += 1
|
|
elif log.status == "error":
|
|
stats[hour]["error"] += 1
|
|
elif log.status == "pending":
|
|
stats[hour]["pending"] += 1
|
|
|
|
# Convert to list for easier consumption by JavaScript
|
|
result = [stats[hour] for hour in sorted(stats.keys())]
|
|
|
|
return jsonify(result)
|
|
|
|
@bp.route("/update_config", methods=["POST"])
|
|
def update_config():
|
|
"""Update scraper configuration."""
|
|
data = request.json
|
|
|
|
if "volume" in data:
|
|
try:
|
|
new_volume = float(data["volume"])
|
|
volume_config = VolumeConfig.query.first()
|
|
if not volume_config:
|
|
volume_config = VolumeConfig(volume=new_volume)
|
|
db.session.add(volume_config)
|
|
else:
|
|
old_value = volume_config.volume
|
|
volume_config.volume = new_volume
|
|
ActivityLog.log_config_change(
|
|
config_key="scraper_volume",
|
|
old_value=old_value,
|
|
new_value=new_volume,
|
|
description="Updated scraper volume"
|
|
)
|
|
|
|
db.session.commit()
|
|
except (ValueError, TypeError):
|
|
return jsonify({"success": False, "message": "Invalid volume value"})
|
|
|
|
if "schedule" in data:
|
|
try:
|
|
schedule = data["schedule"]
|
|
|
|
for hour_str, weight in schedule.items():
|
|
hour = int(hour_str)
|
|
weight = float(weight)
|
|
|
|
if 0 <= hour <= 23 and weight >= 0:
|
|
schedule_config = ScheduleConfig.query.get(hour)
|
|
if not schedule_config:
|
|
schedule_config = ScheduleConfig(hour=hour, weight=weight)
|
|
db.session.add(schedule_config)
|
|
else:
|
|
old_value = schedule_config.weight
|
|
schedule_config.weight = weight
|
|
ActivityLog.log_config_change(
|
|
config_key=f"schedule_hour_{hour}",
|
|
old_value=old_value,
|
|
new_value=weight,
|
|
description=f"Updated schedule weight for hour {hour}"
|
|
)
|
|
|
|
db.session.commit()
|
|
except (ValueError, TypeError):
|
|
return jsonify({"success": False, "message": "Invalid schedule format"})
|
|
|
|
return jsonify({"success": True, "message": "Configuration updated"})
|
|
|
|
# Define the Celery tasks
|
|
@celery.task(bind=True)
|
|
def dummy_scraper_scheduler(self):
|
|
"""Main scheduler task for the dummy scraper."""
|
|
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
|
|
|
if not SCRAPER_ACTIVE:
|
|
return {"status": "Scraper not active"}
|
|
|
|
if SCRAPER_PAUSED:
|
|
return {"status": "Scraper paused"}
|
|
|
|
# Calculate how many papers to scrape based on current hour and configuration
|
|
current_hour = datetime.now().hour
|
|
hour_config = ScheduleConfig.query.get(current_hour)
|
|
volume_config = VolumeConfig.query.first()
|
|
|
|
if not hour_config or not volume_config:
|
|
return {"status": "Missing configuration"}
|
|
|
|
# Calculate papers to scrape this hour
|
|
hourly_rate = volume_config.volume / 24 # Base rate per hour
|
|
adjusted_rate = hourly_rate * (1 / hour_config.weight) # Adjust by weight
|
|
papers_to_scrape = int(adjusted_rate)
|
|
|
|
# Log the scheduling decision
|
|
ActivityLog.log_scraper_activity(
|
|
action="schedule_papers",
|
|
status="success",
|
|
description=f"Scheduled {papers_to_scrape} papers for scraping at hour {current_hour}",
|
|
hourly_rate=hourly_rate,
|
|
weight=hour_config.weight,
|
|
adjusted_rate=adjusted_rate,
|
|
)
|
|
|
|
# Launch individual scraping tasks
|
|
for _ in range(papers_to_scrape):
|
|
if not SCRAPER_ACTIVE or SCRAPER_PAUSED:
|
|
break
|
|
|
|
# Schedule a new paper to be scraped
|
|
dummy_scrape_paper.delay()
|
|
|
|
# Schedule the next run in 5 minutes if still active
|
|
if SCRAPER_ACTIVE:
|
|
dummy_scraper_scheduler.apply_async(countdown=300) # 5 minutes
|
|
|
|
return {"status": "success", "papers_scheduled": papers_to_scrape}
|
|
|
|
@celery.task(bind=True)
|
|
def dummy_scrape_paper(self):
|
|
"""Simulate scraping a single paper."""
|
|
# Simulate success or failure
|
|
success = random.random() > 0.3 # 70% success rate
|
|
|
|
# Simulate processing time
|
|
import time
|
|
time.sleep(random.randint(2, 5)) # 2-5 seconds
|
|
|
|
if success:
|
|
# Create a dummy paper
|
|
new_paper = PaperMetadata(
|
|
title=f"Dummy Paper {random.randint(1000, 9999)}",
|
|
doi=f"10.1234/dummy.{random.randint(1000, 9999)}",
|
|
journal=random.choice([
|
|
"Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
|
|
"Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
|
|
]),
|
|
type="article",
|
|
language="en",
|
|
published_online=datetime.now().date(),
|
|
status="Done",
|
|
file_path="/path/to/dummy/paper.pdf"
|
|
)
|
|
|
|
db.session.add(new_paper)
|
|
db.session.commit()
|
|
|
|
# Log the successful scrape
|
|
ActivityLog.log_scraper_activity(
|
|
action="scrape_paper",
|
|
paper_id=new_paper.id,
|
|
status="success",
|
|
description=f"Successfully scraped paper {new_paper.doi}"
|
|
)
|
|
|
|
return {
|
|
"success": True,
|
|
"paper_id": new_paper.id,
|
|
"title": new_paper.title,
|
|
"doi": new_paper.doi
|
|
}
|
|
else:
|
|
# Log the failed scrape
|
|
error_message = random.choice([
|
|
"Connection timeout",
|
|
"404 Not Found",
|
|
"Access denied",
|
|
"Invalid DOI format",
|
|
"PDF download failed",
|
|
"Rate limited by publisher"
|
|
])
|
|
|
|
ActivityLog.log_scraper_activity(
|
|
action="scrape_paper",
|
|
status="error",
|
|
description=f"Failed to scrape paper: {error_message}"
|
|
)
|
|
|
|
return {
|
|
"success": False,
|
|
"error": error_message
|
|
} |