import random import json from datetime import datetime from flask import Blueprint, jsonify, render_template, request, current_app from ..models import ScheduleConfig, VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory from ..db import db from ..celery import celery bp = Blueprint("scraper", __name__, url_prefix="/scraper") # Global variables to track scraper state SCRAPER_ACTIVE = False SCRAPER_PAUSED = False @bp.route("/") def index(): """Render the scraper control panel.""" volume_config = VolumeConfig.query.first() schedule_config = {record.hour: record.weight for record in ScheduleConfig.query.all()} return render_template( "scraper.html.jinja", volume_config=volume_config, schedule_config=schedule_config, scraper_active=SCRAPER_ACTIVE, scraper_paused=SCRAPER_PAUSED ) @bp.route("/start", methods=["POST"]) def start_scraper(): """Start the scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if not SCRAPER_ACTIVE: SCRAPER_ACTIVE = True SCRAPER_PAUSED = False # Log the action ActivityLog.log_scraper_command( action="start_scraper", status="success", description="Scraper started manually" ) # Start the scheduler task task = dummy_scraper_scheduler.delay() return jsonify({ "success": True, "message": "Scraper started", "task_id": task.id }) else: return jsonify({ "success": False, "message": "Scraper is already running" }) @bp.route("/stop", methods=["POST"]) def stop_scraper(): """Stop the scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if SCRAPER_ACTIVE: SCRAPER_ACTIVE = False SCRAPER_PAUSED = False ActivityLog.log_scraper_command( action="stop_scraper", status="success", description="Scraper stopped manually" ) return jsonify({ "success": True, "message": "Scraper stopped" }) else: return jsonify({ "success": False, "message": "Scraper is not running" }) @bp.route("/pause", methods=["POST"]) def pause_scraper(): """Pause the scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if SCRAPER_ACTIVE and not SCRAPER_PAUSED: SCRAPER_PAUSED = True ActivityLog.log_scraper_command( action="pause_scraper", status="success", description="Scraper paused manually" ) return jsonify({ "success": True, "message": "Scraper paused" }) elif SCRAPER_ACTIVE and SCRAPER_PAUSED: SCRAPER_PAUSED = False ActivityLog.log_scraper_command( action="resume_scraper", status="success", description="Scraper resumed manually" ) return jsonify({ "success": True, "message": "Scraper resumed" }) else: return jsonify({ "success": False, "message": "Scraper is not running" }) @bp.route("/status") def scraper_status(): """Get the current status of the scraper.""" return jsonify({ "active": SCRAPER_ACTIVE, "paused": SCRAPER_PAUSED, "current_hour": datetime.now().hour, }) @bp.route("/stats") def scraper_stats(): """Get scraper statistics for the dashboard.""" # Get the last 24 hours of activity hours = 24 if request.args.get('hours'): try: hours = int(request.args.get('hours')) except ValueError: pass cutoff_time = datetime.utcnow().replace( minute=0, second=0, microsecond=0 ) # Get activity logs for scraper actions logs = ActivityLog.query.filter( ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value, ActivityLog.timestamp >= cutoff_time.replace(hour=cutoff_time.hour - hours) ).all() # Group by hour and status stats = {} for hour in range(hours): target_hour = (cutoff_time.hour - hour) % 24 stats[target_hour] = { "success": 0, "error": 0, "pending": 0, "hour": target_hour, } for log in logs: hour = log.timestamp.hour if hour in stats: if log.status == "success": stats[hour]["success"] += 1 elif log.status == "error": stats[hour]["error"] += 1 elif log.status == "pending": stats[hour]["pending"] += 1 # Convert to list for easier consumption by JavaScript result = [stats[hour] for hour in sorted(stats.keys())] return jsonify(result) @bp.route("/update_config", methods=["POST"]) def update_config(): """Update scraper configuration.""" data = request.json if "volume" in data: try: new_volume = float(data["volume"]) volume_config = VolumeConfig.query.first() if not volume_config: volume_config = VolumeConfig(volume=new_volume) db.session.add(volume_config) else: old_value = volume_config.volume volume_config.volume = new_volume ActivityLog.log_config_change( config_key="scraper_volume", old_value=old_value, new_value=new_volume, description="Updated scraper volume" ) db.session.commit() except (ValueError, TypeError): return jsonify({"success": False, "message": "Invalid volume value"}) if "schedule" in data: try: schedule = data["schedule"] for hour_str, weight in schedule.items(): hour = int(hour_str) weight = float(weight) if 0 <= hour <= 23 and weight >= 0: schedule_config = ScheduleConfig.query.get(hour) if not schedule_config: schedule_config = ScheduleConfig(hour=hour, weight=weight) db.session.add(schedule_config) else: old_value = schedule_config.weight schedule_config.weight = weight ActivityLog.log_config_change( config_key=f"schedule_hour_{hour}", old_value=old_value, new_value=weight, description=f"Updated schedule weight for hour {hour}" ) db.session.commit() except (ValueError, TypeError): return jsonify({"success": False, "message": "Invalid schedule format"}) return jsonify({"success": True, "message": "Configuration updated"}) # Define the Celery tasks @celery.task(bind=True) def dummy_scraper_scheduler(self): """Main scheduler task for the dummy scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if not SCRAPER_ACTIVE: return {"status": "Scraper not active"} if SCRAPER_PAUSED: return {"status": "Scraper paused"} # Calculate how many papers to scrape based on current hour and configuration current_hour = datetime.now().hour hour_config = ScheduleConfig.query.get(current_hour) volume_config = VolumeConfig.query.first() if not hour_config or not volume_config: return {"status": "Missing configuration"} # Calculate papers to scrape this hour hourly_rate = volume_config.volume / 24 # Base rate per hour adjusted_rate = hourly_rate * (1 / hour_config.weight) # Adjust by weight papers_to_scrape = int(adjusted_rate) # Log the scheduling decision ActivityLog.log_scraper_activity( action="schedule_papers", status="success", description=f"Scheduled {papers_to_scrape} papers for scraping at hour {current_hour}", hourly_rate=hourly_rate, weight=hour_config.weight, adjusted_rate=adjusted_rate, ) # Launch individual scraping tasks for _ in range(papers_to_scrape): if not SCRAPER_ACTIVE or SCRAPER_PAUSED: break # Schedule a new paper to be scraped dummy_scrape_paper.delay() # Schedule the next run in 5 minutes if still active if SCRAPER_ACTIVE: dummy_scraper_scheduler.apply_async(countdown=300) # 5 minutes return {"status": "success", "papers_scheduled": papers_to_scrape} @celery.task(bind=True) def dummy_scrape_paper(self): """Simulate scraping a single paper.""" # Simulate success or failure success = random.random() > 0.3 # 70% success rate # Simulate processing time import time time.sleep(random.randint(2, 5)) # 2-5 seconds if success: # Create a dummy paper new_paper = PaperMetadata( title=f"Dummy Paper {random.randint(1000, 9999)}", doi=f"10.1234/dummy.{random.randint(1000, 9999)}", journal=random.choice([ "Nature", "Science", "PLOS ONE", "Journal of Dummy Research", "Proceedings of the Dummy Society", "Cell", "Dummy Review Letters" ]), type="article", language="en", published_online=datetime.now().date(), status="Done", file_path="/path/to/dummy/paper.pdf" ) db.session.add(new_paper) db.session.commit() # Log the successful scrape ActivityLog.log_scraper_activity( action="scrape_paper", paper_id=new_paper.id, status="success", description=f"Successfully scraped paper {new_paper.doi}" ) return { "success": True, "paper_id": new_paper.id, "title": new_paper.title, "doi": new_paper.doi } else: # Log the failed scrape error_message = random.choice([ "Connection timeout", "404 Not Found", "Access denied", "Invalid DOI format", "PDF download failed", "Rate limited by publisher" ]) ActivityLog.log_scraper_activity( action="scrape_paper", status="error", description=f"Failed to scrape paper: {error_message}" ) return { "success": False, "error": error_message }