import random import json from datetime import datetime from flask import Blueprint, jsonify, render_template, request, current_app, flash from ..models import ScheduleConfig, VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory from ..db import db from ..celery import celery bp = Blueprint("scraper", __name__, url_prefix="/scraper") # Global variables to track scraper state SCRAPER_ACTIVE = False SCRAPER_PAUSED = False @bp.route("/") def index(): """Render the scraper control panel.""" volume_config = VolumeConfig.query.first() # Ensure we have volume config if not volume_config: volume_config = VolumeConfig(volume=100) # Default value db.session.add(volume_config) db.session.commit() # Ensure we have schedule config for all hours existing_hours = {record.hour: record for record in ScheduleConfig.query.all()} schedule_config = {} for hour in range(24): if hour in existing_hours: schedule_config[hour] = existing_hours[hour].weight else: # Create default schedule entry (weight 1.0) new_config = ScheduleConfig(hour=hour, weight=1.0) db.session.add(new_config) schedule_config[hour] = 1.0 if len(existing_hours) < 24: db.session.commit() return render_template( "scraper.html.jinja", volume_config=volume_config, schedule_config=schedule_config, scraper_active=SCRAPER_ACTIVE, scraper_paused=SCRAPER_PAUSED ) @bp.route("/start", methods=["POST"]) def start_scraper(): """Start the scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if not SCRAPER_ACTIVE: SCRAPER_ACTIVE = True SCRAPER_PAUSED = False # Log the action ActivityLog.log_scraper_command( action="start_scraper", status="success", description="Scraper started manually" ) # Start the scheduler task task = dummy_scraper_scheduler.delay() return jsonify({ "success": True, "message": "Scraper started", "task_id": task.id }) else: return jsonify({ "success": False, "message": "Scraper is already running" }) @bp.route("/stop", methods=["POST"]) def stop_scraper(): """Stop the scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if SCRAPER_ACTIVE: SCRAPER_ACTIVE = False SCRAPER_PAUSED = False ActivityLog.log_scraper_command( action="stop_scraper", status="success", description="Scraper stopped manually" ) return jsonify({ "success": True, "message": "Scraper stopped" }) else: return jsonify({ "success": False, "message": "Scraper is not running" }) @bp.route("/pause", methods=["POST"]) def pause_scraper(): """Pause the scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if SCRAPER_ACTIVE and not SCRAPER_PAUSED: SCRAPER_PAUSED = True ActivityLog.log_scraper_command( action="pause_scraper", status="success", description="Scraper paused manually" ) return jsonify({ "success": True, "message": "Scraper paused" }) elif SCRAPER_ACTIVE and SCRAPER_PAUSED: SCRAPER_PAUSED = False ActivityLog.log_scraper_command( action="resume_scraper", status="success", description="Scraper resumed manually" ) return jsonify({ "success": True, "message": "Scraper resumed" }) else: return jsonify({ "success": False, "message": "Scraper is not running" }) @bp.route("/status") def scraper_status(): """Get the current status of the scraper.""" return jsonify({ "active": SCRAPER_ACTIVE, "paused": SCRAPER_PAUSED, "current_hour": datetime.now().hour, }) @bp.route("/stats") def scraper_stats(): """Get scraper statistics for the dashboard.""" # Get the last 24 hours of activity hours = 24 if request.args.get('hours'): try: hours = int(request.args.get('hours')) except ValueError: pass cutoff_time = datetime.utcnow().replace( minute=0, second=0, microsecond=0 ) # Get activity logs for scraper actions logs = ActivityLog.query.filter( ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value, ActivityLog.timestamp >= cutoff_time.replace(hour=cutoff_time.hour - hours) ).all() # Group by hour and status stats = {} for hour in range(hours): target_hour = (cutoff_time.hour - hour) % 24 stats[target_hour] = { "success": 0, "error": 0, "pending": 0, "hour": target_hour, } for log in logs: hour = log.timestamp.hour if hour in stats: if log.status == "success": stats[hour]["success"] += 1 elif log.status == "error": stats[hour]["error"] += 1 elif log.status == "pending": stats[hour]["pending"] += 1 # Convert to list for easier consumption by JavaScript result = [stats[hour] for hour in sorted(stats.keys())] return jsonify(result) @bp.route("/update_config", methods=["POST"]) def update_config(): """Update scraper configuration.""" data = request.json try: if "volume" in data: try: new_volume = float(data["volume"]) # Validate volume value (from schedule.py) if new_volume <= 0 or new_volume > 1000: return jsonify({ "success": False, "message": "Volume must be between 1 and 1000" }) volume_config = VolumeConfig.query.first() if not volume_config: volume_config = VolumeConfig(volume=new_volume) db.session.add(volume_config) else: old_value = volume_config.volume volume_config.volume = new_volume ActivityLog.log_config_change( config_key="scraper_volume", old_value=old_value, new_value=new_volume, description="Updated scraper volume" ) db.session.commit() except (ValueError, TypeError): return jsonify({ "success": False, "message": "Invalid volume value" }) if "schedule" in data: try: schedule = data["schedule"] # Validate entire schedule for hour_str, weight in schedule.items(): try: hour = int(hour_str) weight = float(weight) if hour < 0 or hour > 23: return jsonify({ "success": False, "message": f"Hour value must be between 0 and 23, got {hour}" }) if weight < 0.1 or weight > 5: return jsonify({ "success": False, "message": f"Weight for hour {hour} must be between 0.1 and 5, got {weight}" }) except ValueError: return jsonify({ "success": False, "message": f"Invalid data format for hour {hour_str}" }) # Update schedule after validation for hour_str, weight in schedule.items(): hour = int(hour_str) weight = float(weight) schedule_config = ScheduleConfig.query.get(hour) if not schedule_config: schedule_config = ScheduleConfig(hour=hour, weight=weight) db.session.add(schedule_config) else: old_value = schedule_config.weight schedule_config.weight = weight ActivityLog.log_config_change( config_key=f"schedule_hour_{hour}", old_value=old_value, new_value=weight, description=f"Updated schedule weight for hour {hour}" ) db.session.commit() except Exception as e: db.session.rollback() return jsonify({ "success": False, "message": f"Error updating schedule: {str(e)}" }) return jsonify({"success": True, "message": "Configuration updated"}) except Exception as e: db.session.rollback() return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"}) @bp.route("/schedule", methods=["GET", "POST"]) def schedule(): """Legacy route to maintain compatibility with the schedule blueprint.""" # For GET requests, redirect to the scraper index with the schedule tab active if request.method == "GET": return index() # For POST requests, handle form data and process like the original schedule blueprint if request.method == "POST": try: # Check if we're updating volume or schedule if "total_volume" in request.form: # Volume update try: new_volume = float(request.form.get("total_volume", 0)) if new_volume <= 0 or new_volume > 1000: raise ValueError("Volume must be between 1 and 1000") volume_config = VolumeConfig.query.first() if not volume_config: volume_config = VolumeConfig(volume=new_volume) db.session.add(volume_config) else: volume_config.volume = new_volume db.session.commit() flash("Volume updated successfully!", "success") except ValueError as e: db.session.rollback() flash(f"Error updating volume: {str(e)}", "error") else: # Schedule update logic # Validate form data for hour in range(24): key = f"hour_{hour}" if key not in request.form: raise ValueError(f"Missing data for hour {hour}") try: weight = float(request.form.get(key, 0)) if weight < 0 or weight > 5: raise ValueError( f"Weight for hour {hour} must be between 0 and 5" ) except ValueError: raise ValueError(f"Invalid weight value for hour {hour}") # Update database if validation passes for hour in range(24): key = f"hour_{hour}" weight = float(request.form.get(key, 0)) config = ScheduleConfig.query.get(hour) if config: config.weight = weight else: db.session.add(ScheduleConfig(hour=hour, weight=weight)) db.session.commit() flash("Schedule updated successfully!", "success") except ValueError as e: db.session.rollback() flash(f"Error updating schedule: {str(e)}", "error") # Redirect back to the scraper page return index() # Calculate schedule information for visualization/decision making def get_schedule_stats(): """Get statistics about the current schedule configuration.""" volume_config = VolumeConfig.query.first() if not volume_config: return {"error": "No volume configuration found"} total_volume = volume_config.volume schedule_configs = ScheduleConfig.query.all() if not schedule_configs: return {"error": "No schedule configuration found"} # Calculate total weight total_weight = sum(config.weight for config in schedule_configs) # Calculate papers per hour papers_per_hour = {} for config in schedule_configs: weight_ratio = config.weight / total_weight if total_weight > 0 else 0 papers = weight_ratio * total_volume papers_per_hour[config.hour] = papers return { "total_volume": total_volume, "total_weight": total_weight, "papers_per_hour": papers_per_hour } # Enhanced API route to get schedule information @bp.route("/schedule_info") def schedule_info(): """Get information about the current schedule configuration.""" stats = get_schedule_stats() return jsonify(stats) # Define the Celery tasks @celery.task(bind=True) def dummy_scraper_scheduler(self): """Main scheduler task for the dummy scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if not SCRAPER_ACTIVE: return {"status": "Scraper not active"} if SCRAPER_PAUSED: return {"status": "Scraper paused"} # Calculate how many papers to scrape based on current hour and configuration current_hour = datetime.now().hour hour_config = ScheduleConfig.query.get(current_hour) volume_config = VolumeConfig.query.first() if not hour_config or not volume_config: return {"status": "Missing configuration"} # Calculate papers to scrape this hour hourly_rate = volume_config.volume / 24 # Base rate per hour adjusted_rate = hourly_rate * (1 / hour_config.weight) # Adjust by weight papers_to_scrape = int(adjusted_rate) # Log the scheduling decision ActivityLog.log_scraper_activity( action="schedule_papers", status="success", description=f"Scheduled {papers_to_scrape} papers for scraping at hour {current_hour}", hourly_rate=hourly_rate, weight=hour_config.weight, adjusted_rate=adjusted_rate, ) # Launch individual scraping tasks for _ in range(papers_to_scrape): if not SCRAPER_ACTIVE or SCRAPER_PAUSED: break # Schedule a new paper to be scraped dummy_scrape_paper.delay() # Schedule the next run in 5 minutes if still active if SCRAPER_ACTIVE: dummy_scraper_scheduler.apply_async(countdown=300) # 5 minutes return {"status": "success", "papers_scheduled": papers_to_scrape} @celery.task(bind=True) def dummy_scrape_paper(self): """Simulate scraping a single paper.""" # Simulate success or failure success = random.random() > 0.3 # 70% success rate # Simulate processing time import time time.sleep(random.randint(2, 5)) # 2-5 seconds if success: # Create a dummy paper new_paper = PaperMetadata( title=f"Dummy Paper {random.randint(1000, 9999)}", doi=f"10.1234/dummy.{random.randint(1000, 9999)}", journal=random.choice([ "Nature", "Science", "PLOS ONE", "Journal of Dummy Research", "Proceedings of the Dummy Society", "Cell", "Dummy Review Letters" ]), type="article", language="en", published_online=datetime.now().date(), status="Done", file_path="/path/to/dummy/paper.pdf" ) db.session.add(new_paper) db.session.commit() # Log the successful scrape ActivityLog.log_scraper_activity( action="scrape_paper", paper_id=new_paper.id, status="success", description=f"Successfully scraped paper {new_paper.doi}" ) return { "success": True, "paper_id": new_paper.id, "title": new_paper.title, "doi": new_paper.doi } else: # Log the failed scrape error_message = random.choice([ "Connection timeout", "404 Not Found", "Access denied", "Invalid DOI format", "PDF download failed", "Rate limited by publisher" ]) ActivityLog.log_scraper_activity( action="scrape_paper", status="error", description=f"Failed to scrape paper: {error_message}" ) return { "success": False, "error": error_message }