import random import json from datetime import datetime from flask import Blueprint, jsonify, render_template, request, current_app, flash from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory from ..db import db from ..celery import celery bp = Blueprint("scraper", __name__, url_prefix="/scraper") # Global variables to track scraper state SCRAPER_ACTIVE = False SCRAPER_PAUSED = False @bp.route("/") def index(): """Render the scraper control panel.""" volume_config = VolumeConfig.query.first() # Ensure we have volume config if not volume_config: volume_config = VolumeConfig(volume=100) # Default value db.session.add(volume_config) db.session.commit() return render_template( "scraper.html.jinja", volume_config=volume_config, scraper_active=SCRAPER_ACTIVE, scraper_paused=SCRAPER_PAUSED ) @bp.route("/start", methods=["POST"]) def start_scraper(): """Start the scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if not SCRAPER_ACTIVE: SCRAPER_ACTIVE = True SCRAPER_PAUSED = False # Log the action ActivityLog.log_scraper_command( action="start_scraper", status="success", description="Scraper started manually" ) # Trigger the schedule.py to start actual scheduling return jsonify({ "success": True, "message": "Scraper started" }) else: return jsonify({ "success": False, "message": "Scraper is already running" }) @bp.route("/stop", methods=["POST"]) def stop_scraper(): """Stop the scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if SCRAPER_ACTIVE: SCRAPER_ACTIVE = False SCRAPER_PAUSED = False ActivityLog.log_scraper_command( action="stop_scraper", status="success", description="Scraper stopped manually" ) return jsonify({ "success": True, "message": "Scraper stopped" }) else: return jsonify({ "success": False, "message": "Scraper is not running" }) @bp.route("/pause", methods=["POST"]) def pause_scraper(): """Pause the scraper.""" global SCRAPER_ACTIVE, SCRAPER_PAUSED if SCRAPER_ACTIVE and not SCRAPER_PAUSED: SCRAPER_PAUSED = True ActivityLog.log_scraper_command( action="pause_scraper", status="success", description="Scraper paused manually" ) return jsonify({ "success": True, "message": "Scraper paused" }) elif SCRAPER_ACTIVE and SCRAPER_PAUSED: SCRAPER_PAUSED = False ActivityLog.log_scraper_command( action="resume_scraper", status="success", description="Scraper resumed manually" ) return jsonify({ "success": True, "message": "Scraper resumed" }) else: return jsonify({ "success": False, "message": "Scraper is not running" }) @bp.route("/status") def scraper_status(): """Get the current status of the scraper.""" return jsonify({ "active": SCRAPER_ACTIVE, "paused": SCRAPER_PAUSED, "current_hour": datetime.now().hour, }) @bp.route("/stats") def scraper_stats(): """Get scraper statistics for the dashboard.""" # Get the last 24 hours of activity hours = 24 if request.args.get('hours'): try: hours = int(request.args.get('hours')) except ValueError: pass cutoff_time = datetime.utcnow().replace( minute=0, second=0, microsecond=0 ) # Get activity logs for scraper actions logs = ActivityLog.query.filter( ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value, ActivityLog.timestamp >= cutoff_time.replace(hour=cutoff_time.hour - hours) ).all() # Group by hour and status stats = {} for hour in range(hours): target_hour = (cutoff_time.hour - hour) % 24 stats[target_hour] = { "success": 0, "error": 0, "pending": 0, "hour": target_hour, } for log in logs: hour = log.timestamp.hour if hour in stats: if log.status == "success": stats[hour]["success"] += 1 elif log.status == "error": stats[hour]["error"] += 1 elif log.status == "pending": stats[hour]["pending"] += 1 # Convert to list for easier consumption by JavaScript result = [stats[hour] for hour in sorted(stats.keys())] return jsonify(result) @bp.route("/update_config", methods=["POST"]) def update_config(): """Update scraper configuration.""" data = request.json try: if "volume" in data: try: new_volume = float(data["volume"]) # Validate volume value if new_volume <= 0 or new_volume > 1000: return jsonify({ "success": False, "message": "Volume must be between 1 and 1000" }) volume_config = VolumeConfig.query.first() if not volume_config: volume_config = VolumeConfig(volume=new_volume) db.session.add(volume_config) else: old_value = volume_config.volume volume_config.volume = new_volume ActivityLog.log_config_change( config_key="scraper_volume", old_value=old_value, new_value=new_volume, description="Updated scraper volume" ) db.session.commit() except (ValueError, TypeError): return jsonify({ "success": False, "message": "Invalid volume value" }) return jsonify({"success": True, "message": "Configuration updated"}) except Exception as e: db.session.rollback() return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"}) @celery.task(bind=True) def dummy_scrape_paper(self): """Simulate scraping a single paper.""" # Simulate success or failure success = random.random() > 0.3 # 70% success rate # Simulate processing time import time time.sleep(random.randint(2, 5)) # 2-5 seconds if success: # Create a dummy paper new_paper = PaperMetadata( title=f"Dummy Paper {random.randint(1000, 9999)}", doi=f"10.1234/dummy.{random.randint(1000, 9999)}", journal=random.choice([ "Nature", "Science", "PLOS ONE", "Journal of Dummy Research", "Proceedings of the Dummy Society", "Cell", "Dummy Review Letters" ]), type="article", language="en", published_online=datetime.now().date(), status="Done", file_path="/path/to/dummy/paper.pdf" ) db.session.add(new_paper) db.session.commit() # Log the successful scrape ActivityLog.log_scraper_activity( action="scrape_paper", paper_id=new_paper.id, status="success", description=f"Successfully scraped paper {new_paper.doi}" ) return { "success": True, "paper_id": new_paper.id, "title": new_paper.title, "doi": new_paper.doi } else: # Log the failed scrape error_message = random.choice([ "Connection timeout", "404 Not Found", "Access denied", "Invalid DOI format", "PDF download failed", "Rate limited by publisher" ]) ActivityLog.log_scraper_activity( action="scrape_paper", status="error", description=f"Failed to scrape paper: {error_message}" ) return { "success": False, "error": error_message }