""" Simplified scraper blueprint using the new ScraperManager and hourly scheduling system. """ from flask import Blueprint, jsonify, render_template, request, current_app from ..models import ActivityLog, PaperMetadata, ScraperState, VolumeConfig from ..scrapers.manager import ScraperManager from ..scrapers.factory import get_available_scrapers from ..db import db from ..defaults import MAX_VOLUME from datetime import datetime, timedelta bp = Blueprint("scraper", __name__, url_prefix="/scraper") # Initialize the scraper manager scraper_manager = ScraperManager() @bp.route("/") def index(): """Main scraper page.""" # Get current scraper state scraper_state = ScraperState.get_current_state() # Get available scrapers available_scrapers = get_available_scrapers() # Get recent activity logs recent_logs = ActivityLog.query.order_by(ActivityLog.timestamp.desc()).limit(50).all() # Get volume configuration volume_config = VolumeConfig.get_current_volume() # Get paper counts by status paper_counts = { 'new': PaperMetadata.query.filter_by(status='New').count(), 'processing': PaperMetadata.query.filter_by(status='Processing').count(), 'done': PaperMetadata.query.filter_by(status='Done').count(), 'failed': PaperMetadata.query.filter_by(status='Failed').count(), 'pending': PaperMetadata.query.filter_by(status='Pending').count(), 'retrying': PaperMetadata.query.filter_by(status='Retrying').count(), } return render_template( "scraper.html.jinja", scraper_state=scraper_state, available_scrapers=available_scrapers, recent_logs=recent_logs, paper_counts=paper_counts, volume_config=volume_config, max_volume=MAX_VOLUME ) @bp.route("/start", methods=["POST"]) def start_scraper(): """Start the hourly scraper scheduling.""" try: # Handle both JSON and form data if request.is_json: data = request.get_json() or {} else: data = request.form.to_dict() scraper_name = data.get('scraper_name', 'dummy') # Start the scraper using manager result = scraper_manager.start_scraper() if result["status"] == "success": ActivityLog.log_scraper_command( action="start_scraper", status="success", description="Started scraper with hourly scheduling" ) return jsonify({ "success": True, "message": result["message"] }) else: return jsonify({ "success": False, "message": result["message"] }), 400 except Exception as e: ActivityLog.log_scraper_command( action="start_scraper", status="error", description=f"Failed to start scraper: {str(e)}" ) return jsonify({ "success": False, "message": f"Error starting scraper: {str(e)}" }), 500 @bp.route("/pause", methods=["POST"]) def pause_scraper(): """Pause the scraper.""" try: result = scraper_manager.pause_scraper() if result["status"] == "success": ActivityLog.log_scraper_command( action="pause_scraper", status="success", description="Scraper paused successfully" ) return jsonify({ "success": True, "message": result["message"] }) else: return jsonify({ "success": False, "message": result["message"] }), 400 except Exception as e: ActivityLog.log_scraper_command( action="pause_scraper", status="error", description=f"Failed to pause scraper: {str(e)}" ) return jsonify({ "success": False, "message": f"Error pausing scraper: {str(e)}" }), 500 @bp.route("/stop", methods=["POST"]) def stop_scraper(): """Stop the scraper and revert processing papers.""" try: result = scraper_manager.stop_scraper() # Add debugging to see what the manager returns print(f"DEBUG: stop_scraper result: {result}") # Always log the stop attempt regardless of result ActivityLog.log_scraper_command( action="stop_scraper_attempt", status=result.get("status", "unknown"), description=f"Stop scraper called - result: {result}" ) if result["status"] == "success": ActivityLog.log_scraper_command( action="stop_scraper", status="success", description="Scraper stopped and papers reverted to original status" ) return jsonify({ "success": True, "message": result["message"] }) else: return jsonify({ "success": False, "message": result["message"] }), 400 except Exception as e: ActivityLog.log_scraper_command( action="stop_scraper", status="error", description=f"Failed to stop scraper: {str(e)}" ) return jsonify({ "success": False, "message": f"Error stopping scraper: {str(e)}" }), 500 @bp.route("/reset", methods=["POST"]) def reset_scraper(): """Reset the scraper state and revert all processing papers.""" try: result = scraper_manager.reset_scraper() if result["status"] == "success": ActivityLog.log_scraper_command( action="reset_scraper", status="success", description="Scraper reset and all processing papers reverted" ) return jsonify({ "success": True, "message": result["message"] }) else: return jsonify({ "success": False, "message": result["message"] }), 400 except Exception as e: ActivityLog.log_scraper_command( action="reset_scraper", status="error", description=f"Failed to reset scraper: {str(e)}" ) return jsonify({ "success": False, "message": f"Error resetting scraper: {str(e)}" }), 500 @bp.route("/status") def get_status(): """Get current scraper status and statistics.""" try: scraper_state = ScraperState.get_current_state() # Get paper counts by status paper_counts = { 'new': PaperMetadata.query.filter_by(status='New').count(), 'processing': PaperMetadata.query.filter_by(status='Processing').count(), 'done': PaperMetadata.query.filter_by(status='Done').count(), 'failed': PaperMetadata.query.filter_by(status='Failed').count(), 'pending': PaperMetadata.query.filter_by(status='Pending').count(), 'retrying': PaperMetadata.query.filter_by(status='Retrying').count(), } # Get current hour quota info current_quota = scraper_manager.get_current_hour_quota() return jsonify({ "success": True, "scraper_state": { "active": scraper_state.is_active, "paused": scraper_state.is_paused, "last_updated": scraper_state.last_updated.isoformat() if scraper_state.last_updated else None }, "paper_counts": paper_counts, "current_quota": current_quota }) except Exception as e: return jsonify({ "success": False, "message": f"Error getting status: {str(e)}" }), 500 @bp.route("/logs") def get_logs(): """Get recent activity logs.""" try: limit = request.args.get('limit', 50, type=int) logs = ActivityLog.query.order_by(ActivityLog.timestamp.desc()).limit(limit).all() return jsonify({ "success": True, "logs": [{ "id": log.id, "timestamp": log.timestamp.isoformat(), "action": log.action, "status": log.status, "description": log.description, "category": log.category.name if log.category else None } for log in logs] }) except Exception as e: return jsonify({ "success": False, "message": f"Error getting logs: {str(e)}" }), 500 @bp.route("/scrapers") def get_scrapers(): """Get available scrapers and their configurations.""" try: available_scrapers = get_available_scrapers() scraper_info = [] for scraper_dict in available_scrapers: try: scraper_class = scraper_dict["class"] scraper_info.append({ "name": scraper_dict["name"], "description": scraper_dict["description"], "input_statuses": list(scraper_class.INPUT_STATUSES), "output_status_success": scraper_class.OUTPUT_STATUS_SUCCESS, "output_status_failure": scraper_class.OUTPUT_STATUS_FAILURE, "output_status_processing": scraper_class.OUTPUT_STATUS_PROCESSING }) except Exception as e: scraper_info.append({ "name": scraper_dict.get("name", "unknown"), "error": f"Failed to load scraper info: {str(e)}" }) return jsonify({ "success": True, "scrapers": scraper_info }) except Exception as e: return jsonify({ "success": False, "message": f"Error getting scrapers: {str(e)}" }), 500 @bp.route("/process-papers", methods=["POST"]) def process_papers_manually(): """Manually trigger paper processing for current hour.""" try: data = request.get_json() or {} scraper_name = data.get('scraper_name') if not scraper_name: return jsonify({ "success": False, "message": "Scraper name is required" }), 400 # Process papers for current hour papers = scraper_manager.select_papers_for_processing() processed_count = len(papers) if papers else 0 result_msg = f"Manual processing triggered - {processed_count} papers selected for processing" ActivityLog.log_scraper_command( action="manual_process", status="success", description=result_msg ) return jsonify({ "success": True, "message": result_msg, "processed_count": processed_count }) except Exception as e: ActivityLog.log_scraper_command( action="manual_process", status="error", description=f"Failed to manually process papers: {str(e)}" ) return jsonify({ "success": False, "message": f"Error processing papers: {str(e)}" }), 500 @bp.route("/trigger-immediate", methods=["POST"]) def trigger_immediate_processing(): """Trigger immediate processing of papers without waiting for hourly schedule.""" try: # Get papers that should be processed this hour manager = ScraperManager() papers = manager.select_papers_for_processing() if not papers: return jsonify({ "success": True, "message": "No papers available for immediate processing", "papers_scheduled": 0 }) # Get APScheduler instance scheduler = current_app.config.get('SCHEDULER') if not scheduler: return jsonify({ "success": False, "message": "APScheduler not available" }), 500 # Schedule papers for immediate processing via APScheduler scheduled_count = 0 for paper in papers: try: job_id = f"immediate_paper_{paper.id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" scheduler.schedule_paper_processing(paper.id, delay_seconds=1, job_id=job_id) scheduled_count += 1 except Exception as e: ActivityLog.log_error( error_message=f"Failed to schedule paper {paper.id}: {str(e)}", source="trigger_immediate_processing" ) ActivityLog.log_scraper_command( action="trigger_immediate_processing", status="success", description=f"Triggered immediate processing of {scheduled_count} papers via APScheduler" ) return jsonify({ "success": True, "message": f"Immediate processing started for {scheduled_count} papers", "papers_scheduled": scheduled_count }) except Exception as e: ActivityLog.log_scraper_command( action="trigger_immediate_processing", status="error", description=f"Failed to trigger immediate processing: {str(e)}" ) return jsonify({ "success": False, "message": f"Error triggering immediate processing: {str(e)}" }), 500 @bp.route("/available_scrapers") def get_available_scrapers_endpoint(): """Get available scrapers for the UI dropdown.""" try: available_scrapers = get_available_scrapers() return jsonify({ "success": True, "scrapers": [{ "name": scraper["name"], "description": scraper["description"], "is_current": False # Could implement current scraper detection } for scraper in available_scrapers] }) except Exception as e: return jsonify({ "success": False, "message": f"Error getting scrapers: {str(e)}" }), 500 @bp.route("/stats") def get_stats(): """Get scraper statistics for the dashboard.""" try: hours = int(request.args.get('hours', 24)) current_time = datetime.utcnow() cutoff_time = current_time.replace(minute=0, second=0, microsecond=0) # Get activity logs for scraper actions in the last N hours from ..models import ActivityCategory start_time = cutoff_time - timedelta(hours=hours) logs = ActivityLog.query.filter( ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value, ActivityLog.timestamp >= start_time ).all() # Group by hour and status stats = {} for hour_offset in range(hours): target_hour = (current_time.hour - hour_offset) % 24 stats[target_hour] = { "success": 0, "error": 0, "pending": 0, "hour": target_hour, } for log in logs: hour = log.timestamp.hour if hour in stats: if log.status == "success": stats[hour]["success"] += 1 elif log.status == "error": stats[hour]["error"] += 1 elif log.status in ("pending", "info"): stats[hour]["pending"] += 1 # Convert to list for easier consumption by JavaScript result = [stats[hour] for hour in sorted(stats.keys())] return jsonify(result) except Exception as e: return jsonify({ "success": False, "message": f"Error getting stats: {str(e)}" }), 500 @bp.route("/process_single/", methods=["POST"]) def process_single_paper_endpoint(paper_id): """Process a single paper by ID.""" try: data = request.get_json() or {} scraper_name = data.get('scraper_module') # Get the paper paper = PaperMetadata.query.get(paper_id) if not paper: return jsonify({ "success": False, "message": "Paper not found" }), 404 # Get APScheduler instance scheduler = current_app.config.get('SCHEDULER') if not scheduler: return jsonify({ "success": False, "message": "APScheduler not available" }), 500 # Schedule the paper for immediate processing via APScheduler job_id = f"manual_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" try: scheduler.schedule_paper_processing(paper_id, delay_seconds=1, job_id=job_id) ActivityLog.log_scraper_command( action="manual_process_single", status="success", description=f"Scheduled manual processing for paper {paper.doi} via APScheduler" ) return jsonify({ "success": True, "message": f"Processing scheduled for paper {paper.doi}", "paper_id": paper_id }) except Exception as e: return jsonify({ "success": False, "message": f"Failed to schedule processing: {str(e)}" }), 500 except Exception as e: ActivityLog.log_scraper_command( action="manual_process_single", status="error", description=f"Failed to process paper {paper_id}: {str(e)}" ) return jsonify({ "success": False, "message": f"Error processing paper: {str(e)}" }), 500 @bp.route("/update_config", methods=["POST"]) def update_scraper_config(): """Update scraper configuration.""" try: data = request.get_json() or {} # Handle volume configuration updates for daily quota if "volume" in data: # Import the helper function from config module from .config import _update_volume new_volume = data["volume"] success, message, volume_config = _update_volume(new_volume) if success: ActivityLog.log_scraper_command( action="update_volume_config", status="success", description=f"Updated daily volume to {new_volume} papers per day" ) return jsonify({ "success": True, "message": message }) else: return jsonify({ "success": False, "message": message }), 400 # Handle other configuration updates here if needed in the future return jsonify({ "success": True, "message": "Configuration updated successfully" }) except Exception as e: ActivityLog.log_scraper_command( action="update_scraper_config", status="error", description=f"Failed to update scraper config: {str(e)}" ) return jsonify({ "success": False, "message": f"Error updating scraper config: {str(e)}" }), 500