""" Simplified scraper blueprint using the new ScraperManager and hourly scheduling system. """ from flask import Blueprint, jsonify, render_template, request, current_app from ..models import ActivityLog, PaperMetadata, ScraperState, VolumeConfig from ..scrapers.manager import ScraperManager from ..scrapers.factory import get_available_scrapers from ..db import db from ..defaults import MAX_VOLUME from datetime import datetime, timedelta bp = Blueprint("scraper", __name__, url_prefix="/scraper") # Initialize the scraper manager scraper_manager = ScraperManager() @bp.route("/") def index(): """Main scraper page.""" # Get current scraper state scraper_state = ScraperState.get_current_state() # Get available scrapers available_scrapers = get_available_scrapers() # Get recent activity logs recent_logs = ActivityLog.query.order_by(ActivityLog.timestamp.desc()).limit(50).all() # Get volume configuration volume_config = VolumeConfig.get_current_volume() # Get scraper module configuration from ..models import ScraperModuleConfig current_scraper_module = ScraperModuleConfig.get_current_module() # Get paper counts by status paper_counts = { 'new': PaperMetadata.query.filter_by(status='New').count(), 'processing': PaperMetadata.query.filter_by(status='Processing').count(), 'done': PaperMetadata.query.filter_by(status='Done').count(), 'failed': PaperMetadata.query.filter_by(status='Failed').count(), 'pending': PaperMetadata.query.filter_by(status='Pending').count(), 'retrying': PaperMetadata.query.filter_by(status='Retrying').count(), } return render_template( "scraper.html.jinja", scraper_state=scraper_state, available_scrapers=available_scrapers, recent_logs=recent_logs, paper_counts=paper_counts, volume_config=volume_config, max_volume=MAX_VOLUME, current_scraper_module=current_scraper_module, available_scraper_modules=[s["name"] for s in available_scrapers], scraper_details={s["name"]: s for s in available_scrapers} ) @bp.route("/start", methods=["POST"]) def start_scraper(): """Start the hourly scraper scheduling.""" try: # Handle both JSON and form data if request.is_json: data = request.get_json() # Allow empty JSON payload for start requests if data is None: data = {} else: return jsonify({"success": False, "message": "Invalid payload format. Expected JSON."}), 400 # Start the scraper using manager result = scraper_manager.start_scraper() if result["status"] == "success": ActivityLog.log_scraper_command( action="start_scraper", status="success", description="Scraper started successfully." ) return jsonify({"success": True, "message": result["message"]}) else: ActivityLog.log_scraper_command( action="start_scraper", status="failure", description=f"Failed to start scraper: {result['message']}" ) return jsonify({"success": False, "message": result["message"]}), 400 except Exception as e: ActivityLog.log_scraper_command( action="start_scraper", status="error", description=f"Failed to start scraper: {str(e)}" ) return jsonify({"success": False, "message": f"An error occurred: {str(e)}"}), 500 @bp.route("/pause", methods=["POST"]) def pause_scraper(): """Pause the scraper.""" try: result = scraper_manager.pause_scraper() if result["status"] == "success": ActivityLog.log_scraper_command( action="pause_scraper", status="success", description="Scraper paused successfully" ) return jsonify({ "success": True, "message": result["message"] }) else: return jsonify({ "success": False, "message": result["message"] }), 400 except Exception as e: ActivityLog.log_scraper_command( action="pause_scraper", status="error", description=f"Failed to pause scraper: {str(e)}" ) return jsonify({ "success": False, "message": f"Error pausing scraper: {str(e)}" }), 500 @bp.route("/stop", methods=["POST"]) def stop_scraper(): """Stop the scraper and revert processing papers.""" try: result = scraper_manager.stop_scraper() # Add debugging to see what the manager returns print(f"DEBUG: stop_scraper result: {result}") # Always log the stop attempt regardless of result ActivityLog.log_scraper_command( action="stop_scraper_attempt", status=result.get("status", "unknown"), description=f"Stop scraper called - result: {result}" ) if result["status"] == "success": ActivityLog.log_scraper_command( action="stop_scraper", status="success", description="Scraper stopped and papers reverted to original status" ) return jsonify({ "success": True, "message": result["message"] }) else: return jsonify({ "success": False, "message": result["message"] }), 400 except Exception as e: ActivityLog.log_scraper_command( action="stop_scraper", status="error", description=f"Failed to stop scraper: {str(e)}" ) return jsonify({ "success": False, "message": f"Error stopping scraper: {str(e)}" }), 500 @bp.route("/reset", methods=["POST"]) def reset_scraper(): """Reset the scraper state and revert all processing papers.""" try: result = scraper_manager.reset_scraper() if result["status"] == "success": ActivityLog.log_scraper_command( action="reset_scraper", status="success", description="Scraper reset and all processing papers reverted" ) return jsonify({ "success": True, "message": result["message"] }) else: return jsonify({ "success": False, "message": result["message"] }), 400 except Exception as e: ActivityLog.log_scraper_command( action="reset_scraper", status="error", description=f"Failed to reset scraper: {str(e)}" ) return jsonify({ "success": False, "message": f"Error resetting scraper: {str(e)}" }), 500 @bp.route("/status") def get_status(): """Get current scraper status and statistics.""" try: scraper_state = ScraperState.get_current_state() # Get paper counts by status paper_counts = { 'new': PaperMetadata.query.filter_by(status='New').count(), 'processing': PaperMetadata.query.filter_by(status='Processing').count(), 'done': PaperMetadata.query.filter_by(status='Done').count(), 'failed': PaperMetadata.query.filter_by(status='Failed').count(), 'pending': PaperMetadata.query.filter_by(status='Pending').count(), 'retrying': PaperMetadata.query.filter_by(status='Retrying').count(), } # Get current hour quota info current_quota = scraper_manager.get_current_hour_quota() # Get current scraper module configuration from ..models import ScraperModuleConfig current_scraper_module = ScraperModuleConfig.get_current_module() # Get volume configuration current_volume = VolumeConfig.get_current_volume() return jsonify({ "success": True, "scraper_state": { "active": scraper_state.is_active, "paused": scraper_state.is_paused, "last_updated": scraper_state.last_updated.isoformat() if scraper_state.last_updated else None }, "paper_counts": paper_counts, "current_quota": current_quota, "current_scraper_module": current_scraper_module, "volume_config": current_volume }) except Exception as e: return jsonify({ "success": False, "message": f"Error getting status: {str(e)}" }), 500 @bp.route("/logs") def get_logs(): """Get recent activity logs with pagination support.""" try: # Pagination parameters page = request.args.get('page', 1, type=int) per_page = request.args.get('per_page', 20, type=int) # Legacy limit parameter for backward compatibility limit = request.args.get('limit', type=int) if limit and not request.args.get('page'): # Legacy mode: use limit without pagination logs = ActivityLog.query.order_by(ActivityLog.timestamp.desc()).limit(limit).all() return jsonify({ "success": True, "logs": [{ "id": log.id, "timestamp": log.timestamp.isoformat(), "action": log.action, "status": log.status, "description": log.description, "category": log.category } for log in logs] }) # Ensure reasonable per_page limits per_page = min(per_page, 100) # Cap at 100 items per page # Build query with optional filtering query = ActivityLog.query # Filter by categories if specified categories = request.args.getlist('category') if categories: query = query.filter(ActivityLog.category.in_(categories)) # Filter by status if specified status = request.args.get('status') if status: query = query.filter(ActivityLog.status == status) # Order by most recent first and paginate pagination = query.order_by(ActivityLog.timestamp.desc()).paginate( page=page, per_page=per_page, error_out=False ) return jsonify({ "success": True, "logs": [{ "id": log.id, "timestamp": log.timestamp.isoformat(), "action": log.action, "status": log.status, "description": log.description, "category": log.category } for log in pagination.items], "pagination": { "page": pagination.page, "pages": pagination.pages, "per_page": pagination.per_page, "total": pagination.total, "has_next": pagination.has_next, "has_prev": pagination.has_prev, "next_num": pagination.next_num if pagination.has_next else None, "prev_num": pagination.prev_num if pagination.has_prev else None } }) except Exception as e: return jsonify({ "success": False, "message": f"Error getting logs: {str(e)}" }), 500 @bp.route("/scrapers") def get_scrapers(): """Get available scrapers and their configurations.""" try: available_scrapers = get_available_scrapers() scraper_info = [] for scraper_dict in available_scrapers: try: scraper_class = scraper_dict["class"] scraper_info.append({ "name": scraper_dict["name"], "description": scraper_dict["description"], "input_statuses": list(scraper_class.INPUT_STATUSES), "output_status_success": scraper_class.OUTPUT_STATUS_SUCCESS, "output_status_failure": scraper_class.OUTPUT_STATUS_FAILURE, "output_status_processing": scraper_class.OUTPUT_STATUS_PROCESSING }) except Exception as e: scraper_info.append({ "name": scraper_dict.get("name", "unknown"), "error": f"Failed to load scraper info: {str(e)}" }) return jsonify({ "success": True, "scrapers": scraper_info }) except Exception as e: return jsonify({ "success": False, "message": f"Error getting scrapers: {str(e)}" }), 500 @bp.route("/process-papers", methods=["POST"]) def process_papers_manually(): """Manually trigger paper processing for current hour.""" try: data = request.get_json() or {} scraper_name = data.get('scraper_name') if not scraper_name: return jsonify({ "success": False, "message": "Scraper name is required" }), 400 # Process papers for current hour papers = scraper_manager.select_papers_for_processing() processed_count = len(papers) if papers else 0 result_msg = f"Manual processing triggered - {processed_count} papers selected for processing" ActivityLog.log_scraper_command( action="manual_process", status="success", description=result_msg ) return jsonify({ "success": True, "message": result_msg, "processed_count": processed_count }) except Exception as e: ActivityLog.log_scraper_command( action="manual_process", status="error", description=f"Failed to manually process papers: {str(e)}" ) return jsonify({ "success": False, "message": f"Error processing papers: {str(e)}" }), 500 @bp.route("/trigger-immediate", methods=["POST"]) def trigger_immediate_processing(): """Trigger immediate processing of papers without waiting for hourly schedule.""" try: # Get papers that should be processed this hour manager = ScraperManager() papers = manager.select_papers_for_processing() if not papers: return jsonify({ "success": True, "message": "No papers available for immediate processing", "papers_scheduled": 0 }) # Get APScheduler instance scheduler = current_app.config.get('SCHEDULER') if not scheduler: return jsonify({ "success": False, "message": "APScheduler not available" }), 500 # Schedule papers for immediate processing via APScheduler scheduled_count = 0 for paper in papers: try: import uuid job_id = f"immediate_paper_{paper.id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{uuid.uuid4().hex[:8]}" scheduler.schedule_paper_processing(paper.id, delay_seconds=1, job_id=job_id) scheduled_count += 1 except Exception as e: ActivityLog.log_error( error_message=f"Failed to schedule paper {paper.id}: {str(e)}", source="trigger_immediate_processing" ) ActivityLog.log_scraper_command( action="trigger_immediate_processing", status="success", description=f"Triggered immediate processing of {scheduled_count} papers via APScheduler" ) return jsonify({ "success": True, "message": f"Immediate processing started for {scheduled_count} papers", "papers_scheduled": scheduled_count }) except Exception as e: ActivityLog.log_scraper_command( action="trigger_immediate_processing", status="error", description=f"Failed to trigger immediate processing: {str(e)}" ) return jsonify({ "success": False, "message": f"Error triggering immediate processing: {str(e)}" }), 500 @bp.route("/available_scrapers") def get_available_scrapers_endpoint(): """Get available scrapers for the UI dropdown.""" try: available_scrapers = get_available_scrapers() return jsonify({ "success": True, "scrapers": [{ "name": scraper["name"], "description": scraper["description"], "is_current": False # Could implement current scraper detection } for scraper in available_scrapers] }) except Exception as e: return jsonify({ "success": False, "message": f"Error getting scrapers: {str(e)}" }), 500 @bp.route("/stats") def get_stats(): """Get scraper statistics for the dashboard.""" try: hours = int(request.args.get('hours', 24)) current_time = datetime.utcnow() # Get activity logs for scraper actions in the last N hours from ..models import ActivityCategory start_time = current_time - timedelta(hours=hours) logs = ActivityLog.query.filter( ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value, ActivityLog.timestamp >= start_time ).all() # Get scraper command logs for state changes in the same time period state_logs = ActivityLog.query.filter( ActivityLog.category == ActivityCategory.SCRAPER_COMMAND.value, ActivityLog.action.in_(['start_scraper', 'pause_scraper', 'stop_scraper', 'reset_scraper']), ActivityLog.timestamp >= start_time ).order_by(ActivityLog.timestamp.asc()).all() # Group by chronological hour buckets (not hour of day) stats = [] for hour_offset in range(hours): # Calculate the hour bucket (most recent hour first when hour_offset=0) bucket_end_time = current_time - timedelta(hours=hour_offset) bucket_start_time = bucket_end_time - timedelta(hours=1) # Format hour label for display (e.g., "14:00-15:00" or "14:00" for simplicity) hour_label = bucket_start_time.strftime("%H:%M") # Initialize counters for this hour bucket bucket_stats = { "success": 0, "error": 0, "pending": 0, "hour": hour_label, "hour_offset": hour_offset, # For sorting "bucket_start": bucket_start_time, "bucket_end": bucket_end_time, "scraper_active": 0 # Default to inactive } # Count logs that fall within this hour bucket for log in logs: if bucket_start_time <= log.timestamp < bucket_end_time: if log.status == "success": bucket_stats["success"] += 1 elif log.status == "error": bucket_stats["error"] += 1 elif log.status in ("pending", "info"): bucket_stats["pending"] += 1 # Determine scraper status for this hour by checking if scraper was active # For simplicity, check if there were any successful scrapes in this hour # If there were scrapes, assume scraper was active bucket_stats["scraper_active"] = 1 if bucket_stats["success"] > 0 else 0 stats.append(bucket_stats) # Reverse so oldest hour comes first (better for chronological chart display) stats.reverse() # Prepare precise scraper state changes for timeline scraper_timeline = [] for log in state_logs: # Calculate hours ago from current time time_diff = current_time - log.timestamp hours_ago = time_diff.total_seconds() / 3600 # Only include logs within our time range if hours_ago <= hours: scraper_timeline.append({ "timestamp": log.timestamp.isoformat(), "hours_ago": hours_ago, "action": log.action, "status": log.status, "active": 1 if log.action == "start_scraper" and log.status == "success" else 0 }) # Clean up the response (remove internal fields) result = [] for stat in stats: result.append({ "success": stat["success"], "error": stat["error"], "pending": stat["pending"], "hour": stat["hour"], "scraper_active": stat["scraper_active"] }) return jsonify({ "hourly_stats": result, "scraper_timeline": scraper_timeline }) except Exception as e: return jsonify({ "success": False, "message": f"Error getting stats: {str(e)}" }), 500 @bp.route("/process_single/", methods=["POST"]) def process_single_paper_endpoint(paper_id): """Process a single paper by ID.""" try: data = request.get_json() or {} scraper_name = data.get('scraper_module') # Get the paper paper = PaperMetadata.query.get(paper_id) if not paper: return jsonify({ "success": False, "message": "Paper not found" }), 404 # Get APScheduler instance scheduler = current_app.config.get('SCHEDULER') if not scheduler: return jsonify({ "success": False, "message": "APScheduler not available" }), 500 # Schedule the paper for immediate manual processing via APScheduler # Use UUID suffix to ensure unique job IDs import uuid job_id = f"manual_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{uuid.uuid4().hex[:8]}" try: scheduler.schedule_manual_paper_processing(paper_id, scraper_name=scraper_name, delay_seconds=1, job_id=job_id) ActivityLog.log_scraper_command( action="manual_process_single", status="success", description=f"Scheduled manual processing for paper {paper.doi} via APScheduler" + (f" using scraper '{scraper_name}'" if scraper_name else " using system default scraper") ) return jsonify({ "success": True, "message": f"Processing scheduled for paper {paper.doi}" + (f" using {scraper_name} scraper" if scraper_name else " using system default scraper"), "paper_id": paper_id }) except Exception as e: return jsonify({ "success": False, "message": f"Failed to schedule processing: {str(e)}" }), 500 except Exception as e: ActivityLog.log_scraper_command( action="manual_process_single", status="error", description=f"Failed to process paper {paper_id}: {str(e)}" ) return jsonify({ "success": False, "message": f"Error processing paper: {str(e)}" }), 500 @bp.route("/update_config", methods=["POST"]) def update_scraper_config(): """Update scraper configuration.""" try: data = request.get_json() or {} # Handle volume configuration updates for daily quota if "volume" in data: # Import the helper function from config module from .config import _update_volume new_volume = data["volume"] success, message, volume_config = _update_volume(new_volume) if success: ActivityLog.log_scraper_command( action="update_volume_config", status="success", description=f"Updated daily volume to {new_volume} papers per day" ) return jsonify({ "success": True, "message": message }) else: return jsonify({ "success": False, "message": message }), 400 # Handle scraper module configuration updates if "scraper_module" in data: from ..models import ScraperModuleConfig new_module = data["scraper_module"] # Validate that the module exists and is valid available_modules = [m["name"] for m in get_available_scrapers()] if new_module not in available_modules: return jsonify({ "success": False, "message": f"Invalid scraper module: {new_module}" }), 400 # Update the database configuration ScraperModuleConfig.set_module(new_module) ActivityLog.log_scraper_command( action="update_scraper_module", status="success", description=f"Updated scraper module to '{new_module}'" ) return jsonify({ "success": True, "message": f"Scraper module updated to '{new_module}' successfully" }) # Handle other configuration updates here if needed in the future return jsonify({ "success": True, "message": "Configuration updated successfully" }) except Exception as e: ActivityLog.log_scraper_command( action="update_scraper_config", status="error", description=f"Failed to update scraper config: {str(e)}" ) return jsonify({ "success": False, "message": f"Error updating scraper config: {str(e)}" }), 500 @bp.route("/publishers") def get_publishers(): """Get publisher overview data for the scraper overview modal.""" try: import os import glob # Get available parser modules parsers_dir = os.path.join(current_app.root_path, 'parsers') parser_files = glob.glob(os.path.join(parsers_dir, '*_parser.py')) available_parsers = [] for parser_file in parser_files: filename = os.path.basename(parser_file) if filename != 'base_parser.py': # Skip the base parser parser_name = filename.replace('_parser.py', '') available_parsers.append(parser_name) # Get publishers from database (papers that have publisher detected) publisher_query = db.session.query( PaperMetadata.publisher, db.func.count(PaperMetadata.id).label('paper_count') ).filter( PaperMetadata.publisher.isnot(None), PaperMetadata.publisher != '' ).group_by(PaperMetadata.publisher).all() publishers_data = [] for publisher, count in publisher_query: # Check if a parser exists for this publisher has_parser = publisher in available_parsers publishers_data.append({ 'name': publisher, 'paper_count': count, 'has_parser': has_parser, 'parser_status': 'available' if has_parser else 'missing' }) # Sort by paper count descending publishers_data.sort(key=lambda x: x['paper_count'], reverse=True) # Get totals total_papers_with_publisher = sum(p['paper_count'] for p in publishers_data) total_papers_without_publisher = PaperMetadata.query.filter( db.or_(PaperMetadata.publisher.is_(None), PaperMetadata.publisher == '') ).count() return jsonify({ 'success': True, 'data': { 'publishers': publishers_data, 'available_parsers': available_parsers, 'stats': { 'total_publishers': len(publishers_data), 'publishers_with_parsers': len([p for p in publishers_data if p['has_parser']]), 'publishers_without_parsers': len([p for p in publishers_data if not p['has_parser']]), 'total_papers_with_publisher': total_papers_with_publisher, 'total_papers_without_publisher': total_papers_without_publisher } } }) except Exception as e: return jsonify({ 'success': False, 'message': f'Error getting publisher data: {str(e)}' }), 500