SciPaperLoader/scipaperloader/blueprints/scraper.py

"""
Simplified scraper blueprint using the new ScraperManager and hourly scheduling system.
"""
from flask import Blueprint, jsonify, render_template, request, current_app
from ..models import ActivityLog, PaperMetadata, ScraperState, VolumeConfig
from ..scrapers.manager import ScraperManager
from ..scrapers.factory import get_available_scrapers
from ..db import db
from ..defaults import MAX_VOLUME
from datetime import datetime, timedelta

bp = Blueprint("scraper", __name__, url_prefix="/scraper")

# Initialize the scraper manager
scraper_manager = ScraperManager()

@bp.route("/")
def index():
    """Main scraper page."""
    # Get current scraper state
    scraper_state = ScraperState.get_current_state()

    # Get available scrapers
    available_scrapers = get_available_scrapers()

    # Get recent activity logs
    recent_logs = ActivityLog.query.order_by(ActivityLog.timestamp.desc()).limit(50).all()

    # Get volume configuration
    volume_config = VolumeConfig.get_current_volume()

    # Get scraper module configuration
    from ..models import ScraperModuleConfig
    current_scraper_module = ScraperModuleConfig.get_current_module()

    # Get paper counts by status
    paper_counts = {
        'new': PaperMetadata.query.filter_by(status='New').count(),
        'processing': PaperMetadata.query.filter_by(status='Processing').count(),
        'done': PaperMetadata.query.filter_by(status='Done').count(),
        'failed': PaperMetadata.query.filter_by(status='Failed').count(),
        'pending': PaperMetadata.query.filter_by(status='Pending').count(),
        'retrying': PaperMetadata.query.filter_by(status='Retrying').count(),
    }

    return render_template(
        "scraper.html.jinja",
        scraper_state=scraper_state,
        available_scrapers=available_scrapers,
        recent_logs=recent_logs,
        paper_counts=paper_counts,
        volume_config=volume_config,
        max_volume=MAX_VOLUME,
        current_scraper_module=current_scraper_module,
        available_scraper_modules=[s["name"] for s in available_scrapers],
        scraper_details={s["name"]: s for s in available_scrapers}
    )

@bp.route("/start", methods=["POST"])
def start_scraper():
    """Start the hourly scraper scheduling."""
    try:
        # Handle both JSON and form data
        if request.is_json:
            data = request.get_json()
            # Allow empty JSON payload for start requests
            if data is None:
                data = {}
        else:
            return jsonify({"success": False, "message": "Invalid payload format. Expected JSON."}), 400

        # Start the scraper using manager
        result = scraper_manager.start_scraper()

        if result["status"] == "success":
            ActivityLog.log_scraper_command(
                action="start_scraper",
                status="success",
                description="Scraper started successfully."
            )
            return jsonify({"success": True, "message": result["message"]})
        else:
            ActivityLog.log_scraper_command(
                action="start_scraper",
                status="failure",
                description=f"Failed to start scraper: {result['message']}"
            )
            return jsonify({"success": False, "message": result["message"]}), 400

    except Exception as e:
        ActivityLog.log_scraper_command(
            action="start_scraper",
            status="error",
            description=f"Failed to start scraper: {str(e)}"
        )
        return jsonify({"success": False, "message": f"An error occurred: {str(e)}"}), 500

@bp.route("/pause", methods=["POST"])
def pause_scraper():
    """Pause the scraper."""
    try:
        result = scraper_manager.pause_scraper()

        if result["status"] == "success":
            ActivityLog.log_scraper_command(
                action="pause_scraper",
                status="success",
                description="Scraper paused successfully"
            )

            return jsonify({
                "success": True,
                "message": result["message"]
            })
        else:
            return jsonify({
                "success": False,
                "message": result["message"]
            }), 400

    except Exception as e:
        ActivityLog.log_scraper_command(
            action="pause_scraper",
            status="error",
            description=f"Failed to pause scraper: {str(e)}"
        )
        return jsonify({
            "success": False,
            "message": f"Error pausing scraper: {str(e)}"
        }), 500

@bp.route("/stop", methods=["POST"])
def stop_scraper():
    """Stop the scraper and revert processing papers."""
    try:
        result = scraper_manager.stop_scraper()

        # Add debugging to see what the manager returns
        print(f"DEBUG: stop_scraper result: {result}")

        # Always log the stop attempt regardless of result
        ActivityLog.log_scraper_command(
            action="stop_scraper_attempt",
            status=result.get("status", "unknown"),
            description=f"Stop scraper called - result: {result}"
        )

        if result["status"] == "success":
            ActivityLog.log_scraper_command(
                action="stop_scraper",
                status="success",
                description="Scraper stopped and papers reverted to original status"
            )

            return jsonify({
                "success": True,
                "message": result["message"]
            })
        else:
            return jsonify({
                "success": False,
                "message": result["message"]
            }), 400

    except Exception as e:
        ActivityLog.log_scraper_command(
            action="stop_scraper",
            status="error",
            description=f"Failed to stop scraper: {str(e)}"
        )
        return jsonify({
            "success": False,
            "message": f"Error stopping scraper: {str(e)}"
        }), 500

@bp.route("/reset", methods=["POST"])
def reset_scraper():
    """Reset the scraper state and revert all processing papers."""
    try:
        result = scraper_manager.reset_scraper()

        if result["status"] == "success":
            ActivityLog.log_scraper_command(
                action="reset_scraper",
                status="success",
                description="Scraper reset and all processing papers reverted"
            )

            return jsonify({
                "success": True,
                "message": result["message"]
            })
        else:
            return jsonify({
                "success": False,
                "message": result["message"]
            }), 400

    except Exception as e:
        ActivityLog.log_scraper_command(
            action="reset_scraper",
            status="error",
            description=f"Failed to reset scraper: {str(e)}"
        )
        return jsonify({
            "success": False,
            "message": f"Error resetting scraper: {str(e)}"
        }), 500

@bp.route("/status")
def get_status():
    """Get current scraper status and statistics."""
    try:
        scraper_state = ScraperState.get_current_state()

        # Get paper counts by status
        paper_counts = {
            'new': PaperMetadata.query.filter_by(status='New').count(),
            'processing': PaperMetadata.query.filter_by(status='Processing').count(),
            'done': PaperMetadata.query.filter_by(status='Done').count(),
            'failed': PaperMetadata.query.filter_by(status='Failed').count(),
            'pending': PaperMetadata.query.filter_by(status='Pending').count(),
            'retrying': PaperMetadata.query.filter_by(status='Retrying').count(),
        }

        # Get current hour quota info
        current_quota = scraper_manager.get_current_hour_quota()

        # Get current scraper module configuration
        from ..models import ScraperModuleConfig
        current_scraper_module = ScraperModuleConfig.get_current_module()

        # Get volume configuration
        current_volume = VolumeConfig.get_current_volume()

        return jsonify({
            "success": True,
            "scraper_state": {
                "active": scraper_state.is_active,
                "paused": scraper_state.is_paused,
                "last_updated": scraper_state.last_updated.isoformat() if scraper_state.last_updated else None
            },
            "paper_counts": paper_counts,
            "current_quota": current_quota,
            "current_scraper_module": current_scraper_module,
            "volume_config": current_volume
        })

    except Exception as e:
        return jsonify({
            "success": False,
            "message": f"Error getting status: {str(e)}"
        }), 500

@bp.route("/logs")
def get_logs():
    """Get recent activity logs with pagination support."""
    try:
        # Pagination parameters
        page = request.args.get('page', 1, type=int)
        per_page = request.args.get('per_page', 20, type=int)

        # Legacy limit parameter for backward compatibility
        limit = request.args.get('limit', type=int)
        if limit and not request.args.get('page'):
            # Legacy mode: use limit without pagination
            logs = ActivityLog.query.order_by(ActivityLog.timestamp.desc()).limit(limit).all()
            return jsonify({
                "success": True,
                "logs": [{
                    "id": log.id,
                    "timestamp": log.timestamp.isoformat(),
                    "action": log.action,
                    "status": log.status,
                    "description": log.description,
                    "category": log.category
                } for log in logs]
            })

        # Ensure reasonable per_page limits
        per_page = min(per_page, 100)  # Cap at 100 items per page

        # Build query with optional filtering
        query = ActivityLog.query

        # Filter by categories if specified
        categories = request.args.getlist('category')
        if categories:
            query = query.filter(ActivityLog.category.in_(categories))

        # Filter by status if specified
        status = request.args.get('status')
        if status:
            query = query.filter(ActivityLog.status == status)

        # Order by most recent first and paginate
        pagination = query.order_by(ActivityLog.timestamp.desc()).paginate(
            page=page,
            per_page=per_page,
            error_out=False
        )

        return jsonify({
            "success": True,
            "logs": [{
                "id": log.id,
                "timestamp": log.timestamp.isoformat(),
                "action": log.action,
                "status": log.status,
                "description": log.description,
                "category": log.category
            } for log in pagination.items],
            "pagination": {
                "page": pagination.page,
                "pages": pagination.pages,
                "per_page": pagination.per_page,
                "total": pagination.total,
                "has_next": pagination.has_next,
                "has_prev": pagination.has_prev,
                "next_num": pagination.next_num if pagination.has_next else None,
                "prev_num": pagination.prev_num if pagination.has_prev else None
            }
        })

    except Exception as e:
        return jsonify({
            "success": False,
            "message": f"Error getting logs: {str(e)}"
        }), 500

@bp.route("/scrapers")
def get_scrapers():
    """Get available scrapers and their configurations."""
    try:
        available_scrapers = get_available_scrapers()
        scraper_info = []

        for scraper_dict in available_scrapers:
            try:
                scraper_class = scraper_dict["class"]
                scraper_info.append({
                    "name": scraper_dict["name"],
                    "description": scraper_dict["description"],
                    "input_statuses": list(scraper_class.INPUT_STATUSES),
                    "output_status_success": scraper_class.OUTPUT_STATUS_SUCCESS,
                    "output_status_failure": scraper_class.OUTPUT_STATUS_FAILURE,
                    "output_status_processing": scraper_class.OUTPUT_STATUS_PROCESSING
                })
            except Exception as e:
                scraper_info.append({
                    "name": scraper_dict.get("name", "unknown"),
                    "error": f"Failed to load scraper info: {str(e)}"
                })

        return jsonify({
            "success": True,
            "scrapers": scraper_info
        })

    except Exception as e:
        return jsonify({
            "success": False,
            "message": f"Error getting scrapers: {str(e)}"
        }), 500

@bp.route("/process-papers", methods=["POST"])
def process_papers_manually():
    """Manually trigger paper processing for current hour."""
    try:
        data = request.get_json() or {}
        scraper_name = data.get('scraper_name')

        if not scraper_name:
            return jsonify({
                "success": False,
                "message": "Scraper name is required"
            }), 400

        # Process papers for current hour
        papers = scraper_manager.select_papers_for_processing()
        processed_count = len(papers) if papers else 0

        result_msg = f"Manual processing triggered - {processed_count} papers selected for processing"

        ActivityLog.log_scraper_command(
            action="manual_process",
            status="success",
            description=result_msg
        )

        return jsonify({
            "success": True,
            "message": result_msg,
            "processed_count": processed_count
        })

    except Exception as e:
        ActivityLog.log_scraper_command(
            action="manual_process",
            status="error",
            description=f"Failed to manually process papers: {str(e)}"
        )
        return jsonify({
            "success": False,
            "message": f"Error processing papers: {str(e)}"
        }), 500

@bp.route("/trigger-immediate", methods=["POST"])
def trigger_immediate_processing():
    """Trigger immediate processing of papers without waiting for hourly schedule."""
    try:
        # Get papers that should be processed this hour
        manager = ScraperManager()
        papers = manager.select_papers_for_processing()

        if not papers:
            return jsonify({
                "success": True,
                "message": "No papers available for immediate processing",
                "papers_scheduled": 0
            })

        # Get APScheduler instance
        scheduler = current_app.config.get('SCHEDULER')
        if not scheduler:
            return jsonify({
                "success": False,
                "message": "APScheduler not available"
            }), 500

        # Schedule papers for immediate processing via APScheduler
        scheduled_count = 0
        for paper in papers:
            try:
                import uuid
                job_id = f"immediate_paper_{paper.id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{uuid.uuid4().hex[:8]}"
                scheduler.schedule_paper_processing(paper.id, delay_seconds=1, job_id=job_id)
                scheduled_count += 1
            except Exception as e:
                ActivityLog.log_error(
                    error_message=f"Failed to schedule paper {paper.id}: {str(e)}",
                    source="trigger_immediate_processing"
                )

        ActivityLog.log_scraper_command(
            action="trigger_immediate_processing",
            status="success",
            description=f"Triggered immediate processing of {scheduled_count} papers via APScheduler"
        )

        return jsonify({
            "success": True,
            "message": f"Immediate processing started for {scheduled_count} papers",
            "papers_scheduled": scheduled_count
        })

    except Exception as e:
        ActivityLog.log_scraper_command(
            action="trigger_immediate_processing",
            status="error",
            description=f"Failed to trigger immediate processing: {str(e)}"
        )
        return jsonify({
            "success": False,
            "message": f"Error triggering immediate processing: {str(e)}"
        }), 500

@bp.route("/available_scrapers")
def get_available_scrapers_endpoint():
    """Get available scrapers for the UI dropdown."""
    try:
        available_scrapers = get_available_scrapers()

        return jsonify({
            "success": True,
            "scrapers": [{
                "name": scraper["name"],
                "description": scraper["description"],
                "is_current": False  # Could implement current scraper detection
            } for scraper in available_scrapers]
        })

    except Exception as e:
        return jsonify({
            "success": False,
            "message": f"Error getting scrapers: {str(e)}"
        }), 500

@bp.route("/stats")
def get_stats():
    """Get scraper statistics for the dashboard."""
    try:
        hours = int(request.args.get('hours', 24))
        current_time = datetime.utcnow()

        # Get activity logs for scraper actions in the last N hours
        from ..models import ActivityCategory
        start_time = current_time - timedelta(hours=hours)
        logs = ActivityLog.query.filter(
            ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value,
            ActivityLog.timestamp >= start_time
        ).all()

        # Get scraper command logs for state changes in the same time period
        state_logs = ActivityLog.query.filter(
            ActivityLog.category == ActivityCategory.SCRAPER_COMMAND.value,
            ActivityLog.action.in_(['start_scraper', 'pause_scraper', 'stop_scraper', 'reset_scraper']),
            ActivityLog.timestamp >= start_time
        ).order_by(ActivityLog.timestamp.asc()).all()

        # Group by chronological hour buckets (not hour of day)
        stats = []
        for hour_offset in range(hours):
            # Calculate the hour bucket (most recent hour first when hour_offset=0)
            bucket_end_time = current_time - timedelta(hours=hour_offset)
            bucket_start_time = bucket_end_time - timedelta(hours=1)

            # Format hour label for display (e.g., "14:00-15:00" or "14:00" for simplicity)
            hour_label = bucket_start_time.strftime("%H:%M")

            # Initialize counters for this hour bucket
            bucket_stats = {
                "success": 0,
                "error": 0,
                "pending": 0,
                "hour": hour_label,
                "hour_offset": hour_offset,  # For sorting
                "bucket_start": bucket_start_time,
                "bucket_end": bucket_end_time,
                "scraper_active": 0  # Default to inactive
            }

            # Count logs that fall within this hour bucket
            for log in logs:
                if bucket_start_time <= log.timestamp < bucket_end_time:
                    if log.status == "success":
                        bucket_stats["success"] += 1
                    elif log.status == "error":
                        bucket_stats["error"] += 1
                    elif log.status in ("pending", "info"):
                        bucket_stats["pending"] += 1

            # Determine scraper status for this hour by checking if scraper was active
            # For simplicity, check if there were any successful scrapes in this hour
            # If there were scrapes, assume scraper was active
            bucket_stats["scraper_active"] = 1 if bucket_stats["success"] > 0 else 0

            stats.append(bucket_stats)

        # Reverse so oldest hour comes first (better for chronological chart display)
        stats.reverse()

        # Prepare precise scraper state changes for timeline
        scraper_timeline = []
        for log in state_logs:
            # Calculate hours ago from current time
            time_diff = current_time - log.timestamp
            hours_ago = time_diff.total_seconds() / 3600

            # Only include logs within our time range
            if hours_ago <= hours:
                scraper_timeline.append({
                    "timestamp": log.timestamp.isoformat(),
                    "hours_ago": hours_ago,
                    "action": log.action,
                    "status": log.status,
                    "active": 1 if log.action == "start_scraper" and log.status == "success" else 0
                })

        # Clean up the response (remove internal fields)
        result = []
        for stat in stats:
            result.append({
                "success": stat["success"],
                "error": stat["error"],
                "pending": stat["pending"],
                "hour": stat["hour"],
                "scraper_active": stat["scraper_active"]
            })

        return jsonify({
            "hourly_stats": result,
            "scraper_timeline": scraper_timeline
        })

    except Exception as e:
        return jsonify({
            "success": False,
            "message": f"Error getting stats: {str(e)}"
        }), 500

@bp.route("/process_single/<int:paper_id>", methods=["POST"])
def process_single_paper_endpoint(paper_id):
    """Process a single paper by ID."""
    try:
        data = request.get_json() or {}
        scraper_name = data.get('scraper_module')

        # Get the paper
        paper = PaperMetadata.query.get(paper_id)
        if not paper:
            return jsonify({
                "success": False,
                "message": "Paper not found"
            }), 404

        # Get APScheduler instance
        scheduler = current_app.config.get('SCHEDULER')
        if not scheduler:
            return jsonify({
                "success": False,
                "message": "APScheduler not available"
            }), 500

        # Schedule the paper for immediate manual processing via APScheduler
        # Use UUID suffix to ensure unique job IDs
        import uuid
        job_id = f"manual_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{uuid.uuid4().hex[:8]}"
        try:
            scheduler.schedule_manual_paper_processing(paper_id, scraper_name=scraper_name, delay_seconds=1, job_id=job_id)

            ActivityLog.log_scraper_command(
                action="manual_process_single",
                status="success",
                description=f"Scheduled manual processing for paper {paper.doi} via APScheduler" +
                           (f" using scraper '{scraper_name}'" if scraper_name else " using system default scraper")
            )

            return jsonify({
                "success": True,
                "message": f"Processing scheduled for paper {paper.doi}" +
                          (f" using {scraper_name} scraper" if scraper_name else " using system default scraper"),
                "paper_id": paper_id
            })
        except Exception as e:
            return jsonify({
                "success": False,
                "message": f"Failed to schedule processing: {str(e)}"
            }), 500

    except Exception as e:
        ActivityLog.log_scraper_command(
            action="manual_process_single",
            status="error",
            description=f"Failed to process paper {paper_id}: {str(e)}"
        )
        return jsonify({
            "success": False,
            "message": f"Error processing paper: {str(e)}"
        }), 500


@bp.route("/update_config", methods=["POST"])
def update_scraper_config():
    """Update scraper configuration."""
    try:
        data = request.get_json() or {}

        # Handle volume configuration updates for daily quota
        if "volume" in data:
            # Import the helper function from config module
            from .config import _update_volume

            new_volume = data["volume"]
            success, message, volume_config = _update_volume(new_volume)

            if success:
                ActivityLog.log_scraper_command(
                    action="update_volume_config",
                    status="success",
                    description=f"Updated daily volume to {new_volume} papers per day"
                )

                return jsonify({
                    "success": True,
                    "message": message
                })
            else:
                return jsonify({
                    "success": False,
                    "message": message
                }), 400

        # Handle scraper module configuration updates
        if "scraper_module" in data:
            from ..models import ScraperModuleConfig

            new_module = data["scraper_module"]

            # Validate that the module exists and is valid
            available_modules = [m["name"] for m in get_available_scrapers()]

            if new_module not in available_modules:
                return jsonify({
                    "success": False,
                    "message": f"Invalid scraper module: {new_module}"
                }), 400

            # Update the database configuration
            ScraperModuleConfig.set_module(new_module)

            ActivityLog.log_scraper_command(
                action="update_scraper_module",
                status="success",
                description=f"Updated scraper module to '{new_module}'"
            )

            return jsonify({
                "success": True,
                "message": f"Scraper module updated to '{new_module}' successfully"
            })

        # Handle other configuration updates here if needed in the future

        return jsonify({
            "success": True,
            "message": "Configuration updated successfully"
        })

    except Exception as e:
        ActivityLog.log_scraper_command(
            action="update_scraper_config",
            status="error",
            description=f"Failed to update scraper config: {str(e)}"
        )
        return jsonify({
            "success": False,
            "message": f"Error updating scraper config: {str(e)}"
        }), 500

@bp.route("/publishers")
def get_publishers():
    """Get publisher overview data for the scraper overview modal."""
    try:
        import os
        import glob

        # Get available parser modules
        parsers_dir = os.path.join(current_app.root_path, 'parsers')
        parser_files = glob.glob(os.path.join(parsers_dir, '*_parser.py'))
        available_parsers = []

        for parser_file in parser_files:
            filename = os.path.basename(parser_file)
            if filename != 'base_parser.py':  # Skip the base parser
                parser_name = filename.replace('_parser.py', '')
                available_parsers.append(parser_name)

        # Get publishers from database (papers that have publisher detected)
        publisher_query = db.session.query(
            PaperMetadata.publisher,
            db.func.count(PaperMetadata.id).label('paper_count')
        ).filter(
            PaperMetadata.publisher.isnot(None),
            PaperMetadata.publisher != ''
        ).group_by(PaperMetadata.publisher).all()

        publishers_data = []
        for publisher, count in publisher_query:
            # Check if a parser exists for this publisher
            has_parser = publisher in available_parsers

            publishers_data.append({
                'name': publisher,
                'paper_count': count,
                'has_parser': has_parser,
                'parser_status': 'available' if has_parser else 'missing'
            })

        # Sort by paper count descending
        publishers_data.sort(key=lambda x: x['paper_count'], reverse=True)

        # Get totals
        total_papers_with_publisher = sum(p['paper_count'] for p in publishers_data)
        total_papers_without_publisher = PaperMetadata.query.filter(
            db.or_(PaperMetadata.publisher.is_(None), PaperMetadata.publisher == '')
        ).count()

        return jsonify({
            'success': True,
            'data': {
                'publishers': publishers_data,
                'available_parsers': available_parsers,
                'stats': {
                    'total_publishers': len(publishers_data),
                    'publishers_with_parsers': len([p for p in publishers_data if p['has_parser']]),
                    'publishers_without_parsers': len([p for p in publishers_data if not p['has_parser']]),
                    'total_papers_with_publisher': total_papers_with_publisher,
                    'total_papers_without_publisher': total_papers_without_publisher
                }
            }
        })

    except Exception as e:
        return jsonify({
            'success': False,
            'message': f'Error getting publisher data: {str(e)}'
        }), 500