576 lines
19 KiB
Python

"""
Simplified scraper blueprint using the new ScraperManager and hourly scheduling system.
"""
from flask import Blueprint, jsonify, render_template, request, current_app
from ..models import ActivityLog, PaperMetadata, ScraperState, VolumeConfig
from ..scrapers.manager import ScraperManager
from ..scrapers.factory import get_available_scrapers
from ..db import db
from ..defaults import MAX_VOLUME
from datetime import datetime, timedelta
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
# Initialize the scraper manager
scraper_manager = ScraperManager()
@bp.route("/")
def index():
"""Main scraper page."""
# Get current scraper state
scraper_state = ScraperState.get_current_state()
# Get available scrapers
available_scrapers = get_available_scrapers()
# Get recent activity logs
recent_logs = ActivityLog.query.order_by(ActivityLog.timestamp.desc()).limit(50).all()
# Get volume configuration
volume_config = VolumeConfig.get_current_volume()
# Get paper counts by status
paper_counts = {
'new': PaperMetadata.query.filter_by(status='New').count(),
'processing': PaperMetadata.query.filter_by(status='Processing').count(),
'done': PaperMetadata.query.filter_by(status='Done').count(),
'failed': PaperMetadata.query.filter_by(status='Failed').count(),
'pending': PaperMetadata.query.filter_by(status='Pending').count(),
'retrying': PaperMetadata.query.filter_by(status='Retrying').count(),
}
return render_template(
"scraper.html.jinja",
scraper_state=scraper_state,
available_scrapers=available_scrapers,
recent_logs=recent_logs,
paper_counts=paper_counts,
volume_config=volume_config,
max_volume=MAX_VOLUME
)
@bp.route("/start", methods=["POST"])
def start_scraper():
"""Start the hourly scraper scheduling."""
try:
# Handle both JSON and form data
if request.is_json:
data = request.get_json() or {}
else:
data = request.form.to_dict()
scraper_name = data.get('scraper_name', 'dummy')
# Start the scraper using manager
result = scraper_manager.start_scraper()
if result["status"] == "success":
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description="Started scraper with hourly scheduling"
)
return jsonify({
"success": True,
"message": result["message"]
})
else:
return jsonify({
"success": False,
"message": result["message"]
}), 400
except Exception as e:
ActivityLog.log_scraper_command(
action="start_scraper",
status="error",
description=f"Failed to start scraper: {str(e)}"
)
return jsonify({
"success": False,
"message": f"Error starting scraper: {str(e)}"
}), 500
@bp.route("/pause", methods=["POST"])
def pause_scraper():
"""Pause the scraper."""
try:
result = scraper_manager.pause_scraper()
if result["status"] == "success":
ActivityLog.log_scraper_command(
action="pause_scraper",
status="success",
description="Scraper paused successfully"
)
return jsonify({
"success": True,
"message": result["message"]
})
else:
return jsonify({
"success": False,
"message": result["message"]
}), 400
except Exception as e:
ActivityLog.log_scraper_command(
action="pause_scraper",
status="error",
description=f"Failed to pause scraper: {str(e)}"
)
return jsonify({
"success": False,
"message": f"Error pausing scraper: {str(e)}"
}), 500
@bp.route("/stop", methods=["POST"])
def stop_scraper():
"""Stop the scraper and revert processing papers."""
try:
result = scraper_manager.stop_scraper()
# Add debugging to see what the manager returns
print(f"DEBUG: stop_scraper result: {result}")
# Always log the stop attempt regardless of result
ActivityLog.log_scraper_command(
action="stop_scraper_attempt",
status=result.get("status", "unknown"),
description=f"Stop scraper called - result: {result}"
)
if result["status"] == "success":
ActivityLog.log_scraper_command(
action="stop_scraper",
status="success",
description="Scraper stopped and papers reverted to original status"
)
return jsonify({
"success": True,
"message": result["message"]
})
else:
return jsonify({
"success": False,
"message": result["message"]
}), 400
except Exception as e:
ActivityLog.log_scraper_command(
action="stop_scraper",
status="error",
description=f"Failed to stop scraper: {str(e)}"
)
return jsonify({
"success": False,
"message": f"Error stopping scraper: {str(e)}"
}), 500
@bp.route("/reset", methods=["POST"])
def reset_scraper():
"""Reset the scraper state and revert all processing papers."""
try:
result = scraper_manager.reset_scraper()
if result["status"] == "success":
ActivityLog.log_scraper_command(
action="reset_scraper",
status="success",
description="Scraper reset and all processing papers reverted"
)
return jsonify({
"success": True,
"message": result["message"]
})
else:
return jsonify({
"success": False,
"message": result["message"]
}), 400
except Exception as e:
ActivityLog.log_scraper_command(
action="reset_scraper",
status="error",
description=f"Failed to reset scraper: {str(e)}"
)
return jsonify({
"success": False,
"message": f"Error resetting scraper: {str(e)}"
}), 500
@bp.route("/status")
def get_status():
"""Get current scraper status and statistics."""
try:
scraper_state = ScraperState.get_current_state()
# Get paper counts by status
paper_counts = {
'new': PaperMetadata.query.filter_by(status='New').count(),
'processing': PaperMetadata.query.filter_by(status='Processing').count(),
'done': PaperMetadata.query.filter_by(status='Done').count(),
'failed': PaperMetadata.query.filter_by(status='Failed').count(),
'pending': PaperMetadata.query.filter_by(status='Pending').count(),
'retrying': PaperMetadata.query.filter_by(status='Retrying').count(),
}
# Get current hour quota info
current_quota = scraper_manager.get_current_hour_quota()
return jsonify({
"success": True,
"scraper_state": {
"active": scraper_state.is_active,
"paused": scraper_state.is_paused,
"last_updated": scraper_state.last_updated.isoformat() if scraper_state.last_updated else None
},
"paper_counts": paper_counts,
"current_quota": current_quota
})
except Exception as e:
return jsonify({
"success": False,
"message": f"Error getting status: {str(e)}"
}), 500
@bp.route("/logs")
def get_logs():
"""Get recent activity logs."""
try:
limit = request.args.get('limit', 50, type=int)
logs = ActivityLog.query.order_by(ActivityLog.timestamp.desc()).limit(limit).all()
return jsonify({
"success": True,
"logs": [{
"id": log.id,
"timestamp": log.timestamp.isoformat(),
"action": log.action,
"status": log.status,
"description": log.description,
"category": log.category.name if log.category else None
} for log in logs]
})
except Exception as e:
return jsonify({
"success": False,
"message": f"Error getting logs: {str(e)}"
}), 500
@bp.route("/scrapers")
def get_scrapers():
"""Get available scrapers and their configurations."""
try:
available_scrapers = get_available_scrapers()
scraper_info = []
for scraper_dict in available_scrapers:
try:
scraper_class = scraper_dict["class"]
scraper_info.append({
"name": scraper_dict["name"],
"description": scraper_dict["description"],
"input_statuses": list(scraper_class.INPUT_STATUSES),
"output_status_success": scraper_class.OUTPUT_STATUS_SUCCESS,
"output_status_failure": scraper_class.OUTPUT_STATUS_FAILURE,
"output_status_processing": scraper_class.OUTPUT_STATUS_PROCESSING
})
except Exception as e:
scraper_info.append({
"name": scraper_dict.get("name", "unknown"),
"error": f"Failed to load scraper info: {str(e)}"
})
return jsonify({
"success": True,
"scrapers": scraper_info
})
except Exception as e:
return jsonify({
"success": False,
"message": f"Error getting scrapers: {str(e)}"
}), 500
@bp.route("/process-papers", methods=["POST"])
def process_papers_manually():
"""Manually trigger paper processing for current hour."""
try:
data = request.get_json() or {}
scraper_name = data.get('scraper_name')
if not scraper_name:
return jsonify({
"success": False,
"message": "Scraper name is required"
}), 400
# Process papers for current hour
papers = scraper_manager.select_papers_for_processing()
processed_count = len(papers) if papers else 0
result_msg = f"Manual processing triggered - {processed_count} papers selected for processing"
ActivityLog.log_scraper_command(
action="manual_process",
status="success",
description=result_msg
)
return jsonify({
"success": True,
"message": result_msg,
"processed_count": processed_count
})
except Exception as e:
ActivityLog.log_scraper_command(
action="manual_process",
status="error",
description=f"Failed to manually process papers: {str(e)}"
)
return jsonify({
"success": False,
"message": f"Error processing papers: {str(e)}"
}), 500
@bp.route("/trigger-immediate", methods=["POST"])
def trigger_immediate_processing():
"""Trigger immediate processing of papers without waiting for hourly schedule."""
try:
# Get papers that should be processed this hour
manager = ScraperManager()
papers = manager.select_papers_for_processing()
if not papers:
return jsonify({
"success": True,
"message": "No papers available for immediate processing",
"papers_scheduled": 0
})
# Get APScheduler instance
scheduler = current_app.config.get('SCHEDULER')
if not scheduler:
return jsonify({
"success": False,
"message": "APScheduler not available"
}), 500
# Schedule papers for immediate processing via APScheduler
scheduled_count = 0
for paper in papers:
try:
job_id = f"immediate_paper_{paper.id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
scheduler.schedule_paper_processing(paper.id, delay_seconds=1, job_id=job_id)
scheduled_count += 1
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to schedule paper {paper.id}: {str(e)}",
source="trigger_immediate_processing"
)
ActivityLog.log_scraper_command(
action="trigger_immediate_processing",
status="success",
description=f"Triggered immediate processing of {scheduled_count} papers via APScheduler"
)
return jsonify({
"success": True,
"message": f"Immediate processing started for {scheduled_count} papers",
"papers_scheduled": scheduled_count
})
except Exception as e:
ActivityLog.log_scraper_command(
action="trigger_immediate_processing",
status="error",
description=f"Failed to trigger immediate processing: {str(e)}"
)
return jsonify({
"success": False,
"message": f"Error triggering immediate processing: {str(e)}"
}), 500
@bp.route("/available_scrapers")
def get_available_scrapers_endpoint():
"""Get available scrapers for the UI dropdown."""
try:
available_scrapers = get_available_scrapers()
return jsonify({
"success": True,
"scrapers": [{
"name": scraper["name"],
"description": scraper["description"],
"is_current": False # Could implement current scraper detection
} for scraper in available_scrapers]
})
except Exception as e:
return jsonify({
"success": False,
"message": f"Error getting scrapers: {str(e)}"
}), 500
@bp.route("/stats")
def get_stats():
"""Get scraper statistics for the dashboard."""
try:
hours = int(request.args.get('hours', 24))
current_time = datetime.utcnow()
cutoff_time = current_time.replace(minute=0, second=0, microsecond=0)
# Get activity logs for scraper actions in the last N hours
from ..models import ActivityCategory
start_time = cutoff_time - timedelta(hours=hours)
logs = ActivityLog.query.filter(
ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value,
ActivityLog.timestamp >= start_time
).all()
# Group by hour and status
stats = {}
for hour_offset in range(hours):
target_hour = (current_time.hour - hour_offset) % 24
stats[target_hour] = {
"success": 0,
"error": 0,
"pending": 0,
"hour": target_hour,
}
for log in logs:
hour = log.timestamp.hour
if hour in stats:
if log.status == "success":
stats[hour]["success"] += 1
elif log.status == "error":
stats[hour]["error"] += 1
elif log.status in ("pending", "info"):
stats[hour]["pending"] += 1
# Convert to list for easier consumption by JavaScript
result = [stats[hour] for hour in sorted(stats.keys())]
return jsonify(result)
except Exception as e:
return jsonify({
"success": False,
"message": f"Error getting stats: {str(e)}"
}), 500
@bp.route("/process_single/<int:paper_id>", methods=["POST"])
def process_single_paper_endpoint(paper_id):
"""Process a single paper by ID."""
try:
data = request.get_json() or {}
scraper_name = data.get('scraper_module')
# Get the paper
paper = PaperMetadata.query.get(paper_id)
if not paper:
return jsonify({
"success": False,
"message": "Paper not found"
}), 404
# Get APScheduler instance
scheduler = current_app.config.get('SCHEDULER')
if not scheduler:
return jsonify({
"success": False,
"message": "APScheduler not available"
}), 500
# Schedule the paper for immediate processing via APScheduler
job_id = f"manual_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
try:
scheduler.schedule_paper_processing(paper_id, delay_seconds=1, job_id=job_id)
ActivityLog.log_scraper_command(
action="manual_process_single",
status="success",
description=f"Scheduled manual processing for paper {paper.doi} via APScheduler"
)
return jsonify({
"success": True,
"message": f"Processing scheduled for paper {paper.doi}",
"paper_id": paper_id
})
except Exception as e:
return jsonify({
"success": False,
"message": f"Failed to schedule processing: {str(e)}"
}), 500
except Exception as e:
ActivityLog.log_scraper_command(
action="manual_process_single",
status="error",
description=f"Failed to process paper {paper_id}: {str(e)}"
)
return jsonify({
"success": False,
"message": f"Error processing paper: {str(e)}"
}), 500
@bp.route("/update_config", methods=["POST"])
def update_scraper_config():
"""Update scraper configuration."""
try:
data = request.get_json() or {}
# Handle volume configuration updates for daily quota
if "volume" in data:
# Import the helper function from config module
from .config import _update_volume
new_volume = data["volume"]
success, message, volume_config = _update_volume(new_volume)
if success:
ActivityLog.log_scraper_command(
action="update_volume_config",
status="success",
description=f"Updated daily volume to {new_volume} papers per day"
)
return jsonify({
"success": True,
"message": message
})
else:
return jsonify({
"success": False,
"message": message
}), 400
# Handle other configuration updates here if needed in the future
return jsonify({
"success": True,
"message": "Configuration updated successfully"
})
except Exception as e:
ActivityLog.log_scraper_command(
action="update_scraper_config",
status="error",
description=f"Failed to update scraper config: {str(e)}"
)
return jsonify({
"success": False,
"message": f"Error updating scraper config: {str(e)}"
}), 500