From b09c6f1b9b1b66c405bb4d01481640bec1d65104 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Wed, 16 Apr 2025 12:28:39 +0200 Subject: [PATCH] adds initial scraper logic --- scipaperloader/__init__.py | 2 +- scipaperloader/blueprints/__init__.py | 6 +- scipaperloader/blueprints/api.py | 50 ++ scipaperloader/blueprints/scraper.py | 344 ++++++++++++ scipaperloader/templates/nav.html.jinja | 3 + scipaperloader/templates/papers.html.jinja | 23 +- scipaperloader/templates/scraper.html.jinja | 581 ++++++++++++++++++++ 7 files changed, 1000 insertions(+), 9 deletions(-) create mode 100644 scipaperloader/blueprints/api.py create mode 100644 scipaperloader/blueprints/scraper.py create mode 100644 scipaperloader/templates/scraper.html.jinja diff --git a/scipaperloader/__init__.py b/scipaperloader/__init__.py index 2e7f21e..2e4151f 100644 --- a/scipaperloader/__init__.py +++ b/scipaperloader/__init__.py @@ -18,7 +18,7 @@ def create_app(test_config=None): app.config.update(test_config) db.init_app(app) - migrate = Migrate(app, db) # Add this line to initialize Flask-Migrate + migrate = Migrate(app, db) with app.app_context(): db.create_all() diff --git a/scipaperloader/blueprints/__init__.py b/scipaperloader/blueprints/__init__.py index ad98b8b..c6cefe1 100644 --- a/scipaperloader/blueprints/__init__.py +++ b/scipaperloader/blueprints/__init__.py @@ -6,6 +6,8 @@ from .papers import bp as papers_bp from .upload import bp as upload_bp from .schedule import bp as schedule_bp from .logger import bp as logger_bp +from .api import bp as api_bp +from .scraper import bp as scraper_bp def register_blueprints(app: Flask): @@ -14,4 +16,6 @@ def register_blueprints(app: Flask): app.register_blueprint(papers_bp, url_prefix='/papers') app.register_blueprint(upload_bp, url_prefix='/upload') app.register_blueprint(schedule_bp, url_prefix='/schedule') - app.register_blueprint(logger_bp, url_prefix='/logs') \ No newline at end of file + app.register_blueprint(logger_bp, url_prefix='/logs') + app.register_blueprint(api_bp, url_prefix='/api') + app.register_blueprint(scraper_bp, url_prefix='/scraper') \ No newline at end of file diff --git a/scipaperloader/blueprints/api.py b/scipaperloader/blueprints/api.py new file mode 100644 index 0000000..e0c3d8f --- /dev/null +++ b/scipaperloader/blueprints/api.py @@ -0,0 +1,50 @@ +from datetime import datetime +from flask import Blueprint, jsonify, request +from ..models import ActivityLog, ActivityCategory + +bp = Blueprint("api", __name__, url_prefix="/api") + +@bp.route("/activity_logs") +def get_activity_logs(): + """Get activity logs with filtering options.""" + # Get query parameters + category = request.args.get("category") + action = request.args.get("action") + after = request.args.get("after") + limit = request.args.get("limit", 20, type=int) + + # Build query + query = ActivityLog.query + + if category: + query = query.filter(ActivityLog.category == category) + + if action: + query = query.filter(ActivityLog.action == action) + + if after: + try: + after_date = datetime.fromisoformat(after.replace("Z", "+00:00")) + query = query.filter(ActivityLog.timestamp > after_date) + except (ValueError, TypeError): + pass + + # Order by most recent first and limit results + logs = query.order_by(ActivityLog.timestamp.desc()).limit(limit).all() + + # Format the results + result = [] + for log in logs: + log_data = { + "id": log.id, + "timestamp": log.timestamp.isoformat(), + "category": log.category, + "action": log.action, + "description": log.description, + "status": log.status, + "paper_id": log.paper_id, + "extra_data": log.extra_data + } + result.append(log_data) + + return jsonify(result) \ No newline at end of file diff --git a/scipaperloader/blueprints/scraper.py b/scipaperloader/blueprints/scraper.py new file mode 100644 index 0000000..499174b --- /dev/null +++ b/scipaperloader/blueprints/scraper.py @@ -0,0 +1,344 @@ +import random +import json +from datetime import datetime +from flask import Blueprint, jsonify, render_template, request, current_app +from ..models import ScheduleConfig, VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory +from ..db import db +from ..celery import celery + +bp = Blueprint("scraper", __name__, url_prefix="/scraper") + +# Global variables to track scraper state +SCRAPER_ACTIVE = False +SCRAPER_PAUSED = False + +@bp.route("/") +def index(): + """Render the scraper control panel.""" + volume_config = VolumeConfig.query.first() + schedule_config = {record.hour: record.weight for record in ScheduleConfig.query.all()} + + return render_template( + "scraper.html.jinja", + volume_config=volume_config, + schedule_config=schedule_config, + scraper_active=SCRAPER_ACTIVE, + scraper_paused=SCRAPER_PAUSED + ) + +@bp.route("/start", methods=["POST"]) +def start_scraper(): + """Start the scraper.""" + global SCRAPER_ACTIVE, SCRAPER_PAUSED + + if not SCRAPER_ACTIVE: + SCRAPER_ACTIVE = True + SCRAPER_PAUSED = False + + # Log the action + ActivityLog.log_scraper_command( + action="start_scraper", + status="success", + description="Scraper started manually" + ) + + # Start the scheduler task + task = dummy_scraper_scheduler.delay() + + return jsonify({ + "success": True, + "message": "Scraper started", + "task_id": task.id + }) + else: + return jsonify({ + "success": False, + "message": "Scraper is already running" + }) + +@bp.route("/stop", methods=["POST"]) +def stop_scraper(): + """Stop the scraper.""" + global SCRAPER_ACTIVE, SCRAPER_PAUSED + + if SCRAPER_ACTIVE: + SCRAPER_ACTIVE = False + SCRAPER_PAUSED = False + + ActivityLog.log_scraper_command( + action="stop_scraper", + status="success", + description="Scraper stopped manually" + ) + + return jsonify({ + "success": True, + "message": "Scraper stopped" + }) + else: + return jsonify({ + "success": False, + "message": "Scraper is not running" + }) + +@bp.route("/pause", methods=["POST"]) +def pause_scraper(): + """Pause the scraper.""" + global SCRAPER_ACTIVE, SCRAPER_PAUSED + + if SCRAPER_ACTIVE and not SCRAPER_PAUSED: + SCRAPER_PAUSED = True + + ActivityLog.log_scraper_command( + action="pause_scraper", + status="success", + description="Scraper paused manually" + ) + + return jsonify({ + "success": True, + "message": "Scraper paused" + }) + elif SCRAPER_ACTIVE and SCRAPER_PAUSED: + SCRAPER_PAUSED = False + + ActivityLog.log_scraper_command( + action="resume_scraper", + status="success", + description="Scraper resumed manually" + ) + + return jsonify({ + "success": True, + "message": "Scraper resumed" + }) + else: + return jsonify({ + "success": False, + "message": "Scraper is not running" + }) + +@bp.route("/status") +def scraper_status(): + """Get the current status of the scraper.""" + return jsonify({ + "active": SCRAPER_ACTIVE, + "paused": SCRAPER_PAUSED, + "current_hour": datetime.now().hour, + }) + +@bp.route("/stats") +def scraper_stats(): + """Get scraper statistics for the dashboard.""" + # Get the last 24 hours of activity + hours = 24 + if request.args.get('hours'): + try: + hours = int(request.args.get('hours')) + except ValueError: + pass + + cutoff_time = datetime.utcnow().replace( + minute=0, second=0, microsecond=0 + ) + + # Get activity logs for scraper actions + logs = ActivityLog.query.filter( + ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value, + ActivityLog.timestamp >= cutoff_time.replace(hour=cutoff_time.hour - hours) + ).all() + + # Group by hour and status + stats = {} + for hour in range(hours): + target_hour = (cutoff_time.hour - hour) % 24 + stats[target_hour] = { + "success": 0, + "error": 0, + "pending": 0, + "hour": target_hour, + } + + for log in logs: + hour = log.timestamp.hour + if hour in stats: + if log.status == "success": + stats[hour]["success"] += 1 + elif log.status == "error": + stats[hour]["error"] += 1 + elif log.status == "pending": + stats[hour]["pending"] += 1 + + # Convert to list for easier consumption by JavaScript + result = [stats[hour] for hour in sorted(stats.keys())] + + return jsonify(result) + +@bp.route("/update_config", methods=["POST"]) +def update_config(): + """Update scraper configuration.""" + data = request.json + + if "volume" in data: + try: + new_volume = float(data["volume"]) + volume_config = VolumeConfig.query.first() + if not volume_config: + volume_config = VolumeConfig(volume=new_volume) + db.session.add(volume_config) + else: + old_value = volume_config.volume + volume_config.volume = new_volume + ActivityLog.log_config_change( + config_key="scraper_volume", + old_value=old_value, + new_value=new_volume, + description="Updated scraper volume" + ) + + db.session.commit() + except (ValueError, TypeError): + return jsonify({"success": False, "message": "Invalid volume value"}) + + if "schedule" in data: + try: + schedule = data["schedule"] + + for hour_str, weight in schedule.items(): + hour = int(hour_str) + weight = float(weight) + + if 0 <= hour <= 23 and weight >= 0: + schedule_config = ScheduleConfig.query.get(hour) + if not schedule_config: + schedule_config = ScheduleConfig(hour=hour, weight=weight) + db.session.add(schedule_config) + else: + old_value = schedule_config.weight + schedule_config.weight = weight + ActivityLog.log_config_change( + config_key=f"schedule_hour_{hour}", + old_value=old_value, + new_value=weight, + description=f"Updated schedule weight for hour {hour}" + ) + + db.session.commit() + except (ValueError, TypeError): + return jsonify({"success": False, "message": "Invalid schedule format"}) + + return jsonify({"success": True, "message": "Configuration updated"}) + +# Define the Celery tasks +@celery.task(bind=True) +def dummy_scraper_scheduler(self): + """Main scheduler task for the dummy scraper.""" + global SCRAPER_ACTIVE, SCRAPER_PAUSED + + if not SCRAPER_ACTIVE: + return {"status": "Scraper not active"} + + if SCRAPER_PAUSED: + return {"status": "Scraper paused"} + + # Calculate how many papers to scrape based on current hour and configuration + current_hour = datetime.now().hour + hour_config = ScheduleConfig.query.get(current_hour) + volume_config = VolumeConfig.query.first() + + if not hour_config or not volume_config: + return {"status": "Missing configuration"} + + # Calculate papers to scrape this hour + hourly_rate = volume_config.volume / 24 # Base rate per hour + adjusted_rate = hourly_rate * (1 / hour_config.weight) # Adjust by weight + papers_to_scrape = int(adjusted_rate) + + # Log the scheduling decision + ActivityLog.log_scraper_activity( + action="schedule_papers", + status="success", + description=f"Scheduled {papers_to_scrape} papers for scraping at hour {current_hour}", + hourly_rate=hourly_rate, + weight=hour_config.weight, + adjusted_rate=adjusted_rate, + ) + + # Launch individual scraping tasks + for _ in range(papers_to_scrape): + if not SCRAPER_ACTIVE or SCRAPER_PAUSED: + break + + # Schedule a new paper to be scraped + dummy_scrape_paper.delay() + + # Schedule the next run in 5 minutes if still active + if SCRAPER_ACTIVE: + dummy_scraper_scheduler.apply_async(countdown=300) # 5 minutes + + return {"status": "success", "papers_scheduled": papers_to_scrape} + +@celery.task(bind=True) +def dummy_scrape_paper(self): + """Simulate scraping a single paper.""" + # Simulate success or failure + success = random.random() > 0.3 # 70% success rate + + # Simulate processing time + import time + time.sleep(random.randint(2, 5)) # 2-5 seconds + + if success: + # Create a dummy paper + new_paper = PaperMetadata( + title=f"Dummy Paper {random.randint(1000, 9999)}", + doi=f"10.1234/dummy.{random.randint(1000, 9999)}", + journal=random.choice([ + "Nature", "Science", "PLOS ONE", "Journal of Dummy Research", + "Proceedings of the Dummy Society", "Cell", "Dummy Review Letters" + ]), + type="article", + language="en", + published_online=datetime.now().date(), + status="Done", + file_path="/path/to/dummy/paper.pdf" + ) + + db.session.add(new_paper) + db.session.commit() + + # Log the successful scrape + ActivityLog.log_scraper_activity( + action="scrape_paper", + paper_id=new_paper.id, + status="success", + description=f"Successfully scraped paper {new_paper.doi}" + ) + + return { + "success": True, + "paper_id": new_paper.id, + "title": new_paper.title, + "doi": new_paper.doi + } + else: + # Log the failed scrape + error_message = random.choice([ + "Connection timeout", + "404 Not Found", + "Access denied", + "Invalid DOI format", + "PDF download failed", + "Rate limited by publisher" + ]) + + ActivityLog.log_scraper_activity( + action="scrape_paper", + status="error", + description=f"Failed to scrape paper: {error_message}" + ) + + return { + "success": False, + "error": error_message + } \ No newline at end of file diff --git a/scipaperloader/templates/nav.html.jinja b/scipaperloader/templates/nav.html.jinja index 85e3809..a0dd9b4 100644 --- a/scipaperloader/templates/nav.html.jinja +++ b/scipaperloader/templates/nav.html.jinja @@ -7,6 +7,9 @@