added dummy scraper

2025-04-16 15:57:35 +02:00 · 2025-04-16 15:57:35 +02:00 · f36fc53b26
commit f36fc53b26
parent 1f0fb5e990
2 changed files with 224 additions and 5 deletions
--- a/scipaperloader/blueprints/scraper.py
+++ b/scipaperloader/blueprints/scraper.py
@ -1,8 +1,10 @@
 import random
 import json
 import time
 import math
 from datetime import datetime
 from flask import Blueprint, jsonify, render_template, request, current_app, flash
-from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory
+from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory, ScheduleConfig
 from ..db import db
 from ..celery import celery
@ -43,13 +45,15 @@ def start_scraper():
        ActivityLog.log_scraper_command(
            action="start_scraper",
            status="success",
-            description="Scraper started manually"
+            description="Scheduled scraper started - will follow hourly configuration"
        )
-        # Trigger the schedule.py to start actual scheduling
+        # Immediately trigger a task to test the scheduler and provide feedback
        dummy_scheduled_scraper.delay()
        return jsonify({
            "success": True, 
-            "message": "Scraper started"
+            "message": "Scraper started - following hourly schedule configuration"
        })
    else:
        return jsonify({
@ -282,4 +286,210 @@ def dummy_scrape_paper(self):
        return {
            "success": False,
            "error": error_message
-        }
+        }
@celery.task
 def calculate_papers_for_current_hour():
    """
    Calculate how many papers should be downloaded in the current hour
    based on schedule configuration.
    Returns:
        int: Number of papers to download this hour
    """
    current_hour = datetime.now().hour
    # Get volume configuration
    volume_config = VolumeConfig.query.first()
    if not volume_config:
        volume_config = VolumeConfig(volume=100)  # Default to 100 papers per day
        db.session.add(volume_config)
        db.session.commit()
    # Get all schedule configurations to calculate total weight
    schedule_configs = ScheduleConfig.query.all()
    if not schedule_configs:
        # If no schedule configs, create default with equal weights
        for hour in range(24):
            config = ScheduleConfig(hour=hour, weight=1.0)
            db.session.add(config)
        db.session.commit()
        schedule_configs = ScheduleConfig.query.all()
    # Calculate total weight across all hours
    total_weight = sum(config.weight for config in schedule_configs)
    # Find the weight for the current hour
    current_hour_config = ScheduleConfig.query.get(current_hour)
    if not current_hour_config:
        # Create config for current hour if it doesn't exist
        current_hour_config = ScheduleConfig(hour=current_hour, weight=1.0)
        db.session.add(current_hour_config)
        db.session.commit()
    # Calculate papers for current hour: (hour_weight / total_weight) * daily_volume
    if total_weight > 0:
        weight_ratio = current_hour_config.weight / total_weight
        papers_this_hour = math.floor(weight_ratio * volume_config.volume)
    else:
        papers_this_hour = 0
    return papers_this_hour
@celery.task
 def dummy_scheduled_scraper():
    """
    The main scheduler task that runs every hour to process papers
    according to the configured schedule.
    """
    global SCRAPER_ACTIVE, SCRAPER_PAUSED
    if not SCRAPER_ACTIVE or SCRAPER_PAUSED:
        ActivityLog.log_scraper_activity(
            action="scheduled_scraping",
            status="info", 
            description=f"Scheduled scraping skipped: active={SCRAPER_ACTIVE}, paused={SCRAPER_PAUSED}"
        )
        return False
    # Calculate how many papers to download this hour
    papers_to_download = calculate_papers_for_current_hour()
    if papers_to_download <= 0:
        ActivityLog.log_scraper_activity(
            action="scheduled_scraping",
            status="info", 
            description=f"No papers scheduled for current hour"
        )
        return True
    # Get all pending papers
    pending_papers = PaperMetadata.query.filter_by(status="Pending").all()
    # If no pending papers available, create some dummy pending papers
    if not pending_papers:
        ActivityLog.log_scraper_activity(
            action="scheduled_scraping",
            status="info", 
            description=f"No pending papers found - creating {papers_to_download} dummy pending papers"
        )
        # Create dummy pending papers
        for i in range(papers_to_download):
            new_paper = PaperMetadata(
                title=f"Dummy Pending Paper {random.randint(1000, 9999)}",
                doi=f"10.1234/dummy-pending.{random.randint(1000, 9999)}",
                journal=random.choice([
                    "Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
                    "Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
                ]),
                type="article",
                language="en",
                published_online=datetime.now().date(),
                status="Pending"
            )
            db.session.add(new_paper)
        db.session.commit()
        # Get the newly created papers
        pending_papers = PaperMetadata.query.filter_by(status="Pending").all()
    # Select papers_to_download random papers from pending_papers
    selected_papers = random.sample(
        pending_papers, 
        min(papers_to_download, len(pending_papers))
    )
    ActivityLog.log_scraper_activity(
        action="scheduled_scraping",
        status="info", 
        description=f"Starting scheduled scraping of {len(selected_papers)} papers for hour {datetime.now().hour}"
    )
    # For each paper, schedule it to run at a random time within the hour
    current_time = time.time()
    one_hour_in_seconds = 3600
    for paper in selected_papers:
        # Random delay within this hour (0 to 60 minutes)
        random_delay = random.randint(0, one_hour_in_seconds)
        # Schedule the dummy_process_paper task with the random delay
        dummy_process_paper.apply_async(
            args=[paper.id],
            countdown=random_delay
        )
    return True
@celery.task(bind=True)
 def dummy_process_paper(self, paper_id):
    """
    Process a single paper for the dummy scraper.
    Args:
        paper_id (int): ID of the paper to process
    """
    # Get the paper from database
    paper = PaperMetadata.query.get(paper_id)
    if not paper:
        # Log error if paper not found
        ActivityLog.log_scraper_activity(
            action="process_paper",
            status="error", 
            description=f"Paper with ID {paper_id} not found"
        )
        return False
    # Simulate random success/failure (70% success rate)
    success = random.random() < 0.7
    # Simulate processing time (1-5 seconds)
    process_time = random.uniform(1, 5)
    time.sleep(process_time)
    if success:
        # Update paper status to "Done"
        paper.status = "Done"
        paper.file_path = f"/path/to/dummy/papers/{paper.doi.replace('/', '_')}.pdf"
        # Log success
        ActivityLog.log_scraper_activity(
            action="process_paper",
            paper_id=paper.id,
            status="success", 
            description=f"Successfully processed paper: {paper.title}"
        )
    else:
        # Update paper status to "Failed"
        paper.status = "Failed"
        # Generate random error message
        error_message = random.choice([
            "Publisher website unavailable",
            "No PDF download link found",
            "Access restricted",
            "Download timeout",
            "Invalid DOI",
            "Rate limited by publisher"
        ])
        paper.error_msg = error_message
        # Log failure
        ActivityLog.log_scraper_activity(
            action="process_paper",
            paper_id=paper.id,
            status="error", 
            description=f"Failed to process paper: {error_message}"
        )
    # Update the timestamp
    paper.updated_at = datetime.utcnow()
    # Commit changes to database
    db.session.commit()
    return success
--- a/scipaperloader/celery.py
+++ b/scipaperloader/celery.py
@ -1,4 +1,5 @@
 from celery import Celery
 from celery.schedules import crontab
 # Create Celery instance without Flask app initially
 celery = Celery(
@ -29,6 +30,14 @@ def configure_celery(app=None):
        worker_max_memory_per_child=1000000,  # 1GB memory limit
        task_acks_late=True,            # Acknowledge tasks after completion
        task_reject_on_worker_lost=True,  # Requeue tasks if worker dies
        # Configure Beat schedule for periodic tasks
        beat_schedule={
            'scheduled-scraper-hourly': {
                'task': 'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
                'schedule': crontab(minute=0),  # Run at the start of every hour
                'options': {'expires': 3600}
            },
        }
    )
    # Create a custom task class that pushes the Flask application context