fixes dummy paper processing

This commit is contained in:
Michael Beck 2025-04-16 16:32:52 +02:00
parent 4085b47460
commit 0adaed0bfa
4 changed files with 117 additions and 31 deletions

View File

@ -143,11 +143,15 @@ celery-flower: venv
$(PIP) install flower $(PIP) install flower
$(CELERY) -A celery_worker:celery flower --port=5555 $(CELERY) -A celery_worker:celery flower --port=5555
# Run Celery beat scheduler for periodic tasks
celery-beat: venv
$(CELERY) -A celery_worker:celery beat --loglevel=info
# Check if Redis is running, start if needed # Check if Redis is running, start if needed
redis: redis:
@redis-cli ping > /dev/null 2>&1 || (echo "Starting Redis server..." && redis-server --daemonize yes) @redis-cli ping > /dev/null 2>&1 || (echo "Starting Redis server..." && redis-server --daemonize yes)
# Run complete application stack (Flask app + Celery worker + Redis) # Run complete application stack (Flask app + Celery worker + Redis + Beat scheduler)
run-all: redis run-all: redis
@echo "Starting Flask and Celery..." @echo "Starting Flask, Celery worker and Beat scheduler..."
@$(MAKE) -j2 run celery @$(MAKE) -j3 run celery celery-beat

View File

@ -1,4 +1,6 @@
from scipaperloader.celery import celery, configure_celery from scipaperloader.celery import celery, configure_celery
# Import all task modules to ensure they are registered with Celery
import scipaperloader.blueprints.scraper # Import the scraper module with our tasks
# Configure celery with Flask app # Configure celery with Flask app
configure_celery() configure_celery()

View File

@ -4,16 +4,27 @@ import time
import math import math
from datetime import datetime, timedelta from datetime import datetime, timedelta
from flask import Blueprint, jsonify, render_template, request, current_app, flash from flask import Blueprint, jsonify, render_template, request, current_app, flash
from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory, ScheduleConfig from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory, ScheduleConfig, ScraperState
from ..db import db from ..db import db
from ..celery import celery from ..celery import celery
from ..defaults import MAX_VOLUME from ..defaults import MAX_VOLUME
from celery.schedules import crontab
bp = Blueprint("scraper", __name__, url_prefix="/scraper") bp = Blueprint("scraper", __name__, url_prefix="/scraper")
# Global variables to track scraper state # Setup periodic task to run every minute for testing purposes
SCRAPER_ACTIVE = False @celery.on_after_configure.connect
SCRAPER_PAUSED = False def setup_periodic_tasks(sender, **kwargs):
# Run the dummy scraper every minute for testing purposes
sender.add_periodic_task(60.0, run_periodic_dummy_scraper.s(), name='run dummy scraper every minute')
@celery.task
def run_periodic_dummy_scraper():
"""Periodic task to run the dummy scraper if it's active and not paused"""
if ScraperState.is_scraper_active():
dummy_scheduled_scraper.delay()
return True
return False
@bp.route("/") @bp.route("/")
def index(): def index():
@ -26,22 +37,26 @@ def index():
db.session.add(volume_config) db.session.add(volume_config)
db.session.commit() db.session.commit()
# Get scraper state
scraper_state = ScraperState.get_current_state()
return render_template( return render_template(
"scraper.html.jinja", "scraper.html.jinja",
volume_config=volume_config, volume_config=volume_config,
scraper_active=SCRAPER_ACTIVE, scraper_active=scraper_state.is_active,
scraper_paused=SCRAPER_PAUSED, scraper_paused=scraper_state.is_paused,
max_volume=MAX_VOLUME max_volume=MAX_VOLUME
) )
@bp.route("/start", methods=["POST"]) @bp.route("/start", methods=["POST"])
def start_scraper(): def start_scraper():
"""Start the scraper.""" """Start the scraper."""
global SCRAPER_ACTIVE, SCRAPER_PAUSED scraper_state = ScraperState.get_current_state()
if not SCRAPER_ACTIVE: if not scraper_state.is_active:
SCRAPER_ACTIVE = True # Update scraper state
SCRAPER_PAUSED = False ScraperState.set_active(True)
ScraperState.set_paused(False)
# Log the action # Log the action
ActivityLog.log_scraper_command( ActivityLog.log_scraper_command(
@ -66,11 +81,12 @@ def start_scraper():
@bp.route("/stop", methods=["POST"]) @bp.route("/stop", methods=["POST"])
def stop_scraper(): def stop_scraper():
"""Stop the scraper.""" """Stop the scraper."""
global SCRAPER_ACTIVE, SCRAPER_PAUSED scraper_state = ScraperState.get_current_state()
if SCRAPER_ACTIVE: if scraper_state.is_active:
SCRAPER_ACTIVE = False # Update scraper state
SCRAPER_PAUSED = False ScraperState.set_active(False)
ScraperState.set_paused(False)
ActivityLog.log_scraper_command( ActivityLog.log_scraper_command(
action="stop_scraper", action="stop_scraper",
@ -91,10 +107,11 @@ def stop_scraper():
@bp.route("/pause", methods=["POST"]) @bp.route("/pause", methods=["POST"])
def pause_scraper(): def pause_scraper():
"""Pause the scraper.""" """Pause the scraper."""
global SCRAPER_ACTIVE, SCRAPER_PAUSED scraper_state = ScraperState.get_current_state()
if SCRAPER_ACTIVE and not SCRAPER_PAUSED: if scraper_state.is_active and not scraper_state.is_paused:
SCRAPER_PAUSED = True # Update scraper state
ScraperState.set_paused(True)
ActivityLog.log_scraper_command( ActivityLog.log_scraper_command(
action="pause_scraper", action="pause_scraper",
@ -106,8 +123,9 @@ def pause_scraper():
"success": True, "success": True,
"message": "Scraper paused" "message": "Scraper paused"
}) })
elif SCRAPER_ACTIVE and SCRAPER_PAUSED: elif scraper_state.is_active and scraper_state.is_paused:
SCRAPER_PAUSED = False # Update scraper state
ScraperState.set_paused(False)
ActivityLog.log_scraper_command( ActivityLog.log_scraper_command(
action="resume_scraper", action="resume_scraper",
@ -128,9 +146,11 @@ def pause_scraper():
@bp.route("/status") @bp.route("/status")
def scraper_status(): def scraper_status():
"""Get the current status of the scraper.""" """Get the current status of the scraper."""
scraper_state = ScraperState.get_current_state()
return jsonify({ return jsonify({
"active": SCRAPER_ACTIVE, "active": scraper_state.is_active,
"paused": SCRAPER_PAUSED, "paused": scraper_state.is_paused,
"current_hour": datetime.now().hour, "current_hour": datetime.now().hour,
}) })
@ -348,13 +368,12 @@ def dummy_scheduled_scraper():
The main scheduler task that runs every hour to process papers The main scheduler task that runs every hour to process papers
according to the configured schedule. according to the configured schedule.
""" """
global SCRAPER_ACTIVE, SCRAPER_PAUSED # Check if scraper is active using ScraperState
if not ScraperState.is_scraper_active():
if not SCRAPER_ACTIVE or SCRAPER_PAUSED:
ActivityLog.log_scraper_activity( ActivityLog.log_scraper_activity(
action="scheduled_scraping", action="scheduled_scraping",
status="info", status="info",
description=f"Scheduled scraping skipped: active={SCRAPER_ACTIVE}, paused={SCRAPER_PAUSED}" description=f"Scheduled scraping skipped: inactive or paused"
) )
return False return False
@ -382,9 +401,19 @@ def dummy_scheduled_scraper():
# Create dummy pending papers # Create dummy pending papers
for i in range(papers_to_download): for i in range(papers_to_download):
# Generate a unique DOI by checking if it exists in the database
while True:
random_id = random.randint(1000, 9999)
doi = f"10.1234/dummy-pending.{random_id}"
# Check if the DOI already exists
existing = PaperMetadata.query.filter_by(doi=doi).first()
if not existing:
break
new_paper = PaperMetadata( new_paper = PaperMetadata(
title=f"Dummy Pending Paper {random.randint(1000, 9999)}", title=f"Dummy Pending Paper {random_id}",
doi=f"10.1234/dummy-pending.{random.randint(1000, 9999)}", doi=doi,
journal=random.choice([ journal=random.choice([
"Nature", "Science", "PLOS ONE", "Journal of Dummy Research", "Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
"Proceedings of the Dummy Society", "Cell", "Dummy Review Letters" "Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
@ -396,7 +425,18 @@ def dummy_scheduled_scraper():
) )
db.session.add(new_paper) db.session.add(new_paper)
db.session.commit() # Commit all at once after creating all papers
try:
db.session.commit()
except Exception as e:
# Log the error and rollback
ActivityLog.log_error(
error_message="Failed to create dummy pending papers",
exception=e,
source="dummy_scheduled_scraper"
)
db.session.rollback()
return False
# Get the newly created papers # Get the newly created papers
pending_papers = PaperMetadata.query.filter_by(status="Pending").all() pending_papers = PaperMetadata.query.filter_by(status="Pending").all()

View File

@ -211,6 +211,46 @@ class VolumeConfig(db.Model):
volume = db.Column(db.Float) # volume of papers to scrape per day volume = db.Column(db.Float) # volume of papers to scrape per day
class ScraperState(db.Model):
"""Model to store the current state of the scraper."""
id = db.Column(db.Integer, primary_key=True)
is_active = db.Column(db.Boolean, default=False)
is_paused = db.Column(db.Boolean, default=False)
last_updated = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
@classmethod
def get_current_state(cls):
"""Get the current scraper state, creating it if it doesn't exist."""
state = cls.query.first()
if not state:
state = cls(is_active=False, is_paused=False)
db.session.add(state)
db.session.commit()
return state
@classmethod
def set_active(cls, active):
"""Set the active state of the scraper."""
state = cls.get_current_state()
state.is_active = active
db.session.commit()
return state
@classmethod
def set_paused(cls, paused):
"""Set the paused state of the scraper."""
state = cls.get_current_state()
state.is_paused = paused
db.session.commit()
return state
@classmethod
def is_scraper_active(cls):
"""Check if the scraper is active."""
state = cls.get_current_state()
return state.is_active and not state.is_paused
def init_schedule_config(): def init_schedule_config():
"""Initialize ScheduleConfig with default values if empty""" """Initialize ScheduleConfig with default values if empty"""
if ScheduleConfig.query.count() == 0: if ScheduleConfig.query.count() == 0: