fixes dummy paper processing
This commit is contained in:
parent
4085b47460
commit
0adaed0bfa
10
Makefile
10
Makefile
@ -143,11 +143,15 @@ celery-flower: venv
|
|||||||
$(PIP) install flower
|
$(PIP) install flower
|
||||||
$(CELERY) -A celery_worker:celery flower --port=5555
|
$(CELERY) -A celery_worker:celery flower --port=5555
|
||||||
|
|
||||||
|
# Run Celery beat scheduler for periodic tasks
|
||||||
|
celery-beat: venv
|
||||||
|
$(CELERY) -A celery_worker:celery beat --loglevel=info
|
||||||
|
|
||||||
# Check if Redis is running, start if needed
|
# Check if Redis is running, start if needed
|
||||||
redis:
|
redis:
|
||||||
@redis-cli ping > /dev/null 2>&1 || (echo "Starting Redis server..." && redis-server --daemonize yes)
|
@redis-cli ping > /dev/null 2>&1 || (echo "Starting Redis server..." && redis-server --daemonize yes)
|
||||||
|
|
||||||
# Run complete application stack (Flask app + Celery worker + Redis)
|
# Run complete application stack (Flask app + Celery worker + Redis + Beat scheduler)
|
||||||
run-all: redis
|
run-all: redis
|
||||||
@echo "Starting Flask and Celery..."
|
@echo "Starting Flask, Celery worker and Beat scheduler..."
|
||||||
@$(MAKE) -j2 run celery
|
@$(MAKE) -j3 run celery celery-beat
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
from scipaperloader.celery import celery, configure_celery
|
from scipaperloader.celery import celery, configure_celery
|
||||||
|
# Import all task modules to ensure they are registered with Celery
|
||||||
|
import scipaperloader.blueprints.scraper # Import the scraper module with our tasks
|
||||||
|
|
||||||
# Configure celery with Flask app
|
# Configure celery with Flask app
|
||||||
configure_celery()
|
configure_celery()
|
||||||
|
@ -4,16 +4,27 @@ import time
|
|||||||
import math
|
import math
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from flask import Blueprint, jsonify, render_template, request, current_app, flash
|
from flask import Blueprint, jsonify, render_template, request, current_app, flash
|
||||||
from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory, ScheduleConfig
|
from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory, ScheduleConfig, ScraperState
|
||||||
from ..db import db
|
from ..db import db
|
||||||
from ..celery import celery
|
from ..celery import celery
|
||||||
from ..defaults import MAX_VOLUME
|
from ..defaults import MAX_VOLUME
|
||||||
|
from celery.schedules import crontab
|
||||||
|
|
||||||
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
|
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
|
||||||
|
|
||||||
# Global variables to track scraper state
|
# Setup periodic task to run every minute for testing purposes
|
||||||
SCRAPER_ACTIVE = False
|
@celery.on_after_configure.connect
|
||||||
SCRAPER_PAUSED = False
|
def setup_periodic_tasks(sender, **kwargs):
|
||||||
|
# Run the dummy scraper every minute for testing purposes
|
||||||
|
sender.add_periodic_task(60.0, run_periodic_dummy_scraper.s(), name='run dummy scraper every minute')
|
||||||
|
|
||||||
|
@celery.task
|
||||||
|
def run_periodic_dummy_scraper():
|
||||||
|
"""Periodic task to run the dummy scraper if it's active and not paused"""
|
||||||
|
if ScraperState.is_scraper_active():
|
||||||
|
dummy_scheduled_scraper.delay()
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
@bp.route("/")
|
@bp.route("/")
|
||||||
def index():
|
def index():
|
||||||
@ -26,22 +37,26 @@ def index():
|
|||||||
db.session.add(volume_config)
|
db.session.add(volume_config)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
|
||||||
|
# Get scraper state
|
||||||
|
scraper_state = ScraperState.get_current_state()
|
||||||
|
|
||||||
return render_template(
|
return render_template(
|
||||||
"scraper.html.jinja",
|
"scraper.html.jinja",
|
||||||
volume_config=volume_config,
|
volume_config=volume_config,
|
||||||
scraper_active=SCRAPER_ACTIVE,
|
scraper_active=scraper_state.is_active,
|
||||||
scraper_paused=SCRAPER_PAUSED,
|
scraper_paused=scraper_state.is_paused,
|
||||||
max_volume=MAX_VOLUME
|
max_volume=MAX_VOLUME
|
||||||
)
|
)
|
||||||
|
|
||||||
@bp.route("/start", methods=["POST"])
|
@bp.route("/start", methods=["POST"])
|
||||||
def start_scraper():
|
def start_scraper():
|
||||||
"""Start the scraper."""
|
"""Start the scraper."""
|
||||||
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
scraper_state = ScraperState.get_current_state()
|
||||||
|
|
||||||
if not SCRAPER_ACTIVE:
|
if not scraper_state.is_active:
|
||||||
SCRAPER_ACTIVE = True
|
# Update scraper state
|
||||||
SCRAPER_PAUSED = False
|
ScraperState.set_active(True)
|
||||||
|
ScraperState.set_paused(False)
|
||||||
|
|
||||||
# Log the action
|
# Log the action
|
||||||
ActivityLog.log_scraper_command(
|
ActivityLog.log_scraper_command(
|
||||||
@ -66,11 +81,12 @@ def start_scraper():
|
|||||||
@bp.route("/stop", methods=["POST"])
|
@bp.route("/stop", methods=["POST"])
|
||||||
def stop_scraper():
|
def stop_scraper():
|
||||||
"""Stop the scraper."""
|
"""Stop the scraper."""
|
||||||
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
scraper_state = ScraperState.get_current_state()
|
||||||
|
|
||||||
if SCRAPER_ACTIVE:
|
if scraper_state.is_active:
|
||||||
SCRAPER_ACTIVE = False
|
# Update scraper state
|
||||||
SCRAPER_PAUSED = False
|
ScraperState.set_active(False)
|
||||||
|
ScraperState.set_paused(False)
|
||||||
|
|
||||||
ActivityLog.log_scraper_command(
|
ActivityLog.log_scraper_command(
|
||||||
action="stop_scraper",
|
action="stop_scraper",
|
||||||
@ -91,10 +107,11 @@ def stop_scraper():
|
|||||||
@bp.route("/pause", methods=["POST"])
|
@bp.route("/pause", methods=["POST"])
|
||||||
def pause_scraper():
|
def pause_scraper():
|
||||||
"""Pause the scraper."""
|
"""Pause the scraper."""
|
||||||
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
scraper_state = ScraperState.get_current_state()
|
||||||
|
|
||||||
if SCRAPER_ACTIVE and not SCRAPER_PAUSED:
|
if scraper_state.is_active and not scraper_state.is_paused:
|
||||||
SCRAPER_PAUSED = True
|
# Update scraper state
|
||||||
|
ScraperState.set_paused(True)
|
||||||
|
|
||||||
ActivityLog.log_scraper_command(
|
ActivityLog.log_scraper_command(
|
||||||
action="pause_scraper",
|
action="pause_scraper",
|
||||||
@ -106,8 +123,9 @@ def pause_scraper():
|
|||||||
"success": True,
|
"success": True,
|
||||||
"message": "Scraper paused"
|
"message": "Scraper paused"
|
||||||
})
|
})
|
||||||
elif SCRAPER_ACTIVE and SCRAPER_PAUSED:
|
elif scraper_state.is_active and scraper_state.is_paused:
|
||||||
SCRAPER_PAUSED = False
|
# Update scraper state
|
||||||
|
ScraperState.set_paused(False)
|
||||||
|
|
||||||
ActivityLog.log_scraper_command(
|
ActivityLog.log_scraper_command(
|
||||||
action="resume_scraper",
|
action="resume_scraper",
|
||||||
@ -128,9 +146,11 @@ def pause_scraper():
|
|||||||
@bp.route("/status")
|
@bp.route("/status")
|
||||||
def scraper_status():
|
def scraper_status():
|
||||||
"""Get the current status of the scraper."""
|
"""Get the current status of the scraper."""
|
||||||
|
scraper_state = ScraperState.get_current_state()
|
||||||
|
|
||||||
return jsonify({
|
return jsonify({
|
||||||
"active": SCRAPER_ACTIVE,
|
"active": scraper_state.is_active,
|
||||||
"paused": SCRAPER_PAUSED,
|
"paused": scraper_state.is_paused,
|
||||||
"current_hour": datetime.now().hour,
|
"current_hour": datetime.now().hour,
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -348,13 +368,12 @@ def dummy_scheduled_scraper():
|
|||||||
The main scheduler task that runs every hour to process papers
|
The main scheduler task that runs every hour to process papers
|
||||||
according to the configured schedule.
|
according to the configured schedule.
|
||||||
"""
|
"""
|
||||||
global SCRAPER_ACTIVE, SCRAPER_PAUSED
|
# Check if scraper is active using ScraperState
|
||||||
|
if not ScraperState.is_scraper_active():
|
||||||
if not SCRAPER_ACTIVE or SCRAPER_PAUSED:
|
|
||||||
ActivityLog.log_scraper_activity(
|
ActivityLog.log_scraper_activity(
|
||||||
action="scheduled_scraping",
|
action="scheduled_scraping",
|
||||||
status="info",
|
status="info",
|
||||||
description=f"Scheduled scraping skipped: active={SCRAPER_ACTIVE}, paused={SCRAPER_PAUSED}"
|
description=f"Scheduled scraping skipped: inactive or paused"
|
||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -382,9 +401,19 @@ def dummy_scheduled_scraper():
|
|||||||
|
|
||||||
# Create dummy pending papers
|
# Create dummy pending papers
|
||||||
for i in range(papers_to_download):
|
for i in range(papers_to_download):
|
||||||
|
# Generate a unique DOI by checking if it exists in the database
|
||||||
|
while True:
|
||||||
|
random_id = random.randint(1000, 9999)
|
||||||
|
doi = f"10.1234/dummy-pending.{random_id}"
|
||||||
|
|
||||||
|
# Check if the DOI already exists
|
||||||
|
existing = PaperMetadata.query.filter_by(doi=doi).first()
|
||||||
|
if not existing:
|
||||||
|
break
|
||||||
|
|
||||||
new_paper = PaperMetadata(
|
new_paper = PaperMetadata(
|
||||||
title=f"Dummy Pending Paper {random.randint(1000, 9999)}",
|
title=f"Dummy Pending Paper {random_id}",
|
||||||
doi=f"10.1234/dummy-pending.{random.randint(1000, 9999)}",
|
doi=doi,
|
||||||
journal=random.choice([
|
journal=random.choice([
|
||||||
"Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
|
"Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
|
||||||
"Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
|
"Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
|
||||||
@ -396,7 +425,18 @@ def dummy_scheduled_scraper():
|
|||||||
)
|
)
|
||||||
db.session.add(new_paper)
|
db.session.add(new_paper)
|
||||||
|
|
||||||
db.session.commit()
|
# Commit all at once after creating all papers
|
||||||
|
try:
|
||||||
|
db.session.commit()
|
||||||
|
except Exception as e:
|
||||||
|
# Log the error and rollback
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message="Failed to create dummy pending papers",
|
||||||
|
exception=e,
|
||||||
|
source="dummy_scheduled_scraper"
|
||||||
|
)
|
||||||
|
db.session.rollback()
|
||||||
|
return False
|
||||||
|
|
||||||
# Get the newly created papers
|
# Get the newly created papers
|
||||||
pending_papers = PaperMetadata.query.filter_by(status="Pending").all()
|
pending_papers = PaperMetadata.query.filter_by(status="Pending").all()
|
||||||
|
@ -211,6 +211,46 @@ class VolumeConfig(db.Model):
|
|||||||
volume = db.Column(db.Float) # volume of papers to scrape per day
|
volume = db.Column(db.Float) # volume of papers to scrape per day
|
||||||
|
|
||||||
|
|
||||||
|
class ScraperState(db.Model):
|
||||||
|
"""Model to store the current state of the scraper."""
|
||||||
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
|
is_active = db.Column(db.Boolean, default=False)
|
||||||
|
is_paused = db.Column(db.Boolean, default=False)
|
||||||
|
last_updated = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_current_state(cls):
|
||||||
|
"""Get the current scraper state, creating it if it doesn't exist."""
|
||||||
|
state = cls.query.first()
|
||||||
|
if not state:
|
||||||
|
state = cls(is_active=False, is_paused=False)
|
||||||
|
db.session.add(state)
|
||||||
|
db.session.commit()
|
||||||
|
return state
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def set_active(cls, active):
|
||||||
|
"""Set the active state of the scraper."""
|
||||||
|
state = cls.get_current_state()
|
||||||
|
state.is_active = active
|
||||||
|
db.session.commit()
|
||||||
|
return state
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def set_paused(cls, paused):
|
||||||
|
"""Set the paused state of the scraper."""
|
||||||
|
state = cls.get_current_state()
|
||||||
|
state.is_paused = paused
|
||||||
|
db.session.commit()
|
||||||
|
return state
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_scraper_active(cls):
|
||||||
|
"""Check if the scraper is active."""
|
||||||
|
state = cls.get_current_state()
|
||||||
|
return state.is_active and not state.is_paused
|
||||||
|
|
||||||
|
|
||||||
def init_schedule_config():
|
def init_schedule_config():
|
||||||
"""Initialize ScheduleConfig with default values if empty"""
|
"""Initialize ScheduleConfig with default values if empty"""
|
||||||
if ScheduleConfig.query.count() == 0:
|
if ScheduleConfig.query.count() == 0:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user