Compare commits

..

21 Commits

Author SHA1 Message Date
fe202b56d0 fixes logging and scraper start / stop task planning 2025-06-13 13:56:46 +02:00
24f9eb5766 makes logger much more beautiful 2025-06-13 12:57:54 +02:00
4a10052eae redesign of logger frontend to streamline and unify all logger views 2025-06-13 12:30:44 +02:00
8f064cda34 adds timezone config option 2025-06-13 11:47:41 +02:00
7fd403bd40 timezone fix 2025-06-13 11:14:06 +02:00
a7964a2f3d adds scraper modules and modular publisher parser system 2025-06-13 10:11:59 +02:00
ce6bc03b46 new landing page 2025-06-11 23:54:17 +02:00
70e2e2e900 again lol 2025-06-11 23:50:29 +02:00
793f6f9a7e fix 2025-06-11 23:48:46 +02:00
8f84774880 fixes flash messages for single paper scraping 2025-06-11 23:48:38 +02:00
98901ce38e also fixes flash messages 2025-06-11 23:45:19 +02:00
d730137d20 fixes flash messages 2025-06-11 23:44:01 +02:00
e2ae95cea0 fix api path in config js 2025-06-11 23:15:25 +02:00
676a3c96eb adds pagination to scraper and improves timestamp formatting 2025-06-11 23:11:49 +02:00
7a1ab3d7e6 fixes scraper activity chart 2025-06-11 22:25:35 +02:00
a4eb7648d5 fixes scraper 2025-06-11 21:32:01 +02:00
88e180bc94 creates timeline for scraper activity 2025-06-11 14:03:35 +02:00
5c5afefe40 modularizes the templates' js 2025-06-11 11:37:09 +02:00
8ffcf4d65c fix some ui stuff 2025-06-10 19:40:28 +02:00
ceeb6c375d refactor to apscheduler instead of redis and celery 2025-06-10 19:14:59 +02:00
3b42010fab fixes scheduling 2025-06-10 11:40:36 +02:00
67 changed files with 9439 additions and 2365 deletions

3
.gitignore vendored
View File

@ -17,4 +17,5 @@ dist/
migrations/
celerybeat-schedule*
# APScheduler job store files
jobs.sqlite

View File

@ -1,10 +1,9 @@
# List of phony targets (targets that don't represent files)
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev celery celery-flower redis run-all diagnostics
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev run-scheduler diagnostics clean-papers purge-db
# Define Python and pip executables inside virtual environment
PYTHON := venv/bin/python
PIP := venv/bin/pip
CELERY := venv/bin/celery
FLASK := venv/bin/flask
# Default target that runs the application
@ -15,7 +14,7 @@ clean:
rm -rf venv build dist .pytest_cache .mypy_cache *.egg-info
# Define database path
DB_PATH=scipaperloader/papers.db
DB_PATH=instance/papers.db
# Backup the database with timestamp
backup-db:
@ -91,6 +90,24 @@ reset-db: venv
$(PYTHON) -m flask --app scipaperloader db migrate -m "Initial migration"
$(PYTHON) -m flask --app scipaperloader db upgrade
# Clean all papers from the database (keep other tables intact)
clean-papers: venv
@echo "Cleaning all papers from the database..."
@$(PYTHON) -c "from scipaperloader.db import db; from scipaperloader.models import PaperMetadata; from scipaperloader import create_app; app = create_app(); app.app_context().push(); PaperMetadata.query.delete(); db.session.commit(); print('All papers have been removed from the database')"
# Completely purge all database contents (removes all tables and data)
purge-db: venv
@echo "WARNING: This will completely wipe all database contents!"
@read -p "Are you sure you want to continue? (y/N) " -n 1 -r; \
echo; \
if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
echo "Purging database..."; \
rm -f $(DB_PATH); \
echo "Database completely purged"; \
else \
echo "Operation cancelled"; \
fi
# Create and set up virtual environment
venv:
python3 -m venv venv && \
@ -133,65 +150,12 @@ dist: format-check lint mypy test
# Set up complete development environment
dev: clean venv
# Start Celery worker - PURGE FIRST
celery: venv redis
@echo "Purging Celery task queue before starting worker..."
# Purge the queue forcefully. Ignore errors if queue is empty/unreachable initially.
@-$(CELERY) -A celery_worker:celery purge -f
@echo "Starting Celery worker..."
$(CELERY) -A celery_worker:celery worker --loglevel=info
# Monitor Celery tasks with flower web interface
celery-flower: venv
$(PIP) install flower
$(CELERY) -A celery_worker:celery flower --port=5555
# Run Celery beat scheduler for periodic tasks
celery-beat: venv redis
@echo "Starting Celery beat scheduler..."
# Ensure celerybeat-schedule file is removed for clean start if needed
@-rm -f celerybeat-schedule.db
# Use the default file-based scheduler (removed the --scheduler flag)
$(CELERY) -A celery_worker:celery beat --loglevel=info
# Check if Redis is running, start if needed
redis:
@if ! redis-cli ping > /dev/null 2>&1; then \
echo "Starting Redis server..."; \
redis-server --daemonize yes; \
sleep 1; \
else \
echo "Redis is already running."; \
fi
# Run complete application stack (Flask app + Celery worker + Redis + Beat scheduler)
run-all: redis
@echo "Starting Flask, Celery worker and Beat scheduler..."
# Run them in parallel. Ctrl+C will send SIGINT to make, which propagates.
# Use trap to attempt cleanup, but primary cleanup is purge on next start.
@trap '$(MAKE) stop-all;' INT TERM; \
$(MAKE) -j3 run celery celery-beat & wait
# Stop running Celery worker and beat gracefully
stop-celery:
@echo "Attempting graceful shutdown of Celery worker and beat..."
@-pkill -TERM -f "celery -A celery_worker:celery worker" || echo "Worker not found or already stopped."
@-pkill -TERM -f "celery -A celery_worker:celery beat" || echo "Beat not found or already stopped."
@sleep 1 # Give processes a moment to terminate
@echo "Purging remaining tasks from Celery queue..."
@-$(CELERY) -A celery_worker:celery purge -f || echo "Purge failed or queue empty."
# Stop Flask development server
stop-flask:
@echo "Attempting shutdown of Flask development server..."
@-pkill -TERM -f "flask --app scipaperloader --debug run" || echo "Flask server not found or already stopped."
# Stop all components potentially started by run-all
stop-all: stop-celery stop-flask
@echo "All components stopped."
# Start the APScheduler-enabled Flask application
run-scheduler: venv
@echo "Starting Flask app with APScheduler..."
$(PYTHON) -m flask --app scipaperloader --debug run
# Run diagnostic tools
# Run diagnostic tools - works with or without virtualenv
diagnostics:
$(PYTHON) tools/run_diagnostics.py

View File

@ -15,7 +15,6 @@ And open it in the browser at [http://localhost:5000/](http://localhost:5000/)
## Prerequisites
- Python >=3.8
- Redis (for Celery task queue)
## Development environment
@ -41,30 +40,39 @@ And open it in the browser at [http://localhost:5000/](http://localhost:5000/)
add development dependencies under `project.optional-dependencies.*`; run
`make clean && make venv` to reinstall the environment
## Asynchronous Task Processing with Celery
## Task Processing Architecture
SciPaperLoader uses Celery for processing large CSV uploads and other background tasks. This allows the application to handle large datasets reliably without blocking the web interface.
SciPaperLoader uses **APScheduler** for all task processing:
### Running Celery Components
- **Periodic Tasks**: Hourly scraper scheduling with randomized paper processing
- **Background Tasks**: CSV uploads, manual paper processing, and all async operations
- **Job Management**: Clean job scheduling, revocation, and status tracking
- `make redis`: ensures Redis server is running (required for Celery)
This unified architecture provides reliable task processing with simple, maintainable code.
- `make celery`: starts a Celery worker to process background tasks
### Running Components
- `make celery-flower`: starts Flower, a web interface for monitoring Celery tasks at http://localhost:5555
- `make run`: starts the Flask application with integrated APScheduler
- `make run-all`: runs the entire stack (Flask app + Celery worker + Redis) in development mode
For development monitoring:
- Access the Flask admin interface for APScheduler job monitoring
- View real-time logs in the application's activity log section
### How It Works
When you upload a CSV file through the web interface:
**For CSV Uploads:**
1. File is uploaded through the web interface
2. APScheduler creates a background job to process the file
3. Browser shows progress updates via AJAX polling
4. Results are displayed when processing completes
1. The file is sent to the server
2. A Celery task is created to process the file asynchronously
3. The browser shows a progress bar with real-time updates
4. The results are displayed when processing is complete
**For Scheduled Scraping:**
1. APScheduler runs hourly at the top of each hour
2. Papers are selected based on volume and schedule configuration
3. Individual paper processing jobs are scheduled at random times within the hour
4. All jobs are tracked in the database with complete visibility
This architecture allows SciPaperLoader to handle CSV files with thousands of papers without timing out or blocking the web interface.
This unified architecture provides reliable task processing without external dependencies.
## Configuration
@ -72,12 +80,12 @@ Default configuration is loaded from `scipaperloader.defaults` and can be
overriden by environment variables with a `FLASK_` prefix. See
[Configuring from Environment Variables](https://flask.palletsprojects.com/en/3.0.x/config/#configuring-from-environment-variables).
### Celery Configuration
### Task Processing Configuration
The following environment variables can be set to configure Celery:
APScheduler automatically uses your configured database for job persistence. No additional configuration required.
- `FLASK_CELERY_BROKER_URL`: Redis URL for the message broker (default: `redis://localhost:6379/0`)
- `FLASK_CELERY_RESULT_BACKEND`: Redis URL for storing task results (default: `redis://localhost:6379/0`)
For advanced configuration, you can set:
- `FLASK_SQLALCHEMY_DATABASE_URI`: Database URL (APScheduler uses the same database)
Consider using
[dotenv](https://flask.palletsprojects.com/en/3.0.x/cli/#environment-variables-from-dotenv).
@ -115,17 +123,18 @@ You must set a
[SECRET_KEY](https://flask.palletsprojects.com/en/3.0.x/tutorial/deploy/#configure-the-secret-key)
in production to a secret and stable value.
### Deploying with Celery
### Deploying with APScheduler
When deploying to production:
1. Configure a production-ready Redis instance or use a managed service
2. Run Celery workers as system services or in Docker containers
3. Consider setting up monitoring for your Celery tasks and workers
1. APScheduler jobs are automatically persistent in your database
2. The Flask application handles all background processing internally
3. No external message broker or workers required
4. Scale by running multiple Flask instances with shared database
## Troubleshooting and Diagnostics
SciPaperLoader includes a collection of diagnostic and emergency tools to help address issues with the application, particularly with the scraper and Celery task system.
SciPaperLoader includes a collection of diagnostic and emergency tools to help address issues with the application, particularly with the scraper and APScheduler task system.
### Quick Access
@ -151,7 +160,7 @@ All diagnostic tools are located in the `tools/diagnostics/` directory:
- **check_state.py**: Quickly check the current state of the scraper in the database
- **diagnose_scraper.py**: Comprehensive diagnostic tool that examines tasks, logs, and scraper state
- **inspect_tasks.py**: View currently running, scheduled, and reserved Celery tasks
- **inspect_tasks.py**: View currently running and scheduled APScheduler tasks
- **test_reversion.py**: Test the paper reversion functionality when stopping the scraper
### Emergency Recovery
@ -159,7 +168,7 @@ All diagnostic tools are located in the `tools/diagnostics/` directory:
For cases where the scraper is stuck or behaving unexpectedly:
- **emergency_stop.py**: Force stops all scraper activities, revokes all running tasks, and reverts papers from "Pending" state
- **quick_fix.py**: Simplified emergency stop that also restarts Celery workers to ensure code changes are applied
- **quick_fix.py**: Simplified emergency stop that also stops Flask processes to ensure code changes are applied
### Usage Example

View File

@ -1,11 +0,0 @@
from scipaperloader.celery import celery, configure_celery
# Import all task modules to ensure they are registered with Celery
import scipaperloader.scrapers.tasks # Import new scheduler tasks
import scipaperloader.blueprints.scraper # Import the scraper module with our tasks
# Configure celery with Flask app
configure_celery()
if __name__ == '__main__':
# Start the Celery worker
celery.start(['worker', '--loglevel=info', '--concurrency=2'])

BIN
dump.rdb

Binary file not shown.

View File

@ -13,10 +13,10 @@ dependencies = [
"flask-wtf>=1.2.2,<2",
"pyzotero>=1.6.11,<2",
"pandas>=2.2.3,<3",
"celery>=5.5.1,<6",
"redis>=5.2.1,<6",
"flower>=2.0.1,<3",
"APScheduler>=3.10.4,<4",
"flask-migrate>=4.1.0,<5",
"beautifulsoup4>=4.13.4,<5 ",
"requests>=2.32.4,<3"
]
[project.optional-dependencies]

View File

@ -5,14 +5,23 @@ from .db import db
from .models import init_schedule_config
from .models import ActivityLog, ActivityCategory
from .blueprints import register_blueprints
from .scheduler import ScraperScheduler
def create_app(test_config=None):
app = Flask(__name__)
app = Flask(__name__, instance_relative_config=True)
app.config.from_object(Config)
# Celery configuration
app.config['CELERY_BROKER_URL'] = app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0')
app.config['CELERY_RESULT_BACKEND'] = app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0')
# Ensure the instance folder exists
import os
try:
os.makedirs(app.instance_path)
except OSError:
pass
# Set the database URI to use absolute path if it's the default relative path
if app.config['SQLALCHEMY_DATABASE_URI'] == "sqlite:///instance/papers.db":
db_path = os.path.join(app.instance_path, 'papers.db')
app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}'
if test_config:
app.config.update(test_config)
@ -24,6 +33,12 @@ def create_app(test_config=None):
db.create_all()
init_schedule_config()
# Initialize APScheduler
scheduler = ScraperScheduler(app)
# Store scheduler in app config for access from other modules
app.config['SCHEDULER'] = scheduler
@app.context_processor
def inject_app_title():
return {"app_title": app.config["APP_TITLE"]}

View File

@ -2,7 +2,7 @@
from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify, current_app
from ..db import db
# Import the new model
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata, TimezoneConfig
from ..defaults import MAX_VOLUME
import os # Import os for path validation
import sys
@ -129,6 +129,54 @@ def _update_download_path(new_path):
return False, f"Error updating download path: {str(e)}", None
def _update_timezone(new_timezone):
"""
Helper function to update timezone configuration.
Args:
new_timezone (str): The new timezone
Returns:
tuple: (success, message, timezone_config)
"""
try:
# Basic validation: check if it's a non-empty string
if not new_timezone or not isinstance(new_timezone, str):
return False, "Timezone cannot be empty.", None
# Validate timezone using pytz
try:
import pytz
pytz.timezone(new_timezone) # This will raise an exception if invalid
except ImportError:
# If pytz is not available, do basic validation
if '/' not in new_timezone:
return False, "Invalid timezone format. Use format like 'Europe/Berlin'.", None
except pytz.exceptions.UnknownTimeZoneError:
return False, f"Unknown timezone: {new_timezone}. Use format like 'Europe/Berlin'.", None
config = TimezoneConfig.query.first()
if not config:
config = TimezoneConfig(timezone=new_timezone)
db.session.add(config)
else:
old_value = config.timezone
config.timezone = new_timezone
ActivityLog.log_config_change(
config_key="scheduler_timezone",
old_value=old_value,
new_value=new_timezone,
description="Updated scheduler timezone"
)
db.session.commit()
return True, "Timezone updated successfully!", config
except Exception as e:
db.session.rollback()
return False, f"Error updating timezone: {str(e)}", None
def _update_schedule(schedule_data):
"""
Helper function to update schedule configuration.
@ -211,11 +259,19 @@ def general():
db.session.add(download_path_config)
db.session.commit()
# Fetch timezone config
timezone_config = TimezoneConfig.query.first()
if not timezone_config:
timezone_config = TimezoneConfig() # Use default from model
db.session.add(timezone_config)
db.session.commit()
return render_template(
"config/index.html.jinja",
active_tab="general",
volume_config=volume_config,
download_path_config=download_path_config, # Pass to template
timezone_config=timezone_config, # Pass to template
max_volume=MAX_VOLUME,
app_title="Configuration"
)
@ -369,9 +425,10 @@ def generate_test_papers():
@bp.route("/update/general", methods=["POST"])
def update_general():
"""Update general configuration (Volume and Download Path)."""
"""Update general configuration (Volume, Download Path, and Timezone)."""
volume_success, volume_message = True, ""
path_success, path_message = True, ""
timezone_success, timezone_message = True, ""
# Update Volume
new_volume = request.form.get("total_volume")
@ -391,6 +448,15 @@ def update_general():
else:
flash(path_message, "error")
# Update Timezone
new_timezone = request.form.get("timezone")
if new_timezone is not None:
timezone_success, timezone_message, _ = _update_timezone(new_timezone)
if timezone_success:
flash(timezone_message, "success")
else:
flash(timezone_message, "error")
return redirect(url_for("config.general"))

View File

@ -2,7 +2,7 @@
import csv
import io
import datetime
from flask import Blueprint, render_template, request, send_file
from flask import Blueprint, render_template, request, send_file, jsonify
from ..db import db
from ..models import ActivityLog, ActivityCategory
@ -11,11 +11,11 @@ bp = Blueprint("logger", __name__, url_prefix="/logs")
@bp.route("/")
def list_logs():
page = request.args.get("page", 1, type=int)
per_page = 50
# For the new modern view, we only need to provide initial filter values and categories
# The actual data loading will be handled by JavaScript via the API endpoint
# Filters
category = request.args.get("category")
# Get filter parameters for initial state
categories_param = request.args.getlist("category") # Get multiple categories
start_date = request.args.get("start_date")
end_date = request.args.get("end_date")
search_term = request.args.get("search_term")
@ -23,33 +23,12 @@ def list_logs():
if search_term == "None":
search_term = None
query = ActivityLog.query
if category:
query = query.filter(ActivityLog.category == category)
if start_date:
start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
query = query.filter(ActivityLog.timestamp >= start_date_dt)
if end_date:
end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=1)
query = query.filter(ActivityLog.timestamp <= end_date_dt)
if search_term:
query = query.filter(db.or_(
ActivityLog.action.contains(search_term),
ActivityLog.description.contains(search_term)
))
pagination = query.order_by(ActivityLog.timestamp.desc()).paginate(page=page, per_page=per_page, error_out=False)
categories = [e.value for e in ActivityCategory]
return render_template(
"logger.html.jinja",
logs=pagination.items,
pagination=pagination,
"logs.html.jinja",
categories=categories,
category=category,
selected_categories=categories_param, # Pass selected categories
start_date=start_date,
end_date=end_date,
search_term=search_term,
@ -60,15 +39,15 @@ def list_logs():
@bp.route("/download")
def download_logs():
# Filters - reuse logic from list_logs
category = request.args.get("category")
categories = request.args.getlist("category") # Get multiple categories
start_date = request.args.get("start_date")
end_date = request.args.get("end_date")
search_term = request.args.get("search_term")
query = ActivityLog.query
if category:
query = query.filter(ActivityLog.category == category)
if categories:
query = query.filter(ActivityLog.category.in_(categories))
if start_date:
start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
query = query.filter(ActivityLog.timestamp >= start_date_dt)
@ -99,8 +78,12 @@ def download_logs():
# Create response
filename = f"logs_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
csv_data.seek(0)
output = io.BytesIO(csv_data.getvalue().encode('utf-8'))
output.seek(0)
return send_file(
io.StringIO(csv_data.getvalue()),
output,
mimetype="text/csv",
as_attachment=True,
download_name=filename
@ -110,3 +93,131 @@ def download_logs():
def log_detail(log_id):
log = ActivityLog.query.get_or_404(log_id)
return render_template("partials/log_detail_modal.html.jinja", log=log)
@bp.route("/api")
def get_logs_api():
"""Unified API endpoint for getting activity logs with filtering and pagination support."""
try:
# Pagination parameters
page = request.args.get('page', 1, type=int)
per_page = request.args.get('per_page', 50, type=int)
# Legacy limit parameter for backward compatibility
limit = request.args.get('limit', type=int)
if limit and not request.args.get('page'):
# Legacy mode: use limit without pagination
query = ActivityLog.query
# Apply filters
categories = request.args.getlist('category')
if categories:
query = query.filter(ActivityLog.category.in_(categories))
status = request.args.get('status')
if status:
query = query.filter(ActivityLog.status == status)
start_date = request.args.get('start_date')
if start_date:
start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
query = query.filter(ActivityLog.timestamp >= start_date_dt)
end_date = request.args.get('end_date')
if end_date:
end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=1)
query = query.filter(ActivityLog.timestamp <= end_date_dt)
search_term = request.args.get('search_term')
if search_term and search_term != "None":
query = query.filter(db.or_(
ActivityLog.action.contains(search_term),
ActivityLog.description.contains(search_term)
))
logs = query.order_by(ActivityLog.timestamp.desc()).limit(limit).all()
return jsonify({
"success": True,
"logs": [{
"id": log.id,
"timestamp": log.timestamp.isoformat(),
"action": log.action,
"status": log.status,
"description": log.description,
"category": log.category,
"paper_id": log.paper_id,
"extra_data": log.extra_data
} for log in logs]
})
# Ensure reasonable per_page limits
per_page = min(per_page, 100) # Cap at 100 items per page
# Build query with filtering
query = ActivityLog.query
# Filter by categories if specified
categories = request.args.getlist('category')
if categories:
query = query.filter(ActivityLog.category.in_(categories))
# Filter by status if specified
status = request.args.get('status')
if status:
query = query.filter(ActivityLog.status == status)
# Date filters
start_date = request.args.get('start_date')
if start_date:
start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
query = query.filter(ActivityLog.timestamp >= start_date_dt)
end_date = request.args.get('end_date')
if end_date:
end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=1)
query = query.filter(ActivityLog.timestamp <= end_date_dt)
# Search term filter
search_term = request.args.get('search_term')
if search_term and search_term != "None":
query = query.filter(db.or_(
ActivityLog.action.contains(search_term),
ActivityLog.description.contains(search_term)
))
# Order by most recent first and paginate
pagination = query.order_by(ActivityLog.timestamp.desc()).paginate(
page=page,
per_page=per_page,
error_out=False
)
return jsonify({
"success": True,
"logs": [{
"id": log.id,
"timestamp": log.timestamp.isoformat(),
"action": log.action,
"status": log.status,
"description": log.description,
"category": log.category,
"paper_id": log.paper_id,
"extra_data": log.extra_data
} for log in pagination.items],
"pagination": {
"page": pagination.page,
"pages": pagination.pages,
"per_page": pagination.per_page,
"total": pagination.total,
"has_next": pagination.has_next,
"has_prev": pagination.has_prev,
"next_num": pagination.next_num if pagination.has_next else None,
"prev_num": pagination.prev_num if pagination.has_prev else None
}
})
except Exception as e:
return jsonify({
"success": False,
"message": f"Error getting logs: {str(e)}"
}), 500

View File

@ -1,7 +1,7 @@
"""
Simplified scraper blueprint using the new ScraperManager and hourly scheduling system.
"""
from flask import Blueprint, jsonify, render_template, request
from flask import Blueprint, jsonify, render_template, request, current_app
from ..models import ActivityLog, PaperMetadata, ScraperState, VolumeConfig
from ..scrapers.manager import ScraperManager
from ..scrapers.factory import get_available_scrapers
@ -29,6 +29,10 @@ def index():
# Get volume configuration
volume_config = VolumeConfig.get_current_volume()
# Get scraper module configuration
from ..models import ScraperModuleConfig
current_scraper_module = ScraperModuleConfig.get_current_module()
# Get paper counts by status
paper_counts = {
'new': PaperMetadata.query.filter_by(status='New').count(),
@ -46,7 +50,10 @@ def index():
recent_logs=recent_logs,
paper_counts=paper_counts,
volume_config=volume_config,
max_volume=MAX_VOLUME
max_volume=MAX_VOLUME,
current_scraper_module=current_scraper_module,
available_scraper_modules=[s["name"] for s in available_scrapers],
scraper_details={s["name"]: s for s in available_scrapers}
)
@bp.route("/start", methods=["POST"])
@ -55,11 +62,12 @@ def start_scraper():
try:
# Handle both JSON and form data
if request.is_json:
data = request.get_json() or {}
data = request.get_json()
# Allow empty JSON payload for start requests
if data is None:
data = {}
else:
data = request.form.to_dict()
scraper_name = data.get('scraper_name', 'dummy')
return jsonify({"success": False, "message": "Invalid payload format. Expected JSON."}), 400
# Start the scraper using manager
result = scraper_manager.start_scraper()
@ -68,18 +76,16 @@ def start_scraper():
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description="Started scraper with hourly scheduling"
description="Scraper started successfully."
)
return jsonify({
"success": True,
"message": result["message"]
})
return jsonify({"success": True, "message": result["message"]})
else:
return jsonify({
"success": False,
"message": result["message"]
}), 400
ActivityLog.log_scraper_command(
action="start_scraper",
status="failure",
description=f"Failed to start scraper: {result['message']}"
)
return jsonify({"success": False, "message": result["message"]}), 400
except Exception as e:
ActivityLog.log_scraper_command(
@ -87,10 +93,7 @@ def start_scraper():
status="error",
description=f"Failed to start scraper: {str(e)}"
)
return jsonify({
"success": False,
"message": f"Error starting scraper: {str(e)}"
}), 500
return jsonify({"success": False, "message": f"An error occurred: {str(e)}"}), 500
@bp.route("/pause", methods=["POST"])
def pause_scraper():
@ -223,6 +226,13 @@ def get_status():
# Get current hour quota info
current_quota = scraper_manager.get_current_hour_quota()
# Get current scraper module configuration
from ..models import ScraperModuleConfig
current_scraper_module = ScraperModuleConfig.get_current_module()
# Get volume configuration
current_volume = VolumeConfig.get_current_volume()
return jsonify({
"success": True,
"scraper_state": {
@ -231,7 +241,9 @@ def get_status():
"last_updated": scraper_state.last_updated.isoformat() if scraper_state.last_updated else None
},
"paper_counts": paper_counts,
"current_quota": current_quota
"current_quota": current_quota,
"current_scraper_module": current_scraper_module,
"volume_config": current_volume
})
except Exception as e:
@ -242,28 +254,16 @@ def get_status():
@bp.route("/logs")
def get_logs():
"""Get recent activity logs."""
try:
limit = request.args.get('limit', 50, type=int)
logs = ActivityLog.query.order_by(ActivityLog.timestamp.desc()).limit(limit).all()
"""Get recent activity logs with pagination support."""
# Redirect to the unified logs API endpoint
from flask import redirect, url_for
return jsonify({
"success": True,
"logs": [{
"id": log.id,
"timestamp": log.timestamp.isoformat(),
"action": log.action,
"status": log.status,
"description": log.description,
"category": log.category.name if log.category else None
} for log in logs]
})
except Exception as e:
return jsonify({
"success": False,
"message": f"Error getting logs: {str(e)}"
}), 500
# Forward all query parameters to the unified endpoint
query_string = request.query_string.decode('utf-8')
if query_string:
return redirect(f"{url_for('logger.get_logs_api')}?{query_string}")
else:
return redirect(url_for('logger.get_logs_api'))
@bp.route("/scrapers")
def get_scrapers():
@ -346,8 +346,6 @@ def process_papers_manually():
def trigger_immediate_processing():
"""Trigger immediate processing of papers without waiting for hourly schedule."""
try:
from ..scrapers.tasks import process_papers_batch
# Get papers that should be processed this hour
manager = ScraperManager()
papers = manager.select_papers_for_processing()
@ -359,23 +357,38 @@ def trigger_immediate_processing():
"papers_scheduled": 0
})
# Get paper IDs for batch processing
paper_ids = [paper.id for paper in papers]
# Get APScheduler instance
scheduler = current_app.config.get('SCHEDULER')
if not scheduler:
return jsonify({
"success": False,
"message": "APScheduler not available"
}), 500
# Trigger immediate batch processing (no delay)
task = process_papers_batch.delay(paper_ids)
# Schedule papers for immediate processing via APScheduler
scheduled_count = 0
for paper in papers:
try:
import uuid
job_id = f"immediate_paper_{paper.id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{uuid.uuid4().hex[:8]}"
scheduler.schedule_paper_processing(paper.id, delay_seconds=1, job_id=job_id)
scheduled_count += 1
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to schedule paper {paper.id}: {str(e)}",
source="trigger_immediate_processing"
)
ActivityLog.log_scraper_command(
action="trigger_immediate_processing",
status="success",
description=f"Triggered immediate processing of {len(paper_ids)} papers"
description=f"Triggered immediate processing of {scheduled_count} papers via APScheduler"
)
return jsonify({
"success": True,
"message": f"Immediate processing started for {len(paper_ids)} papers",
"papers_scheduled": len(paper_ids),
"task_id": task.id
"message": f"Immediate processing started for {scheduled_count} papers",
"papers_scheduled": scheduled_count
})
except Exception as e:
@ -416,40 +429,96 @@ def get_stats():
try:
hours = int(request.args.get('hours', 24))
current_time = datetime.utcnow()
cutoff_time = current_time.replace(minute=0, second=0, microsecond=0)
# Get activity logs for scraper actions in the last N hours
from ..models import ActivityCategory
start_time = cutoff_time - timedelta(hours=hours)
start_time = current_time - timedelta(hours=hours)
logs = ActivityLog.query.filter(
ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value,
ActivityLog.timestamp >= start_time
).all()
# Group by hour and status
stats = {}
# Get scraper command logs for state changes in the same time period
state_logs = ActivityLog.query.filter(
ActivityLog.category == ActivityCategory.SCRAPER_COMMAND.value,
ActivityLog.action.in_(['start_scraper', 'pause_scraper', 'stop_scraper', 'reset_scraper']),
ActivityLog.timestamp >= start_time
).order_by(ActivityLog.timestamp.asc()).all()
# Group by chronological hour buckets (not hour of day)
stats = []
for hour_offset in range(hours):
target_hour = (current_time.hour - hour_offset) % 24
stats[target_hour] = {
# Calculate the hour bucket (most recent hour first when hour_offset=0)
bucket_end_time = current_time - timedelta(hours=hour_offset)
bucket_start_time = bucket_end_time - timedelta(hours=1)
# Format hour label for display (e.g., "14:00-15:00" or "14:00" for simplicity)
hour_label = bucket_start_time.strftime("%H:%M")
# Initialize counters for this hour bucket
bucket_stats = {
"success": 0,
"error": 0,
"pending": 0,
"hour": target_hour,
"hour": hour_label,
"hour_offset": hour_offset, # For sorting
"bucket_start": bucket_start_time,
"bucket_end": bucket_end_time,
"scraper_active": 0 # Default to inactive
}
for log in logs:
hour = log.timestamp.hour
if hour in stats:
if log.status == "success":
stats[hour]["success"] += 1
elif log.status == "error":
stats[hour]["error"] += 1
elif log.status in ("pending", "info"):
stats[hour]["pending"] += 1
# Count logs that fall within this hour bucket
for log in logs:
if bucket_start_time <= log.timestamp < bucket_end_time:
if log.status == "success":
bucket_stats["success"] += 1
elif log.status == "error":
bucket_stats["error"] += 1
elif log.status in ("pending", "info"):
bucket_stats["pending"] += 1
# Convert to list for easier consumption by JavaScript
result = [stats[hour] for hour in sorted(stats.keys())]
return jsonify(result)
# Determine scraper status for this hour by checking if scraper was active
# For simplicity, check if there were any successful scrapes in this hour
# If there were scrapes, assume scraper was active
bucket_stats["scraper_active"] = 1 if bucket_stats["success"] > 0 else 0
stats.append(bucket_stats)
# Reverse so oldest hour comes first (better for chronological chart display)
stats.reverse()
# Prepare precise scraper state changes for timeline
scraper_timeline = []
for log in state_logs:
# Calculate hours ago from current time
time_diff = current_time - log.timestamp
hours_ago = time_diff.total_seconds() / 3600
# Only include logs within our time range
if hours_ago <= hours:
scraper_timeline.append({
"timestamp": log.timestamp.isoformat(),
"hours_ago": hours_ago,
"action": log.action,
"status": log.status,
"active": 1 if log.action == "start_scraper" and log.status == "success" else 0
})
# Clean up the response (remove internal fields)
result = []
for stat in stats:
result.append({
"success": stat["success"],
"error": stat["error"],
"pending": stat["pending"],
"hour": stat["hour"],
"scraper_active": stat["scraper_active"]
})
return jsonify({
"hourly_stats": result,
"scraper_timeline": scraper_timeline
})
except Exception as e:
return jsonify({
@ -472,20 +541,39 @@ def process_single_paper_endpoint(paper_id):
"message": "Paper not found"
}), 404
# Process the paper using the manager
result = scraper_manager.process_paper(paper)
# Get APScheduler instance
scheduler = current_app.config.get('SCHEDULER')
if not scheduler:
return jsonify({
"success": False,
"message": "APScheduler not available"
}), 500
ActivityLog.log_scraper_command(
action="manual_process_single",
status="success",
description=f"Manually processed paper {paper.doi}"
)
# Schedule the paper for immediate manual processing via APScheduler
# Use UUID suffix to ensure unique job IDs
import uuid
job_id = f"manual_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{uuid.uuid4().hex[:8]}"
try:
scheduler.schedule_manual_paper_processing(paper_id, scraper_name=scraper_name, delay_seconds=1, job_id=job_id)
return jsonify({
"success": True,
"message": f"Processing started for paper {paper.doi}",
"paper_id": paper_id
})
ActivityLog.log_scraper_command(
action="manual_process_single",
status="success",
description=f"Scheduled manual processing for paper {paper.doi} via APScheduler" +
(f" using scraper '{scraper_name}'" if scraper_name else " using system default scraper")
)
return jsonify({
"success": True,
"message": f"Processing scheduled for paper {paper.doi}" +
(f" using {scraper_name} scraper" if scraper_name else " using system default scraper"),
"paper_id": paper_id
})
except Exception as e:
return jsonify({
"success": False,
"message": f"Failed to schedule processing: {str(e)}"
}), 500
except Exception as e:
ActivityLog.log_scraper_command(
@ -530,6 +618,35 @@ def update_scraper_config():
"message": message
}), 400
# Handle scraper module configuration updates
if "scraper_module" in data:
from ..models import ScraperModuleConfig
new_module = data["scraper_module"]
# Validate that the module exists and is valid
available_modules = [m["name"] for m in get_available_scrapers()]
if new_module not in available_modules:
return jsonify({
"success": False,
"message": f"Invalid scraper module: {new_module}"
}), 400
# Update the database configuration
ScraperModuleConfig.set_module(new_module)
ActivityLog.log_scraper_command(
action="update_scraper_module",
status="success",
description=f"Updated scraper module to '{new_module}'"
)
return jsonify({
"success": True,
"message": f"Scraper module updated to '{new_module}' successfully"
})
# Handle other configuration updates here if needed in the future
return jsonify({
@ -547,3 +664,72 @@ def update_scraper_config():
"success": False,
"message": f"Error updating scraper config: {str(e)}"
}), 500
@bp.route("/publishers")
def get_publishers():
"""Get publisher overview data for the scraper overview modal."""
try:
import os
import glob
# Get available parser modules
parsers_dir = os.path.join(current_app.root_path, 'parsers')
parser_files = glob.glob(os.path.join(parsers_dir, '*_parser.py'))
available_parsers = []
for parser_file in parser_files:
filename = os.path.basename(parser_file)
if filename != 'base_parser.py': # Skip the base parser
parser_name = filename.replace('_parser.py', '')
available_parsers.append(parser_name)
# Get publishers from database (papers that have publisher detected)
publisher_query = db.session.query(
PaperMetadata.publisher,
db.func.count(PaperMetadata.id).label('paper_count')
).filter(
PaperMetadata.publisher.isnot(None),
PaperMetadata.publisher != ''
).group_by(PaperMetadata.publisher).all()
publishers_data = []
for publisher, count in publisher_query:
# Check if a parser exists for this publisher
has_parser = publisher in available_parsers
publishers_data.append({
'name': publisher,
'paper_count': count,
'has_parser': has_parser,
'parser_status': 'available' if has_parser else 'missing'
})
# Sort by paper count descending
publishers_data.sort(key=lambda x: x['paper_count'], reverse=True)
# Get totals
total_papers_with_publisher = sum(p['paper_count'] for p in publishers_data)
total_papers_without_publisher = PaperMetadata.query.filter(
db.or_(PaperMetadata.publisher.is_(None), PaperMetadata.publisher == '')
).count()
return jsonify({
'success': True,
'data': {
'publishers': publishers_data,
'available_parsers': available_parsers,
'stats': {
'total_publishers': len(publishers_data),
'publishers_with_parsers': len([p for p in publishers_data if p['has_parser']]),
'publishers_without_parsers': len([p for p in publishers_data if not p['has_parser']]),
'total_papers_with_publisher': total_papers_with_publisher,
'total_papers_without_publisher': total_papers_without_publisher
}
}
})
except Exception as e:
return jsonify({
'success': False,
'message': f'Error getting publisher data: {str(e)}'
}), 500

View File

@ -2,8 +2,11 @@
import codecs
import csv
import datetime
from io import StringIO
import traceback
from io import StringIO, BytesIO
import json
import uuid
from typing import Dict, Any
import pandas as pd
from flask import (
@ -21,7 +24,6 @@ from flask import (
from ..db import db
from ..models import PaperMetadata, ActivityLog
from ..celery import celery # Import the celery instance directly
from ..defaults import DUPLICATE_STRATEGIES
bp = Blueprint("upload", __name__)
@ -29,6 +31,10 @@ bp = Blueprint("upload", __name__)
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
CHUNK_SIZE = 100 # Number of rows to process per batch
# Store task progress in memory (for simplicity)
# In production, you might want to use Redis or database
task_progress = {}
def parse_date(date_str):
"""Parse date string into datetime object."""
if not date_str or pd.isna(date_str):
@ -38,6 +44,76 @@ def parse_date(date_str):
except ValueError:
return None
def _process_csv_background(task_id: str, file_content: str, delimiter: str, duplicate_strategy: str):
"""Background function to process CSV file using APScheduler."""
print(f"DEBUG: _process_csv_background called with task_id: {task_id}")
# Get Flask app for context
from flask import current_app
# Get the Flask app from the scheduler context
from ..scheduler import _get_flask_app
app = _get_flask_app()
print(f"DEBUG: Flask app obtained: {app}")
if not app:
# Fallback: try to get current_app
try:
app = current_app
print(f"DEBUG: Using current_app: {app}")
except RuntimeError as e:
print(f"DEBUG: Failed to get current_app: {e}")
task_progress[task_id] = {
"state": "FAILURE",
"progress": 0,
"error": "Flask app context not available"
}
return
with app.app_context():
try:
print(f"DEBUG: Inside app context, starting CSV processing for task {task_id}")
# Initialize progress
task_progress[task_id] = {
"state": "PROGRESS",
"progress": 0,
"message": "Starting CSV processing..."
}
result = process_csv(file_content, delimiter, duplicate_strategy, task_id)
print(f"DEBUG: CSV processing completed for task {task_id}, result: {result}")
# Mark as completed
task_progress[task_id] = {
"state": "SUCCESS",
"progress": 100,
"result": result
}
except Exception as e:
print(f"DEBUG: Exception in _process_csv_background: {e}")
import traceback
traceback.print_exc()
# Mark as failed
task_progress[task_id] = {
"state": "FAILURE",
"progress": 0,
"error": str(e)
}
try:
ActivityLog.log_error(
error_message=f"Background CSV processing failed: {str(e)}",
source="upload._process_csv_background"
)
except Exception:
# If logging fails, just print the error
print(f"Background CSV processing failed: {str(e)}")
@bp.route("/", methods=["GET", "POST"])
def upload():
if request.method == "POST":
@ -51,23 +127,75 @@ def upload():
stream = codecs.iterdecode(file.stream, "utf-8")
content = "".join(stream)
# Trigger the Celery task
task = process_csv.delay(content, delimiter, duplicate_strategy)
# Generate task ID
task_id = str(uuid.uuid4())
return jsonify({"task_id": task.id})
# Get the APScheduler instance from the global variable
from ..scheduler import _scheduler
if not _scheduler:
return jsonify({"error": "APScheduler not initialized."})
if not _scheduler.running:
return jsonify({"error": "APScheduler not running."})
# Initialize task progress immediately
task_progress[task_id] = {
"state": "PENDING",
"progress": 0,
"message": "Task queued for processing..."
}
# Schedule background task
job_id = f"csv_upload_{task_id}"
# Use UTC time to match APScheduler's timezone configuration
run_time = datetime.datetime.utcnow() + datetime.timedelta(seconds=1) # Start in 1 second
try:
_scheduler.add_job(
func=_process_csv_background,
trigger='date',
run_date=run_time,
args=[task_id, content, delimiter, duplicate_strategy],
id=job_id,
name=f"CSV Upload {task_id}",
replace_existing=True
)
ActivityLog.log_import_activity(
action="schedule_csv_upload",
status="info",
description=f"Scheduled CSV upload task {task_id}",
task_id=task_id
)
except Exception as e:
task_progress[task_id] = {
"state": "FAILURE",
"progress": 0,
"error": f"Failed to schedule task: {str(e)}"
}
return jsonify({"error": f"Failed to schedule background task: {str(e)}"})
return jsonify({"task_id": task_id})
return render_template("upload.html.jinja", duplicate_strategies=DUPLICATE_STRATEGIES)
@celery.task(bind=True)
def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
def process_csv(file_content, delimiter, duplicate_strategy="skip", task_id=None):
"""Process CSV file and import paper metadata."""
# With the ContextTask in place, we're already inside an app context
added_count = skipped_count = updated_count = error_count = 0
errors = []
skipped_records = [] # Add this to track skipped records
try:
# Update task progress if provided
if task_id:
task_progress[task_id] = {
"state": "PROGRESS",
"progress": 10,
"message": "Starting CSV import..."
}
# Log the start of import using ActivityLog model
ActivityLog.log_import_activity(
action="start_csv_import",
@ -77,9 +205,6 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
delimiter=delimiter
)
# Set initial progress percentage
self.update_state(state='PROGRESS', meta={'progress': 10})
# Read CSV into chunks
csv_buffer = StringIO(file_content)
# Count total chunks
@ -116,16 +241,16 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
skipped_count += 1
continue
else:
metadata = PaperMetadata(
title=row["title"],
doi=doi,
alt_id=row.get("alternative_id"),
issn=row["issn"],
paper = PaperMetadata(
title=row.get("title"),
doi=row.get("doi"),
alt_id=row.get("alt_id") or row.get("alternative_id"), # Handle both column names
issn=row.get("issn"),
journal=row.get("journal"),
published_online=parse_date(row.get("published_online")),
status="New",
status="New"
)
db.session.add(metadata)
db.session.add(paper)
added_count += 1
except Exception as e:
error_count += 1
@ -134,6 +259,15 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
# Commit the chunk and roll session fresh
db.session.commit()
# Update progress
if task_id:
progress = min(90, 10 + int((chunk_idx + 1) * 80 / total_chunks))
task_progress[task_id] = {
"state": "PROGRESS",
"progress": progress,
"message": f"Processed {chunk_idx+1}/{total_chunks} chunks"
}
# Log periodic progress every 5 chunks
if (chunk_idx + 1) % 5 == 0:
ActivityLog.log_import_activity(
@ -148,11 +282,14 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
}
)
progress = min(90, 10 + int((chunk_idx + 1) * 80 / total_chunks))
self.update_state(state='PROGRESS', meta={'progress': progress})
# Final progress update and completion log
self.update_state(state='PROGRESS', meta={'progress': 100})
if task_id:
task_progress[task_id] = {
"state": "PROGRESS",
"progress": 100,
"message": "Finalizing import..."
}
ActivityLog.log_import_activity(
action="complete_csv_import",
status="success",
@ -167,6 +304,12 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
except Exception as e:
db.session.rollback()
if task_id:
task_progress[task_id] = {
"state": "FAILURE",
"progress": 0,
"error": str(e)
}
ActivityLog.log_error(
error_message="CSV import failed",
exception=e,
@ -189,7 +332,7 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
status="error",
description=f"Import completed with {error_count} errors",
error_csv=error_csv.getvalue(),
task_id=self.request.id,
task_id=task_id,
error_count=error_count
)
except Exception:
@ -204,41 +347,23 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
"skipped_records": skipped_records[:5], # Include up to 5 examples
"skipped_reason_summary": "Records were skipped because they already exist in the database. Use 'update' strategy to update them.",
"errors": errors[:5],
"error_count": error_count,
"task_id": self.request.id
"error_count": error_count
}
@bp.route("/task_status/<task_id>")
def task_status(task_id):
"""Get status of background task."""
task = celery.AsyncResult(task_id)
progress_data = task_progress.get(task_id)
if not progress_data:
return jsonify({"error": "Task not found."})
if task.state == "PENDING":
response = {"state": task.state, "progress": 0}
elif task.state == "PROGRESS":
response = {
"state": task.state,
"progress": task.info.get("progress", 0)
}
elif task.state == "SUCCESS":
response = {
"state": task.state,
"result": task.result
}
else: # FAILURE, REVOKED, etc.
response = {
"state": task.state,
"error": str(task.info) if task.info else "Unknown error"
}
return jsonify(response)
return jsonify(progress_data)
@bp.route("/download_error_log/<task_id>")
def download_error_log(task_id):
# Find the most recent error log for this task
error_log = ActivityLog.query.filter(
ActivityLog.action == "import_errors",
ActivityLog.extra_data.like(f'%"{task_id}"%') # Search in JSON
ActivityLog.action == "import_errors"
).order_by(ActivityLog.timestamp.desc()).first()
if not error_log:
@ -255,7 +380,7 @@ def download_error_log(task_id):
buffer = StringIO(error_csv)
return send_file(
buffer,
BytesIO(buffer.getvalue().encode()), # Corrected to use BytesIO
mimetype="text/csv",
as_attachment=True,
download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"

View File

@ -1,52 +0,0 @@
from celery import Celery
from celery.schedules import crontab
# Create Celery instance without Flask app initially
celery = Celery(
'scipaperloader',
broker='redis://localhost:6379/0',
backend='redis://localhost:6379/0',
)
def configure_celery(app=None):
"""Configure Celery with the Flask app settings and ensure tasks run in the app context."""
if app is None:
# Import here to avoid circular import
from scipaperloader import create_app
app = create_app()
# Update Celery configuration using the app settings
celery.conf.update(
broker_url=app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0'),
result_backend=app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0'),
task_serializer='json',
accept_content=['json'],
result_serializer='json',
timezone='UTC',
enable_utc=True,
task_time_limit=3600, # 1 hour max runtime
task_soft_time_limit=3000, # 50 minutes soft limit
worker_max_tasks_per_child=10, # Restart workers after 10 tasks
worker_max_memory_per_child=1000000, # 1GB memory limit
task_acks_late=True, # Acknowledge tasks after completion
task_reject_on_worker_lost=True, # Requeue tasks if worker dies
# Configure Beat schedule for periodic tasks
beat_schedule={
'hourly-scraper-scheduler': {
'task': 'scipaperloader.scrapers.tasks.hourly_scraper_scheduler',
'schedule': crontab(minute=0), # Run at the start of every hour
'options': {'expires': 3600}
},
}
)
# Create a custom task class that pushes the Flask application context
class ContextTask(celery.Task):
abstract = True
def __call__(self, *args, **kwargs):
with app.app_context():
return self.run(*args, **kwargs)
celery.Task = ContextTask
return celery

View File

@ -3,7 +3,7 @@ import os
class Config:
SECRET_KEY = os.environ.get("SECRET_KEY", "dev")
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///instance/papers.db")
SQLALCHEMY_TRACK_MODIFICATIONS = False
APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
SCRAPER_MODULE = os.environ.get("SCRAPER_MODULE", "dummy")

View File

@ -191,6 +191,7 @@ class PaperMetadata(db.Model):
type = db.Column(db.String(50))
language = db.Column(db.String(50))
published_online = db.Column(db.Date) # or DateTime/String
publisher = db.Column(db.String(100), nullable=True) # Detected publisher name
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
previous_status = db.Column(db.String(10), nullable=True) # Store previous status for reversion
file_path = db.Column(db.Text)
@ -342,6 +343,41 @@ class ScraperModuleConfig(db.Model):
db.session.commit()
return config
class TimezoneConfig(db.Model):
"""Model to store the configured timezone for the scheduler."""
id = db.Column(db.Integer, primary_key=True)
timezone = db.Column(db.String(50), default="Europe/Berlin")
@classmethod
def get_current_timezone(cls):
"""Get the currently configured timezone."""
config = cls.query.first()
if not config:
config = cls(timezone="Europe/Berlin")
db.session.add(config)
db.session.commit()
return config.timezone
@classmethod
def set_timezone(cls, timezone_name):
"""Set the timezone configuration."""
config = cls.query.first()
if not config:
config = cls(timezone=timezone_name)
db.session.add(config)
else:
old_value = config.timezone
config.timezone = timezone_name
ActivityLog.log_config_change(
config_key="scheduler_timezone",
old_value=old_value,
new_value=timezone_name,
description="Updated scheduler timezone configuration"
)
db.session.commit()
return config
def init_schedule_config():
"""Initialize ScheduleConfig with default values if empty"""
if ScheduleConfig.query.count() == 0:
@ -379,3 +415,9 @@ def init_schedule_config():
default_path = DownloadPathConfig(path="/path/to/dummy/papers")
db.session.add(default_path)
db.session.commit()
# Initialize TimezoneConfig if it doesn't exist
if TimezoneConfig.query.count() == 0:
default_timezone = TimezoneConfig(timezone="Europe/Berlin")
db.session.add(default_timezone)
db.session.commit()

View File

@ -0,0 +1,6 @@
# Parser modules for extracting full text from publisher-specific HTML content
from .base_parser import BaseParser, ParsedContent, ParseError
from .elsevier_parser import ElsevierParser
from .arxiv_parser import ArxivParser
__all__ = ['BaseParser', 'ParsedContent', 'ParseError', 'ElsevierParser', 'ArxivParser']

View File

@ -0,0 +1,227 @@
import re
from bs4 import BeautifulSoup
from typing import Dict, Optional, List
from .base_parser import BaseParser, ParsedContent, ParseError
class ArxivParser(BaseParser):
"""Parser for arXiv papers."""
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
"""Check if this is an arXiv page."""
html_lower = html_content.lower()
# Check for arXiv indicators
indicators = [
'arxiv.org',
'export.arxiv.org',
'arxiv:',
'meta name="citation_publisher" content="arxiv"',
]
return any(indicator in html_lower for indicator in indicators)
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
"""Parse arXiv HTML content."""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Extract title
title = self._extract_title(soup)
# Extract abstract
abstract = self._extract_abstract(soup)
# Extract authors
authors = self._extract_authors(soup)
# Extract full text (arXiv usually just has abstract on the HTML page)
full_text = self._extract_full_text(soup, abstract)
# Extract keywords/subjects
keywords = self._extract_subjects(soup)
# Extract arxiv ID
arxiv_id = self._extract_arxiv_id(soup)
if not full_text or len(full_text.strip()) < 50:
raise ParseError("Could not extract meaningful content from arXiv page")
return ParsedContent(
full_text=full_text,
title=title,
abstract=abstract,
authors=authors,
keywords=keywords,
sections=None, # arXiv HTML pages don't usually have full sections
references=None, # References are typically in the PDF
doi=doi,
journal="arXiv",
publication_date=self._extract_submission_date(soup),
metadata={
'parser': 'arxiv',
'arxiv_id': arxiv_id,
'source': 'arxiv.org'
}
)
except Exception as e:
raise ParseError(f"Failed to parse arXiv content: {str(e)}")
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract paper title."""
# Try multiple title selectors for arXiv
selectors = [
'h1.title',
'meta[name="citation_title"]',
'title'
]
for selector in selectors:
if 'meta' in selector:
element = soup.find('meta', attrs={'name': 'citation_title'})
if element:
return element.get('content', '').strip()
else:
element = soup.select_one(selector)
if element:
text = element.get_text(strip=True)
# Remove "Title:" prefix if present
text = re.sub(r'^Title:\s*', '', text)
return text
return None
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract paper abstract."""
# arXiv abstract selectors
selectors = [
'blockquote.abstract',
'div.abstract',
'meta[name="citation_abstract"]'
]
for selector in selectors:
if 'meta' in selector:
element = soup.find('meta', attrs={'name': 'citation_abstract'})
if element:
return element.get('content', '').strip()
else:
element = soup.select_one(selector)
if element:
text = element.get_text(strip=True)
# Remove "Abstract:" prefix if present
text = re.sub(r'^Abstract:\s*', '', text)
return text
return None
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract author names."""
authors = []
# Try author meta tags
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
if author_metas:
authors = [meta.get('content', '').strip() for meta in author_metas]
# Try arXiv author div
if not authors:
authors_div = soup.select_one('div.authors')
if authors_div:
# Extract author links or text
author_links = authors_div.find_all('a')
if author_links:
authors = [link.get_text(strip=True) for link in author_links]
else:
# Fallback to text parsing
text = authors_div.get_text()
# Remove "Authors:" prefix and split by commas
text = re.sub(r'^Authors?:\s*', '', text)
authors = [author.strip() for author in text.split(',')]
return authors if authors else None
def _extract_full_text(self, soup: BeautifulSoup, abstract: Optional[str] = None) -> str:
"""Extract main content (usually just abstract for arXiv HTML pages)."""
content_parts = []
# For arXiv, the HTML page typically only contains abstract and metadata
# The full text is in the PDF
if abstract:
content_parts.append(f"Abstract\n{abstract}")
# Look for any additional content sections
comments_section = soup.select_one('td.comments')
if comments_section:
comments = comments_section.get_text(strip=True)
if comments:
content_parts.append(f"Comments\n{comments}")
# Add note about PDF availability
content_parts.append(
"\nNote: This is the abstract and metadata from the arXiv HTML page. "
"The full text is available in the PDF version."
)
return '\n\n'.join(content_parts)
def _extract_subjects(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract subject classifications."""
subjects = []
# Look for subject classification
subjects_td = soup.select_one('td.subjects')
if subjects_td:
subjects_text = subjects_td.get_text(strip=True)
# Parse subjects (format: "Primary: subject1; Secondary: subject2")
subjects = [subj.strip() for subj in re.split(r'[;,]', subjects_text)]
# Clean up prefixes
subjects = [re.sub(r'^(Primary|Secondary):\s*', '', subj) for subj in subjects]
subjects = [subj for subj in subjects if subj] # Remove empty strings
return subjects if subjects else None
def _extract_arxiv_id(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract arXiv ID."""
# Look for arXiv ID in various places
arxiv_id_patterns = [
r'arXiv:(\d+\.\d+(?:v\d+)?)',
r'(\d{4}\.\d{4,5}(?:v\d+)?)',
]
# Search in page text
page_text = soup.get_text()
for pattern in arxiv_id_patterns:
match = re.search(pattern, page_text)
if match:
return match.group(1)
# Search in URL or meta tags
canonical_link = soup.find('link', attrs={'rel': 'canonical'})
if canonical_link:
href = canonical_link.get('href', '')
for pattern in arxiv_id_patterns:
match = re.search(pattern, href)
if match:
return match.group(1)
return None
def _extract_submission_date(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract submission date."""
# Look for submission date
submission_td = soup.select_one('td.submission-history')
if submission_td:
date_text = submission_td.get_text()
# Extract date (format varies)
date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_text)
if date_match:
return date_match.group(1)
# Try meta tag
date_meta = soup.find('meta', attrs={'name': 'citation_date'})
if date_meta:
return date_meta.get('content', '').strip()
return None

View File

@ -0,0 +1,83 @@
from abc import ABC, abstractmethod
from typing import Dict, Optional, List
from dataclasses import dataclass
@dataclass
class ParsedContent:
"""Container for parsed content from a publisher's HTML."""
full_text: str
title: Optional[str] = None
abstract: Optional[str] = None
authors: Optional[List[str]] = None
keywords: Optional[List[str]] = None
sections: Optional[Dict[str, str]] = None # section_title -> section_content
references: Optional[List[str]] = None
doi: Optional[str] = None
journal: Optional[str] = None
publication_date: Optional[str] = None
metadata: Optional[Dict] = None # Additional metadata specific to publisher
class BaseParser(ABC):
"""Base class for all publisher-specific parsers."""
def __init__(self):
self.parser_name = self.__class__.__name__.lower().replace('parser', '')
@abstractmethod
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
"""
Check if this parser can handle the given HTML content.
Args:
html_content: The HTML content to check
url: Optional URL of the content (for additional context)
Returns:
True if this parser can handle the content, False otherwise
"""
pass
@abstractmethod
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
"""
Parse HTML content and extract structured information.
Args:
html_content: The HTML content to parse
doi: Optional DOI of the paper
Returns:
ParsedContent object with extracted information
Raises:
ParseError: If parsing fails
"""
pass
def get_name(self) -> str:
"""Return the name of this parser."""
return self.parser_name
def get_description(self) -> str:
"""Return a description of this parser."""
return getattr(self.__class__, "__doc__", "No description available")
def validate_content(self, content: ParsedContent) -> bool:
"""
Validate the parsed content to ensure it meets minimum requirements.
Args:
content: The parsed content to validate
Returns:
True if content is valid, False otherwise
"""
# Basic validation - must have some full text
if not content.full_text or len(content.full_text.strip()) < 100:
return False
return True
class ParseError(Exception):
"""Exception raised when parsing fails."""
pass

View File

@ -0,0 +1,252 @@
import re
from bs4 import BeautifulSoup
from typing import Dict, Optional, List
from .base_parser import BaseParser, ParsedContent, ParseError
class ElsevierParser(BaseParser):
"""Parser for Elsevier/ScienceDirect articles."""
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
"""Check if this is an Elsevier/ScienceDirect page."""
html_lower = html_content.lower()
# Check for Elsevier/ScienceDirect indicators
indicators = [
'sciencedirect.com',
'elsevier.com',
'meta name="citation_publisher" content="elsevier"',
'copyright.*elsevier',
'sciencedirect',
]
return any(indicator in html_lower for indicator in indicators)
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
"""Parse Elsevier/ScienceDirect HTML content."""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Extract title
title = self._extract_title(soup)
# Extract abstract
abstract = self._extract_abstract(soup)
# Extract authors
authors = self._extract_authors(soup)
# Extract full text
full_text = self._extract_full_text(soup)
# Extract sections
sections = self._extract_sections(soup)
# Extract keywords
keywords = self._extract_keywords(soup)
# Extract references
references = self._extract_references(soup)
# Extract journal info
journal = self._extract_journal(soup)
# Extract publication date
publication_date = self._extract_publication_date(soup)
# Combine everything into full text if sections exist
if sections:
full_text = self._combine_sections(sections, abstract)
if not full_text or len(full_text.strip()) < 100:
raise ParseError("Could not extract meaningful full text content")
return ParsedContent(
full_text=full_text,
title=title,
abstract=abstract,
authors=authors,
keywords=keywords,
sections=sections,
references=references,
doi=doi,
journal=journal,
publication_date=publication_date,
metadata={
'parser': 'elsevier',
'source': 'sciencedirect'
}
)
except Exception as e:
raise ParseError(f"Failed to parse Elsevier content: {str(e)}")
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract article title."""
# Try multiple title selectors
selectors = [
'h1.title-text',
'h1[data-testid="title"]',
'h1.article-title',
'meta[name="citation_title"]',
'title'
]
for selector in selectors:
if 'meta' in selector:
element = soup.find('meta', attrs={'name': 'citation_title'})
if element:
return element.get('content', '').strip()
else:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return None
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract article abstract."""
selectors = [
'div.abstract-content',
'div[data-testid="abstract"]',
'div.abstract',
'section.abstract',
'div#abstract'
]
for selector in selectors:
element = soup.select_one(selector)
if element:
return element.get_text(strip=True)
return None
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract author names."""
authors = []
# Try author meta tags
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
if author_metas:
authors = [meta.get('content', '').strip() for meta in author_metas]
# Try author div/span elements
if not authors:
author_elements = soup.select('div.author a, span.author, .author-name')
authors = [elem.get_text(strip=True) for elem in author_elements]
return authors if authors else None
def _extract_full_text(self, soup: BeautifulSoup) -> str:
"""Extract main article content."""
content_parts = []
# Try main content selectors
main_selectors = [
'div.article-content',
'div.body-content',
'main.article-body',
'div[data-testid="article-body"]',
'section.article-section'
]
for selector in main_selectors:
elements = soup.select(selector)
for element in elements:
# Remove script, style, and navigation elements
for unwanted in element.find_all(['script', 'style', 'nav', 'footer', 'header']):
unwanted.decompose()
text = element.get_text(separator='\n', strip=True)
if text and len(text) > 50: # Only add substantial content
content_parts.append(text)
return '\n\n'.join(content_parts)
def _extract_sections(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
"""Extract article sections with headings."""
sections = {}
# Look for section headings and content
section_elements = soup.find_all(['h2', 'h3', 'h4'], class_=re.compile(r'section|heading'))
for heading in section_elements:
section_title = heading.get_text(strip=True)
# Find content after this heading until next heading
content_parts = []
current = heading.next_sibling
while current and current.name not in ['h1', 'h2', 'h3', 'h4']:
if hasattr(current, 'get_text'):
text = current.get_text(strip=True)
if text:
content_parts.append(text)
current = current.next_sibling
if content_parts:
sections[section_title] = '\n'.join(content_parts)
return sections if sections else None
def _extract_keywords(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract article keywords."""
keywords = []
# Try keyword meta tags
keyword_metas = soup.find_all('meta', attrs={'name': 'citation_keywords'})
if keyword_metas:
for meta in keyword_metas:
content = meta.get('content', '')
if content:
keywords.extend([kw.strip() for kw in content.split(',')])
# Try keyword sections
if not keywords:
keyword_sections = soup.select('div.keywords, section.keywords')
for section in keyword_sections:
text = section.get_text()
keywords.extend([kw.strip() for kw in text.split(',') if kw.strip()])
return keywords if keywords else None
def _extract_references(self, soup: BeautifulSoup) -> Optional[List[str]]:
"""Extract references."""
references = []
ref_sections = soup.select('section.references, div.references, ol.references li')
for section in ref_sections:
if section.name == 'li':
references.append(section.get_text(strip=True))
else:
ref_items = section.find_all(['li', 'div'], class_=re.compile(r'reference'))
references.extend([item.get_text(strip=True) for item in ref_items])
return references if references else None
def _extract_journal(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract journal name."""
journal_meta = soup.find('meta', attrs={'name': 'citation_journal_title'})
if journal_meta:
return journal_meta.get('content', '').strip()
return None
def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract publication date."""
date_meta = soup.find('meta', attrs={'name': 'citation_publication_date'})
if date_meta:
return date_meta.get('content', '').strip()
return None
def _combine_sections(self, sections: Dict[str, str], abstract: Optional[str] = None) -> str:
"""Combine all sections into full text."""
full_text_parts = []
if abstract:
full_text_parts.append(f"Abstract\n{abstract}")
for section_title, section_content in sections.items():
full_text_parts.append(f"{section_title}\n{section_content}")
return '\n\n'.join(full_text_parts)

593
scipaperloader/scheduler.py Normal file
View File

@ -0,0 +1,593 @@
"""
APScheduler-based scheduling system to replace complex Celery delayed task management.
This provides clean job scheduling and revocation without manual Redis manipulation.
"""
import random
import logging
from datetime import datetime, timedelta
from typing import Optional, List
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
from apscheduler.executors.pool import ThreadPoolExecutor
from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR, EVENT_JOB_MISSED
from apscheduler.jobstores.base import JobLookupError
# Configure APScheduler logging
logging.getLogger('apscheduler').setLevel(logging.WARNING)
# Global scheduler instance
_scheduler = None
_flask_app = None
def _get_flask_app():
"""Get the Flask app instance."""
global _flask_app
if _flask_app:
return _flask_app
try:
from flask import current_app
return current_app
except RuntimeError:
return None
def _hourly_scraper_scheduler():
"""Standalone function for hourly scheduling logic."""
app = _get_flask_app()
if not app:
return
with app.app_context():
try:
from .models import ScraperState, ActivityLog
# Check if scraper is active
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="hourly_scheduler_apscheduler",
status="info",
description="Hourly scheduler skipped - scraper not active"
)
return {"status": "inactive", "papers_scheduled": 0}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="hourly_scheduler_apscheduler",
status="info",
description="Hourly scheduler skipped - scraper paused"
)
return {"status": "paused", "papers_scheduled": 0}
# Get papers to process this hour
from .scrapers.manager import ScraperManager
manager = ScraperManager()
papers = manager.select_papers_for_processing()
if not papers:
ActivityLog.log_scraper_activity(
action="hourly_scheduler_apscheduler",
status="info",
description="No papers available for processing this hour"
)
return {"status": "empty", "papers_scheduled": 0}
# Schedule papers at random times within the hour
scheduled_count = 0
current_time = datetime.now()
scheduled_papers = []
for paper in papers:
# Random delay between 1 second and 58 minutes
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
run_time = current_time + timedelta(seconds=delay_seconds)
# Schedule the individual paper processing job with unique ID
# Include microseconds and random suffix to prevent collisions
import uuid
job_id = f"process_paper_{paper.id}_{int(current_time.timestamp())}_{uuid.uuid4().hex[:8]}"
global _scheduler
if _scheduler:
_scheduler.add_job(
func=_process_single_paper,
trigger='date',
run_date=run_time,
args=[paper.id],
id=job_id,
replace_existing=True, # Changed to True to handle conflicts gracefully
name=f"Process Paper {paper.doi}"
)
scheduled_count += 1
# Collect paper info for single log entry
paper_info = {
"paper_id": paper.id,
"paper_doi": paper.doi,
"job_id": job_id,
"scheduled_time": run_time.isoformat(),
"delay_seconds": delay_seconds
}
scheduled_papers.append(paper_info)
# Create single comprehensive log entry with JSON data
try:
import json
from .models import ActivityLog
scheduling_data = {
"total_scheduled": scheduled_count,
"scheduled_papers": scheduled_papers,
"timestamp": datetime.now().isoformat(),
"hour_range": f"{current_time.strftime('%H:%M')} - {(current_time + timedelta(hours=1)).strftime('%H:%M')}"
}
ActivityLog.log_scraper_activity(
action="hourly_scheduler_apscheduler",
status="success",
description=f"Scheduled {scheduled_count} papers for random processing within this hour using APScheduler. See extra_data for details.",
**{"scheduling_details": json.dumps(scheduling_data)}
)
except Exception:
# Fallback to simple logging
ActivityLog.log_scraper_activity(
action="hourly_scheduler_apscheduler",
status="success",
description=f"Scheduled {scheduled_count} papers for random processing within this hour using APScheduler"
)
return {"status": "success", "papers_scheduled": scheduled_count}
except Exception as e:
from .models import ActivityLog
ActivityLog.log_error(
error_message=f"APScheduler hourly scheduler error: {str(e)}",
source="_hourly_scraper_scheduler"
)
return {"status": "error", "message": str(e)}
def _process_single_paper(paper_id: int):
"""Standalone function to process a single paper."""
app = _get_flask_app()
if not app:
return
with app.app_context():
try:
from .models import ScraperState, ActivityLog, PaperMetadata
# Enhanced race condition protection
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper_apscheduler",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper not active (APScheduler)"
)
return {"status": "inactive", "paper_id": paper_id}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_single_paper_apscheduler",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper paused (APScheduler)"
)
return {"status": "paused", "paper_id": paper_id}
# Get the paper
paper = PaperMetadata.query.get(paper_id)
if not paper:
return {"status": "error", "message": f"Paper {paper_id} not found"}
# Final check before processing
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper_apscheduler",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper not active (pre-processing check)"
)
return {"status": "inactive", "paper_id": paper_id}
# Process the paper using scraper manager
from .scrapers.manager import ScraperManager
manager = ScraperManager()
result = manager.process_paper(paper)
return result
except Exception as e:
from .models import ActivityLog
ActivityLog.log_error(
error_message=f"Error processing paper {paper_id} in APScheduler: {str(e)}",
source="_process_single_paper"
)
return {"status": "error", "paper_id": paper_id, "message": str(e)}
def _process_single_paper_manual(paper_id: int, scraper_name: Optional[str] = None):
"""Standalone function to process a single paper manually (bypasses scraper state checks)."""
app = _get_flask_app()
if not app:
return
with app.app_context():
try:
from .models import ActivityLog, PaperMetadata
# Get the paper
paper = PaperMetadata.query.get(paper_id)
if not paper:
return {"status": "error", "message": f"Paper {paper_id} not found"}
# Process the paper using manual method (bypasses scraper state checks)
from .scrapers.manager import ScraperManager
manager = ScraperManager()
result = manager.process_paper_manual(paper, scraper_name=scraper_name)
return result
except Exception as e:
from .models import ActivityLog
ActivityLog.log_error(
error_message=f"Error manually processing paper {paper_id} in APScheduler: {str(e)}",
source="_process_single_paper_manual"
)
return {"status": "error", "paper_id": paper_id, "message": str(e)}
def _job_listener(event):
"""Listen to job execution events."""
app = _get_flask_app()
if not app:
return
with app.app_context():
try:
from .models import ActivityLog
job_id = event.job_id
if event.exception:
ActivityLog.log_error(
error_message=f"APScheduler job {job_id} failed: {str(event.exception)}",
source="ScraperScheduler.job_listener"
)
elif hasattr(event, 'retval') and event.retval:
# Job completed successfully
if job_id.startswith('process_paper_'):
ActivityLog.log_scraper_activity(
action="apscheduler_job_complete",
status="success",
description=f"Job {job_id} completed successfully"
)
except Exception as e:
# Don't let logging errors break the scheduler
print(f"Error in job listener: {str(e)}")
class ScraperScheduler:
"""APScheduler-based scraper task scheduler."""
def __init__(self, app=None):
self.app = app
if app:
self.init_app(app)
@property
def scheduler(self):
"""Expose the global _scheduler instance."""
global _scheduler
return _scheduler
def init_app(self, app):
"""Initialize the scheduler with Flask app context."""
global _scheduler, _flask_app
_flask_app = app
self.app = app
# Initialize scheduler within app context to access db.engine properly
with app.app_context():
# Use the existing Flask-SQLAlchemy database engine for APScheduler
from .db import db
# Configure job store to use the existing database engine
jobstores = {
'default': SQLAlchemyJobStore(engine=db.engine)
}
# Configure thread pool executor
executors = {
'default': ThreadPoolExecutor(max_workers=50) # Increased from 20 to 50
}
# Job defaults
job_defaults = {
'coalesce': False, # Don't combine multiple scheduled instances
'max_instances': 3, # Allow up to 3 instances of the same job
'misfire_grace_time': 30 # 30 seconds grace period for missed jobs
}
# Get timezone from database configuration
from .models import TimezoneConfig
configured_timezone = TimezoneConfig.get_current_timezone()
# Create the scheduler
_scheduler = BackgroundScheduler(
jobstores=jobstores,
executors=executors,
job_defaults=job_defaults,
timezone=configured_timezone # Use configurable timezone from database
)
# Add event listeners
_scheduler.add_listener(_job_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR | EVENT_JOB_MISSED)
# Start the scheduler FIRST, which will auto-create tables
_scheduler.start()
# THEN add the hourly scraper job
_scheduler.add_job(
func=_hourly_scraper_scheduler,
trigger='cron',
minute=0, # Run at the start of every hour
id='hourly_scraper_main',
replace_existing=True,
name='Hourly Scraper Scheduler'
)
try:
from .models import ActivityLog
ActivityLog.log_scraper_activity(
action="apscheduler_init",
status="success",
description="APScheduler initialized with database job store and hourly scheduling"
)
except Exception:
# Handle case where we're outside application context
print("✅ APScheduler initialized successfully")
def revoke_all_scraper_jobs(self) -> int:
"""Clean replacement for the complex _clear_delayed_tasks_from_redis method."""
global _scheduler
if not _scheduler:
try:
from .models import ActivityLog
ActivityLog.log_error(
error_message="Scheduler not initialized - cannot revoke jobs",
source="ScraperScheduler.revoke_all_scraper_jobs"
)
except Exception:
print("❌ Scheduler not initialized - cannot revoke jobs")
return 0
revoked_count = 0
revoked_jobs = []
already_gone_jobs = []
failed_jobs = []
try:
# Get all jobs
jobs = _scheduler.get_jobs()
for job in jobs:
# Remove any job that processes papers or uploads (but keep the main hourly scheduler)
if ('paper_process_' in job.id or 'test_paper_process_' in job.id or
'process_paper_' in job.id or 'csv_upload_' in job.id or 'manual_paper_' in job.id or
'startup_paper_' in job.id):
try:
_scheduler.remove_job(job.id)
revoked_count += 1
# Collect job info for single log entry
job_info = {
"job_id": job.id,
"job_name": job.name,
"next_run_time": job.next_run_time.isoformat() if job.next_run_time else None,
"args": job.args
}
revoked_jobs.append(job_info)
print(f"✅ Revoked APScheduler job: {job.id}")
except JobLookupError as e:
# Job already removed/completed - this is normal
already_gone_jobs.append({
"job_id": job.id,
"reason": str(e)
})
print(f" Job {job.id} was already completed or removed")
except Exception as e:
# Other error - log it but continue
failed_jobs.append({
"job_id": job.id,
"error": str(e)
})
print(f"❌ Error removing job {job.id}: {str(e)}")
# Create single comprehensive log entry with JSON data
if revoked_jobs or already_gone_jobs or failed_jobs:
try:
import json
from .models import ActivityLog
revocation_data = {
"total_revoked": revoked_count,
"revoked_jobs": revoked_jobs,
"already_gone_jobs": already_gone_jobs,
"failed_jobs": failed_jobs,
"timestamp": datetime.now().isoformat()
}
ActivityLog.log_scraper_activity(
action="revoke_all_scraper_jobs_apscheduler",
status="success",
description=f"Successfully revoked {revoked_count} APScheduler jobs. See extra_data for details.",
**{"revocation_details": json.dumps(revocation_data)}
)
except Exception:
print(f"✅ Successfully revoked {revoked_count} APScheduler jobs")
return revoked_count
except Exception as e:
try:
from .models import ActivityLog
ActivityLog.log_error(
error_message=f"Error revoking APScheduler jobs: {str(e)}",
source="ScraperScheduler.revoke_all_scraper_jobs"
)
except Exception:
print(f"❌ Error revoking APScheduler jobs: {str(e)}")
return 0
def get_job_count(self) -> int:
"""Get the number of scheduled jobs."""
global _scheduler
if not _scheduler:
return 0
return len(_scheduler.get_jobs())
def get_paper_jobs(self) -> List[dict]:
"""Get information about scheduled paper processing jobs."""
global _scheduler
if not _scheduler:
return []
jobs = []
all_jobs = _scheduler.get_jobs()
for job in all_jobs:
# Match jobs that contain paper processing patterns
if ('process_paper_' in job.id or 'paper_process_' in job.id or 'test_paper_process_' in job.id):
job_info = {
'id': job.id,
'name': job.name,
'next_run_time': job.next_run_time.isoformat() if job.next_run_time else None,
'args': job.args
}
jobs.append(job_info)
return jobs
def shutdown(self):
"""Gracefully shutdown the scheduler."""
global _scheduler
if _scheduler:
try:
from .models import ActivityLog
ActivityLog.log_scraper_activity(
action="apscheduler_shutdown",
status="info",
description="Shutting down APScheduler"
)
except Exception:
print("🔄 Shutting down APScheduler")
_scheduler.shutdown(wait=False)
_scheduler = None
def schedule_paper_processing(self, paper_id: int, delay_seconds: int = 0, job_id: Optional[str] = None) -> str:
"""Schedule a paper for processing with APScheduler.
Args:
paper_id: ID of the paper to process
delay_seconds: Delay in seconds before processing (default: 0 for immediate)
job_id: Optional custom job ID (will be generated if not provided)
Returns:
str: The job ID of the scheduled job
"""
global _scheduler
if not _scheduler:
raise RuntimeError("APScheduler not initialized")
# Generate job ID if not provided
if not job_id:
# Use microseconds and UUID suffix to prevent collisions
import uuid
job_id = f"process_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{uuid.uuid4().hex[:8]}"
# Calculate run time
run_time = datetime.now() + timedelta(seconds=delay_seconds)
# Schedule the job
job = _scheduler.add_job(
func=_process_single_paper,
trigger='date',
run_date=run_time,
args=[paper_id],
id=job_id,
name=f"Process Paper {paper_id}",
replace_existing=True
)
# Log the scheduling
try:
from .models import ActivityLog
ActivityLog.log_scraper_activity(
action="schedule_paper_processing_apscheduler",
paper_id=paper_id,
status="info",
description=f"Scheduled paper {paper_id} for processing at {run_time.strftime('%H:%M:%S')} (Job ID: {job_id})"
)
except Exception:
print(f"✅ Scheduled paper {paper_id} for processing (Job ID: {job_id})")
return job_id
def schedule_manual_paper_processing(self, paper_id: int, scraper_name: Optional[str] = None, delay_seconds: int = 0, job_id: Optional[str] = None) -> str:
"""
Schedule manual paper processing that bypasses scraper state checks.
Args:
paper_id: ID of the paper to process
scraper_name: Optional specific scraper module to use (defaults to system scraper)
delay_seconds: Delay before processing starts (default: 0)
job_id: Optional custom job ID (auto-generated if not provided)
Returns:
Job ID of the scheduled task
"""
global _scheduler
if not _scheduler:
raise RuntimeError("APScheduler not initialized")
if job_id is None:
job_id = f"manual_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
run_time = datetime.now() + timedelta(seconds=delay_seconds)
# Schedule the manual processing job
job = _scheduler.add_job(
func=_process_single_paper_manual,
trigger='date',
run_date=run_time,
args=[paper_id, scraper_name],
id=job_id,
name=f"Manual Process Paper {paper_id}",
replace_existing=True
)
# Log the scheduling
try:
from .models import ActivityLog
ActivityLog.log_scraper_activity(
action="schedule_manual_paper_processing",
paper_id=paper_id,
status="info",
description=f"Scheduled manual processing for paper {paper_id} at {run_time.strftime('%H:%M:%S')} (Job ID: {job_id})"
)
except Exception:
pass # Don't fail if logging fails
return job_id

View File

@ -18,6 +18,43 @@ class BaseScraper(ABC):
OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping
OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing
def __init__(self):
"""Initialize the scraper."""
self.scraper_name = self.get_name().lower()
def log_scrape_start(self, doi: str, paper_id: Optional[int] = None):
"""Log the start of a scraping operation."""
from ..models import ActivityLog
ActivityLog.log_scraper_activity(
action=f"{self.scraper_name}_scrape_start",
status="info",
description=f"Starting {self.get_name()} for DOI: {doi}",
paper_id=paper_id
)
def log_scrape_success(self, doi: str, message: str, paper_id: Optional[int] = None):
"""Log successful completion of scraping."""
from ..models import ActivityLog
ActivityLog.log_scraper_activity(
action=f"{self.scraper_name}_scrape_success",
status="success",
description=f"{self.get_name()} completed successfully for DOI: {doi} - {message}",
paper_id=paper_id
)
def log_scrape_failure(self, doi: str, message: str, paper_id: Optional[int] = None):
"""Log failed scraping operation."""
from ..models import ActivityLog
ActivityLog.log_scraper_activity(
action=f"{self.scraper_name}_scrape_failure",
status="error",
description=f"{self.get_name()} failed for DOI: {doi} - {message}",
paper_id=paper_id
)
@abstractmethod
def scrape(self, doi: str) -> ScrapeResult:
"""

View File

@ -30,6 +30,9 @@ class Scraper(BaseScraper):
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Simulate processing time (1-3 seconds)
processing_time = random.uniform(1, 3)
time.sleep(processing_time)
@ -145,12 +148,7 @@ class Scraper(BaseScraper):
)
# Log success
ActivityLog.log_scraper_activity(
action="dummy_scrape",
status="success",
description=f"Successfully scraped {doi}",
paper_id=paper.id
)
self.log_scrape_success(doi, f"Successfully scraped {doi}", paper.id)
result = ScrapeResult(
status="success",
@ -178,12 +176,7 @@ class Scraper(BaseScraper):
paper.error_msg = error_msg
# Log failure
ActivityLog.log_scraper_activity(
action="dummy_scrape",
status="error",
description=f"Failed to scrape {doi}: {error_msg}",
paper_id=paper.id
)
self.log_scrape_failure(doi, error_msg, paper.id)
result = ScrapeResult(
status="error",

View File

@ -30,13 +30,8 @@ class Scraper(BaseScraper):
timestamp=datetime.utcnow()
)
# Log retry attempt
ActivityLog.log_scraper_activity(
action="retry_failed_paper",
status="info",
description=f"Retrying failed paper: {paper.title}",
paper_id=paper.id
)
# Log start of retry
self.log_scrape_start(doi, paper.id)
# Simulate longer processing time for retry (2-5 seconds)
processing_time = random.uniform(2, 5)
@ -64,12 +59,7 @@ class Scraper(BaseScraper):
result_data = {"file_path": file_path}
# Log success
ActivityLog.log_scraper_activity(
action="retry_scrape_success",
status="success",
description=f"Successfully retried {doi} on second attempt",
paper_id=paper.id
)
self.log_scrape_success(doi, f"Successfully retried {doi} on second attempt", paper.id)
result = ScrapeResult(
status="success",
@ -81,12 +71,7 @@ class Scraper(BaseScraper):
except Exception as e:
error_msg = f"Failed to save retry file: {str(e)}"
ActivityLog.log_scraper_activity(
action="retry_scrape_file_error",
status="error",
description=error_msg,
paper_id=paper.id
)
self.log_scrape_failure(doi, error_msg, paper.id)
result = ScrapeResult(
status="error",
@ -105,12 +90,7 @@ class Scraper(BaseScraper):
]
error_msg = random.choice(error_messages)
ActivityLog.log_scraper_activity(
action="retry_scrape_failure",
status="error",
description=f"Retry failed for {doi}: {error_msg}",
paper_id=paper.id
)
self.log_scrape_failure(doi, error_msg, paper.id)
result = ScrapeResult(
status="error",

View File

@ -0,0 +1,172 @@
import time
import os
import requests
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Scraper that fetches HTML content from DOI and saves it for further processing."""
# This scraper processes "New" papers and outputs "HtmlDownloaded"/"Failed"
INPUT_STATUSES = ["New"]
OUTPUT_STATUS_SUCCESS = "HtmlDownloaded"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "FetchingHtml"
def scrape(self, doi: str) -> ScrapeResult:
"""Fetch HTML content from DOI and save to download path."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
# Prepare file paths
download_path = DownloadPathConfig.get_path()
file_name = f"{doi.replace('/', '_')}.html"
file_path = os.path.join(download_path, file_name)
# Check/create download directory (same pattern as dummy)
if not os.path.exists(download_path):
try:
os.makedirs(download_path, exist_ok=True)
except OSError as e:
error_msg = f"Failed to create download directory: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Check path permissions (same pattern as dummy)
if not os.access(download_path, os.W_OK):
error_msg = f"Download path '{download_path}' is not writable"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
ActivityLog.log_scraper_activity(
action="html_fetch_path_error",
status="error",
description=error_msg,
paper_id=paper.id
)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_write_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
try:
# Fetch HTML from DOI
doi_url = f"https://doi.org/{doi}"
headers = {'User-Agent': 'SciPaperLoader/1.0'}
response = requests.get(doi_url, headers=headers, timeout=30, allow_redirects=True)
# Check for invalid DOI (404) or other HTTP errors
if response.status_code == 404:
error_msg = f"Invalid DOI: {doi} not found"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "invalid_doi"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
response.raise_for_status() # Raise for other HTTP errors
# Save HTML content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(response.text)
# Update paper status to success
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.file_path = file_path
paper.error_msg = None
db.session.commit()
# Log success
self.log_scrape_success(doi, f"Successfully fetched HTML for {doi}", paper.id)
return ScrapeResult(
status="success",
message=f"Successfully fetched HTML for {doi}",
data={
"file_path": file_path,
"url": response.url, # Final URL after redirects
"title": paper.title
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except requests.exceptions.RequestException as e:
error_msg = f"Failed to fetch HTML from DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
ActivityLog.log_scraper_activity(
action="html_fetch",
status="error",
description=error_msg,
paper_id=paper.id
)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "network_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Failed to save HTML file: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "file_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)

View File

@ -1,13 +1,14 @@
"""
Simplified scraper management system with hourly quota scheduling.
Uses APScheduler for all task processing - no Celery dependencies.
"""
import random
import math
import redis
from datetime import datetime, timedelta
from datetime import datetime, timedelta, UTC
from typing import List, Dict, Optional
from sqlalchemy import func
from flask import current_app
from ..models import (
PaperMetadata,
@ -20,7 +21,6 @@ from ..models import (
from ..db import db
from ..cache_utils import get_cached_hourly_quota
from .factory import get_scraper, get_available_scrapers
from ..celery import celery
class ScraperManager:
@ -29,237 +29,81 @@ class ScraperManager:
def __init__(self):
self.current_scraper = None
self.pending_papers = [] # Track papers being processed
# Initialize Redis client for delayed task management
self.redis_client = None
self._init_redis_client()
# No more Redis client initialization - using APScheduler now
def _init_redis_client(self):
"""Initialize Redis client for delayed task management."""
def _get_scheduler(self):
"""Get the ScraperScheduler instance from Flask app config."""
try:
# Use same Redis configuration as Celery
self.redis_client = redis.Redis(
host='localhost',
port=6379,
db=0,
decode_responses=True
)
# Test connection
self.redis_client.ping()
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to initialize Redis client: {str(e)}",
source="ScraperManager._init_redis_client"
)
self.redis_client = None
return current_app.config.get('SCHEDULER')
except RuntimeError:
# Outside application context
return None
def _clear_delayed_tasks_from_redis(self) -> int:
"""Clear delayed tasks from Redis structures used by Celery.
def _get_raw_scheduler(self):
"""Get the raw APScheduler instance for direct job scheduling."""
try:
scheduler_wrapper = current_app.config.get('SCHEDULER')
if scheduler_wrapper:
return scheduler_wrapper.scheduler
return None
except RuntimeError:
return None
Based on analysis, Celery stores delayed tasks in:
- 'unacked_index': Sorted set containing task IDs with execution timestamps
- 'unacked': Hash containing task data keyed by task ID
def _clear_delayed_tasks_from_apscheduler(self) -> int:
"""Clear delayed tasks from APScheduler - clean replacement for Redis manipulation.
Returns:
int: Number of delayed tasks cleared
"""
if not self.redis_client:
scheduler = self._get_scheduler()
if not scheduler:
try:
ActivityLog.log_error(
error_message="Redis client not available - cannot clear delayed tasks",
source="ScraperManager._clear_delayed_tasks_from_redis"
error_message="APScheduler not available - cannot clear delayed tasks",
source="ScraperManager._clear_delayed_tasks_from_apscheduler"
)
except RuntimeError:
# Working outside application context - just print instead
print("❌ Redis client not available - cannot clear delayed tasks")
print("❌ APScheduler not available - cannot clear delayed tasks")
return 0
cleared_count = 0
try:
# Define scraper task patterns to identify our tasks
scraper_patterns = [
'process_single_paper',
'process_papers_batch',
'hourly_scraper_scheduler'
]
cleared_count = scheduler.revoke_all_scraper_jobs()
try:
ActivityLog.log_scraper_activity(
action="check_delayed_tasks",
status="info",
description="Checking Celery delayed task structures (unacked_index, unacked)"
)
except RuntimeError:
print("🔍 Checking Celery delayed task structures (unacked_index, unacked)")
# Check 'unacked_index' (sorted set with task IDs and timestamps)
unacked_index_cleared = 0
if self.redis_client.exists('unacked_index'):
try:
# Get all task IDs from the sorted set
task_ids = self.redis_client.zrange('unacked_index', 0, -1)
if task_ids:
try:
ActivityLog.log_scraper_activity(
action="scan_unacked_index",
status="info",
description=f"Found {len(task_ids)} tasks in 'unacked_index'"
)
except RuntimeError:
print(f"📋 Found {len(task_ids)} tasks in 'unacked_index'")
# Check each task ID against the 'unacked' hash to get task details
scraper_task_ids = []
for task_id in task_ids:
try:
# Get task data from 'unacked' hash
task_data = self.redis_client.hget('unacked', task_id)
if task_data:
# Check if this task contains any of our scraper patterns
if any(pattern in str(task_data) for pattern in scraper_patterns):
scraper_task_ids.append(task_id)
except Exception:
# Skip individual task errors
continue
# Remove scraper task IDs from both structures
for task_id in scraper_task_ids:
try:
# Remove from unacked_index (sorted set)
removed_from_index = self.redis_client.zrem('unacked_index', task_id)
# Remove from unacked (hash)
removed_from_hash = self.redis_client.hdel('unacked', task_id)
if removed_from_index or removed_from_hash:
unacked_index_cleared += 1
except Exception as e:
try:
ActivityLog.log_error(
error_message=f"Error removing delayed task {task_id}: {str(e)}",
source="ScraperManager._clear_delayed_tasks_from_redis"
)
except RuntimeError:
print(f"❌ Error removing delayed task {task_id}: {str(e)}")
continue
cleared_count += unacked_index_cleared
if unacked_index_cleared > 0:
try:
ActivityLog.log_scraper_activity(
action="clear_unacked_tasks",
status="success",
description=f"Cleared {unacked_index_cleared} scraper tasks from unacked structures"
)
except RuntimeError:
print(f"✅ Cleared {unacked_index_cleared} scraper tasks from unacked structures")
else:
try:
ActivityLog.log_scraper_activity(
action="check_unacked_index",
status="info",
description="No tasks found in 'unacked_index'"
)
except RuntimeError:
print(" No tasks found in 'unacked_index'")
except Exception as e:
try:
ActivityLog.log_error(
error_message=f"Error accessing 'unacked_index': {str(e)}",
source="ScraperManager._clear_delayed_tasks_from_redis"
)
except RuntimeError:
print(f"❌ Error accessing 'unacked_index': {str(e)}")
else:
try:
ActivityLog.log_scraper_activity(
action="check_unacked_index",
status="info",
description="'unacked_index' key does not exist - no delayed tasks"
)
except RuntimeError:
print(" 'unacked_index' key does not exist - no delayed tasks")
# Also check the 'celery' queue for immediate tasks (backup check)
celery_cleared = 0
try:
queue_length = self.redis_client.llen('celery')
if queue_length and queue_length > 0:
# Scan for any scraper tasks in the immediate queue
scraper_tasks = []
for i in range(queue_length):
try:
task_data = self.redis_client.lindex('celery', i)
if task_data and any(pattern in str(task_data) for pattern in scraper_patterns):
scraper_tasks.append(task_data)
except Exception:
continue
# Remove scraper tasks from celery queue
for task_data in scraper_tasks:
try:
removed_count = self.redis_client.lrem('celery', 0, task_data)
celery_cleared += removed_count
except Exception:
continue
cleared_count += celery_cleared
if celery_cleared > 0:
try:
ActivityLog.log_scraper_activity(
action="clear_celery_tasks",
status="success",
description=f"Cleared {celery_cleared} scraper tasks from 'celery' queue"
)
except RuntimeError:
print(f"✅ Cleared {celery_cleared} scraper tasks from 'celery' queue")
except Exception as e:
try:
ActivityLog.log_error(
error_message=f"Error checking 'celery' queue: {str(e)}",
source="ScraperManager._clear_delayed_tasks_from_redis"
)
except RuntimeError:
print(f"❌ Error checking 'celery' queue: {str(e)}")
# Summary
# Summary logging
if cleared_count > 0:
try:
ActivityLog.log_scraper_activity(
action="clear_delayed_tasks_complete",
action="clear_delayed_tasks_complete_apscheduler",
status="success",
description=f"Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})"
description=f"Total delayed scraper tasks cleared from APScheduler: {cleared_count}"
)
except RuntimeError:
print(f"✅ Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})")
print(f"✅ Total delayed scraper tasks cleared from APScheduler: {cleared_count}")
else:
try:
ActivityLog.log_scraper_activity(
action="clear_delayed_tasks_complete",
action="clear_delayed_tasks_complete_apscheduler",
status="info",
description="No delayed scraper tasks found to clear in Redis"
description="No delayed scraper tasks found to clear in APScheduler"
)
except RuntimeError:
print(" No delayed scraper tasks found to clear in Redis")
print(" No delayed scraper tasks found to clear in APScheduler")
return cleared_count
except Exception as e:
try:
ActivityLog.log_error(
error_message=f"Failed to clear delayed tasks from Redis: {str(e)}",
source="ScraperManager._clear_delayed_tasks_from_redis"
error_message=f"Failed to clear delayed tasks from APScheduler: {str(e)}",
source="ScraperManager._clear_delayed_tasks_from_apscheduler"
)
except RuntimeError:
print(f"❌ Failed to clear delayed tasks from Redis: {str(e)}")
print(f"❌ Failed to clear delayed tasks from APScheduler: {str(e)}")
return 0
def start_scraper(self) -> Dict[str, str]:
"""Start the scraper system."""
"""Start the scraper system and immediately schedule papers for the current hour."""
try:
# Get current scraper
self.current_scraper = get_scraper()
@ -270,13 +114,25 @@ class ScraperManager:
scraper_name = self.current_scraper.get_name()
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description=f"Started scraper: {scraper_name}. Use /trigger-immediate endpoint to immediately schedule papers instead of waiting for the next hourly boundary."
)
# Immediately schedule papers for the remaining time in the current hour
immediate_scheduled_count = self._schedule_papers_for_current_hour()
return {"status": "success", "message": "Scraper started successfully. Papers will be scheduled at the next hourly boundary, or use /trigger-immediate to schedule immediately."}
if immediate_scheduled_count > 0:
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description=f"Started scraper: {scraper_name}. Immediately scheduled {immediate_scheduled_count} papers for the remaining time in this hour."
)
return {"status": "success", "message": f"Scraper started successfully. Immediately scheduled {immediate_scheduled_count} papers for processing in the remaining time this hour."}
else:
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description=f"Started scraper: {scraper_name}. No papers available for immediate scheduling in the current hour."
)
return {"status": "success", "message": "Scraper started successfully. No papers available for immediate scheduling this hour."}
except Exception as e:
ActivityLog.log_error(
@ -318,123 +174,29 @@ class ScraperManager:
return {"status": "error", "message": str(e)}
def stop_scraper(self) -> Dict[str, str]:
"""Stop the scraper, revoke all running tasks, and revert pending papers."""
"""Stop the scraper, revoke all APScheduler jobs, and revert pending papers."""
try:
# First, revoke all running tasks
revoked_count = 0
delayed_cleared_count = 0
# STEP 1: Immediately set scraper as inactive - this is critical for race condition prevention
ScraperState.set_active(False)
ScraperState.set_paused(False)
ActivityLog.log_scraper_command(
action="stop_scraper_start",
status="info",
description="Beginning scraper stop process with task revocation and delayed task clearing"
description="Scraper stop initiated - marked as inactive. Beginning APScheduler job revocation."
)
try:
# Get Celery inspector to check for running tasks
i = celery.control.inspect()
active = i.active() or {}
scheduled = i.scheduled() or {}
reserved = i.reserved() or {}
# STEP 2: Brief pause to allow running jobs to see the inactive state
import time
time.sleep(0.2)
# Revoke active tasks
for worker, tasks in active.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
ActivityLog.log_scraper_activity(
action="revoke_task",
status="success",
description=f"Revoked active task: {task.get('name', 'unknown')} (ID: {task['id']})"
)
# STEP 3: Revoke all APScheduler jobs
delayed_cleared_count = self._clear_delayed_tasks_from_apscheduler()
# Revoke scheduled tasks
for worker, tasks in scheduled.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
ActivityLog.log_scraper_activity(
action="revoke_task",
status="success",
description=f"Revoked scheduled task: {task.get('name', 'unknown')} (ID: {task['id']})"
)
# STEP 4: Wait a bit for any remaining jobs to finish their checks and exit
time.sleep(1.0)
# Revoke reserved tasks
for worker, tasks in reserved.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
ActivityLog.log_scraper_activity(
action="revoke_task",
status="success",
description=f"Revoked reserved task: {task.get('name', 'unknown')} (ID: {task['id']})"
)
# Purge all task queues
celery.control.purge()
ActivityLog.log_scraper_activity(
action="purge_queues",
status="success",
description="Purged all task queues"
)
# **NEW: Clear delayed tasks from Redis sorted sets**
delayed_cleared_count = self._clear_delayed_tasks_from_redis()
# Additional cleanup: revoke any remaining scraper-related tasks by name pattern
try:
# Use broadcast to revoke tasks that match scraper patterns
scraper_task_patterns = [
'process_single_paper',
'process_papers_batch',
'hourly_scraper_scheduler'
]
# Get a fresh inspection of tasks after purge
fresh_inspect = celery.control.inspect()
all_tasks = {}
all_tasks.update(fresh_inspect.active() or {})
all_tasks.update(fresh_inspect.scheduled() or {})
all_tasks.update(fresh_inspect.reserved() or {})
additional_revoked = 0
for worker, tasks in all_tasks.items():
for task in tasks:
task_name = task.get('name', '')
task_id = task.get('id', '')
if any(pattern in task_name for pattern in scraper_task_patterns) and task_id:
celery.control.revoke(task_id, terminate=True)
additional_revoked += 1
ActivityLog.log_scraper_activity(
action="revoke_scraper_task",
status="success",
description=f"Revoked lingering scraper task: {task_name} (ID: {task_id})"
)
if additional_revoked > 0:
ActivityLog.log_scraper_activity(
action="cleanup_scraper_tasks",
status="success",
description=f"Additional cleanup: revoked {additional_revoked} lingering scraper tasks"
)
except Exception as e:
ActivityLog.log_error(
error_message=f"Error during additional scraper task cleanup: {str(e)}",
source="ScraperManager.stop_scraper.cleanup"
)
except Exception as e:
ActivityLog.log_error(
error_message=f"Error revoking tasks: {str(e)}",
source="ScraperManager.stop_scraper"
)
# Continue with paper reversion even if task revocation fails
# Get current scraper to know what status to revert to
# STEP 5: Revert papers from processing status
scraper = get_scraper()
input_statuses = scraper.get_input_statuses()
@ -453,7 +215,7 @@ class ScraperManager:
paper.status = paper.previous_status
else:
paper.status = revert_status
paper.updated_at = datetime.utcnow()
paper.updated_at = datetime.now(UTC)
reverted_count += 1
db.session.commit()
@ -464,19 +226,15 @@ class ScraperManager:
description=f"Reverted {reverted_count} papers from '{processing_status}' to previous status"
)
# Deactivate scraper
ScraperState.set_active(False)
ScraperState.set_paused(False)
ActivityLog.log_scraper_command(
action="stop_scraper",
status="success",
description=f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers."
description=f"Scraper stopped completely. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers."
)
return {
"status": "success",
"message": f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers to previous status."
"message": f"Scraper stopped. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers to previous status."
}
except Exception as e:
@ -487,51 +245,16 @@ class ScraperManager:
return {"status": "error", "message": str(e)}
def reset_scraper(self) -> Dict[str, str]:
"""Reset scraper state, revoke all running tasks, and clear all processing statuses."""
"""Reset scraper state, revoke all APScheduler jobs, and clear all processing statuses."""
try:
# First, revoke all running tasks (similar to stop_scraper)
revoked_count = 0
ActivityLog.log_scraper_command(
action="reset_scraper_start",
status="info",
description="Beginning scraper reset process with task revocation"
description="Beginning scraper reset process with APScheduler job revocation"
)
try:
# Get Celery inspector to check for running tasks
i = celery.control.inspect()
active = i.active() or {}
scheduled = i.scheduled() or {}
reserved = i.reserved() or {}
# Revoke all tasks (active, scheduled, reserved)
for queue_name, queue_tasks in [("active", active), ("scheduled", scheduled), ("reserved", reserved)]:
for worker, tasks in queue_tasks.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
ActivityLog.log_scraper_activity(
action="revoke_task",
status="success",
description=f"Revoked {queue_name} task: {task.get('name', 'unknown')} (ID: {task['id']})"
)
# Purge all task queues
celery.control.purge()
ActivityLog.log_scraper_activity(
action="purge_queues",
status="success",
description="Purged all task queues during reset"
)
except Exception as e:
ActivityLog.log_error(
error_message=f"Error revoking tasks during reset: {str(e)}",
source="ScraperManager.reset_scraper"
)
# Continue with paper reversion even if task revocation fails
# Clear all APScheduler jobs
delayed_cleared_count = self._clear_delayed_tasks_from_apscheduler()
# Get current scraper configuration
scraper = get_scraper()
@ -551,7 +274,7 @@ class ScraperManager:
paper.status = paper.previous_status
else:
paper.status = revert_status
paper.updated_at = datetime.utcnow()
paper.updated_at = datetime.now(UTC)
paper.error_msg = None # Clear any error messages
reverted_count += 1
@ -564,12 +287,12 @@ class ScraperManager:
ActivityLog.log_scraper_command(
action="reset_scraper",
status="success",
description=f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers."
description=f"Scraper reset. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers."
)
return {
"status": "success",
"message": f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers to original status."
"message": f"Scraper reset. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers to original status."
}
except Exception as e:
@ -638,24 +361,52 @@ class ScraperManager:
.limit(papers_needed)
.all())
ActivityLog.log_scraper_activity(
action="select_papers",
status="info",
description=f"Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})"
)
try:
ActivityLog.log_scraper_activity(
action="select_papers",
status="info",
description=f"Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})"
)
except RuntimeError:
# Outside application context - use print fallback
print(f"📋 Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})")
return papers
except Exception as e:
ActivityLog.log_error(
error_message=f"Error selecting papers: {str(e)}",
source="ScraperManager.select_papers_for_processing"
)
try:
ActivityLog.log_error(
error_message=f"Error selecting papers: {str(e)}",
source="ScraperManager.select_papers_for_processing"
)
except RuntimeError:
# Outside application context - use print fallback
print(f"❌ Error selecting papers: {str(e)}")
return []
def process_paper(self, paper: PaperMetadata) -> Dict:
"""Process a single paper using the current scraper."""
try:
# **RACE CONDITION FIX**: Double-check scraper state before proceeding
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_paper",
paper_id=paper.id,
status="skipped",
description="Skipped processing - scraper deactivated during task execution"
)
return {"paper_id": paper.id, "status": "skipped", "message": "Scraper not active"}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_paper",
paper_id=paper.id,
status="skipped",
description="Skipped processing - scraper paused during task execution"
)
return {"paper_id": paper.id, "status": "skipped", "message": "Scraper paused"}
scraper = get_scraper()
output_statuses = scraper.get_output_statuses()
@ -665,9 +416,25 @@ class ScraperManager:
# Update paper status to processing
paper.previous_status = previous_status
paper.status = output_statuses["processing"]
paper.updated_at = datetime.utcnow()
paper.updated_at = datetime.now(UTC)
db.session.commit()
# **ADDITIONAL RACE CONDITION CHECK**: Verify scraper is still active before expensive scraping operation
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
# Scraper was deactivated after we marked paper as processing - revert and exit
paper.status = previous_status
paper.updated_at = datetime.now(UTC)
db.session.commit()
ActivityLog.log_scraper_activity(
action="process_paper",
paper_id=paper.id,
status="cancelled",
description="Cancelled processing - scraper deactivated after paper marked as processing"
)
return {"paper_id": paper.id, "status": "cancelled", "message": "Scraper deactivated during processing"}
# Perform scraping
result = scraper.scrape(paper.doi)
@ -681,7 +448,7 @@ class ScraperManager:
paper.status = output_statuses["failure"]
paper.error_msg = result.message
paper.updated_at = datetime.utcnow()
paper.updated_at = datetime.now(UTC)
db.session.commit()
# Log result
@ -706,7 +473,7 @@ class ScraperManager:
if input_statuses:
paper.status = input_statuses[0]
paper.error_msg = f"Processing error: {str(e)}"
paper.updated_at = datetime.utcnow()
paper.updated_at = datetime.now(UTC)
db.session.commit()
except:
pass # Don't fail if reversion fails
@ -718,6 +485,91 @@ class ScraperManager:
return {"paper_id": paper.id, "status": "error", "message": str(e)}
def process_paper_manual(self, paper: PaperMetadata, scraper_name: Optional[str] = None) -> Dict:
"""Process a single paper manually, bypassing scraper state checks."""
try:
# Get scraper configuration but skip state validation for manual processing
if scraper_name:
# Use the specified scraper
import importlib
from .base import BaseScraper
try:
module = importlib.import_module(f"scipaperloader.scrapers.{scraper_name}")
scraper_cls = getattr(module, "Scraper")
if not issubclass(scraper_cls, BaseScraper):
raise TypeError(f"Scraper class in module '{scraper_name}' does not inherit from BaseScraper")
scraper = scraper_cls()
except (ImportError, AttributeError, TypeError) as e:
ActivityLog.log_error(
error_message=f"Failed to load specified scraper '{scraper_name}': {str(e)}. Falling back to system default.",
source="ScraperManager.process_paper_manual"
)
scraper = get_scraper()
else:
# Use system default scraper
scraper = get_scraper()
output_statuses = scraper.get_output_statuses()
# Store the previous status before changing it
previous_status = paper.status
# Update paper status to processing
paper.previous_status = previous_status
paper.status = output_statuses["processing"]
paper.updated_at = datetime.now(UTC)
db.session.commit()
# Perform scraping (no state checks for manual processing)
result = scraper.scrape(paper.doi)
# Update paper status based on result
if result.status == "success":
paper.status = output_statuses["success"]
paper.error_msg = None
if result.data and "file_path" in result.data:
paper.file_path = result.data["file_path"]
else:
paper.status = output_statuses["failure"]
paper.error_msg = result.message
paper.updated_at = datetime.now(UTC)
db.session.commit()
# Log result
ActivityLog.log_scraper_activity(
action="process_paper_manual",
paper_id=paper.id,
status=result.status,
description=f"Manually processed {paper.doi}: {result.message}"
)
return {
"paper_id": paper.id,
"status": result.status,
"message": result.message,
"duration": result.duration
}
except Exception as e:
# Revert paper status on error
try:
input_statuses = get_scraper().get_input_statuses()
if input_statuses:
paper.status = input_statuses[0]
paper.error_msg = f"Manual processing error: {str(e)}"
paper.updated_at = datetime.now(UTC)
db.session.commit()
except:
pass # Don't fail if reversion fails
ActivityLog.log_error(
error_message=f"Error manually processing paper {paper.id}: {str(e)}",
source="ScraperManager.process_paper_manual"
)
return {"paper_id": paper.id, "status": "error", "message": str(e)}
def get_status(self) -> Dict:
"""Get current scraper status."""
scraper_state = ScraperState.get_current_state()
@ -745,3 +597,119 @@ class ScraperManager:
"processing_papers": processing_count,
"current_hour_quota": self.get_current_hour_quota()
}
def _schedule_papers_for_current_hour(self) -> int:
"""Schedule papers for processing in the remaining time of the current hour.
Returns:
int: Number of papers scheduled
"""
try:
# Get papers that should be processed this hour
papers = self.select_papers_for_processing()
if not papers:
return 0
# Get raw APScheduler instance for direct job scheduling
scheduler = self._get_raw_scheduler()
if not scheduler:
ActivityLog.log_error(
error_message="Raw APScheduler not available for immediate paper scheduling",
source="ScraperManager._schedule_papers_for_current_hour"
)
return 0
# Calculate remaining time in current hour
current_time = datetime.now()
next_hour = current_time.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)
remaining_seconds = int((next_hour - current_time).total_seconds())
# Don't schedule if less than 2 minutes remaining
if remaining_seconds < 120:
ActivityLog.log_scraper_activity(
action="start_scraper_immediate_scheduling",
status="info",
description=f"Skipping immediate scheduling - only {remaining_seconds} seconds remaining in current hour"
)
return 0
# Schedule papers at random times within the remaining time
scheduled_count = 0
scheduled_papers = []
for paper in papers:
try:
# Random delay between 1 second and remaining time minus 60 seconds buffer
max_delay = max(1, remaining_seconds - 60)
delay_seconds = random.randint(1, max_delay)
run_time = current_time + timedelta(seconds=delay_seconds)
# Generate unique job ID
import uuid
job_id = f"startup_paper_{paper.id}_{int(current_time.timestamp())}_{uuid.uuid4().hex[:8]}"
# Schedule the job
from ..scheduler import _process_single_paper
scheduler.add_job(
func=_process_single_paper,
trigger='date',
run_date=run_time,
args=[paper.id],
id=job_id,
name=f"Startup Process Paper {paper.id}",
replace_existing=True
)
scheduled_count += 1
# Collect paper info for logging
paper_info = {
"paper_id": paper.id,
"paper_doi": paper.doi,
"job_id": job_id,
"scheduled_time": run_time.isoformat(),
"delay_seconds": delay_seconds
}
scheduled_papers.append(paper_info)
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to schedule paper {paper.id} during startup: {str(e)}",
source="ScraperManager._schedule_papers_for_current_hour"
)
# Create single comprehensive log entry
if scheduled_papers:
try:
import json
scheduling_data = {
"total_scheduled": scheduled_count,
"scheduled_papers": scheduled_papers,
"timestamp": current_time.isoformat(),
"remaining_time_seconds": remaining_seconds,
"trigger": "startup_immediate_scheduling"
}
ActivityLog.log_scraper_activity(
action="startup_immediate_scheduling",
status="success",
description=f"Scheduled {scheduled_count} papers for immediate processing during startup for remaining {remaining_seconds}s in current hour. See extra_data for details.",
**{"scheduling_details": json.dumps(scheduling_data)}
)
except Exception:
# Fallback to simple logging
ActivityLog.log_scraper_activity(
action="startup_immediate_scheduling",
status="success",
description=f"Scheduled {scheduled_count} papers for immediate processing during startup"
)
return scheduled_count
except Exception as e:
ActivityLog.log_error(
error_message=f"Error in startup immediate scheduling: {str(e)}",
source="ScraperManager._schedule_papers_for_current_hour"
)
return 0

View File

@ -0,0 +1,282 @@
import time
import requests
import re
from urllib.parse import urlparse
from datetime import datetime
from typing import Optional
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Publisher detection scraper that identifies the publisher from the final URL after DOI redirect."""
# This scraper processes "New" papers and outputs "PublisherDetected"/"Failed"
INPUT_STATUSES = ["New"]
OUTPUT_STATUS_SUCCESS = "PublisherDetected"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "DetectingPublisher"
# Publisher detection patterns based on URL domains and paths
PUBLISHER_URL_PATTERNS = {
'elsevier': [
r'sciencedirect\.com',
r'elsevier\.com',
r'.*\.elsevier\.com'
],
'springer': [
r'link\.springer\.com',
r'springer\.com',
r'.*\.springer\.com'
],
'wiley': [
r'onlinelibrary\.wiley\.com',
r'wiley\.com',
r'.*\.wiley\.com'
],
'ieee': [
r'ieeexplore\.ieee\.org',
r'ieee\.org',
r'.*\.ieee\.org'
],
'plos': [
r'journals\.plos\.org',
r'plos\.org',
r'.*\.plos\.org'
],
'nature': [
r'nature\.com',
r'.*\.nature\.com'
],
'sage': [
r'journals\.sagepub\.com',
r'sagepub\.com',
r'.*\.sagepub\.com'
],
'taylor_francis': [
r'tandfonline\.com',
r'.*\.tandfonline\.com'
],
'acs': [
r'pubs\.acs\.org',
r'acs\.org',
r'.*\.acs\.org'
],
'arxiv': [
r'arxiv\.org',
r'export\.arxiv\.org'
],
'pubmed': [
r'pubmed\.ncbi\.nlm\.nih\.gov',
r'ncbi\.nlm\.nih\.gov'
],
'oxford': [
r'academic\.oup\.com',
r'oup\.com',
r'.*\.oup\.com'
],
'cambridge': [
r'cambridge\.org',
r'.*\.cambridge\.org'
],
'biorxiv': [
r'biorxiv\.org',
r'.*\.biorxiv\.org'
],
'researchgate': [
r'researchgate\.net',
r'.*\.researchgate\.net'
]
}
def scrape(self, doi: str) -> ScrapeResult:
"""Detect publisher from the final URL after DOI redirect."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
try:
# Get the final URL by following the DOI redirect
final_url = self._get_final_url(doi)
if not final_url:
error_msg = f"Could not resolve DOI {doi} to a URL"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "doi_resolution_failed"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Detect publisher from URL
detected_publisher = self._detect_publisher_from_url(final_url)
if detected_publisher:
# Update paper with detected publisher
paper.publisher = detected_publisher
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.error_msg = None
db.session.commit()
success_msg = f"Publisher '{detected_publisher}' detected from URL: {final_url}"
self.log_scrape_success(doi, success_msg, paper.id)
return ScrapeResult(
status="success",
message=success_msg,
data={
"publisher": detected_publisher,
"final_url": final_url
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
else:
error_msg = f"Could not detect publisher from URL: {final_url}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={
"final_url": final_url,
"error_code": "publisher_not_detected"
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Error detecting publisher for DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "publisher_detection_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
def _get_final_url(self, doi: str) -> Optional[str]:
"""
Get the final URL after following DOI redirects.
Args:
doi: The DOI to resolve
Returns:
Final URL after redirects, or None if resolution fails
"""
try:
doi_url = f"https://doi.org/{doi}"
headers = {
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
# Make a HEAD request to get the final URL without downloading content
response = requests.head(
doi_url,
headers=headers,
timeout=15,
allow_redirects=True
)
# If HEAD is not allowed, try GET but with minimal content
if response.status_code == 405: # Method Not Allowed
response = requests.get(
doi_url,
headers=headers,
timeout=15,
allow_redirects=True,
stream=True # Don't download the full content
)
response.close() # Close connection after getting headers
if response.status_code in [200, 302, 301]:
return response.url
else:
return None
except Exception as e:
# Log error but don't raise - we'll handle this gracefully
return None
def _detect_publisher_from_url(self, url: str) -> Optional[str]:
"""
Detect publisher from URL using domain patterns.
Args:
url: The URL to analyze
Returns:
Publisher name if detected, None otherwise
"""
if not url:
return None
# Parse the URL to get the domain
parsed_url = urlparse(url)
domain = parsed_url.netloc.lower()
# Remove 'www.' prefix if present
if domain.startswith('www.'):
domain = domain[4:]
# Score each publisher based on URL pattern matches
publisher_scores = {}
for publisher, patterns in self.PUBLISHER_URL_PATTERNS.items():
score = 0
for pattern in patterns:
if re.search(pattern, domain, re.IGNORECASE):
score += 10 # Strong match for domain patterns
# Also check the full URL for path-based patterns
if re.search(pattern, url.lower(), re.IGNORECASE):
score += 5
if score > 0:
publisher_scores[publisher] = score
# Return the publisher with the highest score
if publisher_scores:
best_publisher = max(publisher_scores.keys(), key=lambda x: publisher_scores[x])
# Only return if we have a reasonable confidence (score > 5)
if publisher_scores[best_publisher] > 5:
return best_publisher
return None

View File

@ -1,18 +1,17 @@
"""
Hourly scheduler task that processes papers at random times within each hour.
APScheduler-based task functions that replace Celery tasks for paper processing.
"""
import random
from datetime import datetime, timedelta
from typing import Optional
from celery import shared_task
from flask import current_app
from ..models import ScraperState, ActivityLog
from ..models import ScraperState, ActivityLog, PaperMetadata
from .manager import ScraperManager
@shared_task(bind=True)
def hourly_scraper_scheduler(self):
def hourly_scraper_scheduler():
"""
Hourly task that schedules paper processing at random times within the hour.
@ -29,8 +28,6 @@ def hourly_scraper_scheduler(self):
status="info",
description="Hourly scheduler skipped - scraper not active"
)
# Disable retries for inactive scheduler
self.retry = False
return {"status": "inactive", "papers_scheduled": 0}
if scraper_state.is_paused:
@ -39,8 +36,6 @@ def hourly_scraper_scheduler(self):
status="info",
description="Hourly scheduler skipped - scraper paused"
)
# Disable retries for paused scheduler
self.retry = False
return {"status": "paused", "papers_scheduled": 0}
# Initialize scraper manager
@ -57,6 +52,15 @@ def hourly_scraper_scheduler(self):
)
return {"status": "empty", "papers_scheduled": 0}
# Get scheduler from Flask app config
scheduler = current_app.config.get('SCHEDULER')
if not scheduler:
ActivityLog.log_error(
error_message="APScheduler not available for paper scheduling",
source="hourly_scraper_scheduler"
)
return {"status": "error", "message": "APScheduler not available"}
# Schedule papers at random times within the hour (0-3600 seconds)
scheduled_count = 0
current_time = datetime.now()
@ -64,24 +68,27 @@ def hourly_scraper_scheduler(self):
for paper in papers:
# Random delay between 1 second and 58 minutes
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
run_date = current_time + timedelta(seconds=delay_seconds)
# Schedule the task using Celery's task registry to avoid circular import issues
from ..celery import celery
celery.send_task(
'scipaperloader.scrapers.tasks.process_single_paper',
# Schedule the task using APScheduler
job_id = f"paper_process_{paper.id}_{int(current_time.timestamp())}"
scheduler.add_job(
func=process_single_paper,
trigger='date',
run_date=run_date,
args=[paper.id],
countdown=delay_seconds
id=job_id,
replace_existing=True
)
scheduled_count += 1
# Log each scheduled paper
schedule_time = current_time + timedelta(seconds=delay_seconds)
ActivityLog.log_scraper_activity(
action="schedule_paper",
paper_id=paper.id,
status="info",
description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
description=f"Scheduled paper {paper.doi} for processing at {run_date.strftime('%H:%M:%S')}"
)
ActivityLog.log_scraper_activity(
@ -100,8 +107,7 @@ def hourly_scraper_scheduler(self):
return {"status": "error", "message": str(e)}
@shared_task(bind=True)
def process_single_paper(self, paper_id: int):
def process_single_paper(paper_id: int):
"""
Process a single paper. This task is scheduled at random times within each hour.
@ -109,17 +115,17 @@ def process_single_paper(self, paper_id: int):
paper_id: ID of the paper to process
"""
try:
# Double-check scraper state before processing
# ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times
# Initial check before any processing
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Skipped processing - scraper not active"
description="Task skipped - scraper not active (initial check)"
)
# Use Celery's ignore to mark this task as completed without error
self.retry = False
return {"status": "inactive", "paper_id": paper_id}
if scraper_state.is_paused:
@ -127,18 +133,50 @@ def process_single_paper(self, paper_id: int):
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Skipped processing - scraper paused"
description="Task skipped - scraper paused (initial check)"
)
return {"status": "paused", "paper_id": paper_id}
# Brief pause to allow stop commands to take effect
import time
time.sleep(0.1)
# Second check after brief delay
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper not active (secondary check)"
)
return {"status": "inactive", "paper_id": paper_id}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper paused (secondary check)"
)
# Use Celery's ignore for paused state too
self.retry = False
return {"status": "paused", "paper_id": paper_id}
# Get the paper
from ..models import PaperMetadata
paper = PaperMetadata.query.get(paper_id)
if not paper:
return {"status": "error", "message": f"Paper {paper_id} not found"}
# Third check before starting actual processing
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Task skipped - scraper not active (pre-processing check)"
)
return {"status": "inactive", "paper_id": paper_id}
# Process the paper using scraper manager
manager = ScraperManager()
result = manager.process_paper(paper)
@ -153,8 +191,48 @@ def process_single_paper(self, paper_id: int):
return {"status": "error", "paper_id": paper_id, "message": str(e)}
@shared_task(bind=True)
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
def process_single_paper_manual(paper_id: int, scraper_name: Optional[str] = None):
"""
Process a single paper manually, bypassing scraper state checks.
Used for manual paper processing from the UI.
Args:
paper_id: ID of the paper to process
scraper_name: Optional specific scraper module to use
"""
try:
# Get the paper without checking scraper state
paper = PaperMetadata.query.get(paper_id)
if not paper:
ActivityLog.log_error(
error_message=f"Paper {paper_id} not found for manual processing",
source="process_single_paper_manual"
)
return {"status": "error", "message": f"Paper {paper_id} not found"}
# Process the paper using the manual processing method (bypasses state checks)
manager = ScraperManager()
result = manager.process_paper_manual(paper, scraper_name=scraper_name)
ActivityLog.log_scraper_activity(
action="manual_process_complete",
paper_id=paper_id,
status=result["status"],
description=f"Manual processing completed for paper {paper.doi}" +
(f" using scraper '{scraper_name}'" if scraper_name else " using system default scraper")
)
return result
except Exception as e:
ActivityLog.log_error(
error_message=f"Error manually processing paper {paper_id}: {str(e)}",
source="process_single_paper_manual"
)
return {"status": "error", "paper_id": paper_id, "message": str(e)}
def process_papers_batch(paper_ids: list, scraper_module: Optional[str] = None):
"""
Process multiple papers in a batch for immediate processing.
@ -167,7 +245,6 @@ def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] =
manager = ScraperManager()
for paper_id in paper_ids:
from ..models import PaperMetadata
paper = PaperMetadata.query.get(paper_id)
if paper:
result = manager.process_paper(paper)

View File

@ -0,0 +1,237 @@
import time
import os
from datetime import datetime
from typing import Optional
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
from ..parsers.base_parser import BaseParser, ParseError
from ..parsers.elsevier_parser import ElsevierParser
from ..parsers.arxiv_parser import ArxivParser
class Scraper(BaseScraper):
"""Full text extraction scraper that uses publisher-specific parsers."""
# This scraper processes papers with HTML content and outputs "TextExtracted"/"Failed"
INPUT_STATUSES = ["WebContentDownloaded", "PublisherDetected"]
OUTPUT_STATUS_SUCCESS = "TextExtracted"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "ExtractingText"
def __init__(self):
super().__init__()
# Registry of available parsers
self.parsers = [
ElsevierParser(),
ArxivParser(),
# Add more parsers here as you create them
# SpringerParser(),
# WileyParser(),
# IEEEParser(),
]
def scrape(self, doi: str) -> ScrapeResult:
"""Extract full text using appropriate publisher parser."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
# Check if HTML file exists
if not paper.file_path or not os.path.exists(paper.file_path):
error_msg = f"HTML file not found for DOI {doi}. Expected at: {paper.file_path}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "html_file_not_found"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
try:
# Read HTML content
with open(paper.file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
# Find appropriate parser
parser = self._select_parser(html_content)
if not parser:
error_msg = f"No suitable parser found for DOI {doi}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "no_parser_available"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Parse content
parsed_content = parser.parse(html_content, doi)
# Validate parsed content
if not parser.validate_content(parsed_content):
error_msg = f"Parsed content validation failed for DOI {doi}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "content_validation_failed"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Save extracted text to file
text_file_path = self._save_extracted_text(parsed_content, doi)
# Update paper status to success
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.error_msg = None
# You might want to add a text_file_path field to store the text file location
# paper.text_file_path = text_file_path
db.session.commit()
success_msg = f"Successfully extracted text using {parser.get_name()} parser"
self.log_scrape_success(doi, success_msg, paper.id)
return ScrapeResult(
status="success",
message=f"Successfully extracted full text for {doi}",
data={
"text_file_path": text_file_path,
"parser_used": parser.get_name(),
"title": parsed_content.title,
"word_count": len(parsed_content.full_text.split()),
"has_abstract": bool(parsed_content.abstract),
"has_sections": bool(parsed_content.sections),
"author_count": len(parsed_content.authors) if parsed_content.authors else 0,
"keyword_count": len(parsed_content.keywords) if parsed_content.keywords else 0,
"reference_count": len(parsed_content.references) if parsed_content.references else 0
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except ParseError as e:
error_msg = f"Parser error for DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "parser_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Unexpected error extracting text for DOI {doi}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "extraction_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
def _select_parser(self, html_content: str) -> Optional[BaseParser]:
"""
Select the most appropriate parser for the HTML content.
Args:
html_content: The HTML content to analyze
Returns:
The best parser for this content, or None if no parser can handle it
"""
for parser in self.parsers:
if parser.can_parse(html_content):
return parser
return None
def _save_extracted_text(self, parsed_content, doi: str) -> str:
"""
Save extracted text to a file.
Args:
parsed_content: The parsed content object
doi: The DOI of the paper
Returns:
Path to the saved text file
"""
download_path = DownloadPathConfig.get_path()
text_file_name = f"{doi.replace('/', '_')}_fulltext.txt"
text_file_path = os.path.join(download_path, text_file_name)
with open(text_file_path, 'w', encoding='utf-8') as f:
# Write structured content
f.write(f"DOI: {parsed_content.doi or doi}\n")
f.write(f"Title: {parsed_content.title or 'Unknown'}\n")
f.write(f"Journal: {parsed_content.journal or 'Unknown'}\n")
f.write(f"Publication Date: {parsed_content.publication_date or 'Unknown'}\n")
if parsed_content.authors:
f.write(f"Authors: {', '.join(parsed_content.authors)}\n")
if parsed_content.keywords:
f.write(f"Keywords: {', '.join(parsed_content.keywords)}\n")
f.write(f"Extracted: {datetime.utcnow().isoformat()}\n")
f.write("=" * 80 + "\n\n")
# Write full text
f.write(parsed_content.full_text)
# Optionally write references at the end
if parsed_content.references:
f.write("\n\n" + "=" * 80 + "\n")
f.write("REFERENCES\n")
f.write("=" * 80 + "\n")
for i, ref in enumerate(parsed_content.references, 1):
f.write(f"{i}. {ref}\n")
return text_file_path

View File

@ -0,0 +1,201 @@
import time
import os
import requests
from urllib.parse import urlparse
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Web fetcher scraper that downloads HTML content from DOI URLs."""
# This scraper processes "New" papers and outputs "WebContentDownloaded"/"Failed"
INPUT_STATUSES = ["New"]
OUTPUT_STATUS_SUCCESS = "WebContentDownloaded"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "FetchingWebContent"
def scrape(self, doi: str) -> ScrapeResult:
"""Fetch HTML content from DOI and save to download path."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log start of scraping
self.log_scrape_start(doi, paper.id)
# Update status to processing
paper.status = self.OUTPUT_STATUS_PROCESSING
db.session.commit()
# Prepare file paths
download_path = DownloadPathConfig.get_path()
file_name = f"{doi.replace('/', '_')}.html"
file_path = os.path.join(download_path, file_name)
# Check/create download directory
if not os.path.exists(download_path):
try:
os.makedirs(download_path, exist_ok=True)
except OSError as e:
error_msg = f"Failed to create download directory: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Check path permissions
if not os.access(download_path, os.W_OK):
error_msg = f"Download path '{download_path}' is not writable"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "path_write_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
try:
# Fetch HTML from DOI
doi_url = f"https://doi.org/{doi}"
headers = {
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
response = requests.get(
doi_url,
headers=headers,
timeout=30,
allow_redirects=True,
verify=True
)
# Check for invalid DOI (404) or other HTTP errors
if response.status_code == 404:
error_msg = f"Invalid DOI: {doi} not found (404)"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "invalid_doi"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Check for other HTTP errors
response.raise_for_status()
# Save HTML content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(response.text)
# Extract final URL after redirects (for publisher detection)
final_url = response.url
# Update paper status to success
paper.status = self.OUTPUT_STATUS_SUCCESS
paper.file_path = file_path
paper.error_msg = None
db.session.commit()
# Log success
success_msg = f"Successfully fetched HTML content for {doi} from {final_url}"
self.log_scrape_success(doi, success_msg, paper.id)
return ScrapeResult(
status="success",
message=f"Successfully fetched HTML for {doi}",
data={
"file_path": file_path,
"final_url": final_url,
"content_length": len(response.text),
"content_type": response.headers.get('content-type', 'unknown'),
"title": paper.title,
"domain": urlparse(final_url).netloc if final_url else None
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except requests.exceptions.HTTPError as e:
error_msg = f"HTTP error fetching {doi_url}: {e.response.status_code} - {e}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "http_error", "status_code": e.response.status_code},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except requests.exceptions.RequestException as e:
error_msg = f"Network error fetching {doi_url}: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "network_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Failed to save HTML file: {str(e)}"
paper.status = self.OUTPUT_STATUS_FAILURE
paper.error_msg = error_msg
db.session.commit()
self.log_scrape_failure(doi, error_msg, paper.id)
return ScrapeResult(
status="error",
message=error_msg,
data={"error_code": "file_creation_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)

View File

@ -0,0 +1,384 @@
# JavaScript Modularization Documentation
## Overview
The JavaScript code in the SciPaperLoader application has been modularized into reusable components to improve maintainability, reduce code duplication, and enable easier testing and updates.
## Modularization Task Completed
### Problem Statement
The original codebase had ~800+ lines of inline JavaScript scattered across multiple Jinja templates with several critical issues:
- **Code Duplication**: Similar functionality replicated across templates
- **Maintenance Difficulty**: Changes required editing multiple template files
- **Linter Issues**: Jinja template syntax mixed with JavaScript caused linting errors
- **Testing Challenges**: Inline code was difficult to unit test
- **Poor Separation of Concerns**: Template logic mixed with application logic
### Solution Implemented
Successfully transformed the codebase by:
1. **Extracted 10 Modular JavaScript Files** (~800+ lines of code moved from templates)
2. **Eliminated Code Duplication** by creating reusable components
3. **Fixed Linter Compatibility** by separating template syntax from JavaScript logic
4. **Implemented Clean Variable Passing** using JSON script tags instead of direct Jinja embedding
5. **Created Class-Based Architecture** with proper inheritance and composition patterns
6. **Established Inter-Component Communication** through callback systems
7. **Added Comprehensive Error Handling** and loading states throughout
### Key Achievements
- ✅ **5 templates modularized**: `scraper.html.jinja`, `papers.html.jinja`, `upload.html.jinja`, `logger.html.jinja`, `config/schedule.html.jinja`
- ✅ **10 JavaScript modules created**: Covering all functionality from utilities to dashboard coordination
- ✅ **Zero functionality loss**: All existing features preserved during modularization
- ✅ **Improved maintainability**: Changes now require editing single module files
- ✅ **Enhanced testability**: Individual modules can be unit tested
- ✅ **Clean variable handling**: Jinja variables passed as JSON configuration instead of inline embedding
### Before vs After Example
**Before (inline in template)**:
```html
<script>
var maxVolume = {{ max_volume }}; // Linter error
$('#start-scraper').click(function() {
// 50+ lines of mixed template/JS code
});
</script>
```
**After (modular)**:
```html
<script type="application/json" id="config-data">
{"maxVolume": {{ max_volume|tojson }}}
</script>
<script src="{{ url_for('static', filename='js/scraper-control.js') }}"></script>
<script>
const config = JSON.parse(document.getElementById('config-data').textContent);
new ScraperControl(config).init();
</script>
```
## Modular JavaScript Files
### 1. `/static/js/common.js`
**Purpose**: Common utilities used across the application
**Key Functions**:
- `showFlashMessage(message, type)` - Display flash messages to users
- `createStatusBadge(status)` - Generate status badge HTML
- `formatTimestamp(timestamp)` - Format timestamps for display
- `truncateText(text, maxLength)` - Truncate text with ellipsis
- `toggleButtonLoading(button, loading, loadingText)` - Handle button loading states
- `apiRequest(url, options)` - Generic API request wrapper
**Used by**: All templates that need basic utilities
### 2. `/static/js/modal-handler.js`
**Purpose**: Handle modal dialogs with dynamic content loading
**Key Features**:
- AJAX content loading
- Error handling
- Automatic click handler setup
- Bootstrap modal integration
**Used by**:
- `papers.html.jinja` (paper details modal)
- `logger.html.jinja` (log details modal)
### 3. `/static/js/form-handler.js`
**Purpose**: Handle form submissions with progress tracking
**Key Features**:
- Progress modal display
- Task status polling
- Error handling
- Customizable callbacks
**Used by**:
- `upload.html.jinja` (CSV upload form)
### 4. `/static/js/chart.js`
**Purpose**: Handle Chart.js activity visualization
**Key Features**:
- Chart initialization and rendering
- Data loading from API
- Error handling for missing Chart.js
**Used by**:
- `scraper.html.jinja` (activity charts)
### 5. `/static/js/scraper-control.js`
**Purpose**: Handle scraper control operations (start/stop/pause/reset)
**Key Features**:
- Status polling
- Volume configuration
- Callback system for refreshing other components
**Used by**:
- `scraper.html.jinja`
### 6. `/static/js/paper-processor.js`
**Purpose**: Handle paper search and processing functionality
**Key Features**:
- Paper search
- Single paper processing
- Status polling
- Scraper selection
**Used by**:
- `scraper.html.jinja`
### 7. `/static/js/activity-monitor.js`
**Purpose**: Handle activity log display and real-time notifications
**Key Features**:
- Activity log loading
- Real-time updates
- Notification management
**Used by**:
- `scraper.html.jinja`
### 8. `/static/js/scraper-dashboard.js`
**Purpose**: Coordinate all scraper dashboard components
**Key Features**:
- Component initialization
- Inter-component communication
- Configuration management
**Used by**:
- `scraper.html.jinja`
### 9. `/static/js/config-handler.js`
**Purpose**: Handle configuration forms and Alpine.js integration
**Key Features**:
- Configuration API calls
- Alpine.js data objects
- Schedule management
- Volume updates
**Used by**:
- `config/schedule.html.jinja`
## Template Updates
### Templates Using Modular JavaScript
1. **scraper.html.jinja**
- Uses all scraper-related modules
- Passes Jinja variables as configuration parameters
- Initializes dashboard with `initScraperDashboard(config)`
2. **papers.html.jinja**
- Uses `modal-handler.js` for paper detail modals
- Simplified from custom modal code to single line initialization
3. **upload.html.jinja**
- Uses `form-handler.js` for upload progress tracking
- Custom result display function
- Automatic task status polling
4. **logger.html.jinja**
- Uses `modal-handler.js` for log detail modals
- Custom URL construction for log endpoints
5. **config/schedule.html.jinja**
- Uses `config-handler.js` for Alpine.js integration
- Modular schedule management functions
## Benefits of Modularization
### 1. **Reusability**
- Modal functionality shared between papers and logger templates
- Common utilities used across all templates
- Form handling can be reused for other forms
### 2. **Maintainability**
- Single place to update common functionality
- Clear separation of concerns
- Easier debugging and testing
### 3. **Parameter Passing**
- Jinja variables passed as configuration objects
- No more hardcoded values in JavaScript
- Environment-specific settings easily configurable
### 4. **Extensibility**
- Easy to add new functionality to existing modules
- New templates can easily use existing modules
- Plugin-like architecture for components
## Usage Examples
### Basic Modal Usage
```javascript
const modal = new ModalHandler('modalId', 'contentElementId');
modal.setupClickHandlers('.clickable-items');
```
### Form with Progress Tracking
```javascript
const formHandler = new FormHandler('formId', {
onSuccess: (result) => console.log('Success:', result),
onError: (error) => console.log('Error:', error)
});
```
### Configuration Management
```javascript
// In Alpine.js template
x-data="configHandler.createScheduleManager(initialData, volume)"
```
## Migration Notes
### Old vs New Approach
**Before**: Inline JavaScript in each template
```html
<script>
document.addEventListener('DOMContentLoaded', function() {
// Lots of inline JavaScript code
});
</script>
```
**After**: Modular imports with configuration
```html
<script src="{{ url_for('static', filename='js/common.js') }}"></script>
<script src="{{ url_for('static', filename='js/modal-handler.js') }}"></script>
<script>
const modal = new ModalHandler('modalId', 'contentId');
modal.setupClickHandlers('.links');
</script>
```
### Jinja Variable Handling
To properly separate Jinja template variables from JavaScript code and avoid linting issues, we use a clean JSON configuration approach:
**Before**: Variables embedded directly in JavaScript (causes linting issues)
```javascript
if (volume > {{ max_volume }}) {
// Error handling - JSLint will complain about {{ }}
}
```
**After**: Clean separation using JSON script tags
```html
<!-- Jinja variables in JSON format -->
<script type="application/json" id="config-data">
{
"maxVolume": {{ max_volume|tojson }},
"currentVolume": {{ volume|tojson }},
"apiUrl": {{ url_for('api.endpoint')|tojson }},
"csrfToken": {{ csrf_token()|tojson }}
}
</script>
<!-- Clean JavaScript that reads the configuration -->
<script>
document.addEventListener('DOMContentLoaded', function() {
const config = JSON.parse(document.getElementById('config-data').textContent);
const handler = new VolumeHandler(config);
});
</script>
```
**Benefits of this approach**:
- **Linter-friendly**: No template syntax in JavaScript files
- **Type-safe**: JSON ensures proper data types
- **Maintainable**: Clear separation of concerns
- **Secure**: Automatic escaping with `|tojson` filter
- **Debuggable**: Easy to inspect configuration in DevTools
**Real-world example from scraper.html.jinja**:
```html
<script type="application/json" id="scraper-config">
{
"statusUrl": {{ url_for('api.scraper_status')|tojson }},
"startUrl": {{ url_for('api.start_scraper')|tojson }},
"volume": {{ volume|tojson }},
"scraperType": {{ scraper_type|tojson }},
"csrfToken": {{ csrf_token()|tojson }}
}
</script>
<script>
const config = JSON.parse(document.getElementById('scraper-config').textContent);
initScraperDashboard(config);
</script>
```
## Future Improvements
### Potential Enhancements
1. **Bundle Management**: Consider using webpack or similar for production builds
2. **Unit Testing**: Add comprehensive test suite for individual modules
3. **JSDoc Comments**: Add detailed documentation for better IDE support
4. **Centralized Error Reporting**: Implement global error handling system
5. **Performance Optimization**: Implement lazy loading for non-critical modules
6. **TypeScript Migration**: Consider migrating to TypeScript for better type safety
### Adding New Modules
When creating new JavaScript modules:
1. Follow the established class-based pattern
2. Include proper error handling
3. Use the configuration pattern for Jinja variables
4. Add documentation to this README
5. Update templates to use the new module
## Testing
A test file `test_js_modularization.py` has been created to verify the modularization. To run comprehensive testing:
```bash
python test_js_modularization.py
```
This will verify:
- All JavaScript files exist and are properly formatted
- Templates correctly reference the modular files
- Configuration patterns are properly implemented
- No inline JavaScript remains in templates
## Maintenance
### When Making Changes
1. **Update Single Module**: Changes to functionality only require editing one file
2. **Test Affected Templates**: Ensure all templates using the module still work
3. **Update Documentation**: Keep this README current with any changes
4. **Consider Dependencies**: Check if changes affect other modules
### File Organization
```
/static/js/
├── README.md # This documentation
├── common.js # Shared utilities
├── modal-handler.js # Modal functionality
├── form-handler.js # Form processing
├── chart.js # Chart visualization
├── scraper-control.js # Scraper operations
├── paper-processor.js # Paper management
├── activity-monitor.js # Activity tracking
├── scraper-dashboard.js # Dashboard coordination
├── config-handler.js # Configuration management
└── table-handler.js # Table utilities
```
## Migration Summary
The modularization successfully transformed **~800+ lines of inline JavaScript** from templates into a maintainable, reusable module system. This improvement provides:
- **Enhanced maintainability** through single-responsibility modules
- **Reduced code duplication** via shared utility functions
- **Improved linter compatibility** by separating template and JavaScript concerns
- **Better testability** with isolated, unit-testable modules
- **Cleaner templates** with minimal, configuration-only JavaScript
- **Easier debugging** with clearly separated concerns and proper error handling
All existing functionality has been preserved while significantly improving the codebase architecture and developer experience.

View File

@ -0,0 +1,328 @@
/**
* Activity monitoring and display functionality
*/
class ActivityMonitor {
constructor() {
this.activityLog = document.getElementById("activityLog");
this.notificationsToggle = document.getElementById("notificationsToggle");
this.notificationsEnabled = true;
this.lastPaperTimestamp = new Date().toISOString();
// Pagination state
this.currentPage = 1;
this.perPage = 20;
this.statusFilter = "";
this.totalPages = 1;
this.totalEntries = 0;
// Pagination elements
this.paginationContainer = document.getElementById("activityPagination");
this.paginationInfo = document.getElementById("activityPaginationInfo");
this.prevPageBtn = document.getElementById("activityPrevPage");
this.nextPageBtn = document.getElementById("activityNextPage");
this.currentPageSpan = document.getElementById("activityCurrentPage");
this.pageSizeSelect = document.getElementById("activityPageSize");
this.statusFilterSelect = document.getElementById("activityStatusFilter");
this.initEventListeners();
this.setupWebSocket();
}
/**
* Initialize event listeners
*/
initEventListeners() {
if (this.notificationsToggle) {
this.notificationsToggle.addEventListener("click", () => {
this.notificationsEnabled = this.notificationsToggle.checked;
});
}
// Time range buttons
document.querySelectorAll(".time-range-btn").forEach((btn) => {
btn.addEventListener("click", () => {
document
.querySelectorAll(".time-range-btn")
.forEach((b) => b.classList.remove("active"));
btn.classList.add("active");
const currentTimeRange = parseInt(btn.dataset.hours);
// Trigger chart refresh if callback is provided
if (this.onChartRefresh) {
this.onChartRefresh(currentTimeRange);
}
});
});
// Pagination event listeners
if (this.prevPageBtn) {
this.prevPageBtn.addEventListener("click", (e) => {
e.preventDefault();
if (this.currentPage > 1) {
this.currentPage--;
this.loadRecentActivity();
}
});
}
if (this.nextPageBtn) {
this.nextPageBtn.addEventListener("click", (e) => {
e.preventDefault();
if (this.currentPage < this.totalPages) {
this.currentPage++;
this.loadRecentActivity();
}
});
}
// Page size change
if (this.pageSizeSelect) {
this.pageSizeSelect.addEventListener("change", () => {
this.perPage = parseInt(this.pageSizeSelect.value);
this.currentPage = 1; // Reset to first page
this.loadRecentActivity();
});
}
// Status filter change
if (this.statusFilterSelect) {
this.statusFilterSelect.addEventListener("change", () => {
this.statusFilter = this.statusFilterSelect.value;
this.currentPage = 1; // Reset to first page
this.loadRecentActivity();
});
}
}
/**
* Load and render recent activity
*/
async loadRecentActivity() {
if (!this.activityLog) return;
try {
// Build query parameters for pagination
const params = new URLSearchParams({
page: this.currentPage,
per_page: this.perPage,
});
// Add multiple category parameters
params.append("category", "scraper_activity");
params.append("category", "scraper_command");
if (this.statusFilter) {
params.append("status", this.statusFilter);
}
const data = await apiRequest(`/logs/api?${params.toString()}`);
if (data.success) {
this.renderActivityLog(data.logs);
this.updatePagination(data.pagination);
console.log("Activity log refreshed with latest data");
} else {
throw new Error(data.message || "Failed to load logs");
}
} catch (error) {
console.error("Failed to load activity logs:", error);
// If the API endpoint doesn't exist, just show a message
this.activityLog.innerHTML =
'<tr><td colspan="4" class="text-center">Activity log API not available</td></tr>';
this.hidePagination();
}
}
/**
* Render activity log data
* @param {Array} logs - Array of log entries
*/
renderActivityLog(logs) {
if (!this.activityLog) return;
this.activityLog.innerHTML = "";
if (!logs || logs.length === 0) {
this.activityLog.innerHTML =
'<tr><td colspan="4" class="text-center">No recent activity</td></tr>';
return;
}
logs.forEach((log) => {
const row = document.createElement("tr");
// Format timestamp
const timeStr = formatTimestamp(log.timestamp);
// Create status badge
const statusBadge = createStatusBadge(log.status);
row.innerHTML = `
<td>${timeStr}</td>
<td>${log.action}</td>
<td>${statusBadge}</td>
<td>${log.description || ""}</td>
`;
this.activityLog.appendChild(row);
});
}
/**
* Update pagination controls based on API response
* @param {Object} pagination - Pagination data from API
*/
updatePagination(pagination) {
if (!pagination || !this.paginationContainer) return;
this.currentPage = pagination.page;
this.totalPages = pagination.pages;
this.totalEntries = pagination.total;
// Show pagination container
this.paginationContainer.classList.remove("d-none");
// Update pagination info
const startEntry = (pagination.page - 1) * pagination.per_page + 1;
const endEntry = Math.min(
pagination.page * pagination.per_page,
pagination.total
);
if (this.paginationInfo) {
this.paginationInfo.textContent = `Showing ${startEntry} - ${endEntry} of ${pagination.total} entries`;
}
// Update current page display
if (this.currentPageSpan) {
this.currentPageSpan.textContent = `${pagination.page} of ${pagination.pages}`;
}
// Update previous button
if (this.prevPageBtn) {
if (pagination.has_prev) {
this.prevPageBtn.classList.remove("disabled");
this.prevPageBtn.querySelector("a").removeAttribute("tabindex");
this.prevPageBtn
.querySelector("a")
.setAttribute("aria-disabled", "false");
} else {
this.prevPageBtn.classList.add("disabled");
this.prevPageBtn.querySelector("a").setAttribute("tabindex", "-1");
this.prevPageBtn
.querySelector("a")
.setAttribute("aria-disabled", "true");
}
}
// Update next button
if (this.nextPageBtn) {
if (pagination.has_next) {
this.nextPageBtn.classList.remove("disabled");
this.nextPageBtn.querySelector("a").removeAttribute("tabindex");
this.nextPageBtn
.querySelector("a")
.setAttribute("aria-disabled", "false");
} else {
this.nextPageBtn.classList.add("disabled");
this.nextPageBtn.querySelector("a").setAttribute("tabindex", "-1");
this.nextPageBtn
.querySelector("a")
.setAttribute("aria-disabled", "true");
}
}
}
/**
* Hide pagination controls when not needed
*/
hidePagination() {
if (this.paginationContainer) {
this.paginationContainer.classList.add("d-none");
}
}
/**
* Setup WebSocket for real-time notifications
*/
setupWebSocket() {
// If WebSocket is available, implement it here
// For now we'll poll the server periodically for new papers
setInterval(() => this.checkForNewPapers(), 10000); // Check every 10 seconds
}
/**
* Check for new papers and show notifications
*/
async checkForNewPapers() {
if (!this.notificationsEnabled) return;
try {
// Use the API endpoint for checking new papers, with limit for efficiency
const data = await apiRequest(
`/logs/api?category=scraper_activity&category=scraper_command&action=scrape_paper&after=${this.lastPaperTimestamp}&limit=5`
);
if (data && data.length > 0) {
// Update the timestamp
this.lastPaperTimestamp = new Date().toISOString();
// Show notifications for new papers
data.forEach((log) => {
const extraData = log.extra_data ? JSON.parse(log.extra_data) : {};
if (log.status === "success") {
showFlashMessage(
`New paper scraped: ${extraData.title || "Unknown title"}`,
"success"
);
} else if (log.status === "error") {
showFlashMessage(
`Failed to scrape paper: ${log.description}`,
"error"
);
}
});
// Refresh the activity chart and log
if (this.onChartRefresh) {
this.onChartRefresh();
}
// Only reload if we're on page 1 to avoid disrupting user navigation
if (this.currentPage === 1) {
this.loadRecentActivity();
}
}
} catch (error) {
// If the API endpoint doesn't exist, do nothing
console.debug("Activity polling failed (this may be expected):", error);
}
}
/**
* Set callback for chart refresh
*/
setChartRefreshCallback(callback) {
this.onChartRefresh = callback;
}
/**
* Refresh activity log manually (useful for external triggers)
*/
refresh() {
this.loadRecentActivity();
}
/**
* Reset pagination to first page
*/
resetToFirstPage() {
this.currentPage = 1;
this.loadRecentActivity();
}
}
// Export for use in other modules
if (typeof window !== "undefined") {
window.ActivityMonitor = ActivityMonitor;
}

View File

@ -0,0 +1,436 @@
/**
* Chart utilities for activity visualization
*/
/**
* Chart utilities for activity visualization
*/
class ActivityChart {
constructor(canvasId) {
this.canvasId = canvasId;
this.chart = null;
this.scraperChart = null;
this.initChart();
}
initChart() {
// Check if Chart.js is available
if (typeof Chart === "undefined") {
console.error("Chart.js is not loaded");
return;
}
const chartElement = document.getElementById(this.canvasId);
if (!chartElement) {
console.error(
`Chart canvas element with id "${this.canvasId}" not found`
);
return;
}
// Set canvas height directly
chartElement.style.height = "300px";
chartElement.height = 300;
this.ctx = chartElement.getContext("2d");
// Initialize scraper activity chart
this.initScraperChart();
}
initScraperChart() {
const scraperChartElement = document.getElementById("scraperActivityChart");
if (!scraperChartElement) {
console.warn("Scraper activity chart element not found");
return;
}
this.scraperCtx = scraperChartElement.getContext("2d");
}
/**
* Render the activity chart with provided data
* @param {Object} data - Chart data object with hourly_stats and scraper_timeline
*/
render(data) {
if (!this.ctx) {
console.error("Chart context not available");
return;
}
console.log("Render received data:", data);
// Handle both old and new data formats for compatibility
const hourlyStats = data.hourly_stats || data;
const scraperTimeline = data.scraper_timeline || [];
console.log("Extracted hourlyStats:", hourlyStats);
console.log("Extracted scraperTimeline:", scraperTimeline);
// Extract the data for the main chart (papers only)
const labels = hourlyStats.map((item) => item.hour);
const successData = hourlyStats.map((item) => item.success);
const errorData = hourlyStats.map((item) => item.error);
const pendingData = hourlyStats.map((item) => item.pending);
// Destroy existing charts if they exist
if (this.chart) {
this.chart.destroy();
}
if (this.scraperChart) {
this.scraperChart.destroy();
}
// Render main chart (papers only)
this.chart = new Chart(this.ctx, {
type: "bar",
data: {
labels: labels,
datasets: [
{
label: "Success",
data: successData,
backgroundColor: "#28a745",
stack: "Papers",
},
{
label: "Error",
data: errorData,
backgroundColor: "#dc3545",
stack: "Papers",
},
{
label: "Pending",
data: pendingData,
backgroundColor: "#ffc107",
stack: "Papers",
},
],
},
options: {
responsive: true,
maintainAspectRatio: true,
aspectRatio: 2.5,
layout: {
padding: {
top: 20,
bottom: 20,
},
},
plugins: {
legend: {
position: "top",
},
tooltip: {
mode: "index",
intersect: false,
},
},
scales: {
x: {
stacked: true,
title: {
display: true,
text: "Time (Last Hours)",
},
},
y: {
type: "linear",
display: true,
stacked: true,
beginAtZero: true,
title: {
display: true,
text: "Papers Scraped",
},
},
},
},
});
// Render scraper activity timeline chart with precise timing
this.renderScraperChart(labels, scraperTimeline, hourlyStats.length);
// Show simple legend for scraper activity
this.showScraperStateLegend();
}
/**
* Render the separate scraper activity timeline chart with precise timestamps
* @param {Array} hourLabels - Hour labels for main chart
* @param {Array} scraperTimeline - Timeline of scraper state changes
* @param {number} totalHours - Total hours range being displayed
*/
renderScraperChart(hourLabels, scraperTimeline, totalHours) {
if (!this.scraperCtx) {
console.warn("Scraper chart context not available");
return;
}
let timelineData = [];
if (scraperTimeline && scraperTimeline.length > 0) {
console.log("Original scraper timeline:", scraperTimeline);
// Filter out duplicate events with the same action, status, and hours_ago
const uniqueTimeline = scraperTimeline.filter((event, index, self) => {
return (
index ===
self.findIndex(
(e) =>
e.action === event.action &&
e.status === event.status &&
e.hours_ago === event.hours_ago
)
);
});
console.log("Filtered unique timeline:", uniqueTimeline);
// Sort timeline by hours_ago (oldest first = highest hours_ago first)
const sortedTimeline = [...uniqueTimeline].sort(
(a, b) => b.hours_ago - a.hours_ago
);
console.log("Sorted scraper timeline:", sortedTimeline);
// Create simple timeline with relative positions
let currentState = 0;
// Use hours_ago directly as x-coordinates (inverted so recent is on right)
for (let i = 0; i < sortedTimeline.length; i++) {
const event = sortedTimeline[i];
console.log(`Processing event ${i}:`, event);
// Set the new state based on the action
if (event.action === "start_scraper" && event.status === "success") {
currentState = 1;
} else if (
event.action === "stop_scraper" &&
event.status === "success"
) {
currentState = 0;
} else if (
event.action === "reset_scraper" &&
event.status === "success"
) {
currentState = 0;
} else if (
event.action === "pause_scraper" &&
event.status === "success"
) {
currentState = 0; // Treat pause as inactive
}
console.log(
`New state for ${event.action}: ${currentState} at ${event.hours_ago}h ago`
);
// Use negative hours_ago so recent events are on the right
timelineData.push({
x: -event.hours_ago,
y: currentState,
});
}
// Add current time point
timelineData.push({
x: 0, // Current time
y: currentState,
});
console.log("Final timeline data:", timelineData);
} else {
// No timeline data, show as inactive for the full time range
timelineData = [
{ x: -totalHours, y: 0 }, // Start of time range
{ x: 0, y: 0 }, // Current time
];
}
// Ensure we always have data points at the boundaries for proper scaling
const hasStartPoint = timelineData.some(
(point) => point.x <= -totalHours + 1
);
const hasEndPoint = timelineData.some((point) => point.x >= -1);
if (!hasStartPoint) {
// Add a point at the start of the time range with current state
const currentState =
timelineData.length > 0 ? timelineData[timelineData.length - 1].y : 0;
timelineData.unshift({ x: -totalHours, y: currentState });
}
if (!hasEndPoint) {
// Add a point near the current time with current state
const currentState =
timelineData.length > 0 ? timelineData[timelineData.length - 1].y : 0;
timelineData.push({ x: 0, y: currentState });
}
this.scraperChart = new Chart(this.scraperCtx, {
type: "line",
data: {
datasets: [
{
label: "Scraper Active",
data: timelineData,
borderColor: "#28a745",
backgroundColor: "rgba(40, 167, 69, 0.1)",
borderWidth: 3,
fill: true,
stepped: "before", // Creates step transitions
pointRadius: 5,
pointHoverRadius: 7,
pointBackgroundColor: "#28a745",
pointBorderColor: "#ffffff",
pointBorderWidth: 2,
tension: 0,
},
],
},
options: {
responsive: true,
maintainAspectRatio: true,
aspectRatio: 10,
layout: {
padding: {
top: 10,
bottom: 10,
},
},
plugins: {
legend: {
display: false,
},
tooltip: {
callbacks: {
label: function (context) {
const status =
context.parsed.y === 1 ? "Activated" : "Deactivated";
const timestamp = new Date();
timestamp.setHours(
timestamp.getHours() - Math.abs(context.parsed.x)
);
const formattedTime = timestamp.toLocaleString("en-GB", {
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
day: "2-digit",
month: "2-digit",
year: "numeric",
});
return `Scraper: ${status} at ${formattedTime}`;
},
},
},
},
scales: {
x: {
type: "linear",
min: -totalHours,
max: 0,
title: {
display: true,
text: "Timeline (Hours Ago → Now)",
},
ticks: {
callback: function (value) {
if (value === 0) return "Now";
return `-${Math.abs(value)}h`;
},
stepSize: Math.max(1, Math.floor(totalHours / 8)), // Show reasonable number of ticks
},
grid: {
display: true,
},
},
y: {
type: "linear",
display: true,
beginAtZero: true,
max: 1.2,
min: -0.2,
title: {
display: true,
text: "Active Status",
},
ticks: {
stepSize: 1,
callback: function (value) {
return value === 1 ? "Active" : value === 0 ? "Inactive" : "";
},
},
grid: {
color: function (context) {
return context.tick.value === 0.5
? "rgba(0,0,0,0.1)"
: "rgba(0,0,0,0.05)";
},
},
},
},
},
});
}
/**
* Show a simple legend for scraper states
*/
showScraperStateLegend() {
let legendContainer = document.getElementById("scraper-state-legend");
if (!legendContainer) {
return;
}
legendContainer.classList.remove("d-none");
legendContainer.innerHTML = `
<small class="text-muted">
<i class="fas fa-info-circle"></i>
The line chart below shows exact timestamps when the scraper was started or stopped with proper time intervals.
</small>
`;
}
/**
* Load and render chart data for specified time range
* @param {number} hours - Number of hours to show data for
*/
async loadData(hours) {
try {
const response = await fetch(`/scraper/stats?hours=${hours}`);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const data = await response.json();
console.log("Stats data loaded:", data);
this.render(data);
} catch (error) {
console.error("Failed to load activity stats:", error);
// Hide the chart or show an error message
const chartContainer = document.getElementById(
this.canvasId
).parentElement;
if (chartContainer) {
chartContainer.innerHTML =
'<p class="text-muted">Chart data unavailable</p>';
}
}
}
/**
* Destroy the chart instance
*/
destroy() {
if (this.chart) {
this.chart.destroy();
this.chart = null;
}
if (this.scraperChart) {
this.scraperChart.destroy();
this.scraperChart = null;
}
}
}

View File

@ -0,0 +1,175 @@
/**
* Common utilities for the SciPaperLoader application
*/
/**
* Display a flash message to the user as an overlay
* @param {string} message - The message to display
* @param {string} type - The type of message (success, error, warning, info)
* @param {number} duration - Duration in milliseconds (default: 5000)
*/
function showFlashMessage(message, type = "success", duration = 5000) {
const flashMsg = document.createElement("div");
const normalizedType = type === "error" ? "danger" : type;
flashMsg.className = `flash-overlay flash-${normalizedType}`;
// Get the appropriate icon based on type
const getIcon = (messageType) => {
switch (messageType) {
case "success":
return '<svg class="flash-icon" role="img" aria-label="Success:"><use xlink:href="#check-circle-fill"/></svg>';
case "danger":
return '<svg class="flash-icon" role="img" aria-label="Error:"><use xlink:href="#x-circle-fill"/></svg>';
case "warning":
return '<svg class="flash-icon" role="img" aria-label="Warning:"><use xlink:href="#exclamation-triangle-fill"/></svg>';
case "info":
return '<svg class="flash-icon" role="img" aria-label="Info:"><use xlink:href="#info-fill"/></svg>';
default:
return '<svg class="flash-icon" role="img" aria-label="Info:"><use xlink:href="#info-fill"/></svg>';
}
};
flashMsg.innerHTML = `
<div class="flash-content">
${getIcon(normalizedType)}
<div class="flash-message">${message}</div>
<button type="button" class="flash-close" onclick="removeFlashMessage(this.parentElement.parentElement)">×</button>
</div>
`;
// Add to page first
document.body.appendChild(flashMsg);
// Position all messages in stack
updateFlashMessagePositions();
// Auto dismiss
setTimeout(() => {
removeFlashMessage(flashMsg);
}, duration);
return flashMsg;
}
/**
* Remove a flash message and update positions
* @param {HTMLElement} flashMsg - The flash message element to remove
*/
function removeFlashMessage(flashMsg) {
if (!flashMsg || !flashMsg.parentNode) return;
flashMsg.classList.add("fade-out");
setTimeout(() => {
if (flashMsg.parentNode) {
flashMsg.remove();
updateFlashMessagePositions();
}
}, 300);
}
/**
* Update positions of all flash messages to create a proper stack
*/
function updateFlashMessagePositions() {
const messages = document.querySelectorAll(".flash-overlay:not(.fade-out)");
messages.forEach((msg, index) => {
const topPosition = 20 + index * 90; // 90px spacing between messages
msg.style.top = `${topPosition}px`;
msg.style.zIndex = 9999 - index; // Higher z-index for newer messages
});
}
/**
* Create a status badge HTML element
* @param {string} status - The status to create a badge for
* @returns {string} HTML string for the status badge
*/
function createStatusBadge(status) {
switch (status) {
case "New":
return '<span class="badge bg-info">New</span>';
case "Pending":
return '<span class="badge bg-warning text-dark">Pending</span>';
case "Done":
return '<span class="badge bg-success">Done</span>';
case "Failed":
return '<span class="badge bg-danger">Failed</span>';
case "success":
return '<span class="badge bg-success">Success</span>';
case "error":
return '<span class="badge bg-danger">Error</span>';
case "pending":
return '<span class="badge bg-warning text-dark">Pending</span>';
default:
return `<span class="badge bg-secondary">${status}</span>`;
}
}
/**
* Format a timestamp to a readable time string
* @param {string} timestamp - ISO timestamp string
* @returns {string} Formatted time string
*/
function formatTimestamp(timestamp) {
const date = new Date(timestamp);
return date.toLocaleTimeString("de-DE", {
year: "2-digit",
month: "numeric",
day: "numeric",
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
});
}
/**
* Truncate text to a specified length
* @param {string} text - The text to truncate
* @param {number} maxLength - Maximum length before truncation
* @returns {string} Truncated text with ellipsis if needed
*/
function truncateText(text, maxLength) {
return text.length > maxLength ? text.substring(0, maxLength) + "..." : text;
}
/**
* Toggle button loading state
* @param {HTMLElement} button - The button element
* @param {boolean} loading - Whether to show loading state
* @param {string} loadingText - Text to show when loading
*/
function toggleButtonLoading(button, loading, loadingText = "Loading...") {
if (loading) {
button.disabled = true;
button.dataset.originalText = button.innerHTML;
button.innerHTML = `<i class="fas fa-spinner fa-spin"></i> ${loadingText}`;
} else {
button.disabled = false;
button.innerHTML = button.dataset.originalText || button.innerHTML;
}
}
/**
* Generic fetch wrapper with error handling
* @param {string} url - The URL to fetch
* @param {object} options - Fetch options
* @returns {Promise} Fetch promise
*/
async function apiRequest(url, options = {}) {
const defaultOptions = {
headers: {
"Content-Type": "application/json",
},
};
const mergedOptions = { ...defaultOptions, ...options };
try {
const response = await fetch(url, mergedOptions);
const data = await response.json();
return data;
} catch (error) {
console.error(`API request failed for ${url}:`, error);
throw error;
}
}

View File

@ -0,0 +1,195 @@
/**
* Configuration utilities for handling settings and form submissions
*/
class ConfigHandler {
constructor(options = {}) {
this.options = {
apiEndpoint: options.apiEndpoint || "/config/api/update_config",
...options,
};
}
/**
* Update configuration via API
* @param {object} configData - Configuration data to send
* @returns {Promise} API response promise
*/
async updateConfig(configData) {
try {
const response = await fetch(this.options.apiEndpoint, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify(configData),
});
const data = await response.json();
if (data.success) {
showFlashMessage(
data.message || "Configuration updated successfully!",
"success"
);
} else {
const errorMessage =
data.updates?.[0]?.message ||
data.message ||
"Error updating configuration";
showFlashMessage(errorMessage, "error");
}
return data;
} catch (error) {
console.error("Error updating configuration:", error);
showFlashMessage("Network error occurred", "error");
throw error;
}
}
/**
* Update volume configuration
* @param {number} volume - New volume value
*/
async updateVolume(volume) {
return this.updateConfig({ volume: volume });
}
/**
* Update schedule configuration
* @param {object} schedule - Schedule configuration object
*/
async updateSchedule(schedule) {
return this.updateConfig({ schedule: schedule });
}
/**
* Create an Alpine.js data object for schedule management
* Reads configuration from JSON script tag in the template
* @returns {object} Alpine.js data object
*/
createScheduleManager() {
const self = this;
// Read configuration from JSON script tag
const configElement = document.getElementById("schedule-config");
const config = configElement ? JSON.parse(configElement.textContent) : {};
const initialSchedule = config.initialSchedule || {};
const volume = config.totalVolume || 0;
return {
schedule: { ...initialSchedule },
volume: volume,
selectedHours: [],
newWeight: 1.0,
volumeValue: volume,
isDragging: false,
dragOperation: null,
formatHour(h) {
return String(h).padStart(2, "0") + ":00";
},
async updateVolume() {
try {
const data = await self.updateVolume(this.volumeValue);
if (data.success) {
this.volume = parseFloat(this.volumeValue);
}
} catch (error) {
// Error handling is done in updateConfig
}
},
getBackgroundStyle(hour) {
const weight = parseFloat(this.schedule[hour]);
const maxWeight = 2.5;
// Normalize weight (0.0 to 1.0)
const t = Math.min(weight / maxWeight, 1.0);
// Interpolate HSL lightness: 95% (light) to 30% (dark)
const lightness = 95 - t * 65;
const backgroundColor = `hsl(210, 10%, ${lightness}%)`;
const textColor = t > 0.65 ? "white" : "black";
return {
backgroundColor,
color: textColor,
};
},
startDrag(event, hour) {
event.preventDefault();
this.isDragging = true;
this.dragOperation = this.isSelected(hour) ? "remove" : "add";
this.toggleSelect(hour);
},
dragSelect(hour) {
if (!this.isDragging) return;
const selected = this.isSelected(hour);
if (this.dragOperation === "add" && !selected) {
this.selectedHours.push(hour);
} else if (this.dragOperation === "remove" && selected) {
this.selectedHours = this.selectedHours.filter((h) => h !== hour);
}
},
endDrag() {
this.isDragging = false;
},
toggleSelect(hour) {
if (this.isSelected(hour)) {
this.selectedHours = this.selectedHours.filter((h) => h !== hour);
} else {
this.selectedHours.push(hour);
}
},
isSelected(hour) {
return this.selectedHours.includes(hour);
},
applyWeight() {
this.selectedHours.forEach((hour) => {
this.schedule[hour] = parseFloat(this.newWeight).toFixed(1);
});
this.selectedHours = [];
},
getTotalWeight() {
return Object.values(this.schedule).reduce(
(sum, w) => sum + parseFloat(w),
0
);
},
getPapersPerHour(hour) {
const total = this.getTotalWeight();
if (total === 0) return 0;
return (
(parseFloat(this.schedule[hour]) / total) *
this.volume
).toFixed(1);
},
async saveSchedule() {
try {
await self.updateSchedule(this.schedule);
} catch (error) {
// Error handling is done in updateConfig
}
},
};
}
}
/**
* Global instance for easy access
*/
window.configHandler = new ConfigHandler();

View File

@ -0,0 +1,231 @@
/**
* Form utilities for handling form submissions with progress tracking
*/
class FormHandler {
constructor(formId, options = {}) {
this.form = document.getElementById(formId);
this.options = {
progressModalId: "progressModal",
progressBarId: "progressBar",
progressStatusId: "progressStatus",
statusCheckInterval: 1000,
onSuccess: null,
onError: null,
onProgress: null,
...options,
};
this.progressModal = null;
this.progressBar = null;
this.progressStatus = null;
this.submitButton = null;
this.initElements();
this.initEventListeners();
}
/**
* Initialize DOM elements
*/
initElements() {
if (this.options.progressModalId) {
const modalElement = document.getElementById(
this.options.progressModalId
);
if (modalElement && typeof bootstrap !== "undefined") {
this.progressModal = new bootstrap.Modal(modalElement);
}
}
this.progressBar = document.getElementById(this.options.progressBarId);
this.progressStatus = document.getElementById(
this.options.progressStatusId
);
this.submitButton = this.form?.querySelector('button[type="submit"]');
}
/**
* Initialize event listeners
*/
initEventListeners() {
if (this.form) {
this.form.addEventListener("submit", (e) => this.handleSubmit(e));
}
}
/**
* Handle form submission
* @param {Event} e - Form submit event
*/
async handleSubmit(e) {
e.preventDefault();
// Show progress modal
this.showProgress();
this.updateProgress(5, "Starting...");
// Disable submit button
if (this.submitButton) {
this.submitButton.disabled = true;
}
const formData = new FormData(this.form);
try {
const response = await fetch(this.form.action, {
method: "POST",
body: formData,
});
const data = await response.json();
if (data.error) {
this.handleError(data.error);
return;
}
// Start polling for task status if task_id is provided
if (data.task_id) {
this.pollTaskStatus(data.task_id);
} else {
// Handle immediate response
this.handleSuccess(data);
}
} catch (error) {
console.error("Form submission failed:", error);
this.handleError("Form submission failed. Please try again.");
}
}
/**
* Poll task status for long-running operations
* @param {string} taskId - Task ID to poll
*/
async pollTaskStatus(taskId) {
const checkStatus = async () => {
try {
// Construct status URL - this should be customizable
const statusUrl = this.options.statusUrlTemplate
? this.options.statusUrlTemplate.replace("{taskId}", taskId)
: `/upload/task_status/${taskId}`;
const response = await fetch(statusUrl);
const status = await response.json();
console.log("Task status:", status);
if (status.state === "SUCCESS") {
this.updateProgress(100, "Completed!");
setTimeout(() => {
this.hideProgress();
this.handleSuccess(status.result);
}, 1000);
} else if (status.state === "FAILURE") {
this.updateProgress(100, "Failed!", true);
setTimeout(() => {
this.hideProgress();
this.handleError(status.error || "Unknown error occurred");
}, 1000);
} else {
// Update progress
const progress = status.progress || 0;
this.updateProgress(progress, `Processing... (${status.state})`);
// Continue polling
setTimeout(checkStatus, this.options.statusCheckInterval);
}
} catch (error) {
console.error("Failed to check task status:", error);
// Continue polling on error
setTimeout(checkStatus, this.options.statusCheckInterval);
}
};
checkStatus();
}
/**
* Show progress modal
*/
showProgress() {
if (this.progressModal) {
this.progressModal.show();
}
}
/**
* Hide progress modal
*/
hideProgress() {
if (this.progressModal) {
this.progressModal.hide();
}
}
/**
* Update progress display
* @param {number} percentage - Progress percentage (0-100)
* @param {string} message - Status message
* @param {boolean} isError - Whether this is an error state
*/
updateProgress(percentage, message, isError = false) {
if (this.progressBar) {
this.progressBar.style.width = `${percentage}%`;
this.progressBar.textContent = `${percentage}%`;
if (isError) {
this.progressBar.classList.add("bg-danger");
}
}
if (this.progressStatus) {
this.progressStatus.textContent = message;
}
// Call custom progress callback
if (this.options.onProgress) {
this.options.onProgress(percentage, message, isError);
}
}
/**
* Handle successful form submission
* @param {object} result - Success result data
*/
handleSuccess(result) {
// Re-enable submit button
if (this.submitButton) {
this.submitButton.disabled = false;
}
// Call custom success callback
if (this.options.onSuccess) {
this.options.onSuccess(result);
} else {
// Default success handling
showFlashMessage("Operation completed successfully!", "success");
}
}
/**
* Handle form submission error
* @param {string} error - Error message
*/
handleError(error) {
this.hideProgress();
// Re-enable submit button
if (this.submitButton) {
this.submitButton.disabled = false;
}
// Call custom error callback
if (this.options.onError) {
this.options.onError(error);
} else {
// Default error handling
showFlashMessage(`Error: ${error}`, "error");
}
}
}

View File

@ -0,0 +1,485 @@
/**
* Logger Manager - Modern activity log management for the unified logger view
*/
class LoggerManager {
constructor(options = {}) {
this.categories = options.categories || [];
this.initialFilters = options.initialFilters || {};
// Pagination state
this.currentPage = 1;
this.perPage = 50;
this.totalPages = 1;
this.totalEntries = 0;
// Current filter state
this.filters = { ...this.initialFilters };
// DOM elements
this.initElements();
this.initEventListeners();
// Apply initial filters and load data
this.applyInitialFilters();
this.loadLogs();
}
initElements() {
// Form elements
this.filtersForm = document.getElementById("filterForm");
this.categoryCheckboxes = document.querySelectorAll(".category-checkbox");
this.selectAllCategories = document.getElementById("selectAllCategories");
this.statusSelect = document.getElementById("statusFilter");
this.startDateInput = document.getElementById("startDate");
this.endDateInput = document.getElementById("endDate");
this.searchTermInput = document.getElementById("searchTerm");
this.clearFiltersBtn = document.getElementById("clearFilters");
this.downloadLogsBtn = document.getElementById("downloadLogs");
this.refreshLogsBtn = document.getElementById("refreshLogs");
// Logs display elements
this.logsTableBody = document.getElementById("logsTableBody");
this.pageSizeSelect = document.getElementById("pageSize");
// Pagination elements
this.paginationContainer = document.getElementById("logsPagination");
this.paginationInfo = document.getElementById("paginationDetails");
this.prevPageBtn = document.getElementById("prevPage");
this.nextPageBtn = document.getElementById("nextPage");
this.currentPageSpan = document.getElementById("currentPageSpan");
// Modal
this.logModal = new ModalHandler("logDetailModal", "log-detail-content");
}
initEventListeners() {
// Filter form submission
if (this.filtersForm) {
this.filtersForm.addEventListener("submit", (e) => {
e.preventDefault();
this.applyFilters();
});
}
// Handle "Select All" checkbox for categories
if (this.selectAllCategories) {
this.selectAllCategories.addEventListener("change", () => {
const isChecked = this.selectAllCategories.checked;
this.categoryCheckboxes.forEach((checkbox) => {
checkbox.checked = isChecked;
});
this.applyFilters();
});
}
// Handle individual category checkboxes
this.categoryCheckboxes.forEach((checkbox) => {
checkbox.addEventListener("change", () => {
// Update "Select All" checkbox state
this.updateSelectAllState();
this.applyFilters();
});
});
// Individual filter changes for immediate application
[this.statusSelect, this.startDateInput, this.endDateInput].forEach(
(element) => {
if (element) {
element.addEventListener("change", () => {
this.applyFilters();
});
}
}
);
// Search term with debounce
if (this.searchTermInput) {
let searchTimeout;
this.searchTermInput.addEventListener("input", () => {
clearTimeout(searchTimeout);
searchTimeout = setTimeout(() => {
this.applyFilters();
}, 500);
});
}
// Clear filters
if (this.clearFiltersBtn) {
this.clearFiltersBtn.addEventListener("click", () => {
this.clearAllFilters();
});
}
// Download logs
if (this.downloadLogsBtn) {
this.downloadLogsBtn.addEventListener("click", (e) => {
e.preventDefault();
this.downloadLogs();
});
}
// Refresh logs
if (this.refreshLogsBtn) {
this.refreshLogsBtn.addEventListener("click", () => {
this.loadLogs();
});
}
// Page size change
if (this.pageSizeSelect) {
this.pageSizeSelect.addEventListener("change", () => {
this.perPage = parseInt(this.pageSizeSelect.value);
this.currentPage = 1; // Reset to first page
this.loadLogs();
});
}
// Pagination buttons
if (this.prevPageBtn) {
this.prevPageBtn.addEventListener("click", (e) => {
e.preventDefault();
if (this.currentPage > 1) {
this.currentPage--;
this.loadLogs();
}
});
}
if (this.nextPageBtn) {
this.nextPageBtn.addEventListener("click", (e) => {
e.preventDefault();
if (this.currentPage < this.totalPages) {
this.currentPage++;
this.loadLogs();
}
});
}
}
updateSelectAllState() {
const checkedCount = Array.from(this.categoryCheckboxes).filter(
(cb) => cb.checked
).length;
const totalCount = this.categoryCheckboxes.length;
if (checkedCount === 0) {
this.selectAllCategories.checked = false;
this.selectAllCategories.indeterminate = false;
} else if (checkedCount === totalCount) {
this.selectAllCategories.checked = true;
this.selectAllCategories.indeterminate = false;
} else {
this.selectAllCategories.checked = false;
this.selectAllCategories.indeterminate = true;
}
}
getSelectedCategories() {
return Array.from(this.categoryCheckboxes)
.filter((checkbox) => checkbox.checked)
.map((checkbox) => checkbox.value);
}
applyInitialFilters() {
// Set category checkboxes from initial filters
if (this.initialFilters.category) {
const selectedCategories = Array.isArray(this.initialFilters.category)
? this.initialFilters.category
: [this.initialFilters.category];
this.categoryCheckboxes.forEach((checkbox) => {
checkbox.checked = selectedCategories.includes(checkbox.value);
});
this.updateSelectAllState();
}
if (this.startDateInput && this.initialFilters.start_date) {
this.startDateInput.value = this.initialFilters.start_date;
}
if (this.endDateInput && this.initialFilters.end_date) {
this.endDateInput.value = this.initialFilters.end_date;
}
if (this.searchTermInput && this.initialFilters.search_term) {
this.searchTermInput.value = this.initialFilters.search_term;
}
}
applyFilters() {
// Collect current filter values
const selectedCategories = this.getSelectedCategories();
this.filters = {
category: selectedCategories, // Now an array
status: this.statusSelect?.value || "",
start_date: this.startDateInput?.value || "",
end_date: this.endDateInput?.value || "",
search_term: this.searchTermInput?.value || "",
};
// Reset to first page when filters change
this.currentPage = 1;
// Load logs with new filters
this.loadLogs();
// Update URL to reflect current filters (for bookmarking/sharing)
this.updateUrl();
}
clearAllFilters() {
// Clear all category checkboxes and select all
this.categoryCheckboxes.forEach((checkbox) => {
checkbox.checked = true; // Default to all selected
});
if (this.selectAllCategories) {
this.selectAllCategories.checked = true;
this.selectAllCategories.indeterminate = false;
}
if (this.statusSelect) this.statusSelect.value = "";
if (this.startDateInput) this.startDateInput.value = "";
if (this.endDateInput) this.endDateInput.value = "";
if (this.searchTermInput) this.searchTermInput.value = "";
// Apply empty filters
this.applyFilters();
}
async loadLogs() {
if (!this.logsTableBody) return;
try {
// Show loading state
this.logsTableBody.innerHTML =
'<tr><td colspan="5" class="text-center"><div class="spinner-border spinner-border-sm text-primary" role="status"><span class="visually-hidden">Loading...</span></div> Loading logs...</td></tr>';
// Build query parameters
const params = new URLSearchParams({
page: this.currentPage,
per_page: this.perPage,
});
// Add filters to query
Object.entries(this.filters).forEach(([key, value]) => {
if (value) {
if (key === "category" && Array.isArray(value)) {
// Handle multiple categories
value.forEach((cat) => {
if (cat) params.append("category", cat);
});
} else if (value) {
params.append(key, value);
}
}
});
// Fetch logs from unified API
const data = await apiRequest(`/logs/api?${params.toString()}`);
if (data.success) {
this.renderLogs(data.logs);
this.updatePagination(data.pagination);
console.log("Logs loaded successfully");
} else {
throw new Error(data.message || "Failed to load logs");
}
} catch (error) {
console.error("Failed to load logs:", error);
this.logsTableBody.innerHTML =
'<tr><td colspan="5" class="text-center text-danger">Error loading logs. Please try again.</td></tr>';
this.hidePagination();
}
}
renderLogs(logs) {
if (!this.logsTableBody) return;
this.logsTableBody.innerHTML = "";
if (!logs || logs.length === 0) {
this.logsTableBody.innerHTML =
'<tr><td colspan="5" class="text-center">No logs found matching the current filters.</td></tr>';
return;
}
logs.forEach((log) => {
const row = document.createElement("tr");
row.className = "log-entry";
row.setAttribute("data-log-id", log.id);
// Format timestamp
const timeStr = formatTimestamp(log.timestamp);
// Create status badge
const statusBadge = createStatusBadge(log.status);
// Create category badge
const categoryBadge = this.createCategoryBadge(log.category);
row.innerHTML = `
<td>${timeStr}</td>
<td>${categoryBadge}</td>
<td>${log.action}</td>
<td>${statusBadge}</td>
<td>${log.description || ""}</td>
`;
// Add click handler for details modal - whole row is clickable
row.addEventListener("click", () => {
const url = `/logs/${log.id}/detail`;
this.logModal.loadAndShow(url, "Error loading log details.");
});
this.logsTableBody.appendChild(row);
});
}
createCategoryBadge(category) {
const categoryColors = {
gui_interaction: "bg-primary",
config_change: "bg-warning",
scraper_command: "bg-info",
scraper_activity: "bg-success",
system: "bg-danger",
data_import: "bg-secondary",
};
const colorClass = categoryColors[category] || "bg-secondary";
const displayName = category
.replace(/_/g, " ")
.replace(/\b\w/g, (l) => l.toUpperCase());
return `<span class="badge ${colorClass}">${displayName}</span>`;
}
updatePagination(pagination) {
if (!pagination || !this.paginationContainer) return;
this.currentPage = pagination.page;
this.totalPages = pagination.pages;
this.totalEntries = pagination.total;
// Show pagination container
this.paginationContainer.classList.remove("d-none");
// Update pagination info
const startEntry = (pagination.page - 1) * pagination.per_page + 1;
const endEntry = Math.min(
pagination.page * pagination.per_page,
pagination.total
);
if (this.paginationInfo) {
this.paginationInfo.textContent = `Showing ${startEntry} - ${endEntry} of ${pagination.total} entries`;
}
// Update current page display
if (this.currentPageSpan) {
this.currentPageSpan.innerHTML = `<span class="page-link">${pagination.page} of ${pagination.pages}</span>`;
}
// Update previous button
if (this.prevPageBtn) {
if (pagination.has_prev) {
this.prevPageBtn.classList.remove("disabled");
this.prevPageBtn.querySelector("a").removeAttribute("tabindex");
this.prevPageBtn
.querySelector("a")
.setAttribute("aria-disabled", "false");
} else {
this.prevPageBtn.classList.add("disabled");
this.prevPageBtn.querySelector("a").setAttribute("tabindex", "-1");
this.prevPageBtn
.querySelector("a")
.setAttribute("aria-disabled", "true");
}
}
// Update next button
if (this.nextPageBtn) {
if (pagination.has_next) {
this.nextPageBtn.classList.remove("disabled");
this.nextPageBtn.querySelector("a").removeAttribute("tabindex");
this.nextPageBtn
.querySelector("a")
.setAttribute("aria-disabled", "false");
} else {
this.nextPageBtn.classList.add("disabled");
this.nextPageBtn.querySelector("a").setAttribute("tabindex", "-1");
this.nextPageBtn
.querySelector("a")
.setAttribute("aria-disabled", "true");
}
}
}
hidePagination() {
if (this.paginationContainer) {
this.paginationContainer.classList.add("d-none");
}
}
updateUrl() {
// Update URL with current filters for bookmarking
const params = new URLSearchParams();
Object.entries(this.filters).forEach(([key, value]) => {
if (value) {
if (key === "category" && Array.isArray(value)) {
// Handle multiple categories
value.forEach((cat) => {
if (cat) params.append("category", cat);
});
} else if (value) {
params.append(key, value);
}
}
});
const newUrl = `${window.location.pathname}${
params.toString() ? "?" + params.toString() : ""
}`;
window.history.replaceState({}, "", newUrl);
}
downloadLogs() {
// Build download URL with current filters
const params = new URLSearchParams();
Object.entries(this.filters).forEach(([key, value]) => {
if (value) {
if (key === "category" && Array.isArray(value)) {
// Handle multiple categories
value.forEach((cat) => {
if (cat) params.append("category", cat);
});
} else if (value) {
params.append(key, value);
}
}
});
const downloadUrl = `/logs/download${
params.toString() ? "?" + params.toString() : ""
}`;
window.location.href = downloadUrl;
}
refresh() {
this.loadLogs();
}
/**
* Set modal handler for log details
* @param {ModalHandler} modalHandler - Modal handler instance
*/
setModalHandler(modalHandler) {
this.logModal = modalHandler;
}
}
// Export for use in other modules
if (typeof window !== "undefined") {
window.LoggerManager = LoggerManager;
}

View File

@ -0,0 +1,221 @@
/**
* Modal utilities for handling dynamic content loading
*/
class ModalHandler {
constructor(modalId, contentElementId) {
this.modalElement = document.getElementById(modalId);
this.contentElement = document.getElementById(contentElementId);
this.modal = null;
if (this.modalElement && typeof bootstrap !== "undefined") {
this.modal = new bootstrap.Modal(this.modalElement);
// Set up global event delegation for modal close buttons
this.setupGlobalCloseHandlers();
}
}
/**
* Load content into modal via AJAX and show it
* @param {string} url - URL to fetch content from
* @param {string} errorMessage - Message to show on error
*/
async loadAndShow(url, errorMessage = "Error loading content.") {
if (!this.modal || !this.contentElement) {
console.error("Modal or content element not found");
return;
}
try {
const response = await fetch(url);
const html = await response.text();
this.contentElement.innerHTML = html;
// Set up close button handlers after content is loaded
this.setupCloseHandlers();
// Format any JSON content in the modal
this.formatJsonContent();
this.modal.show();
} catch (error) {
console.error("Error loading modal content:", error);
this.contentElement.innerHTML = `<div class="modal-body text-danger">${errorMessage}</div>`;
this.modal.show();
}
}
/**
* Set up click handlers for elements that should open the modal
* @param {string} selector - CSS selector for clickable elements
* @param {string} urlAttribute - Attribute name containing the URL (default: 'data-url')
*/
setupClickHandlers(selector, urlAttribute = "data-url") {
document.addEventListener("DOMContentLoaded", () => {
document.querySelectorAll(selector).forEach((element) => {
element.addEventListener("click", (e) => {
e.preventDefault();
const url = element.getAttribute(urlAttribute);
if (url) {
this.loadAndShow(url);
}
});
});
});
}
/**
* Show the modal with custom content
* @param {string} content - HTML content to display
*/
showWithContent(content) {
if (!this.modal || !this.contentElement) return;
this.contentElement.innerHTML = content;
// Set up close button handlers after content is loaded
this.setupCloseHandlers();
this.modal.show();
}
/**
* Set up global event delegation for modal close buttons
*/
setupGlobalCloseHandlers() {
// Use event delegation to handle dynamically loaded close buttons
this.modalElement.addEventListener("click", (e) => {
if (
e.target.matches('[data-bs-dismiss="modal"]') ||
e.target.closest('[data-bs-dismiss="modal"]') ||
e.target.matches(".btn-close") ||
e.target.closest(".btn-close")
) {
e.preventDefault();
this.hide();
}
});
// Handle ESC key press
document.addEventListener("keydown", (e) => {
if (
e.key === "Escape" &&
this.modal &&
this.modalElement.classList.contains("show")
) {
this.hide();
}
});
}
/**
* Set up close button event handlers for dynamically loaded content
*/
setupCloseHandlers() {
// This method is now mostly redundant due to global event delegation
// but we'll keep it for backward compatibility
// Handle close buttons with data-bs-dismiss="modal"
const closeButtons = this.contentElement.querySelectorAll(
'[data-bs-dismiss="modal"]'
);
closeButtons.forEach((button) => {
button.addEventListener("click", (e) => {
e.preventDefault();
this.hide();
});
});
// Handle close buttons with .btn-close class
const closeButtonsClass =
this.contentElement.querySelectorAll(".btn-close");
closeButtonsClass.forEach((button) => {
button.addEventListener("click", (e) => {
e.preventDefault();
this.hide();
});
});
// Also handle ESC key press
document.addEventListener("keydown", (e) => {
if (
e.key === "Escape" &&
this.modal &&
this.modalElement.classList.contains("show")
) {
this.hide();
}
});
}
/**
* Format JSON content in the modal after it's loaded
*/
formatJsonContent() {
// Format JSON in extra data if present
const extraDataElement = this.contentElement.querySelector(
"#extra-data-content"
);
if (extraDataElement && extraDataElement.textContent.trim()) {
try {
const jsonData = JSON.parse(extraDataElement.textContent);
// Pretty-format the JSON with proper indentation
const formattedJson = JSON.stringify(jsonData, null, 2);
extraDataElement.textContent = formattedJson;
// Add syntax highlighting classes if the JSON is complex
if (typeof jsonData === "object" && jsonData !== null) {
extraDataElement.parentElement.classList.add("json-formatted");
}
} catch (e) {
// If it's not valid JSON, leave it as is but still format if it looks like JSON
const text = extraDataElement.textContent.trim();
if (text.startsWith("{") || text.startsWith("[")) {
// Try to fix common JSON issues and reformat
try {
const fixedJson = text
.replace(/'/g, '"')
.replace(/None/g, "null")
.replace(/True/g, "true")
.replace(/False/g, "false");
const parsed = JSON.parse(fixedJson);
extraDataElement.textContent = JSON.stringify(parsed, null, 2);
} catch (fixError) {
// If still can't parse, just leave as is
console.debug("Extra data is not valid JSON:", e);
}
}
}
}
// Also format old_value and new_value if they contain JSON
const preElements = this.contentElement.querySelectorAll("pre code");
preElements.forEach(function (codeElement) {
if (codeElement && codeElement.textContent.trim()) {
const text = codeElement.textContent.trim();
if (
(text.startsWith("{") && text.endsWith("}")) ||
(text.startsWith("[") && text.endsWith("]"))
) {
try {
const jsonData = JSON.parse(text);
codeElement.textContent = JSON.stringify(jsonData, null, 2);
} catch (e) {
// Not JSON, leave as is
}
}
}
});
}
/**
* Hide the modal
*/
hide() {
if (this.modal) {
this.modal.hide();
}
}
}

View File

@ -0,0 +1,315 @@
/**
* Paper search and processing functionality
*/
class PaperProcessor {
constructor() {
// DOM elements
this.searchForm = document.getElementById("searchPaperForm");
this.searchInput = document.getElementById("paperSearchInput");
this.searchResults = document.getElementById("searchResults");
this.paperSearchResults = document.getElementById("paperSearchResults");
this.scraperSelect = document.getElementById("scraperSelect");
this.initEventListeners();
this.loadAvailableScrapers();
}
/**
* Initialize event listeners
*/
initEventListeners() {
if (this.searchForm) {
this.searchForm.addEventListener("submit", (e) => {
e.preventDefault();
this.searchPapers();
});
}
}
/**
* Load available scraper modules
*/
async loadAvailableScrapers() {
if (!this.scraperSelect) return;
try {
const data = await apiRequest("/scraper/available_scrapers");
if (data.success && data.scrapers && data.scrapers.length > 0) {
// Clear previous options except the default one
while (this.scraperSelect.options.length > 1) {
this.scraperSelect.remove(1);
}
// Add each scraper as an option
data.scrapers.forEach((scraper) => {
const option = document.createElement("option");
option.value = scraper.name;
option.textContent = `${
scraper.name
} - ${scraper.description.substring(0, 50)}${
scraper.description.length > 50 ? "..." : ""
}`;
if (scraper.is_current) {
option.textContent += " (system default)";
}
this.scraperSelect.appendChild(option);
});
} else {
// If no scrapers or error, add a note
const option = document.createElement("option");
option.disabled = true;
option.textContent = "No scrapers available";
this.scraperSelect.appendChild(option);
}
} catch (error) {
console.error("Error loading scrapers:", error);
const option = document.createElement("option");
option.disabled = true;
option.textContent = "Error loading scrapers";
this.scraperSelect.appendChild(option);
}
}
/**
* Search for papers
*/
async searchPapers() {
if (!this.searchInput || !this.paperSearchResults || !this.searchResults)
return;
const query = this.searchInput.value.trim();
if (!query) {
showFlashMessage("Please enter a search term", "warning");
return;
}
// Show loading message
this.paperSearchResults.innerHTML =
'<tr><td colspan="5" class="text-center">Searching papers...</td></tr>';
this.searchResults.classList.remove("d-none");
try {
const data = await apiRequest(
`/api/papers?query=${encodeURIComponent(query)}`
);
if (!data.papers || data.papers.length === 0) {
this.paperSearchResults.innerHTML =
'<tr><td colspan="5" class="text-center">No papers found matching your search</td></tr>';
return;
}
this.paperSearchResults.innerHTML = "";
data.papers.forEach((paper) => {
const row = document.createElement("tr");
// Create status badge
const statusBadge = createStatusBadge(paper.status);
// Create process button (enabled only for papers not in 'Pending' status)
const processButtonDisabled =
paper.status === "Pending" ? "disabled" : "";
// Truncate title if too long
const truncatedTitle = truncateText(paper.title, 70);
row.innerHTML = `
<td>${paper.id}</td>
<td title="${paper.title}">${truncatedTitle}</td>
<td>${paper.doi || "N/A"}</td>
<td>${statusBadge}</td>
<td>
<button class="btn btn-sm btn-primary process-paper-btn"
data-paper-id="${paper.id}"
${processButtonDisabled}>
Process Now
</button>
</td>
`;
this.paperSearchResults.appendChild(row);
});
// Add event listeners to the process buttons
document.querySelectorAll(".process-paper-btn").forEach((btn) => {
btn.addEventListener("click", () => {
this.processSinglePaper(btn.getAttribute("data-paper-id"));
});
});
} catch (error) {
console.error("Error searching papers:", error);
this.paperSearchResults.innerHTML =
'<tr><td colspan="5" class="text-center">Error searching papers</td></tr>';
}
}
/**
* Process a single paper
* @param {string} paperId - The ID of the paper to process
*/
async processSinglePaper(paperId) {
if (!this.scraperSelect) return;
// Disable all process buttons to prevent multiple clicks
document.querySelectorAll(".process-paper-btn").forEach((btn) => {
btn.disabled = true;
});
// Show processing status via flash message
showFlashMessage("Processing paper...", "info");
// Get selected scraper
const selectedScraper = this.scraperSelect.value;
try {
const data = await apiRequest(`/scraper/process_single/${paperId}`, {
method: "POST",
body: JSON.stringify({
scraper_module: selectedScraper,
}),
});
if (data.success) {
// Update status in the search results
const row = document
.querySelector(`.process-paper-btn[data-paper-id="${paperId}"]`)
?.closest("tr");
if (row) {
const statusCell = row.querySelector("td:nth-child(4)");
if (statusCell) {
statusCell.innerHTML = createStatusBadge("Pending");
}
}
// Show success notification
showFlashMessage(data.message, "success");
// Set up polling to check paper status and refresh activity
this.pollPaperStatus(paperId, 3000, 20);
} else {
showFlashMessage(data.message, "error");
}
} catch (error) {
console.error("Error processing paper:", error);
showFlashMessage("Error processing paper", "error");
} finally {
// Re-enable the process buttons after a short delay
setTimeout(() => {
document.querySelectorAll(".process-paper-btn").forEach((btn) => {
if (btn.getAttribute("data-paper-id") !== paperId) {
btn.disabled = false;
}
});
}, 1000);
}
}
/**
* Poll paper status until it changes from Pending
* @param {string} paperId - The paper ID to poll
* @param {number} interval - Polling interval in milliseconds
* @param {number} maxAttempts - Maximum number of polling attempts
*/
pollPaperStatus(paperId, interval = 3000, maxAttempts = 20) {
let attempts = 0;
// Immediately refresh activity log to show the initial pending status
if (this.onActivityRefresh) {
this.onActivityRefresh();
}
const checkStatus = async () => {
attempts++;
console.log(
`Checking status of paper ${paperId}, attempt ${attempts}/${maxAttempts}`
);
try {
const data = await apiRequest(`/api/papers/${paperId}`);
if (data && data.paper) {
const paper = data.paper;
console.log(`Paper status: ${paper.status}`);
// Update the UI with the current status
const row = document
.querySelector(`.process-paper-btn[data-paper-id="${paperId}"]`)
?.closest("tr");
if (row) {
const statusCell = row.querySelector("td:nth-child(4)");
if (statusCell) {
statusCell.innerHTML = createStatusBadge(paper.status);
}
// Update processing status message if status changed
if (paper.status !== "Pending") {
if (paper.status === "Done") {
showFlashMessage(
`Paper processed successfully: ${paper.title}`,
"success"
);
} else if (paper.status === "Failed") {
showFlashMessage(
`Paper processing failed: ${
paper.error_msg || "Unknown error"
}`,
"error"
);
}
}
}
// Always refresh activity log
if (this.onActivityRefresh) {
this.onActivityRefresh();
}
// If status is still pending and we haven't reached max attempts, check again
if (paper.status === "Pending" && attempts < maxAttempts) {
setTimeout(checkStatus, interval);
} else {
// If status changed or we reached max attempts, refresh chart data too
if (this.onChartRefresh) {
this.onChartRefresh();
}
// If we hit max attempts but status is still pending, show a message
if (paper.status === "Pending" && attempts >= maxAttempts) {
showFlashMessage(
"Paper is still being processed. Check the activity log for updates.",
"info"
);
}
}
}
} catch (error) {
console.error(`Error polling paper status: ${error}`);
// If there's an error, we can still try again if under max attempts
if (attempts < maxAttempts) {
setTimeout(checkStatus, interval);
}
}
};
// Start checking
setTimeout(checkStatus, interval);
}
/**
* Set callback for activity refresh
*/
setActivityRefreshCallback(callback) {
this.onActivityRefresh = callback;
}
/**
* Set callback for chart refresh
*/
setChartRefreshCallback(callback) {
this.onChartRefresh = callback;
}
}

View File

@ -0,0 +1,335 @@
/**
* Scraper control functionality
*/
class ScraperController {
constructor(options = {}) {
this.maxVolume = options.maxVolume || 1000;
this.volumeConfig = options.volumeConfig || 100;
// DOM elements
this.statusIndicator = document.getElementById("statusIndicator");
this.statusText = document.getElementById("statusText");
this.startButton = document.getElementById("startButton");
this.pauseButton = document.getElementById("pauseButton");
this.stopButton = document.getElementById("stopButton");
this.resetButton = document.getElementById("resetButton");
this.initEventListeners();
this.initStatusPolling();
}
/**
* Initialize event listeners for scraper controls
*/
initEventListeners() {
if (this.startButton) {
this.startButton.addEventListener("click", () => this.startScraper());
}
if (this.pauseButton) {
this.pauseButton.addEventListener("click", () =>
this.togglePauseScraper()
);
}
if (this.stopButton) {
this.stopButton.addEventListener("click", () => this.stopScraper());
}
if (this.resetButton) {
this.resetButton.addEventListener("click", () => this.resetScraper());
}
// Configuration form (handles both volume and scraper module)
const configForm = document.getElementById("volumeForm");
if (configForm) {
configForm.addEventListener("submit", (e) => {
e.preventDefault();
this.updateConfiguration();
});
}
}
/**
* Initialize status polling
*/
initStatusPolling() {
this.updateStatus();
setInterval(() => this.updateStatus(), 5000); // Poll every 5 seconds
}
/**
* Update scraper status display
*/
async updateStatus() {
try {
const data = await apiRequest("/scraper/status");
console.log("Status data received:", data);
// Remove all status classes first
if (this.statusIndicator) {
this.statusIndicator.classList.remove(
"status-active",
"status-paused",
"status-inactive"
);
}
// Handle the new JSON structure with scraper_state
const scraperState = data.scraper_state || data; // Fallback for old structure
if (scraperState.active) {
if (scraperState.paused) {
this.statusIndicator?.classList.add("status-paused");
if (this.statusText) this.statusText.textContent = "Paused";
if (this.pauseButton) this.pauseButton.textContent = "Resume";
} else {
this.statusIndicator?.classList.add("status-active");
if (this.statusText) this.statusText.textContent = "Active";
if (this.pauseButton) this.pauseButton.textContent = "Pause";
}
if (this.startButton) this.startButton.disabled = true;
if (this.pauseButton) this.pauseButton.disabled = false;
if (this.stopButton) this.stopButton.disabled = false;
if (this.resetButton) this.resetButton.disabled = false;
} else {
this.statusIndicator?.classList.add("status-inactive");
if (this.statusText) this.statusText.textContent = "Inactive";
if (this.startButton) this.startButton.disabled = false;
if (this.pauseButton) this.pauseButton.disabled = true;
if (this.stopButton) this.stopButton.disabled = true;
if (this.resetButton) this.resetButton.disabled = false;
}
} catch (error) {
console.error("Error fetching status:", error);
// On error, show inactive state
if (this.statusIndicator) {
this.statusIndicator.classList.remove(
"status-active",
"status-paused",
"status-inactive"
);
this.statusIndicator.classList.add("status-inactive");
}
if (this.statusText) this.statusText.textContent = "Error";
}
}
/**
* Start the scraper
*/
async startScraper() {
console.log("Start button clicked - sending request to /scraper/start");
try {
const data = await apiRequest("/scraper/start", {
method: "POST",
body: JSON.stringify({}),
});
console.log("Data received:", data);
if (data.success) {
showFlashMessage("Scraper started successfully", "success");
this.updateStatus();
// Trigger activity refresh if callback is provided
if (this.onActivityRefresh) {
setTimeout(() => this.onActivityRefresh(), 1000);
}
} else {
showFlashMessage(data.message, "error");
}
} catch (error) {
console.error("Error starting scraper:", error);
showFlashMessage("Error starting scraper: " + error.message, "error");
}
}
/**
* Toggle pause/resume scraper
*/
async togglePauseScraper() {
try {
const data = await apiRequest("/scraper/pause", {
method: "POST",
body: JSON.stringify({}),
});
if (data.success) {
showFlashMessage(data.message, "info");
this.updateStatus();
if (this.onActivityRefresh) {
setTimeout(() => this.onActivityRefresh(), 1000);
}
} else {
showFlashMessage(data.message, "error");
}
} catch (error) {
console.error("Error toggling pause:", error);
showFlashMessage("Error controlling scraper: " + error.message, "error");
}
}
/**
* Stop the scraper
*/
async stopScraper() {
try {
const data = await apiRequest("/scraper/stop", {
method: "POST",
body: JSON.stringify({}),
});
if (data.success) {
showFlashMessage("Scraper stopped successfully", "warning");
this.updateStatus();
if (this.onActivityRefresh) {
setTimeout(() => this.onActivityRefresh(), 1000);
}
} else {
showFlashMessage(data.message, "error");
}
} catch (error) {
console.error("Error stopping scraper:", error);
showFlashMessage("Error stopping scraper: " + error.message, "error");
}
}
/**
* Reset the scraper
*/
async resetScraper() {
if (
!confirm(
"Are you sure you want to reset the scraper? This will stop all current tasks, optionally clear non-pending papers, and restart the scraper."
)
) {
return;
}
// Disable button to prevent multiple clicks
if (this.resetButton) this.resetButton.disabled = true;
// Show a loading message
showFlashMessage("Resetting scraper, please wait...", "info");
try {
const data = await apiRequest("/scraper/reset", {
method: "POST",
body: JSON.stringify({
clear_papers: true, // You could make this configurable with a checkbox
}),
});
if (data.success) {
showFlashMessage(
"Scraper has been completely reset and restarted",
"success"
);
// Update everything
this.updateStatus();
if (this.onActivityRefresh) {
this.onActivityRefresh();
setTimeout(() => this.onActivityRefresh(), 1000);
}
if (this.onChartRefresh) {
this.onChartRefresh();
}
} else {
showFlashMessage(data.message || "Error resetting scraper", "error");
}
} catch (error) {
console.error("Error resetting scraper:", error);
showFlashMessage("Error resetting scraper: " + error.message, "error");
} finally {
// Re-enable button
if (this.resetButton) this.resetButton.disabled = false;
}
}
/**
* Update configuration (volume and/or scraper module)
*/
async updateConfiguration() {
const volumeInput = document.getElementById("volumeInput");
const scraperSelect = document.getElementById("mainScraperSelect");
const submitButton = document.querySelector(
'#volumeForm button[type="submit"]'
);
if (!submitButton) return;
const updates = {};
let hasChanges = false;
// Check volume changes
if (volumeInput) {
const volume = volumeInput.value;
// Basic validation
if (!volume || volume < 1 || volume > this.maxVolume) {
showFlashMessage(
`Please enter a valid volume between 1 and ${this.maxVolume}`,
"warning"
);
volumeInput.focus();
return;
}
updates.volume = volume;
hasChanges = true;
}
// Check scraper module changes
if (scraperSelect && scraperSelect.value) {
updates.scraper_module = scraperSelect.value;
hasChanges = true;
}
if (!hasChanges) {
showFlashMessage("No changes to save", "info");
return;
}
// Toggle loading state
toggleButtonLoading(submitButton, true, "Updating...");
try {
const data = await apiRequest("/scraper/update_config", {
method: "POST",
body: JSON.stringify(updates),
});
if (data.success) {
showFlashMessage(
data.message || "Configuration updated successfully",
"success"
);
} else {
showFlashMessage(
data.message || "Failed to update configuration",
"error"
);
}
} catch (error) {
console.error("Error updating configuration:", error);
showFlashMessage(
"Network error while updating configuration. Please try again.",
"error"
);
} finally {
toggleButtonLoading(submitButton, false);
}
}
/**
* Set callback for activity refresh
*/
setActivityRefreshCallback(callback) {
this.onActivityRefresh = callback;
}
/**
* Set callback for chart refresh
*/
setChartRefreshCallback(callback) {
this.onChartRefresh = callback;
}
}

View File

@ -0,0 +1,87 @@
/**
* Main scraper dashboard initialization and coordination
*/
class ScraperDashboard {
constructor(config = {}) {
this.config = {
maxVolume: config.maxVolume || 1000,
volumeConfig: config.volumeConfig || 100,
currentTimeRange: 24,
};
this.initComponents();
this.setupCallbacks();
this.initializeData();
}
/**
* Initialize all dashboard components
*/
initComponents() {
// Initialize chart
this.activityChart = new ActivityChart("activityChart");
// Initialize scraper controller
this.scraperController = new ScraperController({
maxVolume: this.config.maxVolume,
volumeConfig: this.config.volumeConfig,
});
// Initialize paper processor
this.paperProcessor = new PaperProcessor();
// Initialize activity monitor
this.activityMonitor = new ActivityMonitor();
}
/**
* Setup callbacks between components
*/
setupCallbacks() {
// Set up activity refresh callbacks
const activityRefreshCallback = () =>
this.activityMonitor.loadRecentActivity();
this.scraperController.setActivityRefreshCallback(activityRefreshCallback);
this.paperProcessor.setActivityRefreshCallback(activityRefreshCallback);
// Set up chart refresh callbacks
const chartRefreshCallback = (timeRange = this.config.currentTimeRange) => {
this.config.currentTimeRange = timeRange;
this.activityChart.loadData(timeRange);
};
this.scraperController.setChartRefreshCallback(chartRefreshCallback);
this.paperProcessor.setChartRefreshCallback(chartRefreshCallback);
this.activityMonitor.setChartRefreshCallback(chartRefreshCallback);
}
/**
* Initialize data on page load
*/
initializeData() {
// Load recent activity
this.activityMonitor.loadRecentActivity();
// Load chart data after a short delay to ensure Chart.js is loaded
setTimeout(() => {
this.activityChart.loadData(this.config.currentTimeRange);
}, 100);
}
/**
* Refresh all dashboard data
*/
refreshAll() {
this.activityMonitor.loadRecentActivity();
this.activityChart.loadData(this.config.currentTimeRange);
this.scraperController.updateStatus();
}
}
/**
* Initialize the scraper dashboard
* @param {Object} config - Configuration object with Jinja variables
*/
function initScraperDashboard(config = {}) {
return new ScraperDashboard(config);
}

View File

@ -0,0 +1,500 @@
/**
* Scraper Overview functionality
*/
class ScraperOverview {
constructor() {
this.modal = null;
this.scrapers = [];
this.systemConfig = {};
this.init();
}
init() {
// Initialize modal reference
this.modal = document.getElementById("scraperOverviewModal");
// Load data when modal is shown
if (this.modal) {
this.modal.addEventListener("show.bs.modal", () => {
this.loadScraperOverview();
});
}
}
async loadScraperOverview() {
const loadingEl = document.getElementById("scraperOverviewLoading");
const errorEl = document.getElementById("scraperOverviewError");
const contentEl = document.getElementById("scraperOverviewContent");
// Show loading state
loadingEl?.classList.remove("d-none");
errorEl?.classList.add("d-none");
contentEl?.classList.add("d-none");
try {
// Load scrapers, system config, and publishers in parallel
const [scrapersResponse, statusResponse, publishersResponse] =
await Promise.all([
fetch("/scraper/scrapers"),
fetch("/scraper/status"),
fetch("/scraper/publishers"),
]);
if (
!scrapersResponse.ok ||
!statusResponse.ok ||
!publishersResponse.ok
) {
throw new Error("Failed to load scraper information");
}
const scrapersData = await scrapersResponse.json();
const statusData = await statusResponse.json();
const publishersData = await publishersResponse.json();
if (
!scrapersData.success ||
!statusData.success ||
!publishersData.success
) {
throw new Error(
scrapersData.message ||
statusData.message ||
publishersData.message ||
"Unknown error"
);
}
this.scrapers = scrapersData.scrapers;
this.systemConfig = statusData;
this.publishersData = publishersData.data;
// Update UI
this.updateSystemConfig();
this.updateScrapersTable();
this.updatePublishersSection();
this.updateStatusFlowDiagram();
// Show content
loadingEl?.classList.add("d-none");
contentEl?.classList.remove("d-none");
} catch (error) {
console.error("Error loading scraper overview:", error);
// Show error state
loadingEl?.classList.add("d-none");
const errorMessage = document.getElementById(
"scraperOverviewErrorMessage"
);
if (errorMessage) {
errorMessage.textContent =
error.message || "Failed to load scraper information";
}
errorEl?.classList.remove("d-none");
}
}
updateSystemConfig() {
// Current scraper module
const currentModuleEl = document.getElementById("currentScraperModule");
if (currentModuleEl) {
const currentModule =
this.systemConfig.current_scraper_module || "System Default";
currentModuleEl.textContent = currentModule;
currentModuleEl.className = "badge bg-primary";
}
// Volume limit
const volumeLimitEl = document.getElementById("currentVolumeLimit");
if (volumeLimitEl) {
const volumeLimit = this.systemConfig.volume_config || "Unknown";
volumeLimitEl.textContent = volumeLimit;
}
// Total modules
const totalModulesEl = document.getElementById("totalScraperModules");
if (totalModulesEl) {
totalModulesEl.textContent = this.scrapers.length;
}
// Paper counts summary
const paperCountsEl = document.getElementById("paperCountsSummary");
if (paperCountsEl && this.systemConfig.paper_counts) {
const counts = this.systemConfig.paper_counts;
paperCountsEl.innerHTML = `
<div class="d-flex flex-wrap gap-2">
<span class="badge bg-primary">${counts.new || 0} New</span>
<span class="badge bg-warning">${
counts.processing || 0
} Processing</span>
<span class="badge bg-success">${
counts.done || 0
} Done</span>
<span class="badge bg-danger">${
counts.failed || 0
} Failed</span>
<span class="badge bg-info">${
counts.pending || 0
} Pending</span>
<span class="badge bg-secondary">${
counts.retrying || 0
} Retrying</span>
</div>
`;
}
}
updateScrapersTable() {
const tbody = document.getElementById("scrapersTableBody");
if (!tbody) return;
tbody.innerHTML = "";
this.scrapers.forEach((scraper) => {
const row = document.createElement("tr");
// Check if this is the current active scraper
const isCurrentScraper =
scraper.name === this.systemConfig.current_scraper_module;
if (scraper.error) {
row.innerHTML = `
<td>${scraper.name}</td>
<td colspan="5" class="text-danger">
<i class="fas fa-exclamation-triangle"></i> ${scraper.error}
</td>
`;
} else {
row.innerHTML = `
<td>
<strong>${scraper.name}</strong>
${
scraper.name === "dummy"
? '<span class="badge bg-info ms-2">Test Module</span>'
: ""
}
${
isCurrentScraper
? '<span class="badge bg-success ms-2"><i class="fas fa-check"></i> Active</span>'
: ""
}
</td>
<td class="scraper-description">
${this.truncateDescription(scraper.description)}
</td>
<td class="input-status-list">
${this.renderStatusBadges(
scraper.input_statuses,
"bg-info"
)}
</td>
<td class="status-output">
<span class="badge bg-success">${
scraper.output_status_success
}</span>
</td>
<td class="status-output">
<span class="badge bg-danger">${
scraper.output_status_failure
}</span>
</td>
<td class="status-output">
<span class="badge bg-warning">${
scraper.output_status_processing
}</span>
</td>
`;
}
// Highlight the current scraper row
if (isCurrentScraper) {
row.classList.add("table-success");
}
tbody.appendChild(row);
});
}
updateStatusFlowDiagram() {
const diagramEl = document.getElementById("statusFlowDiagram");
if (!diagramEl) return;
// Analyze actual scrapers to build real flow
const statusFlow = this.analyzeScraperFlow();
let diagramHTML = '<div class="status-flow-container">';
// Create visual flow based on actual scrapers
statusFlow.forEach((stage, index) => {
if (index > 0) {
diagramHTML +=
'<div class="status-flow-arrow text-center my-2"><i class="fas fa-arrow-down fa-2x text-muted"></i></div>';
}
diagramHTML += '<div class="status-flow-stage mb-4 p-3 border rounded">';
diagramHTML += `<div class="fw-bold mb-2 text-primary">${stage.title}</div>`;
if (stage.scrapers && stage.scrapers.length > 0) {
diagramHTML +=
'<div class="mb-2"><small class="text-muted">Handled by: ' +
stage.scrapers.map((s) => `<strong>${s}</strong>`).join(", ") +
"</small></div>";
}
diagramHTML += '<div class="status-badges">';
stage.statuses.forEach((status, statusIndex) => {
if (statusIndex > 0) {
diagramHTML += '<i class="fas fa-arrow-right status-flow-arrow"></i>';
}
const badgeClass = this.getStatusBadgeClass(status);
diagramHTML += `<span class="status-flow-node badge ${badgeClass}">${status}</span>`;
});
diagramHTML += "</div>";
if (stage.description) {
diagramHTML += `<div class="small text-muted mt-2">${stage.description}</div>`;
}
diagramHTML += "</div>";
});
diagramHTML += "</div>";
// Add explanation
diagramHTML += `
<div class="mt-4 p-3 bg-light rounded">
<h6><i class="fas fa-info-circle"></i> Flow Explanation:</h6>
<ul class="small mb-0">
<li><strong>Modular Processing:</strong> Each scraper handles specific input statuses</li>
<li><strong>Status Transitions:</strong> Papers move through statuses as they are processed</li>
<li><strong>Pipeline Architecture:</strong> Output from one scraper can become input to another</li>
<li><strong>Error Handling:</strong> Failed papers can be retried by specialized scrapers</li>
<li><strong>Parallel Processing:</strong> Multiple scrapers can work on different papers simultaneously</li>
</ul>
</div>
`;
diagramEl.innerHTML = diagramHTML;
}
analyzeScraperFlow() {
// Build actual flow based on available scrapers
const stages = [];
const allInputStatuses = new Set();
const allOutputStatuses = new Set();
const scrapersByInput = {};
// Analyze scrapers to understand the flow
this.scrapers.forEach((scraper) => {
if (scraper.input_statuses) {
scraper.input_statuses.forEach((status) => {
allInputStatuses.add(status);
if (!scrapersByInput[status]) {
scrapersByInput[status] = [];
}
scrapersByInput[status].push(scraper.name);
});
}
if (scraper.output_status_success)
allOutputStatuses.add(scraper.output_status_success);
if (scraper.output_status_failure)
allOutputStatuses.add(scraper.output_status_failure);
});
// Entry point
if (allInputStatuses.has("New")) {
stages.push({
title: "Entry Point",
statuses: ["New"],
scrapers: scrapersByInput["New"] || [],
description: "Newly uploaded papers enter the processing pipeline",
});
}
// Processing stages
const processingStatuses = Array.from(allInputStatuses).filter(
(status) => !["New", "Done", "Failed"].includes(status)
);
if (processingStatuses.length > 0) {
stages.push({
title: "Processing Stages",
statuses: processingStatuses,
scrapers: [],
description: "Papers move through various processing stages",
});
}
// Final outputs
const finalStatuses = ["Done", "Failed"];
stages.push({
title: "Final States",
statuses: finalStatuses.filter((status) => allOutputStatuses.has(status)),
scrapers: [],
description: "Papers end up in final success or failure states",
});
// Retry handling
if (allInputStatuses.has("Failed")) {
stages.push({
title: "Retry Processing",
statuses: ["Failed", "Retrying"],
scrapers: scrapersByInput["Failed"] || [],
description: "Failed papers can be retried with specialized scrapers",
});
}
return stages;
}
getStatusBadgeClass(status) {
const statusClasses = {
New: "bg-primary",
Pending: "bg-warning",
Processing: "bg-warning",
Retrying: "bg-warning",
Done: "bg-success",
Failed: "bg-danger",
HtmlDownloaded: "bg-info",
PublisherDetected: "bg-info",
TextExtracted: "bg-info",
};
return statusClasses[status] || "bg-secondary";
}
renderStatusBadges(statuses, defaultClass = "bg-secondary") {
if (!Array.isArray(statuses)) return "";
return statuses
.map(
(status) =>
`<span class="badge ${this.getStatusBadgeClass(
status
)} status-badge">${status}</span>`
)
.join("");
}
truncateDescription(description, maxLength = 100) {
if (!description) return "No description available";
if (description.length <= maxLength) return description;
return description.substring(0, maxLength).trim() + "...";
}
updatePublishersSection() {
// Update publisher statistics
const publisherStatsEl = document.getElementById("publisherStats");
if (publisherStatsEl && this.publishersData && this.publishersData.stats) {
const stats = this.publishersData.stats;
publisherStatsEl.innerHTML = `
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-primary mb-1">${stats.total_publishers}</div>
<div class="text-muted small">Total Publishers</div>
</div>
</div>
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-success mb-1">${stats.publishers_with_parsers}</div>
<div class="text-muted small">With Parsers</div>
</div>
</div>
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-warning mb-1">${stats.publishers_without_parsers}</div>
<div class="text-muted small">Missing Parsers</div>
</div>
</div>
<div class="col-md-3">
<div class="text-center">
<div class="h4 text-info mb-1">${stats.total_papers_with_publisher}</div>
<div class="text-muted small">Papers with Publisher</div>
</div>
</div>
`;
}
// Update publishers table
const publishersTableBody = document.getElementById("publishersTableBody");
if (
publishersTableBody &&
this.publishersData &&
this.publishersData.publishers
) {
publishersTableBody.innerHTML = "";
if (this.publishersData.publishers.length === 0) {
publishersTableBody.innerHTML = `
<tr>
<td colspan="4" class="text-center text-muted py-4">
<i class="fas fa-info-circle"></i> No publishers detected yet.<br>
<small>Run the publisher_detector scraper to identify publishers from paper URLs.</small>
</td>
</tr>
`;
return;
}
this.publishersData.publishers.forEach((publisher) => {
const row = document.createElement("tr");
// Publisher status badge
const statusBadge = publisher.has_parser
? '<span class="badge bg-success"><i class="fas fa-check"></i> Available</span>'
: '<span class="badge bg-warning"><i class="fas fa-exclamation-triangle"></i> Missing</span>';
// Parser availability indicator
const parserIndicator = publisher.has_parser
? '<i class="fas fa-check-circle text-success" title="Parser available"></i>'
: '<i class="fas fa-times-circle text-warning" title="Parser not available"></i>';
row.innerHTML = `
<td>
<strong>${publisher.name}</strong>
</td>
<td>
<span class="badge bg-info">${publisher.paper_count}</span>
</td>
<td>${statusBadge}</td>
<td class="text-center">${parserIndicator}</td>
`;
publishersTableBody.appendChild(row);
});
}
}
// Public method to show the modal
show() {
if (this.modal) {
const bootstrapModal = new bootstrap.Modal(this.modal);
bootstrapModal.show();
}
}
}
// Global function to load scraper overview (used by retry button)
function loadScraperOverview() {
if (window.scraperOverview) {
window.scraperOverview.loadScraperOverview();
}
}
// Global function to show scraper overview modal
function showScraperOverview() {
if (!window.scraperOverview) {
window.scraperOverview = new ScraperOverview();
}
window.scraperOverview.show();
}
// Initialize when DOM is ready
document.addEventListener("DOMContentLoaded", function () {
window.scraperOverview = new ScraperOverview();
});

View File

@ -0,0 +1,337 @@
/**
* Table utilities for handling data tables with pagination, sorting, and filtering
*/
class TableHandler {
constructor(tableId, options = {}) {
this.table = document.getElementById(tableId);
this.options = {
enableSorting: true,
enableFiltering: true,
enablePagination: true,
loadingText: "Loading...",
noDataText: "No data available",
...options,
};
this.currentPage = 1;
this.itemsPerPage = options.itemsPerPage || 20;
this.sortColumn = null;
this.sortDirection = "asc";
this.filters = {};
this.initializeTable();
}
/**
* Initialize table features
*/
initializeTable() {
if (!this.table) return;
if (this.options.enableSorting) {
this.setupSortingHandlers();
}
if (this.options.enableFiltering) {
this.setupFilteringHandlers();
}
}
/**
* Set up sorting handlers for table headers
*/
setupSortingHandlers() {
const headers = this.table.querySelectorAll("th[data-sortable]");
headers.forEach((header) => {
header.style.cursor = "pointer";
header.addEventListener("click", () => {
const column = header.dataset.sortable;
this.sortByColumn(column);
});
});
}
/**
* Sort table by column
* @param {string} column - Column to sort by
*/
sortByColumn(column) {
if (this.sortColumn === column) {
this.sortDirection = this.sortDirection === "asc" ? "desc" : "asc";
} else {
this.sortColumn = column;
this.sortDirection = "asc";
}
this.updateSortIndicators();
this.refreshData();
}
/**
* Update sort direction indicators in table headers
*/
updateSortIndicators() {
// Remove existing sort indicators
this.table.querySelectorAll("th .sort-indicator").forEach((indicator) => {
indicator.remove();
});
// Add indicator to current sort column
if (this.sortColumn) {
const header = this.table.querySelector(
`th[data-sortable="${this.sortColumn}"]`
);
if (header) {
const indicator = document.createElement("span");
indicator.className = "sort-indicator";
indicator.innerHTML = this.sortDirection === "asc" ? " ↑" : " ↓";
header.appendChild(indicator);
}
}
}
/**
* Set up filtering handlers
*/
setupFilteringHandlers() {
const filterInputs = document.querySelectorAll("[data-table-filter]");
filterInputs.forEach((input) => {
input.addEventListener("input", (e) => {
const filterKey = e.target.dataset.tableFilter;
this.setFilter(filterKey, e.target.value);
});
});
}
/**
* Set a filter value
* @param {string} key - Filter key
* @param {string} value - Filter value
*/
setFilter(key, value) {
if (value && value.trim() !== "") {
this.filters[key] = value.trim();
} else {
delete this.filters[key];
}
this.currentPage = 1; // Reset to first page when filtering
this.refreshData();
}
/**
* Show loading state
*/
showLoading() {
const tbody = this.table.querySelector("tbody");
if (tbody) {
const colCount = this.table.querySelectorAll("th").length;
tbody.innerHTML = `
<tr>
<td colspan="${colCount}" class="text-center">${this.options.loadingText}</td>
</tr>
`;
}
}
/**
* Show no data message
*/
showNoData() {
const tbody = this.table.querySelector("tbody");
if (tbody) {
const colCount = this.table.querySelectorAll("th").length;
tbody.innerHTML = `
<tr>
<td colspan="${colCount}" class="text-center">${this.options.noDataText}</td>
</tr>
`;
}
}
/**
* Render table data
* @param {Array} data - Array of data objects
* @param {Function} rowRenderer - Function to render each row
*/
renderData(data, rowRenderer) {
const tbody = this.table.querySelector("tbody");
if (!tbody) return;
if (!data || data.length === 0) {
this.showNoData();
return;
}
tbody.innerHTML = data.map(rowRenderer).join("");
}
/**
* Build query parameters for API requests
* @returns {object} Query parameters object
*/
buildQueryParams() {
const params = {
page: this.currentPage,
per_page: this.itemsPerPage,
...this.filters,
};
if (this.sortColumn) {
params.sort_by = this.sortColumn;
params.sort_dir = this.sortDirection;
}
return params;
}
/**
* Refresh table data (to be implemented by subclasses or passed as callback)
*/
refreshData() {
if (this.options.onRefresh) {
this.options.onRefresh(this.buildQueryParams());
}
}
/**
* Update pagination controls
* @param {object} paginationInfo - Pagination information
*/
updatePagination(paginationInfo) {
const paginationContainer = document.querySelector(".pagination-container");
if (!paginationContainer || !paginationInfo) return;
// This is a basic implementation - you might want to enhance this
const { current_page, total_pages, has_prev, has_next } = paginationInfo;
let paginationHTML = '<nav><ul class="pagination justify-content-center">';
// Previous button
if (has_prev) {
paginationHTML += `<li class="page-item"><a class="page-link" href="#" data-page="${
current_page - 1
}">Previous</a></li>`;
} else {
paginationHTML +=
'<li class="page-item disabled"><span class="page-link">Previous</span></li>';
}
// Page numbers (simplified - show current and adjacent pages)
const startPage = Math.max(1, current_page - 2);
const endPage = Math.min(total_pages, current_page + 2);
for (let i = startPage; i <= endPage; i++) {
if (i === current_page) {
paginationHTML += `<li class="page-item active"><span class="page-link">${i}</span></li>`;
} else {
paginationHTML += `<li class="page-item"><a class="page-link" href="#" data-page="${i}">${i}</a></li>`;
}
}
// Next button
if (has_next) {
paginationHTML += `<li class="page-item"><a class="page-link" href="#" data-page="${
current_page + 1
}">Next</a></li>`;
} else {
paginationHTML +=
'<li class="page-item disabled"><span class="page-link">Next</span></li>';
}
paginationHTML += "</ul></nav>";
paginationContainer.innerHTML = paginationHTML;
// Add click handlers for pagination links
paginationContainer.querySelectorAll("a[data-page]").forEach((link) => {
link.addEventListener("click", (e) => {
e.preventDefault();
this.currentPage = parseInt(e.target.dataset.page);
this.refreshData();
});
});
}
}
/**
* Specialized table handler for papers
*/
class PapersTableHandler extends TableHandler {
constructor(tableId, options = {}) {
super(tableId, {
apiEndpoint: "/api/papers",
...options,
});
}
/**
* Render a paper row
* @param {object} paper - Paper data object
* @returns {string} HTML string for table row
*/
renderPaperRow(paper) {
const statusBadge = createStatusBadge(paper.status);
const truncatedTitle = truncateText(paper.title, 70);
return `
<tr>
<td>
<a href="#" class="paper-link" data-url="/papers/${
paper.id
}/detail">
${truncatedTitle}
</a>
</td>
<td>
<a href="https://doi.org/${paper.doi}" target="_blank">
${paper.doi || "N/A"}
</a>
</td>
<td>${paper.journal || "N/A"}</td>
<td>${paper.issn || "N/A"}</td>
<td>${statusBadge}</td>
<td>${formatTimestamp(paper.created_at)}</td>
<td>${formatTimestamp(paper.updated_at)}</td>
</tr>
`;
}
/**
* Load and display papers data
* @param {object} params - Query parameters
*/
async loadPapers(params = {}) {
this.showLoading();
try {
const queryString = new URLSearchParams(params).toString();
const url = `${this.options.apiEndpoint}?${queryString}`;
const response = await fetch(url);
const data = await response.json();
if (data.papers) {
this.renderData(data.papers, (paper) => this.renderPaperRow(paper));
if (data.pagination) {
this.updatePagination(data.pagination);
}
} else {
this.showNoData();
}
} catch (error) {
console.error("Error loading papers:", error);
this.showNoData();
}
}
/**
* Refresh data implementation
*/
refreshData() {
this.loadPapers(this.buildQueryParams());
}
}

View File

@ -7,3 +7,34 @@
.progress-bar {
width: 0%;
}
/* JSON formatting styles */
.json-formatted {
background-color: #f8f9fa;
border: 1px solid #dee2e6;
border-radius: 0.375rem;
font-family: "Monaco", "Menlo", "Ubuntu Mono", monospace;
font-size: 0.875rem;
line-height: 1.4;
}
.json-formatted code {
color: #212529;
background-color: transparent;
padding: 0;
}
/* Improve readability of JSON in modals */
#extra-data-content {
white-space: pre-wrap;
word-break: break-word;
font-family: "Monaco", "Menlo", "Ubuntu Mono", monospace;
font-size: 0.875rem;
line-height: 1.4;
}
/* Style for old/new value code blocks */
pre code {
white-space: pre-wrap;
word-break: break-word;
}

View File

@ -1,4 +1,8 @@
{% extends "base.html.jinja" %} {% block content %}
{% extends "base.html.jinja" %}
{% block title %}About{% endblock title %}
{% block content %}
<h1 class="mb-4">📘 About This App</h1>
<p class="lead">

View File

@ -7,6 +7,7 @@
<meta name="keywords" content="science, papers, research, management" />
<title>{{ app_title }}</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" />
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.0/font/bootstrap-icons.css">
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
<!-- Optional Alpine.js -->
<script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
@ -17,6 +18,8 @@
<main class="container my-5">{% block content %}{% endblock content %}</main>
{% include "footer.html.jinja" %}
<!-- Include common utilities globally -->
<script src="{{ url_for('static', filename='js/common.js') }}"></script>
{% block scripts %}{% endblock scripts %}
</body>

View File

@ -38,6 +38,43 @@
</div>
</div>
<div class="form-section">
<h6>Scheduler Timezone</h6>
<p class="text-muted">Configure the timezone for the APScheduler to use for job
scheduling.</p>
<div class="mb-3">
<label for="timezone" class="form-label">Timezone:</label>
<select class="form-control" id="timezone" name="timezone" required>
<option value="UTC" {% if timezone_config.timezone=='UTC' %}selected{% endif %}>
UTC</option>
<option value="Europe/Berlin" {% if timezone_config.timezone=='Europe/Berlin'
%}selected{% endif %}>Europe/Berlin (CET/CEST)</option>
<option value="Europe/London" {% if timezone_config.timezone=='Europe/London'
%}selected{% endif %}>Europe/London (GMT/BST)</option>
<option value="Europe/Paris" {% if timezone_config.timezone=='Europe/Paris'
%}selected{% endif %}>Europe/Paris (CET/CEST)</option>
<option value="Europe/Rome" {% if timezone_config.timezone=='Europe/Rome'
%}selected{% endif %}>Europe/Rome (CET/CEST)</option>
<option value="US/Eastern" {% if timezone_config.timezone=='US/Eastern'
%}selected{% endif %}>US/Eastern (EST/EDT)</option>
<option value="US/Central" {% if timezone_config.timezone=='US/Central'
%}selected{% endif %}>US/Central (CST/CDT)</option>
<option value="US/Mountain" {% if timezone_config.timezone=='US/Mountain'
%}selected{% endif %}>US/Mountain (MST/MDT)</option>
<option value="US/Pacific" {% if timezone_config.timezone=='US/Pacific'
%}selected{% endif %}>US/Pacific (PST/PDT)</option>
<option value="Asia/Tokyo" {% if timezone_config.timezone=='Asia/Tokyo'
%}selected{% endif %}>Asia/Tokyo (JST)</option>
<option value="Asia/Shanghai" {% if timezone_config.timezone=='Asia/Shanghai'
%}selected{% endif %}>Asia/Shanghai (CST)</option>
<option value="Australia/Sydney" {% if
timezone_config.timezone=='Australia/Sydney' %}selected{% endif %}>
Australia/Sydney (AEST/AEDT)</option>
</select>
<div class="form-text">Current: {{ timezone_config.timezone }}</div>
</div>
</div>
<div class="form-section">
<h6>System Settings</h6>
<p class="text-muted">Configure general system behavior.</p>
@ -65,15 +102,21 @@
<div class="col-md-6">
<form method="post" action="{{ url_for('config.update_scraper_module') }}">
<div class="form-section">
<h6>Scraper Module</h6>
<div class="d-flex justify-content-between align-items-center mb-2">
<h6>Scraper Module</h6>
<button type="button" class="btn btn-outline-info btn-sm"
onclick="showScraperOverview()" title="View scraper modules overview">
<i class="fas fa-info-circle"></i> How Scrapers Work
</button>
</div>
<p class="text-muted">Select which scraper module to use for processing papers.</p>
<div class="mb-3">
<label for="scraper_module" class="form-label">Active Scraper Module:</label>
<select class="form-control" id="scraper_module" name="scraper_module">
{% for module in available_scraper_modules %}
<option value="{{ module }}" {% if module==current_scraper_module %} selected
{%endif %}>
<option value="{{ module }}" {% if module==current_scraper_module %} selected {%
endif %}>
{{ module }}
{% if scraper_details[module] %}
- {{ scraper_details[module].description[:50] }}...

View File

@ -53,4 +53,13 @@
{% endif %}
</div>
</div>
<!-- Include the scraper overview modal -->
{% include "partials/scraper_overview_modal.html.jinja" %}
{% endblock content %}
{% block scripts %}
{{ super() }}
<script src="{{ url_for('static', filename='js/scraper-overview.js') }}"></script>
{% endblock scripts %}

View File

@ -39,12 +39,19 @@
}
</style>
<script>
const initialSchedule = {{ schedule | tojson }};
const totalVolume = {{ volume }};
<!-- Configuration data in JSON format for clean separation -->
<script type="application/json" id="schedule-config">
{
"initialSchedule": {{ schedule | tojson }},
"totalVolume": {{ volume | tojson }},
"maxVolume": {{ max_volume | tojson }}
}
</script>
<div x-data="scheduleManager(initialSchedule, totalVolume)" class="tab-pane active">
<!-- Load config handler for modular functionality -->
<script src="{{ url_for('static', filename='js/config-handler.js') }}"></script>
<div x-data="configHandler.createScheduleManager()" class="tab-pane active">
<div class="card">
<div class="card-header d-flex justify-content-between">
<h5>Scheduling Configuration</h5>
@ -211,164 +218,3 @@
</div>
</div>
</div>
<script>
function scheduleManager(initial, volume) {
return {
schedule: { ...initial },
volume: volume,
selectedHours: [],
newWeight: 1.0,
volumeValue: volume,
isDragging: false,
dragOperation: null,
formatHour(h) {
return String(h).padStart(2, "0") + ":00";
},
updateVolume() {
fetch('{{ url_for('config.api_update_config') }}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
volume: this.volumeValue
})
})
.then(response => response.json())
.then(data => {
if (data.success) {
this.volume = parseFloat(this.volumeValue);
showFlashMessage('Volume updated successfully!', 'success');
} else {
showFlashMessage(data.updates?.[0]?.message || 'Error updating volume', 'error');
}
})
.catch(error => {
console.error('Error:', error);
showFlashMessage('Network error occurred', 'error');
});
},
getBackgroundStyle(hour) {
const weight = parseFloat(this.schedule[hour]);
const maxWeight = 2.5; // You can adjust this
// Normalize weight (0.0 to 1.0)
const t = Math.min(weight / maxWeight, 1.0);
// Interpolate HSL lightness: 95% (light) to 30% (dark)
const lightness = 95 - t * 65; // 95 → 30
const backgroundColor = `hsl(210, 10%, ${lightness}%)`; // soft gray-blue palette
const textColor = t > 0.65 ? "white" : "black"; // adaptive text color
return {
backgroundColor,
color: textColor,
};
},
getBackgroundStyleFromValue(value) {
const weight = parseFloat(value);
const maxWeight = 2.5; // You can adjust this
// Normalize weight (0.0 to 1.0)
const t = Math.min(weight / maxWeight, 1.0);
// Interpolate HSL lightness: 95% (light) to 30% (dark)
const lightness = 95 - t * 65; // 95 → 30
const backgroundColor = `hsl(210, 10%, ${lightness}%)`; // soft gray-blue palette
const textColor = t > 0.65 ? "white" : "black"; // adaptive text color
return {
backgroundColor,
color: textColor,
};
},
startDrag(event, hour) {
event.preventDefault();
this.isDragging = true;
this.dragOperation = this.isSelected(hour) ? "remove" : "add";
this.toggleSelect(hour);
},
dragSelect(hour) {
if (!this.isDragging) return;
const selected = this.isSelected(hour);
if (this.dragOperation === "add" && !selected) {
this.selectedHours.push(hour);
} else if (this.dragOperation === "remove" && selected) {
this.selectedHours = this.selectedHours.filter((h) => h !== hour);
}
},
endDrag() {
this.isDragging = false;
},
toggleSelect(hour) {
if (this.isSelected(hour)) {
this.selectedHours = this.selectedHours.filter((h) => h !== hour);
} else {
this.selectedHours.push(hour);
}
},
isSelected(hour) {
return this.selectedHours.includes(hour);
},
applyWeight() {
this.selectedHours.forEach((hour) => {
this.schedule[hour] = parseFloat(this.newWeight).toFixed(1);
});
this.selectedHours = [];
},
getTotalWeight() {
return Object.values(this.schedule).reduce(
(sum, w) => sum + parseFloat(w),
0
);
},
getPapersPerHour(hour) {
const total = this.getTotalWeight();
if (total === 0) return 0;
return (
(parseFloat(this.schedule[hour]) / total) *
this.volume
).toFixed(1);
},
saveSchedule() {
fetch('{{ url_for('config.api_update_config') }}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
schedule: this.schedule
})
})
.then(response => response.json())
.then(data => {
if (data.success) {
showFlashMessage('Schedule updated successfully!', 'success');
} else {
showFlashMessage(data.updates?.[0]?.message || 'Error updating schedule', 'error');
}
})
.catch(error => {
console.error('Error:', error);
showFlashMessage('Network error occurred', 'error');
});
}
};
}
</script>

View File

@ -1,66 +1,146 @@
{% extends "base.html.jinja" %}
<!-- Include flash messages template -->
{% include "partials/flash_messages.html.jinja" %}
{% block title %}Home - SciPaperLoader{% endblock title %}
{% block content %}
<div class="container text-center">
<div class="container text-center mb-5">
<h1 class="display-4">Welcome to SciPaperLoader</h1>
<p class="lead">Your paper scraping tool is ready.</p>
<p class="text-muted">A simple tool to scrape papers from Zotero API.</p>
<p class="lead">Your comprehensive paper management and scraping platform</p>
<p class="text-muted">Automate paper collection, manage metadata, and monitor download progress with intelligent
scheduling</p>
</div>
<div class="row g-4">
<div class="col-md-6">
<div class="card shadow-sm">
<!-- Main Features Section -->
<div class="row g-4 mb-5">
<div class="col-12">
<h2 class="text-center mb-4">🚀 Core Features</h2>
</div>
<div class="col-lg-4 col-md-6">
<div class="card shadow-sm h-100">
<div class="card-body">
<h5 class="card-title">📄 CSV Import</h5>
<h5 class="card-title">🎛️ Scraper Control Panel</h5>
<p class="card-text">
Upload a 37-column CSV to import paper metadata. Only relevant fields
(title, DOI, ISSN, etc.) are stored. Errors are reported without
aborting the batch.
Start, pause, and monitor the automated paper scraping process. View real-time statistics,
activity charts, and process individual papers on demand.
</p>
<a href="{{ url_for('upload.upload') }}" class="btn btn-sm btn-outline-primary">Upload Now</a>
<a href="{{ url_for('scraper.index') }}" class="btn btn-primary">Open Control Panel</a>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card shadow-sm">
<div class="card-body">
<h5 class="card-title">🧠 Background Scraper</h5>
<p class="card-text">
A daemon process runs hourly to fetch papers using Zotero API.
Downloads are randomized to mimic human behavior and avoid detection.
</p>
<a href="{{ url_for('logger.list_logs') }}" class="btn btn-sm btn-outline-secondary">View Logs</a>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card shadow-sm">
<div class="col-lg-4 col-md-6">
<div class="card shadow-sm h-100">
<div class="card-body">
<h5 class="card-title">📚 Paper Management</h5>
<p class="card-text">
Monitor paper status (Pending, Done, Failed), download PDFs, and
inspect errors. Files are stored on disk in structured folders per
DOI.
Browse, search, and manage your paper collection. View download status,
inspect metadata, export data, and handle failed downloads.
</p>
<a href="{{ url_for('papers.list_papers') }}" class="btn btn-sm btn-outline-success">Browse Papers</a>
<a href="{{ url_for('papers.list_papers') }}" class="btn btn-success">Browse Papers</a>
</div>
</div>
</div>
<div class="col-md-6">
<div class="card shadow-sm">
<div class="col-lg-4 col-md-6">
<div class="card shadow-sm h-100">
<div class="card-body">
<h5 class="card-title">🕒 Download Schedule</h5>
<h5 class="card-title">📄 CSV Data Import</h5>
<p class="card-text">
Control how many papers are downloaded per hour. Configure hourly
volume (e.g. 2/hour at daytime, 0 at night) to match your bandwidth or
usage pattern.
Bulk import paper metadata from CSV files. Supports 37-column format with
intelligent duplicate detection and comprehensive error reporting.
</p>
<a href="{{ url_for('config.schedule') }}" class="btn btn-sm btn-outline-warning">Adjust Schedule</a>
<a href="{{ url_for('upload.upload') }}" class="btn btn-outline-primary">Import Data</a>
</div>
</div>
</div>
</div>
<!-- Configuration & Monitoring Section -->
<div class="row g-4 mb-5">
<div class="col-12">
<h2 class="text-center mb-4">⚙️ Configuration & Monitoring</h2>
</div>
<div class="col-lg-4 col-md-6">
<div class="card shadow-sm h-100">
<div class="card-body">
<h5 class="card-title">🕒 Download Scheduling</h5>
<p class="card-text">
Configure hourly download quotas and timing patterns. Set different rates for
day/night hours to optimize bandwidth usage and avoid detection.
</p>
<a href="{{ url_for('config.schedule') }}" class="btn btn-warning">Manage Schedule</a>
</div>
</div>
</div>
<div class="col-lg-4 col-md-6">
<div class="card shadow-sm h-100">
<div class="card-body">
<h5 class="card-title">🔧 System Configuration</h5>
<p class="card-text">
Adjust global settings including daily volume limits, download paths,
and scraper module selection for optimal performance.
</p>
<a href="{{ url_for('config.general') }}" class="btn btn-outline-secondary">System Settings</a>
</div>
</div>
</div>
<div class="col-lg-4 col-md-6">
<div class="card shadow-sm h-100">
<div class="card-body">
<h5 class="card-title">📊 Activity Logs</h5>
<p class="card-text">
Monitor system activity, track scraping progress, and troubleshoot issues
with comprehensive logging and activity timeline views.
</p>
<a href="{{ url_for('logger.list_logs') }}" class="btn btn-info">View Logs</a>
</div>
</div>
</div>
</div>
<!-- Advanced Features Section -->
<div class="row g-4 mb-5">
<div class="col-12">
<h2 class="text-center mb-4">🔬 Advanced Features</h2>
</div>
<div class="col-lg-6 col-md-6">
<div class="card shadow-sm h-100">
<div class="card-body">
<h5 class="card-title">🗄️ Database Management</h5>
<p class="card-text">
Manage your paper database with tools for generating test data,
cleaning up records, and database maintenance operations.
</p>
<a href="{{ url_for('config.database') }}" class="btn btn-outline-danger">Database Tools</a>
</div>
</div>
</div>
<div class="col-lg-6 col-md-6">
<div class="card shadow-sm h-100">
<div class="card-body">
<h5 class="card-title">🧠 Intelligent Processing</h5>
<p class="card-text">
Background daemon with randomized timing, human-like behavior patterns,
and automatic retry mechanisms for robust paper collection.
</p>
<div class="mt-3">
<span class="badge bg-success me-2">Auto-Retry</span>
<span class="badge bg-info me-2">Smart Timing</span>
<span class="badge bg-warning">Rate Limiting</span>
</div>
</div>
</div>
</div>
</div>
{% endblock content %}

View File

@ -1,117 +0,0 @@
{% extends "base.html.jinja" %}
{% block content %}
<h1>Activity Logs</h1>
<form method="get" class="mb-3">
<div class="row g-2">
<div class="col-md-3">
<label for="category" class="form-label">Category:</label>
<select name="category" id="category" class="form-select">
<option value="">All</option>
{% for cat in categories %}
<option value="{{ cat }}" {% if category==cat %}selected{% endif %}>{{ cat }}</option>
{% endfor %}
</select>
</div>
<div class="col-md-3">
<label for="start_date" class="form-label">Start Date:</label>
<input type="date" name="start_date" id="start_date" value="{{ start_date }}" class="form-control">
</div>
<div class="col-md-3">
<label for="end_date" class="form-label">End Date:</label>
<input type="date" name="end_date" id="end_date" value="{{ end_date }}" class="form-control">
</div>
<div class="col-md-3">
<label for="search_term" class="form-label">Search:</label>
<input type="text" name="search_term" id="search_term" value="{{ search_term }}" class="form-control">
</div>
</div>
<div class="mt-3">
<button type="submit" class="btn btn-primary">Filter</button>
<a href="{{ url_for('logger.download_logs', category=category, start_date=start_date, end_date=end_date, search_term=search_term) }}"
class="btn btn-secondary">Download CSV</a>
</div>
</form>
<ul class="list-group">
{% for log in logs %}
<li class="list-group-item log-item" data-log-id="{{ log.id }}">
<div class="d-flex justify-content-between align-items-center">
<div class="ms-2 me-auto">
<div class="fw-bold">{{ log.timestamp }}</div>
{{ log.action }} - {{ log.description }}
</div>
<span class="badge bg-primary rounded-pill">{{ log.category }}</span>
</div>
</li>
{% endfor %}
</ul>
{% if pagination %}
<nav aria-label="Page navigation" class="mt-4">
<ul class="pagination justify-content-center">
{% if pagination.has_prev %}
<li class="page-item">
<a class="page-link"
href="{{ url_for('logger.list_logs', page=pagination.prev_num, category=category, start_date=start_date, end_date=end_date, search_term=search_term) }}">Previous</a>
</li>
{% else %}
<li class="page-item disabled">
<span class="page-link">Previous</span>
</li>
{% endif %}
<li class="page-item disabled">
<span class="page-link">Page {{ pagination.page }} of {{ pagination.pages }}</span>
</li>
{% if pagination.has_next %}
<li class="page-item">
<a class="page-link"
href="{{ url_for('logger.list_logs', page=pagination.next_num, category=category, start_date=start_date, end_date=end_date, search_term=search_term) }}">Next</a>
</li>
{% else %}
<li class="page-item disabled">
<span class="page-link">Next</span>
</li>
{% endif %}
</ul>
</nav>
{% endif %}
<!-- Modal for log details -->
<div class="modal fade" id="logDetailModal" tabindex="-1" aria-hidden="true">
<div class="modal-dialog modal-lg modal-dialog-scrollable">
<div class="modal-content" id="log-detail-content">
<!-- Log details will be loaded here via AJAX -->
</div>
</div>
</div>
<script>
document.addEventListener("DOMContentLoaded", function () {
const modal = new bootstrap.Modal(document.getElementById('logDetailModal'));
const content = document.getElementById('log-detail-content');
document.querySelectorAll('.log-item').forEach(item => {
item.addEventListener('click', function () {
const logId = this.getAttribute('data-log-id');
fetch(`/logs/${logId}/detail`)
.then(response => response.text())
.then(html => {
content.innerHTML = html;
modal.show();
})
.catch(err => {
content.innerHTML = '<div class="modal-body text-danger">Error loading log details.</div>';
modal.show();
});
});
});
});
</script>
{% endblock content %}

View File

@ -0,0 +1,263 @@
{% extends "base.html.jinja" %}
{% block title %}Activity Logs{% endblock title %}
{% block styles %}
{{ super() }}
<style>
.logs-container {
background: white;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
.filter-panel {
background: #f8f9fa;
border-bottom: 1px solid #dee2e6;
padding: 1rem;
}
.log-entry {
cursor: pointer;
transition: background-color 0.2s ease;
}
.log-entry:hover {
background-color: #f8f9fa;
}
.category-badge {
font-size: 0.75rem;
padding: 0.25rem 0.5rem;
}
.activity-controls {
width: auto;
display: inline-block;
}
.logs-table th {
background-color: #f8f9fa;
font-weight: 600;
}
.log-entry {
cursor: pointer;
transition: background-color 0.2s ease;
}
.log-entry:hover {
background-color: #f8f9fa;
}
.pagination-info {
font-size: 0.875rem;
color: #6c757d;
}
.search-results-container {
max-height: 600px;
overflow-y: auto;
}
/* JSON formatting styles */
.json-formatted {
background-color: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 0.375rem;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 0.875rem;
line-height: 1.4;
}
.json-formatted code {
color: #495057;
background: transparent;
}
</style>
{% endblock styles %}
{% block content %}
<div class="container-fluid mt-4">
<h1><i class="bi bi-list-ul"></i> Activity Logs</h1>
<!-- Include standardized flash messages -->
{% include "partials/flash_messages.html.jinja" %}
<div class="logs-container">
<!-- Filter Panel -->
<div class="filter-panel">
<form id="filterForm" class="row g-3">
<div class="col-md-3">
<label class="form-label">Categories:</label>
<div class="category-checkbox-container p-2"
style="max-height: 200px; overflow-y: auto; background-color: white; border: 1px solid #ced4da; border-radius: 0.375rem;">
<div class="form-check">
<input class="form-check-input" type="checkbox" id="selectAllCategories" {% if not
selected_categories or selected_categories|length==categories|length %}checked{% endif
%}>
<label class="form-check-label fw-bold" for="selectAllCategories">
All Categories
</label>
</div>
<hr class="my-2">
{% for cat in categories %}
<div class="form-check">
<input class="form-check-input category-checkbox" type="checkbox" id="category_{{ cat }}"
value="{{ cat }}" {% if not selected_categories or cat in selected_categories
%}checked{% endif %}>
<label class="form-check-label" for="category_{{ cat }}">
{{ cat.replace('_', ' ').title() }}
</label>
</div>
{% endfor %}
</div>
</div>
<div class="col-md-3">
<div class="row">
<label for="statusFilter" class="form-label">Status:</label>
<select id="statusFilter" class="form-select form-select-sm">
<option value="">All Statuses</option>
<option value="success">Success</option>
<option value="error">Error</option>
<option value="warning">Warning</option>
<option value="info">Info</option>
<option value="pending">Pending</option>
</select>
</div>
</div>
<div class="col-md-3">
<label for="startDate" class="form-label">Start Date:</label>
<input type="date" id="startDate" class="form-control form-control-sm"
value="{{ start_date or '' }}">
<label for="endDate" class="form-label mt-2">End Date:</label>
<input type="date" id="endDate" class="form-control form-control-sm" value="{{ end_date or '' }}">
</div>
<div class="col-md-3">
<label for="searchTerm" class="form-label">Search:</label>
<input type="text" id="searchTerm" class="form-control form-control-sm"
placeholder="Search in actions and descriptions" value="{{ search_term or '' }}">
</div>
<div class="col-12 d-flex justify-content-end mt-3">
<button type="button" id="clearFilters" class="btn btn-outline-secondary btn-sm">
<i class="bi bi-x"></i> Clear Filters
</button>
</div>
</form>
</div>
<!-- Controls Panel -->
<div class="d-flex justify-content-between align-items-center p-3 border-bottom">
<div class="d-flex align-items-center gap-3">
<div class="form-group mb-0">
<label for="pageSize" class="form-label mb-0 me-2">Show:</label>
<select id="pageSize" class="form-select form-select-sm activity-controls">
<option value="20">20</option>
<option value="50" selected>50</option>
<option value="100">100</option>
</select>
</div>
<span id="paginationInfo" class="pagination-info">Loading...</span>
</div>
<div class="d-flex gap-2">
<button type="button" id="refreshLogs" class="btn btn-outline-primary btn-sm">
<i class="bi bi-arrow-clockwise"></i> Refresh
</button>
<button type="button" id="downloadLogs" class="btn btn-outline-success btn-sm">
<i class="bi bi-download"></i> Download CSV
</button>
</div>
</div>
<!-- Logs Table -->
<div class="search-results-container">
<table class="table table-hover logs-table mb-0">
<thead class="sticky-top">
<tr>
<th style="width: 150px;">Timestamp</th>
<th style="width: 120px;">Category</th>
<th style="width: 180px;">Action</th>
<th style="width: 100px;">Status</th>
<th>Description</th>
</tr>
</thead>
<tbody id="logsTableBody">
<tr>
<td colspan="5" class="text-center py-4">
<div class="spinner-border spinner-border-sm text-primary" role="status">
<span class="visually-hidden">Loading...</span>
</div>
Loading logs...
</td>
</tr>
</tbody>
</table>
</div>
<!-- Pagination Controls -->
<nav id="logsPagination" aria-label="Logs pagination" class="p-3 border-top d-none">
<div class="d-flex justify-content-between align-items-center">
<div class="pagination-info">
<span id="paginationDetails">Showing 0 - 0 of 0 entries</span>
</div>
<ul class="pagination pagination-sm mb-0">
<li class="page-item" id="prevPage">
<a class="page-link" href="#" aria-label="Previous">
<span aria-hidden="true">«</span>
</a>
</li>
<li class="page-item active" id="currentPageItem">
<span class="page-link" id="currentPageSpan">1</span>
</li>
<li class="page-item" id="nextPage">
<a class="page-link" href="#" aria-label="Next">
<span aria-hidden="true">»</span>
</a>
</li>
</ul>
</div>
</nav>
</div>
</div>
<!-- Modal for log details -->
<div class="modal fade" id="logDetailModal" tabindex="-1" aria-hidden="true" data-bs-backdrop="true"
data-bs-keyboard="true">
<div class="modal-dialog modal-lg modal-dialog-scrollable">
<div class="modal-content" id="log-detail-content">
<!-- Log details will be loaded here via AJAX -->
</div>
</div>
</div>
{% endblock content %}
{% block scripts %}
{{ super() }}
<script src="{{ url_for('static', filename='js/modal-handler.js') }}"></script>
<script src="{{ url_for('static', filename='js/logger-manager.js') }}"></script>
<script>
document.addEventListener('DOMContentLoaded', function () {
// Initialize the logger manager
window.loggerManager = new LoggerManager({
initialFilters: {
category: {{ selected_categories | tojson }},
start_date: "{{ start_date or '' }}",
end_date: "{{ end_date or '' }}",
search_term: "{{ search_term or '' }}"
}
});
// Set up modal handler for log details
const logModal = new ModalHandler('logDetailModal', 'log-detail-content');
window.loggerManager.setModalHandler(logModal);
});
</script>
{% endblock scripts %}

View File

@ -8,7 +8,7 @@
<div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
<li class="nav-item">
<a class="nav-link" href="{{ url_for('scraper.index') }}">Scraper</a>
<a class="nav-link" href="{{ url_for('scraper.index') }}">Control Panel</a>
</li>
<li class="nav-item">
<a class="nav-link" href="{{ url_for('upload.upload') }}">Import CSV</a>

View File

@ -1,7 +1,12 @@
{% extends "base.html.jinja" %}
{% block title %}Papers{% endblock title %}
{% block content %}
<!-- Include flash messages template -->
{% include "partials/flash_messages.html.jinja" %}
{# --- Sort direction logic for each column --- #}
{% set title_sort = 'asc' if sort_by != 'title' or sort_dir == 'desc' else 'desc' %}
{% set journal_sort = 'asc' if sort_by != 'journal' or sort_dir == 'desc' else 'desc' %}
@ -275,28 +280,14 @@
</ul>
</nav>
<script>
document.addEventListener("DOMContentLoaded", function () {
const modal = new bootstrap.Modal(document.getElementById('paperDetailModal'));
const content = document.getElementById('paper-detail-content');
document.querySelectorAll('.paper-link').forEach(link => {
link.addEventListener('click', function (e) {
e.preventDefault();
const url = this.getAttribute('data-url');
fetch(url)
.then(response => response.text())
.then(html => {
content.innerHTML = html;
modal.show();
})
.catch(err => {
content.innerHTML = '<div class="modal-body text-danger">Error loading details.</div>';
modal.show();
});
});
});
});
</script>
{% endblock content %}
{% block scripts %}
{{ super() }}
<script src="{{ url_for('static', filename='js/modal-handler.js') }}"></script>
<script>
// Use the reusable ModalHandler for paper details
const paperModal = new ModalHandler('paperDetailModal', 'paper-detail-content');
paperModal.setupClickHandlers('.paper-link');
</script>
{% endblock scripts %}

View File

@ -1,93 +1,145 @@
<!-- Server-side flash messages from Flask -->
{% with messages = get_flashed_messages(with_categories=true) %}
{% if messages %}
<div class="server-flash-messages">
{% for category, message in messages %}
<div class="alert alert-{{ category }} alert-dismissible fade show" role="alert">
{{ message }}
<button type="button" class="btn-close" data-bs-dismiss="alert" aria-label="Close"></button>
</div>
{% endfor %}
</div>
{% endif %}
{% endwith %}
<!-- JavaScript flash message container for client-side messages -->
<div id="clientFlashContainer"></div>
<!-- SVG Icons for Flash Messages -->
<svg xmlns="http://www.w3.org/2000/svg" class="d-none">
<symbol id="check-circle-fill" viewBox="0 0 16 16">
<path
d="M16 8A8 8 0 1 1 0 8a8 8 0 0 1 16 0zm-3.97-3.03a.75.75 0 0 0-1.08.022L7.477 9.417 5.384 7.323a.75.75 0 0 0-1.06 1.06L6.97 11.03a.75.75 0 0 0 1.079-.02l3.992-4.99a.75.75 0 0 0-.01-1.05z" />
</symbol>
<symbol id="info-fill" viewBox="0 0 16 16">
<path
d="M8 16A8 8 0 1 0 8 0a8 8 0 0 0 0 16zm.93-9.412-1 4.705c-.07.34.029.533.304.533.194 0 .487-.07.686-.246l-.088.416c-.287.346-.92.598-1.465.598-.703 0-1.002-.422-.808-1.319l.738-3.468c.064-.293.006-.399-.287-.47l-.451-.081.082-.381 2.29-.287zM8 5.5a1 1 0 1 1 0-2 1 1 0 0 1 0 2z" />
</symbol>
<symbol id="exclamation-triangle-fill" viewBox="0 0 16 16">
<path
d="M8.982 1.566a1.13 1.13 0 0 0-1.96 0L.165 13.233c-.457.778.091 1.767.98 1.767h13.713c.889 0 1.438-.99.98-1.767L8.982 1.566zM8 5c.535 0 .954.462.9.995l-.35 3.507a.552.552 0 0 1-1.1 0L7.1 5.995A.905.905 0 0 1 8 5zm.002 6a1 1 0 1 1 0 2 1 1 0 0 1 0-2z" />
</symbol>
<symbol id="x-circle-fill" viewBox="0 0 16 16">
<path
d="M16 8A8 8 0 1 1 0 8a8 8 0 0 1 16 0zM5.354 4.646a.5.5 0 1 0-.708.708L7.293 8l-2.647 2.646a.5.5 0 0 0 .708.708L8 8.707l2.646 2.647a.5.5 0 0 0 .708-.708L8.707 8l2.647-2.646a.5.5 0 0 0-.708-.708L8 7.293 5.354 4.646z" />
</symbol>
</svg>
<!-- CSS styles for flash overlay messages -->
<style>
.client-flash-message {
.flash-overlay {
position: fixed;
top: 30%;
left: 50%;
transform: translate(-50%, -50%);
z-index: 1000;
width: 300px;
text-align: center;
font-weight: bold;
padding: 12px;
margin-bottom: 20px;
border-radius: 6px;
top: 20px;
right: 20px;
z-index: 9999;
max-width: 420px;
opacity: 1;
transition: opacity 5s ease-in-out;
transition: all 0.3s ease-in-out;
transform: translateX(0);
margin-bottom: 10px;
}
.client-flash-message.success {
background-color: #d4edda;
border-color: #c3e6cb;
color: #155724;
.flash-content {
padding: 16px 20px;
border-radius: 8px;
box-shadow: 0 6px 20px rgba(0, 0, 0, 0.15);
display: flex;
align-items: flex-start;
font-weight: 500;
border-left: 4px solid;
position: relative;
}
.client-flash-message.error {
.flash-icon {
width: 20px;
height: 20px;
margin-right: 12px;
margin-top: 1px;
flex-shrink: 0;
}
.flash-message {
flex: 1;
line-height: 1.4;
}
.flash-close {
background: none;
border: none;
font-size: 20px;
cursor: pointer;
padding: 0;
margin-left: 12px;
opacity: 0.6;
line-height: 1;
font-weight: bold;
flex-shrink: 0;
margin-top: -2px;
}
.flash-close:hover {
opacity: 1;
}
.flash-success .flash-content {
background-color: #d1e7dd;
border-left-color: #198754;
color: #0f5132;
}
.flash-danger .flash-content {
background-color: #f8d7da;
border-color: #f5c6cb;
border-left-color: #dc3545;
color: #721c24;
}
.client-flash-message.info {
background-color: #d1ecf1;
border-color: #bee5eb;
color: #0c5460;
}
.client-flash-message.warning {
.flash-warning .flash-content {
background-color: #fff3cd;
border-color: #ffeeba;
color: #856404;
border-left-color: #ffc107;
color: #664d03;
}
.client-flash-message.fade {
.flash-info .flash-content {
background-color: #cff4fc;
border-left-color: #0dcaf0;
color: #055160;
}
.flash-overlay.fade-out {
opacity: 0;
transform: translateX(100%);
}
/* Stack multiple flash messages with smooth transitions */
.flash-overlay {
/* Dynamic positioning will be set by JavaScript */
}
/* Ensure proper z-index stacking */
.flash-overlay:nth-child(1) {
z-index: 9999;
}
.flash-overlay:nth-child(2) {
z-index: 9998;
}
.flash-overlay:nth-child(3) {
z-index: 9997;
}
.flash-overlay:nth-child(4) {
z-index: 9996;
}
.flash-overlay:nth-child(5) {
z-index: 9995;
}
</style>
<!-- Server-side flash messages from Flask -->
{% with messages = get_flashed_messages(with_categories=true) %}
{% if messages %}
<script>
// Global flash message function that can be used from anywhere
function showFlashMessage(message, type = 'success', duration = 5000) {
const flashMsg = document.createElement('div');
flashMsg.className = `client-flash-message ${type}`;
flashMsg.textContent = message;
const container = document.getElementById('clientFlashContainer');
container.appendChild(flashMsg);
// Apply fade effect after some time
setTimeout(() => flashMsg.classList.add('fade'), duration - 3000);
// Remove element after duration
setTimeout(() => flashMsg.remove(), duration);
return flashMsg;
}
// Initialize toast messages if Bootstrap is used
// Convert server-side flash messages to overlay messages
document.addEventListener('DOMContentLoaded', function () {
// Initialize any Bootstrap toasts if they exist
if (typeof bootstrap !== 'undefined' && bootstrap.Toast) {
const toastElList = [].slice.call(document.querySelectorAll('.toast'));
toastElList.map(function (toastEl) {
return new bootstrap.Toast(toastEl);
});
}
});
{% for category, message in messages %}
showFlashMessage({{ message| tojson }}, {{ (category if category != 'error' else 'danger')| tojson }});
{% endfor %}
});
</script>
{% endif %}
{% endwith %}

Before

Width:  |  Height:  |  Size: 2.7 KiB

After

Width:  |  Height:  |  Size: 4.1 KiB

View File

@ -1,18 +1,82 @@
<div class="modal-header">
<h5 class="modal-title">Log Details</h5>
<button type="button" class="btn-close" data-bs-dismiss="modal"></button>
<h5 class="modal-title"><i class="fas fa-info-circle"></i> Log Details</h5>
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
<p><strong>Timestamp:</strong> {{ log.timestamp }}</p>
<p><strong>Category:</strong> {{ log.category }}</p>
<p><strong>Action:</strong> {{ log.action }}</p>
<p><strong>Description:</strong> {{ log.description }}</p>
<div class="row">
<div class="col-md-6">
<p><strong>Timestamp:</strong> <span class="text-muted">{{ log.timestamp }}</span></p>
<p><strong>Category:</strong>
<span class="badge bg-secondary">{{ log.category.replace('_', ' ').title() }}</span>
</p>
<p><strong>Action:</strong> <code>{{ log.action }}</code></p>
{% if log.status %}
<p><strong>Status:</strong>
{% if log.status == 'success' %}
<span class="badge bg-success">{{ log.status.title() }}</span>
{% elif log.status == 'error' %}
<span class="badge bg-danger">{{ log.status.title() }}</span>
{% elif log.status == 'warning' %}
<span class="badge bg-warning">{{ log.status.title() }}</span>
{% else %}
<span class="badge bg-info">{{ log.status.title() }}</span>
{% endif %}
</p>
{% endif %}
</div>
<div class="col-md-6">
{% if log.paper_id %}
<p><strong>Paper ID:</strong> <a href="/papers/{{ log.paper_id }}" target="_blank">{{ log.paper_id }}</a></p>
{% endif %}
{% if log.user_id %}
<p><strong>User ID:</strong> {{ log.user_id }}</p>
{% endif %}
{% if log.config_key %}
<p><strong>Config Key:</strong> <code>{{ log.config_key }}</code></p>
{% endif %}
{% if log.source_ip %}
<p><strong>Source IP:</strong> {{ log.source_ip }}</p>
{% endif %}
</div>
</div>
{% if log.description %}
<div class="mt-3">
<p><strong>Description:</strong></p>
<div class="alert alert-light">{{ log.description }}</div>
</div>
{% endif %}
{% if log.old_value or log.new_value %}
<div class="mt-3">
<p><strong>Configuration Changes:</strong></p>
<div class="row">
{% if log.old_value %}
<div class="col-md-6">
<label class="form-label"><strong>Old Value:</strong></label>
<pre class="bg-light p-2"><code>{{ log.old_value }}</code></pre>
</div>
{% endif %}
{% if log.new_value %}
<div class="col-md-6">
<label class="form-label"><strong>New Value:</strong></label>
<pre class="bg-light p-2"><code>{{ log.new_value }}</code></pre>
</div>
{% endif %}
</div>
</div>
{% endif %}
{% if log.extra_data %}
<p><strong>Extra Data:</strong>
<pre><code>{{ log.extra_data }}</code></pre>
</p>
<div class="mt-3">
<p><strong>Additional Data:</strong></p>
<pre class="bg-light p-3"
style="max-height: 300px; overflow-y: auto;"><code id="extra-data-content">{{ log.extra_data }}</code></pre>
</div>
{% endif %}
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">
<i class="fas fa-times"></i> Close
</button>
</div>

View File

@ -0,0 +1,249 @@
<!-- Scraper Overview Modal -->
<div class="modal fade" id="scraperOverviewModal" tabindex="-1" role="dialog"
aria-labelledby="scraperOverviewModalLabel" aria-hidden="true">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="scraperOverviewModalLabel">
<i class="fas fa-cogs"></i> Scraper Modules Overview
</h5>
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
<!-- Loading state -->
<div id="scraperOverviewLoading" class="text-center py-4">
<div class="spinner-border text-primary" role="status">
<span class="visually-hidden">Loading...</span>
</div>
<p class="mt-2 text-muted">Loading scraper information...</p>
</div>
<!-- Error state -->
<div id="scraperOverviewError" class="alert alert-danger d-none" role="alert">
<h6 class="alert-heading">Error Loading Scrapers</h6>
<p id="scraperOverviewErrorMessage"></p>
<button class="btn btn-outline-danger btn-sm" onclick="loadScraperOverview()">
<i class="fas fa-redo"></i> Retry
</button>
</div>
<!-- Content -->
<div id="scraperOverviewContent" class="d-none">
<!-- Scraper Architecture Overview -->
<div class="card mb-4">
<div class="card-header">
<h6 class="mb-0">
<i class="fas fa-info-circle"></i> How Scraper Modules Work
</h6>
</div>
<div class="card-body">
<p class="mb-3">
SciPaperLoader uses a modular scraper architecture where each scraper module handles
specific paper processing stages. Papers flow through different statuses as they are
processed by various scrapers.
</p>
<div class="row">
<div class="col-md-6">
<h6>Key Concepts:</h6>
<ul class="small">
<li><strong>Input Statuses:</strong> Paper statuses this scraper can process
</li>
<li><strong>Output Statuses:</strong> Statuses papers get after processing</li>
<li><strong>Processing Status:</strong> Temporary status while scraper works
</li>
<li><strong>Pipeline:</strong> Scrapers can be chained together</li>
</ul>
</div>
<div class="col-md-6">
<h6>Status Flow Example:</h6>
<div class="d-flex align-items-center small">
<span class="badge bg-info">New</span>
<i class="fas fa-arrow-right mx-2"></i>
<span class="badge bg-warning">Processing</span>
<i class="fas fa-arrow-right mx-2"></i>
<span class="badge bg-success">Done</span>
</div>
<div class="text-muted mt-1">Papers transition through these statuses</div>
</div>
</div>
</div>
</div>
<!-- Current System Configuration -->
<div class="card mb-4">
<div class="card-header">
<h6 class="mb-0">
<i class="fas fa-server"></i> System Configuration
</h6>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-4">
<p><strong>Active Scraper Module:</strong> <span id="currentScraperModule"
class="badge bg-primary">Loading...</span></p>
<p><strong>Daily Volume Limit:</strong> <span
id="currentVolumeLimit">Loading...</span> papers</p>
</div>
<div class="col-md-4">
<p><strong>Total Available Modules:</strong> <span
id="totalScraperModules">Loading...</span></p>
<p><strong>Processing Pipeline:</strong> <span
id="processingPipeline">Multi-stage</span></p>
</div>
<div class="col-md-4">
<p><strong>Current Paper Counts:</strong></p>
<div id="paperCountsSummary" class="small">
<!-- Will be populated by JavaScript -->
</div>
</div>
</div>
</div>
</div>
<!-- Available Scrapers Table -->
<div class="card">
<div class="card-header">
<h6 class="mb-0">
<i class="fas fa-list"></i> Available Scraper Modules
</h6>
</div>
<div class="card-body">
<div class="table-responsive">
<table class="table table-hover">
<thead>
<tr>
<th>Module Name</th>
<th>Description</th>
<th>Input Statuses</th>
<th>Success Output</th>
<th>Failure Output</th>
<th>Processing Status</th>
</tr>
</thead>
<tbody id="scrapersTableBody">
<!-- Table content will be populated by JavaScript -->
</tbody>
</table>
</div>
</div>
</div>
<!-- Publisher Parser Overview -->
<div class="card mt-4">
<div class="card-header">
<h6 class="mb-0">
<i class="fas fa-building"></i> Publisher Parser Overview
</h6>
</div>
<div class="card-body">
<div class="row mb-3">
<div class="col-md-12">
<p class="text-muted mb-2">
<i class="fas fa-info-circle"></i>
Publishers are detected from paper URLs and mapped to specific parser modules
for content extraction.
</p>
</div>
</div>
<!-- Publisher Statistics -->
<div class="row mb-4" id="publisherStats">
<!-- Will be populated by JavaScript -->
</div>
<!-- Publishers Table -->
<div class="table-responsive">
<table class="table table-hover table-sm">
<thead>
<tr>
<th>Publisher</th>
<th>Papers</th>
<th>Parser Status</th>
<th>Parser Available</th>
</tr>
</thead>
<tbody id="publishersTableBody">
<!-- Table content will be populated by JavaScript -->
</tbody>
</table>
</div>
</div>
</div>
<!-- Status Flow Diagram -->
<div class="card mt-4">
<div class="card-header">
<h6 class="mb-0">
<i class="fas fa-project-diagram"></i> Paper Status Flow Diagram
</h6>
</div>
<div class="card-body">
<div id="statusFlowDiagram" class="text-center py-4">
<!-- This will be populated by JavaScript -->
</div>
</div>
</div>
</div>
</div>
<div class="modal-footer">
<div class="d-flex justify-content-between w-100">
<small class="text-muted">
<i class="fas fa-lightbulb"></i>
Tip: Scrapers can be chained to create complex processing pipelines
</small>
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<style>
/* Custom styles for the scraper overview modal */
#scraperOverviewModal .modal-xl {
max-width: 1200px;
}
#scraperOverviewModal .table th {
font-size: 0.9rem;
background-color: #f8f9fa;
}
#scraperOverviewModal .badge {
font-size: 0.75rem;
}
#scraperOverviewModal .status-badge {
margin: 2px;
display: inline-block;
}
.status-flow-node {
display: inline-block;
padding: 8px 16px;
margin: 4px;
border-radius: 20px;
font-size: 0.9rem;
font-weight: 500;
}
.status-flow-arrow {
color: #6c757d;
margin: 0 8px;
}
.scraper-description {
max-width: 300px;
word-break: break-word;
}
.input-status-list {
max-width: 150px;
}
.status-output {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 0.8rem;
}
</style>

File diff suppressed because it is too large Load Diff

View File

@ -1,34 +1,14 @@
{% extends "base.html.jinja" %} {% block content %}
{% extends "base.html.jinja" %}
{% block title %}Import CSV{% endblock title %}
{% block content %}
<h1>Welcome to SciPaperLoader</h1>
<div id="results-container"></div>
<!-- Include flash messages template -->
{% include "partials/flash_messages.html.jinja" %}
{% if success %}
<div class="alert alert-success mt-3">{{ success }}</div>
{% endif %} {% if error_message %}
<div class="alert alert-warning mt-3">
<h4>{{ error_message }}</h4>
<table class="table table-sm table-bordered">
<thead>
<tr>
<th>Row</th>
<th>DOI</th>
<th>Error</th>
</tr>
</thead>
<tbody>
{% for error in error_samples %}
<tr>
<td>{{ error.row }}</td>
<td>{{ error.doi }}</td>
<td>{{ error.error }}</td>
</tr>
{% endfor %}
</tbody>
</table>
<a href="{{ url_for('upload.download_error_log') }}" class="btn btn-outline-secondary">Download Full Error Log</a>
</div>
{% endif %}
<div id="results-container"></div>
<div class="alert alert-info">
<p>
@ -88,93 +68,42 @@
</div>
</div>
{% endblock content %}
{% block scripts %}
{{ super() }}
<!-- Configuration data in JSON format for clean separation -->
<script type="application/json" id="upload-config">
{
"statusUrlTemplate": {{ (url_for('upload.task_status', task_id='') ~ '{taskId}')|tojson }}
}
</script>
<script src="{{ url_for('static', filename='js/form-handler.js') }}"></script>
<script>
const form = document.getElementById("upload-form");
form.addEventListener("submit", function (e) {
e.preventDefault();
document.addEventListener('DOMContentLoaded', function () {
// Read configuration from JSON
const config = JSON.parse(document.getElementById('upload-config').textContent);
// Display loading state immediately
const progressModal = new bootstrap.Modal(document.getElementById("progressModal"));
progressModal.show();
const progressBar = document.getElementById("progressBar");
progressBar.style.width = "5%";
progressBar.textContent = "Starting...";
const formData = new FormData(form);
// Disable the form while processing
const submitButton = form.querySelector("button[type='submit']");
submitButton.disabled = true;
fetch(form.action, {
method: "POST",
body: formData,
})
.then((response) => response.json())
.then((data) => {
if (data.error) {
// Handle error
progressModal.hide();
alert(`Error: ${data.error}`);
submitButton.disabled = false;
return;
}
const taskId = data.task_id;
const interval = setInterval(() => {
fetch("{{ url_for('upload.task_status', task_id='') }}" + taskId)
.then((response) => response.json())
.then((status) => {
console.log("Task status:", status);
if (status.state === "SUCCESS") {
clearInterval(interval);
progressBar.style.width = "100%";
progressBar.textContent = "Completed!";
setTimeout(() => {
progressModal.hide();
showResults(status.result);
submitButton.disabled = false;
}, 1000);
} else if (status.state === "FAILURE") {
clearInterval(interval);
progressBar.style.width = "100%";
progressBar.classList.add("bg-danger");
progressBar.textContent = "Failed!";
setTimeout(() => {
progressModal.hide();
alert(`Task failed: ${status.error || "Unknown error"}`);
submitButton.disabled = false;
}, 1000);
} else {
// Update progress bar with more information
const progress = status.progress || 0;
progressBar.style.width = `${progress}%`;
progressBar.textContent = `${progress}% complete`;
document.getElementById("progressStatus").innerText = `Processing... (${status.state})`;
}
})
.catch((err) => {
console.error("Failed to check task status:", err);
});
}, 1000);
})
.catch((err) => {
console.error("Upload failed:", err);
progressModal.hide();
alert("Upload failed. Please try again.");
submitButton.disabled = false;
});
// Initialize form handler with custom callbacks
const uploadFormHandler = new FormHandler('upload-form', {
statusUrlTemplate: config.statusUrlTemplate,
onSuccess: showResults,
onError: (error) => showFlashMessage(`Upload failed: ${error}`, 'error')
});
});
const showResults = (result) => {
// Show main success message as overlay
const message = `Upload completed! Added: ${result.added}, Updated: ${result.updated}, Skipped: ${result.skipped}, Errors: ${result.error_count}`;
showFlashMessage(message, 'success');
let resultHTML = `<div class="alert alert-success">${message}</div>`;
// Build detailed results HTML for the results container
let resultHTML = '';
// Add skipped records information
if (result.skipped > 0) {
showFlashMessage(`${result.skipped} records were skipped`, 'info');
resultHTML += `
<div class="alert alert-info">
<h4>${result.skipped} records were skipped</h4>
@ -205,6 +134,7 @@
// Existing error display code
if (result.error_count > 0) {
showFlashMessage(`${result.error_count} errors occurred during upload`, 'warning');
resultHTML += `
<div class="alert alert-warning">
<h4>Some errors occurred (${result.error_count} total)</h4>
@ -238,7 +168,8 @@
</div>`;
}
// Display detailed results in container
document.getElementById("results-container").innerHTML = resultHTML;
};
</script>
{% endblock content %}
{% endblock scripts %}

131
tests/test_csv_upload.py Normal file
View File

@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""
Test script to verify CSV upload functionality works with APScheduler.
"""
import requests
import time
import io
import csv
from scipaperloader import create_app
def create_test_csv():
"""Create a simple test CSV file."""
csv_content = """title,doi,issn,journal,alternative_id,published_online
Test Paper 1,10.1000/test_upload_001,1234-5678,Test Journal,ALT001,2024-01-01
Test Paper 2,10.1000/test_upload_002,1234-5678,Test Journal,ALT002,2024-01-02
Test Paper 3,10.1000/test_upload_003,1234-5678,Test Journal,ALT003,2024-01-03
"""
return csv_content
def test_csv_upload():
"""Test the CSV upload functionality."""
print("🧪 Testing CSV Upload Functionality")
print("=" * 50)
# Create Flask app
app = create_app()
with app.test_client() as client:
# Create test CSV
csv_content = create_test_csv()
# Prepare file data
csv_file = io.BytesIO(csv_content.encode('utf-8'))
csv_file.name = 'test_upload.csv'
print("📤 Uploading CSV file...")
# Make upload request
response = client.post('/upload/', data={
'file': (csv_file, 'test_upload.csv'),
'delimiter': ',',
'duplicate_strategy': 'skip'
}, content_type='multipart/form-data')
print(f"Response Status: {response.status_code}")
print(f"Response Data: {response.get_json()}")
if response.status_code == 200:
response_data = response.get_json()
if 'task_id' in response_data:
task_id = response_data['task_id']
print(f"✅ Task scheduled successfully: {task_id}")
# Monitor task progress
print("\n📊 Monitoring task progress...")
for i in range(30): # Wait up to 30 seconds
progress_response = client.get(f'/upload/task_status/{task_id}')
if progress_response.status_code == 200:
progress_data = progress_response.get_json()
print(f"Progress: {progress_data}")
if progress_data.get('state') == 'SUCCESS':
print("✅ CSV upload completed successfully!")
result = progress_data.get('result', {})
print(f" Added: {result.get('added', 0)}")
print(f" Skipped: {result.get('skipped', 0)}")
print(f" Errors: {result.get('error_count', 0)}")
return True
elif progress_data.get('state') == 'FAILURE':
print(f"❌ CSV upload failed: {progress_data.get('error')}")
return False
else:
print(f"❌ Failed to get task status: {progress_response.status_code}")
return False
time.sleep(1)
print("⏰ Task did not complete within 30 seconds")
return False
else:
print(f"❌ No task_id in response: {response_data}")
return False
else:
print(f"❌ Upload request failed: {response.status_code}")
print(f"Response: {response.get_data(as_text=True)}")
return False
def check_scheduler_status():
"""Check APScheduler status."""
print("\n🔍 Checking APScheduler Status")
print("=" * 50)
app = create_app()
with app.app_context():
from scipaperloader.scheduler import _scheduler
if not _scheduler:
print("❌ APScheduler not initialized")
return False
if not _scheduler.running:
print("❌ APScheduler not running")
return False
jobs = _scheduler.get_jobs()
print(f"✅ APScheduler running with {len(jobs)} jobs")
# Show current jobs
for job in jobs:
print(f" - {job.id}: {job.name}")
return True
if __name__ == "__main__":
print("🚀 CSV Upload Test Suite")
print("=" * 50)
# First check scheduler status
if not check_scheduler_status():
print("❌ APScheduler issues detected, cannot proceed with test")
exit(1)
# Run the upload test
success = test_csv_upload()
if success:
print("\n🎉 All tests passed! CSV upload is working correctly.")
exit(0)
else:
print("\n❌ Test failed! CSV upload needs debugging.")
exit(1)

View File

@ -0,0 +1,397 @@
#!/usr/bin/env python3
"""
Comprehensive test for APScheduler functionality in SciPaperLoader.
Tests job scheduling, execution, revocation, and hourly scheduler functionality.
"""
import sys
import os
import time
import threading
from datetime import datetime, timedelta
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from scipaperloader import create_app
from scipaperloader.models import PaperMetadata, ScraperState, ActivityLog, ScheduleConfig, VolumeConfig
from scipaperloader.scrapers.manager import ScraperManager
from scipaperloader.db import db
def test_scheduler_functionality():
"""Comprehensive test of APScheduler functionality."""
print("🧪 Testing APScheduler Functionality")
print("=" * 50)
# Create test app with in-memory database
app = create_app({
'TESTING': True,
'SQLALCHEMY_DATABASE_URI': 'sqlite:///:memory:',
})
with app.app_context():
# Test 1: Basic scheduler availability
print("\n📋 Test 1: Scheduler Initialization")
scheduler = app.config.get('SCHEDULER')
if not scheduler:
print("❌ APScheduler not found in app config")
return False
print("✅ APScheduler available and initialized")
print(f"📊 Initial job count: {scheduler.get_job_count()}")
# Test 2: Database table creation
print("\n📋 Test 2: APScheduler Database Tables")
try:
# Check if we can query jobs (which requires tables to exist)
jobs = scheduler.get_paper_jobs()
print("✅ APScheduler database tables exist and accessible")
print(f"📋 Current paper jobs: {len(jobs)}")
except Exception as e:
print(f"❌ APScheduler database tables not accessible: {e}")
return False
# Test 3: Job scheduling functionality
print("\n📋 Test 3: Job Scheduling")
# Create test paper
test_paper = PaperMetadata(
title="Test Paper for Scheduler",
doi="10.1000/test_scheduler_001",
issn="1234-5678",
journal="Test Journal",
status="New"
)
db.session.add(test_paper)
db.session.commit()
# Schedule a paper for processing in 30 seconds (longer delay)
try:
job_id = scheduler.schedule_paper_processing(
paper_id=test_paper.id,
delay_seconds=30 # Increased delay to 30 seconds
# Removed explicit job_id to allow default "paper_job_" prefix
)
print(f"✅ Paper scheduling works: Job ID {job_id}")
except Exception as e:
print(f"❌ Paper scheduling failed: {e}")
return False
# Verify job was scheduled
jobs_after = scheduler.get_paper_jobs()
if len(jobs_after) == 0:
print("❌ No jobs found after scheduling")
return False
print(f"✅ Job successfully scheduled: {len(jobs_after)} paper job(s) found")
# Test 4: Job information retrieval
print("\n📋 Test 4: Job Information Retrieval")
scheduled_job = jobs_after[0]
print(f"✅ Job details accessible:")
print(f" 📝 Job ID: {scheduled_job['id']}")
print(f" 📝 Job Name: {scheduled_job['name']}")
print(f" 📝 Next Run Time: {scheduled_job['next_run_time']}")
print(f" 📝 Args: {scheduled_job['args']}")
# Test 5: Job revocation
print("\n📋 Test 5: Job Revocation")
initial_count = len(jobs_after)
revoked_count = scheduler.revoke_all_scraper_jobs()
if revoked_count != initial_count:
print(f"⚠️ Warning: Expected to revoke {initial_count} jobs, but revoked {revoked_count}")
else:
print(f"✅ Job revocation works: {revoked_count} job(s) revoked")
# Verify jobs were revoked
jobs_after_revocation = scheduler.get_paper_jobs()
if len(jobs_after_revocation) > 0:
print(f"❌ Jobs still exist after revocation: {len(jobs_after_revocation)}")
return False
print("✅ All paper jobs successfully revoked")
# Test 6: Multiple job scheduling
print("\n📋 Test 6: Multiple Job Scheduling")
# Create more test papers
test_papers = []
for i in range(3):
paper = PaperMetadata(
title=f"Test Paper {i+1}",
doi=f"10.1000/test_scheduler_{i+2:03d}",
issn="1234-5678",
journal="Test Journal",
status="New"
)
db.session.add(paper)
test_papers.append(paper)
db.session.commit()
# Schedule multiple papers
scheduled_jobs = []
for i, paper in enumerate(test_papers):
job_id = scheduler.schedule_paper_processing(
paper_id=paper.id,
delay_seconds=10 + i # Stagger the scheduling
# Removed explicit job_id to allow default "paper_job_" prefix
)
scheduled_jobs.append(job_id)
print(f"✅ Multiple job scheduling works: {len(scheduled_jobs)} jobs scheduled")
# Verify all jobs are scheduled
all_jobs = scheduler.get_paper_jobs()
if len(all_jobs) != len(test_papers):
print(f"❌ Expected {len(test_papers)} jobs, found {len(all_jobs)}")
return False
print(f"✅ All jobs properly scheduled: {len(all_jobs)} total jobs")
# Test 7: ScraperManager integration
print("\n📋 Test 7: ScraperManager Integration")
manager = ScraperManager()
# Test paper selection
papers = manager.select_papers_for_processing(limit=2)
print(f"✅ ScraperManager paper selection: {len(papers)} papers selected")
# Test scraper state management with APScheduler
start_result = manager.start_scraper()
if start_result["status"] != "success":
print(f"❌ Failed to start scraper: {start_result['message']}")
return False
print("✅ Scraper started successfully")
# Test job clearing through manager
cleared_count = manager._clear_delayed_tasks_from_apscheduler()
print(f"✅ ScraperManager job clearing: {cleared_count} jobs cleared")
# Verify jobs were cleared
remaining_jobs = scheduler.get_paper_jobs()
if len(remaining_jobs) > 0:
print(f"❌ Jobs still exist after manager clearing: {len(remaining_jobs)}")
return False
print("✅ ScraperManager successfully clears APScheduler jobs")
# Test 8: Hourly scheduler configuration
print("\n📋 Test 8: Hourly Scheduler Configuration")
# Ensure the hourly job is scheduled correctly
all_scheduler_jobs = scheduler._scheduler.get_jobs() if hasattr(scheduler, '_scheduler') and scheduler._scheduler else []
hourly_jobs = [job for job in all_scheduler_jobs if job.id == 'hourly_scraper_main']
if not hourly_jobs:
print("❌ Hourly scheduler job not found")
return False
hourly_job = hourly_jobs[0]
print("✅ Hourly scheduler job found:")
print(f" 📝 Job ID: {hourly_job.id}")
print(f" 📝 Job Name: {hourly_job.name}")
print(f" 📝 Trigger: {hourly_job.trigger}")
print(f" 📝 Next Run: {hourly_job.next_run_time}")
# Test 9: Configuration-based scheduling
print("\n📋 Test 9: Configuration-based Scheduling")
# Set up volume configuration
volume_config = VolumeConfig.query.first()
if not volume_config:
volume_config = VolumeConfig(volume=10) # 10 papers per day
db.session.add(volume_config)
db.session.commit()
# Test quota calculation
quota = manager.get_current_hour_quota()
print(f"✅ Hourly quota calculation: {quota} papers per hour")
if quota < 0:
print("❌ Invalid quota calculation")
return False
# Test 10: Activity logging integration
print("\n📋 Test 10: Activity Logging Integration")
# Check recent APScheduler-related logs
recent_logs = ActivityLog.query.filter(
ActivityLog.action.like('%apscheduler%')
).order_by(ActivityLog.timestamp.desc()).limit(5).all()
print(f"✅ APScheduler activity logging: {len(recent_logs)} related log entries")
if recent_logs:
for log in recent_logs[:3]:
print(f" 📝 {log.action}: {log.description}")
# Test 11: Error handling
print("\n📋 Test 11: Error Handling")
# Test scheduling with invalid paper ID
try:
scheduler.schedule_paper_processing(
paper_id=99999, # Non-existent paper
delay_seconds=1,
job_id="test_error_job"
)
print("✅ Scheduling with invalid paper ID handled gracefully")
except Exception as e:
print(f"✅ Scheduling with invalid paper ID properly raises exception: {e}")
# Test 12: Cleanup and shutdown
print("\n📋 Test 12: Cleanup and Shutdown")
# Stop scraper
stop_result = manager.stop_scraper()
if stop_result["status"] != "success":
print(f"❌ Failed to stop scraper: {stop_result['message']}")
return False
print("✅ Scraper stopped successfully")
# Final job count should be minimal (only hourly scheduler)
final_job_count = scheduler.get_job_count()
final_paper_jobs = len(scheduler.get_paper_jobs())
print(f"📊 Final state:")
print(f" 📝 Total jobs: {final_job_count}")
print(f" 📝 Paper jobs: {final_paper_jobs}")
if final_paper_jobs > 0:
print("❌ Paper jobs still exist after cleanup")
return False
print("✅ Cleanup completed successfully")
print("\n🎉 ALL SCHEDULER TESTS PASSED!")
print("\n📋 Test Summary:")
print(" ✅ APScheduler initialization works")
print(" ✅ Database tables created and accessible")
print(" ✅ Job scheduling functionality works")
print(" ✅ Job information retrieval works")
print(" ✅ Job revocation works")
print(" ✅ Multiple job scheduling works")
print(" ✅ ScraperManager integration works")
print(" ✅ Hourly scheduler configured correctly")
print(" ✅ Configuration-based scheduling works")
print(" ✅ Activity logging integration works")
print(" ✅ Error handling works")
print(" ✅ Cleanup and shutdown works")
return True
def test_job_execution():
"""Test that jobs actually execute (requires waiting)."""
print("\n🔄 Testing Job Execution (5-second test)")
print("-" * 40)
app = create_app({
'TESTING': True,
'SQLALCHEMY_DATABASE_URI': 'sqlite:///:memory:',
})
with app.app_context():
# Initialize database and scheduler
db.create_all()
scheduler = app.config.get('SCHEDULER')
if not scheduler:
print("❌ Scheduler not initialized")
return False
# Create test paper
test_paper = PaperMetadata(
title="Test Paper for Execution",
doi="10.1000/test_execution",
issn="1234-5678",
journal="Test Journal",
status="Pending"
)
db.session.add(test_paper)
db.session.commit()
# Verify paper is added to the database
test_paper_id = test_paper.id
if not test_paper_id:
print("❌ Test paper not added to the database")
return False
# Schedule paper for processing in 2 seconds
job_id = scheduler.schedule_paper_processing(
paper_id=test_paper_id,
delay_seconds=2
)
print(f"📅 Scheduled job {job_id} for execution in 2 seconds")
# Wait and check for execution
print("⏳ Waiting for job execution...")
time.sleep(3)
# Check if job completed (should be removed from scheduler)
remaining_jobs = scheduler.get_paper_jobs()
if remaining_jobs:
print(f"⚠️ Job still in scheduler: {len(remaining_jobs)} remaining")
for job in remaining_jobs:
print(f" 📝 Job ID: {job['id']}, Next Run Time: {job['next_run_time']}")
else:
print("✅ Job executed and removed from scheduler")
# Check activity logs for execution evidence
execution_logs = ActivityLog.query.filter(
ActivityLog.action.like('%process_single_paper%')
).order_by(ActivityLog.timestamp.desc()).limit(3).all()
if execution_logs:
print("✅ Job execution logged in activity:")
for log in execution_logs:
print(f" 📝 {log.action}: {log.description}")
else:
print("⚠️ No execution logs found")
# Validate job execution status in the database
updated_paper = PaperMetadata.query.get(test_paper_id)
if updated_paper:
print(f"🔍 Retrieved paper: {updated_paper.title}, Status: {updated_paper.status}")
if updated_paper.status == "Done":
print("✅ Paper status updated to 'Done'")
else:
print(f"❌ Paper status not updated: {updated_paper.status}")
else:
print("❌ Paper not found in the database")
return True
if __name__ == "__main__":
print(f"📅 Starting scheduler tests at {datetime.now()}")
try:
# Run main functionality tests
success = test_scheduler_functionality()
if success:
print("\n" + "="*50)
# Run execution test if main tests pass
test_job_execution()
print(f"\n📅 Tests completed at {datetime.now()}")
sys.exit(0 if success else 1)
except KeyboardInterrupt:
print("\n⏹️ Tests interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\n❌ Test error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@ -18,4 +18,5 @@ def client(app):
def test_index(client):
response = client.get("/")
assert b"It works!" in response.data
# Updated assertion to check for actual content in the index page
assert b"Welcome to SciPaperLoader" in response.data

View File

@ -10,7 +10,7 @@ especially for addressing issues with the scraper module.
**Symptoms:**
- Web interface shows scraper as stopped but papers are still being processed
- `/scraper/stop` endpoint returns success but processing continues
- Active tasks show up in Celery inspector
- Active tasks show up in APScheduler inspector
**Solutions:**
@ -24,7 +24,7 @@ python tools/diagnostics/emergency_stop.py
The emergency stop performs these actions:
- Sets scraper state to inactive in the database
- Revokes all running, reserved, and scheduled Celery tasks
- Revokes all running and scheduled APScheduler tasks
- Purges all task queues
- Reverts papers with "Pending" status to their previous state
@ -33,12 +33,12 @@ The emergency stop performs these actions:
**Symptoms:**
- Code changes don't seem to have any effect
- Bug fixes don't work even though the code is updated
- Workers might be using cached versions of modified code
- APScheduler might be using cached versions of modified code
**Solution:**
```bash
# Use the quick fix to stop tasks and restart workers
# Use the quick fix to stop tasks and restart the application
make diagnostics # Then select option 6 (Quick fix)
# Or directly:
@ -57,7 +57,7 @@ python tools/diagnostics/diagnose_scraper.py
This tool will:
- Show current scraper state
- List all active, scheduled, and reserved tasks
- List all active and scheduled APScheduler tasks
- Display recent activity and error logs
## Preventative Measures
@ -67,11 +67,10 @@ This tool will:
- Deploying code changes
- Modifying the database
2. **Monitor task queue size** using Flower web interface:
2. **Monitor APScheduler jobs** through the diagnostic tools:
```bash
make celery-flower
make diagnostics # Then select option 2 (Inspect tasks)
```
Then visit http://localhost:5555
3. **Check logs for failed tasks** regularly in the Logger tab of the application

View File

@ -7,14 +7,14 @@ This directory contains various scripts for diagnosing issues, debugging, and ha
### Scraper Management
- **emergency_stop.py**: Force stops all scraper activities, revokes running tasks, and reverts papers from "Pending" state
- **quick_fix.py**: A simplified emergency stop that also restarts Celery workers to ensure code changes are applied
- **quick_fix.py**: A simplified emergency stop that also stops Flask processes to ensure code changes are applied
- **test_reversion.py**: Tests the paper reversion functionality when stopping the scraper
### Monitoring and Diagnostics
- **check_state.py**: Checks the current state of the scraper in the database
- **diagnose_scraper.py**: Comprehensive diagnostic tool that examines tasks, logs, and scraper state
- **inspect_tasks.py**: Displays currently running, scheduled, and reserved Celery tasks
- **inspect_tasks.py**: Displays currently running and scheduled APScheduler tasks
## Usage
@ -59,5 +59,5 @@ python tools/diagnostics/quick_fix.py
## Notes
- Always run these scripts from the project root directory
- Some scripts may require a running Redis server
- Some scripts may require a running Flask application with APScheduler
- After using emergency tools, the application may need to be restarted completely

View File

@ -3,7 +3,6 @@ Diagnose and fix scraper stopping issues.
"""
from scipaperloader import create_app
from scipaperloader.celery import celery
from scipaperloader.models import ScraperState, ActivityLog
from scipaperloader.scrapers.factory import get_scraper
@ -18,21 +17,15 @@ def check_scraper_status():
else:
print("No scraper state found in database")
def check_celery_tasks():
"""Check currently running Celery tasks."""
i = celery.control.inspect()
print("\n=== ACTIVE TASKS ===")
active_tasks = i.active() or {}
for worker, tasks in active_tasks.items():
for task in tasks:
print(f"Worker: {worker}, Task: {task.get('name', 'Unknown')}, ID: {task.get('id', 'Unknown')}")
print("\n=== SCHEDULED TASKS ===")
scheduled_tasks = i.scheduled() or {}
for worker, tasks in scheduled_tasks.items():
for task in tasks:
print(f"Worker: {worker}, Task: {task.get('name', 'Unknown')}, ID: {task.get('id', 'Unknown')}")
def check_scheduler_jobs():
"""Check the current jobs in APScheduler."""
with app.app_context():
scheduler = app.config.get('SCHEDULER')
if not scheduler:
print("❌ APScheduler not found in app config")
else:
jobs = scheduler.get_paper_jobs()
print("Scheduled jobs:", jobs)
def check_recent_logs():
"""Check recent activity logs for clues."""
@ -60,41 +53,26 @@ def force_stop_scraper():
print("Set scraper state to inactive")
# Revoke all tasks
i = celery.control.inspect()
revoked_ids = []
# Check all queues
for queue_name, queue_func in [
("scheduled", i.scheduled),
("active", i.active),
("reserved", i.reserved)
]:
queue = queue_func() or {}
for worker, tasks in queue.items():
for task in tasks:
task_id = task.get('id')
if task_id and task_id not in revoked_ids:
celery.control.revoke(task_id, terminate=True)
revoked_ids.append(task_id)
print(f"Revoked task: {task_id}")
# Purge all queues
celery.control.purge()
print("Purged all task queues")
scheduler = app.config.get('SCHEDULER')
if not scheduler:
print("❌ APScheduler not found in app config")
else:
revoked_count = scheduler.revoke_all_scraper_jobs()
print(f"✅ Revoked {revoked_count} jobs from APScheduler")
# Log the action
ActivityLog.log_scraper_command(
action="force_stop_scraper",
status="success",
description=f"Force stopped scraper, revoked {len(revoked_ids)} tasks"
description=f"Force stopped scraper, revoked {revoked_count} tasks"
)
print(f"\nRevoked {len(revoked_ids)} tasks in total")
print(f"\nRevoked {revoked_count} tasks in total")
if __name__ == "__main__":
print("=== SCRAPER STATUS DIAGNOSTIC TOOL ===")
check_scraper_status()
check_celery_tasks()
check_scheduler_jobs()
check_recent_logs()
stop_confirmation = input("\nDo you want to force stop the scraper? (y/n): ")

View File

@ -23,7 +23,6 @@ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../.
from scipaperloader import create_app
from scipaperloader.db import db
from scipaperloader.models import PaperMetadata, ActivityLog, ScraperState
from scipaperloader.celery import celery
app = create_app()
@ -38,46 +37,18 @@ def emergency_stop():
ScraperState.set_paused(False)
print("✓ Set scraper state to inactive")
# 2. Revoke all tasks
print("\nRevoking running tasks...")
try:
i = celery.control.inspect()
active = i.active() or {}
scheduled = i.scheduled() or {}
reserved = i.reserved() or {}
# 2. Revoke all jobs in APScheduler
scheduler = app.config.get('SCHEDULER')
if scheduler:
revoked_count = scheduler.revoke_all_scraper_jobs()
print(f"✅ Revoked {revoked_count} jobs from APScheduler")
else:
print("❌ APScheduler not found in app config")
revoked_count = 0
# Revoke active tasks
for worker, tasks in active.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
print(f" Revoked active task: {task.get('name', 'unknown')}")
# Revoke scheduled tasks
for worker, tasks in scheduled.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
# Revoke reserved tasks
for worker, tasks in reserved.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
print(f"✓ Revoked {revoked_count} tasks")
# 3. Purge queues
celery.control.purge()
print("✓ Purged all task queues")
except Exception as e:
print(f"⚠ Error revoking tasks: {str(e)}")
# 3. Revert all papers to 'Pending' state
PaperMetadata.query.filter_by(status="Processing").update({"status": "Pending"})
db.session.commit()
print("✅ Reverted all 'Processing' papers to 'Pending' state")
# 4. Revert papers in "Pending" status
try:

View File

@ -1,11 +1,78 @@
#!/usr/bin/env python3
"""
Inspect current Celery tasks (active, reserved, and scheduled)
Inspect current APScheduler jobs (active and scheduled).
"""
from scipaperloader.celery import celery
import sys
import os
from datetime import datetime
i = celery.control.inspect()
print("Active tasks:", i.active())
print("Reserved tasks:", i.reserved())
print("Scheduled tasks:", i.scheduled())
# Add project root to path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from scipaperloader import create_app
from scipaperloader.models import ScraperState
def main():
print("=== APScheduler Task Inspector ===")
print(f"Time: {datetime.now()}\n")
app = create_app()
with app.app_context():
# Check scraper state
scraper_state = ScraperState.get_current_state()
print(f"🔄 Scraper State:")
print(f" Active: {'' if scraper_state.is_active else ''} {scraper_state.is_active}")
print(f" Paused: {'⏸️' if scraper_state.is_paused else '▶️'} {scraper_state.is_paused}")
print()
# Check APScheduler
scheduler = app.config.get('SCHEDULER')
if not scheduler:
print("❌ APScheduler not found in app config")
return
print("📋 APScheduler Status:")
# Access the underlying scheduler
if hasattr(scheduler, 'scheduler') and scheduler.scheduler:
print(f" Running: {'' if scheduler.scheduler.running else ''} {scheduler.scheduler.running}")
else:
print("❌ APScheduler instance not accessible")
print()
# Get all jobs
if hasattr(scheduler, 'scheduler') and scheduler.scheduler:
all_jobs = scheduler.scheduler.get_jobs()
else:
all_jobs = []
paper_jobs = scheduler.get_paper_jobs()
print(f"📊 Job Statistics:")
print(f" Total jobs: {len(all_jobs)}")
print(f" Paper processing jobs: {len(paper_jobs)}")
print()
if paper_jobs:
print("📝 Active Paper Processing Jobs:")
for job in paper_jobs:
next_run = job.get('next_run_time', 'Not scheduled')
print(f"{job['id']}")
print(f" Next run: {next_run}")
print(f" Name: {job.get('name', 'N/A')}")
if job.get('args'):
print(f" Paper ID: {job['args'][0] if job['args'] else 'N/A'}")
print()
else:
print("✅ No active paper processing jobs")
# Show other jobs if any
other_jobs = [job for job in all_jobs if not any(pattern in job.id for pattern in ['paper_process_', 'test_paper_process_', 'process_paper_'])]
if other_jobs:
print(f"🔧 Other Scheduled Jobs ({len(other_jobs)}):")
for job in other_jobs:
next_run = job.next_run_time.strftime('%Y-%m-%d %H:%M:%S') if job.next_run_time else 'Not scheduled'
print(f"{job.id} - Next run: {next_run}")
if __name__ == "__main__":
main()

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""
Quick fix script to stop all running scraper tasks and restart Celery workers.
This ensures the updated code is loaded and tasks are properly terminated.
Quick fix script to stop all running scraper tasks using APScheduler.
This ensures all scheduled tasks are properly terminated.
"""
import os
@ -9,45 +9,55 @@ import sys
import signal
import subprocess
import time
from datetime import datetime
from datetime import datetime, UTC
# Add project root to path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
def kill_celery_processes():
"""Kill all running Celery processes"""
print("Killing Celery processes...")
def stop_apscheduler_jobs():
"""Stop all APScheduler jobs through the Flask app"""
print("Stopping APScheduler jobs...")
try:
# Get all celery processes
result = subprocess.run(['pgrep', '-f', 'celery'], capture_output=True, text=True)
from scipaperloader import create_app
app = create_app()
with app.app_context():
scheduler = app.config.get('SCHEDULER')
if scheduler:
revoked_count = scheduler.revoke_all_scraper_jobs()
print(f"✓ Revoked {revoked_count} APScheduler jobs")
else:
print("❌ APScheduler not found in app config")
except Exception as e:
print(f"⚠ Error stopping APScheduler jobs: {e}")
def kill_python_processes():
"""Kill any running Python processes that might be Flask/APScheduler workers"""
print("Checking for running Flask/APScheduler processes...")
try:
# Look for Flask processes
result = subprocess.run(['pgrep', '-f', 'flask'], capture_output=True, text=True)
if result.returncode == 0:
pids = result.stdout.strip().split('\n')
for pid in pids:
if pid:
try:
os.kill(int(pid), signal.SIGTERM)
print(f" Killed process {pid}")
except ProcessLookupError:
pass # Process already dead
# Check if this is our process before killing
cmdline_result = subprocess.run(['ps', '-p', pid, '-o', 'cmd='], capture_output=True, text=True)
if 'scipaperloader' in cmdline_result.stdout:
os.kill(int(pid), signal.SIGTERM)
print(f" Killed Flask process {pid}")
except (ProcessLookupError, ValueError):
pass # Process already dead or invalid PID
# Wait a moment for graceful shutdown
time.sleep(2)
else:
print("✓ No Flask processes found")
# Force kill any remaining processes
result = subprocess.run(['pgrep', '-f', 'celery'], capture_output=True, text=True)
if result.returncode == 0:
pids = result.stdout.strip().split('\n')
for pid in pids:
if pid:
try:
os.kill(int(pid), signal.SIGKILL)
print(f" Force killed process {pid}")
except ProcessLookupError:
pass
print("✓ All Celery processes terminated")
except Exception as e:
print(f"⚠ Error killing processes: {e}")
print(f"⚠ Error checking processes: {e}")
def stop_scraper_state():
"""Set scraper state to inactive using Flask app context"""
@ -55,6 +65,7 @@ def stop_scraper_state():
from scipaperloader import create_app
from scipaperloader.models import ScraperState, PaperMetadata
from scipaperloader.db import db
from scipaperloader.scrapers.factory import get_scraper
app = create_app()
with app.app_context():
@ -63,41 +74,57 @@ def stop_scraper_state():
ScraperState.set_paused(False)
print("✓ Set scraper state to inactive")
# Revert any pending papers to "New" status (simple approach since we don't have previous_status data yet)
pending_papers = PaperMetadata.query.filter_by(status="Pending").all()
# Get scraper configuration for proper status reversion
scraper = get_scraper()
input_statuses = scraper.get_input_statuses()
output_statuses = scraper.get_output_statuses()
processing_status = output_statuses.get("processing", "Processing")
# Revert any papers in processing status
processing_papers = PaperMetadata.query.filter_by(status=processing_status).all()
reverted_count = 0
for paper in pending_papers:
paper.status = "New" # Simple fallback - revert all to "New"
reverted_count += 1
if processing_papers and input_statuses:
revert_status = input_statuses[0] # Use first input status as default
for paper in processing_papers:
# Try to use previous_status if available, otherwise use first input status
if hasattr(paper, 'previous_status') and paper.previous_status:
paper.status = paper.previous_status
else:
paper.status = revert_status
paper.updated_at = datetime.now(UTC)
reverted_count += 1
if reverted_count > 0:
db.session.commit()
print(f"✓ Reverted {reverted_count} papers from 'Pending' to 'New'")
print(f"✓ Reverted {reverted_count} papers from '{processing_status}' to previous status")
else:
print("✓ No pending papers to revert")
print("✓ No papers in processing status to revert")
except Exception as e:
print(f"⚠ Error setting scraper state: {e}")
def main():
print("=== QUICK SCRAPER FIX ===")
print("=== QUICK SCRAPER FIX (APScheduler) ===")
print(f"Time: {datetime.now()}")
print()
# Step 1: Stop scraper state
# Step 1: Stop scraper state and revert papers
stop_scraper_state()
# Step 2: Kill all Celery processes
kill_celery_processes()
# Step 2: Stop all APScheduler jobs
stop_apscheduler_jobs()
# Step 3: Kill any running Flask processes
kill_python_processes()
print()
print("=== FIX COMPLETE ===")
print("The scraper has been stopped and all tasks terminated.")
print("You can now restart the Celery workers with:")
print(" make celery")
print("or")
print("You can now restart the application with:")
print(" make run")
print("or")
print(" python -m flask --app scipaperloader run")
if __name__ == "__main__":
main()

View File

@ -1,16 +1,17 @@
#!/usr/bin/env python3
"""
Test script for verifying the paper reversion fix.
Test script for verifying the paper reversion fix with APScheduler.
This script:
1. Simulates stopping the scraper
2. Checks that all pending papers were reverted to their previous status
3. Ensures all running tasks were terminated
1. Creates test papers and simulates processing
2. Tests the stop_scraper functionality
3. Checks that all pending papers were reverted to their previous status
4. Ensures all running tasks were terminated
"""
import os
import sys
import time
from datetime import datetime
from datetime import datetime, UTC, timedelta
from sqlalchemy import func
from flask import Flask
@ -21,81 +22,136 @@ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../.
from scipaperloader import create_app
from scipaperloader.db import db
from scipaperloader.models import PaperMetadata, ActivityLog, ScraperState
from scipaperloader.celery import celery
from scipaperloader.scrapers.factory import get_scraper
from scipaperloader.scrapers.manager import ScraperManager
print("[DEBUG] Initializing Flask app...")
app = create_app()
print("[DEBUG] Flask app initialized.")
def test_stop_scraper():
"""Test the stop_scraper functionality"""
"""Test the stop_scraper functionality with proper APScheduler integration"""
print("[DEBUG] Entering app context...")
with app.app_context():
# First check current scraper state
print("[DEBUG] App context entered.")
# Clear existing test data
print("[DEBUG] Clearing existing test data...")
PaperMetadata.query.filter(PaperMetadata.doi.like('10.1234/test%')).delete()
db.session.commit()
print("[DEBUG] Existing test data cleared.")
# Get scraper configuration
scraper = get_scraper()
input_statuses = scraper.get_input_statuses()
output_statuses = scraper.get_output_statuses()
if not input_statuses:
print("❌ No input statuses found for current scraper")
return
input_status = input_statuses[0] # Use first input status
processing_status = output_statuses.get("processing", "Processing")
print(f"[DEBUG] Using input status: {input_status}")
print(f"[DEBUG] Using processing status: {processing_status}")
# Create test papers in input status
test_papers = []
print("[DEBUG] Creating test papers...")
for i in range(3):
test_paper = PaperMetadata()
test_paper.title = f"Test Paper {i+1}"
test_paper.doi = f"10.1234/test{i+1}"
test_paper.status = input_status
test_paper.created_at = datetime.now(UTC)
test_paper.updated_at = datetime.now(UTC)
db.session.add(test_paper)
test_papers.append(test_paper)
db.session.commit()
print(f"[DEBUG] Created {len(test_papers)} test papers in '{input_status}' status.")
# Simulate some papers being moved to processing status
print("[DEBUG] Simulating papers in processing...")
for i, paper in enumerate(test_papers[:2]): # Move first 2 papers to processing
paper.previous_status = paper.status # Store previous status
paper.status = processing_status
paper.updated_at = datetime.now(UTC)
db.session.commit()
print(f"[DEBUG] Moved 2 papers to '{processing_status}' status.")
# Check current scraper state
scraper_state = ScraperState.get_current_state()
print(f"Current scraper state: active={scraper_state.is_active}, paused={scraper_state.is_paused}")
print(f"[DEBUG] Current scraper state: active={scraper_state.is_active}, paused={scraper_state.is_paused}")
# Check if there are any papers in "Pending" state
pending_count = PaperMetadata.query.filter_by(status="Pending").count()
print(f"Papers in 'Pending' state before stopping: {pending_count}")
# Check paper counts before stopping
input_count = PaperMetadata.query.filter_by(status=input_status).count()
processing_count = PaperMetadata.query.filter_by(status=processing_status).count()
print(f"[DEBUG] Papers before stopping: {input_count} in '{input_status}', {processing_count} in '{processing_status}'")
if pending_count == 0:
print("No papers in 'Pending' state to test with.")
print("Would you like to create a test paper in Pending state? (y/n)")
choice = input().lower()
if choice == 'y':
# Create a test paper
paper = PaperMetadata(
title="Test Paper for Reversion",
doi="10.1234/test.123",
status="Pending",
previous_status="New", # Test value we expect to be reverted to
created_at=datetime.utcnow(),
updated_at=datetime.utcnow()
# Test APScheduler job management
scheduler = app.config.get('SCHEDULER')
if scheduler:
print("[DEBUG] Testing APScheduler job management...")
# Create some test jobs using the correct API
for paper in test_papers:
job_id = scheduler.schedule_paper_processing(
paper_id=paper.id,
delay_seconds=60, # 1 minute from now
job_id=f"test_paper_process_{paper.id}"
)
db.session.add(paper)
db.session.commit()
print(f"Created test paper with ID {paper.id}, status='Pending', previous_status='New'")
pending_count = 1
print(f"[DEBUG] Scheduled job {job_id} for paper {paper.id}")
# Simulate the stop_scraper API call
from scipaperloader.blueprints.scraper import revert_pending_papers
print("Reverting pending papers...")
reverted = revert_pending_papers()
print(f"Reverted {reverted} papers from 'Pending' state")
jobs_before = len(scheduler.get_paper_jobs())
print(f"[DEBUG] Created {jobs_before} test jobs in APScheduler")
# Check if any papers are still in "Pending" state
still_pending = PaperMetadata.query.filter_by(status="Pending").count()
print(f"Papers still in 'Pending' state after stopping: {still_pending}")
# Test the manager's stop_scraper method
print("[DEBUG] Testing ScraperManager.stop_scraper()...")
manager = ScraperManager()
result = manager.stop_scraper()
# List any that were reverted and their current status
if reverted > 0:
print("\nPapers that were reverted:")
recent_logs = ActivityLog.query.filter_by(action="revert_pending").order_by(
ActivityLog.timestamp.desc()).limit(10).all()
print(f"[DEBUG] stop_scraper result: {result}")
for log in recent_logs:
paper = PaperMetadata.query.get(log.paper_id)
if paper:
print(f"Paper ID {paper.id}: '{paper.title}' - Now status='{paper.status}'")
# Check jobs after stopping
jobs_after = len(scheduler.get_paper_jobs())
print(f"[DEBUG] Jobs after stopping: {jobs_after} (should be 0)")
# Check active celery tasks
i = celery.control.inspect()
active = i.active() or {}
reserved = i.reserved() or {}
scheduled = i.scheduled() or {}
if jobs_after == 0:
print("✅ All APScheduler jobs successfully revoked")
else:
print(f"{jobs_after} jobs still exist after revocation")
else:
print("❌ APScheduler not found in app config")
active_count = sum(len(tasks) for worker, tasks in active.items())
reserved_count = sum(len(tasks) for worker, tasks in reserved.items())
scheduled_count = sum(len(tasks) for worker, tasks in scheduled.items())
# Check paper counts after stopping
input_count_after = PaperMetadata.query.filter_by(status=input_status).count()
processing_count_after = PaperMetadata.query.filter_by(status=processing_status).count()
print(f"[DEBUG] Papers after stopping: {input_count_after} in '{input_status}', {processing_count_after} in '{processing_status}'")
print(f"\nCurrently {active_count} active, {reserved_count} reserved, and {scheduled_count} scheduled tasks")
# Verify that processing papers were reverted
if processing_count_after == 0 and input_count_after >= processing_count:
print("✅ Papers successfully reverted from processing to previous status")
else:
print(f"❌ Paper reversion failed: expected 0 processing papers, got {processing_count_after}")
# Print conclusion
if still_pending == 0 and reverted > 0:
print("\nSUCCESS: All pending papers were properly reverted!")
elif still_pending > 0:
print(f"\nWARNING: {still_pending} papers are still in 'Pending' state!")
elif pending_count == 0 and reverted == 0:
print("\nNo papers to revert. Can't fully test.")
# Check scraper state after stopping
scraper_state_after = ScraperState.get_current_state()
print(f"[DEBUG] Scraper state after stopping: active={scraper_state_after.is_active}, paused={scraper_state_after.is_paused}")
if __name__ == "__main__":
test_stop_scraper()
if not scraper_state_after.is_active and not scraper_state_after.is_paused:
print("✅ Scraper state correctly set to inactive")
else:
print("❌ Scraper state not properly updated")
# Clean up test data
print("[DEBUG] Cleaning up test data...")
PaperMetadata.query.filter(PaperMetadata.doi.like('10.1234/test%')).delete()
db.session.commit()
print("[DEBUG] Test data cleaned up.")
print("[DEBUG] Starting test_stop_scraper...")
test_stop_scraper()
print("[DEBUG] test_stop_scraper completed.")