Compare commits
21 Commits
1e97a9cc7b
...
fe202b56d0
Author | SHA1 | Date | |
---|---|---|---|
fe202b56d0 | |||
24f9eb5766 | |||
4a10052eae | |||
8f064cda34 | |||
7fd403bd40 | |||
a7964a2f3d | |||
ce6bc03b46 | |||
70e2e2e900 | |||
793f6f9a7e | |||
8f84774880 | |||
98901ce38e | |||
d730137d20 | |||
e2ae95cea0 | |||
676a3c96eb | |||
7a1ab3d7e6 | |||
a4eb7648d5 | |||
88e180bc94 | |||
5c5afefe40 | |||
8ffcf4d65c | |||
ceeb6c375d | |||
3b42010fab |
3
.gitignore
vendored
3
.gitignore
vendored
@ -17,4 +17,5 @@ dist/
|
||||
|
||||
migrations/
|
||||
|
||||
celerybeat-schedule*
|
||||
# APScheduler job store files
|
||||
jobs.sqlite
|
84
Makefile
84
Makefile
@ -1,10 +1,9 @@
|
||||
# List of phony targets (targets that don't represent files)
|
||||
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev celery celery-flower redis run-all diagnostics
|
||||
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev run-scheduler diagnostics clean-papers purge-db
|
||||
|
||||
# Define Python and pip executables inside virtual environment
|
||||
PYTHON := venv/bin/python
|
||||
PIP := venv/bin/pip
|
||||
CELERY := venv/bin/celery
|
||||
FLASK := venv/bin/flask
|
||||
|
||||
# Default target that runs the application
|
||||
@ -15,7 +14,7 @@ clean:
|
||||
rm -rf venv build dist .pytest_cache .mypy_cache *.egg-info
|
||||
|
||||
# Define database path
|
||||
DB_PATH=scipaperloader/papers.db
|
||||
DB_PATH=instance/papers.db
|
||||
|
||||
# Backup the database with timestamp
|
||||
backup-db:
|
||||
@ -91,6 +90,24 @@ reset-db: venv
|
||||
$(PYTHON) -m flask --app scipaperloader db migrate -m "Initial migration"
|
||||
$(PYTHON) -m flask --app scipaperloader db upgrade
|
||||
|
||||
# Clean all papers from the database (keep other tables intact)
|
||||
clean-papers: venv
|
||||
@echo "Cleaning all papers from the database..."
|
||||
@$(PYTHON) -c "from scipaperloader.db import db; from scipaperloader.models import PaperMetadata; from scipaperloader import create_app; app = create_app(); app.app_context().push(); PaperMetadata.query.delete(); db.session.commit(); print('All papers have been removed from the database')"
|
||||
|
||||
# Completely purge all database contents (removes all tables and data)
|
||||
purge-db: venv
|
||||
@echo "WARNING: This will completely wipe all database contents!"
|
||||
@read -p "Are you sure you want to continue? (y/N) " -n 1 -r; \
|
||||
echo; \
|
||||
if [[ $$REPLY =~ ^[Yy]$$ ]]; then \
|
||||
echo "Purging database..."; \
|
||||
rm -f $(DB_PATH); \
|
||||
echo "Database completely purged"; \
|
||||
else \
|
||||
echo "Operation cancelled"; \
|
||||
fi
|
||||
|
||||
# Create and set up virtual environment
|
||||
venv:
|
||||
python3 -m venv venv && \
|
||||
@ -133,65 +150,12 @@ dist: format-check lint mypy test
|
||||
# Set up complete development environment
|
||||
dev: clean venv
|
||||
|
||||
# Start Celery worker - PURGE FIRST
|
||||
celery: venv redis
|
||||
@echo "Purging Celery task queue before starting worker..."
|
||||
# Purge the queue forcefully. Ignore errors if queue is empty/unreachable initially.
|
||||
@-$(CELERY) -A celery_worker:celery purge -f
|
||||
@echo "Starting Celery worker..."
|
||||
$(CELERY) -A celery_worker:celery worker --loglevel=info
|
||||
|
||||
# Monitor Celery tasks with flower web interface
|
||||
celery-flower: venv
|
||||
$(PIP) install flower
|
||||
$(CELERY) -A celery_worker:celery flower --port=5555
|
||||
|
||||
# Run Celery beat scheduler for periodic tasks
|
||||
celery-beat: venv redis
|
||||
@echo "Starting Celery beat scheduler..."
|
||||
# Ensure celerybeat-schedule file is removed for clean start if needed
|
||||
@-rm -f celerybeat-schedule.db
|
||||
# Use the default file-based scheduler (removed the --scheduler flag)
|
||||
$(CELERY) -A celery_worker:celery beat --loglevel=info
|
||||
|
||||
# Check if Redis is running, start if needed
|
||||
redis:
|
||||
@if ! redis-cli ping > /dev/null 2>&1; then \
|
||||
echo "Starting Redis server..."; \
|
||||
redis-server --daemonize yes; \
|
||||
sleep 1; \
|
||||
else \
|
||||
echo "Redis is already running."; \
|
||||
fi
|
||||
|
||||
# Run complete application stack (Flask app + Celery worker + Redis + Beat scheduler)
|
||||
run-all: redis
|
||||
@echo "Starting Flask, Celery worker and Beat scheduler..."
|
||||
# Run them in parallel. Ctrl+C will send SIGINT to make, which propagates.
|
||||
# Use trap to attempt cleanup, but primary cleanup is purge on next start.
|
||||
@trap '$(MAKE) stop-all;' INT TERM; \
|
||||
$(MAKE) -j3 run celery celery-beat & wait
|
||||
|
||||
# Stop running Celery worker and beat gracefully
|
||||
stop-celery:
|
||||
@echo "Attempting graceful shutdown of Celery worker and beat..."
|
||||
@-pkill -TERM -f "celery -A celery_worker:celery worker" || echo "Worker not found or already stopped."
|
||||
@-pkill -TERM -f "celery -A celery_worker:celery beat" || echo "Beat not found or already stopped."
|
||||
@sleep 1 # Give processes a moment to terminate
|
||||
@echo "Purging remaining tasks from Celery queue..."
|
||||
@-$(CELERY) -A celery_worker:celery purge -f || echo "Purge failed or queue empty."
|
||||
|
||||
# Stop Flask development server
|
||||
stop-flask:
|
||||
@echo "Attempting shutdown of Flask development server..."
|
||||
@-pkill -TERM -f "flask --app scipaperloader --debug run" || echo "Flask server not found or already stopped."
|
||||
|
||||
# Stop all components potentially started by run-all
|
||||
stop-all: stop-celery stop-flask
|
||||
@echo "All components stopped."
|
||||
# Start the APScheduler-enabled Flask application
|
||||
run-scheduler: venv
|
||||
@echo "Starting Flask app with APScheduler..."
|
||||
$(PYTHON) -m flask --app scipaperloader --debug run
|
||||
|
||||
# Run diagnostic tools
|
||||
# Run diagnostic tools - works with or without virtualenv
|
||||
diagnostics:
|
||||
$(PYTHON) tools/run_diagnostics.py
|
||||
|
||||
|
59
README.md
59
README.md
@ -15,7 +15,6 @@ And open it in the browser at [http://localhost:5000/](http://localhost:5000/)
|
||||
## Prerequisites
|
||||
|
||||
- Python >=3.8
|
||||
- Redis (for Celery task queue)
|
||||
|
||||
## Development environment
|
||||
|
||||
@ -41,30 +40,39 @@ And open it in the browser at [http://localhost:5000/](http://localhost:5000/)
|
||||
add development dependencies under `project.optional-dependencies.*`; run
|
||||
`make clean && make venv` to reinstall the environment
|
||||
|
||||
## Asynchronous Task Processing with Celery
|
||||
## Task Processing Architecture
|
||||
|
||||
SciPaperLoader uses Celery for processing large CSV uploads and other background tasks. This allows the application to handle large datasets reliably without blocking the web interface.
|
||||
SciPaperLoader uses **APScheduler** for all task processing:
|
||||
|
||||
### Running Celery Components
|
||||
- **Periodic Tasks**: Hourly scraper scheduling with randomized paper processing
|
||||
- **Background Tasks**: CSV uploads, manual paper processing, and all async operations
|
||||
- **Job Management**: Clean job scheduling, revocation, and status tracking
|
||||
|
||||
- `make redis`: ensures Redis server is running (required for Celery)
|
||||
This unified architecture provides reliable task processing with simple, maintainable code.
|
||||
|
||||
- `make celery`: starts a Celery worker to process background tasks
|
||||
### Running Components
|
||||
|
||||
- `make celery-flower`: starts Flower, a web interface for monitoring Celery tasks at http://localhost:5555
|
||||
- `make run`: starts the Flask application with integrated APScheduler
|
||||
|
||||
- `make run-all`: runs the entire stack (Flask app + Celery worker + Redis) in development mode
|
||||
For development monitoring:
|
||||
- Access the Flask admin interface for APScheduler job monitoring
|
||||
- View real-time logs in the application's activity log section
|
||||
|
||||
### How It Works
|
||||
|
||||
When you upload a CSV file through the web interface:
|
||||
**For CSV Uploads:**
|
||||
1. File is uploaded through the web interface
|
||||
2. APScheduler creates a background job to process the file
|
||||
3. Browser shows progress updates via AJAX polling
|
||||
4. Results are displayed when processing completes
|
||||
|
||||
1. The file is sent to the server
|
||||
2. A Celery task is created to process the file asynchronously
|
||||
3. The browser shows a progress bar with real-time updates
|
||||
4. The results are displayed when processing is complete
|
||||
**For Scheduled Scraping:**
|
||||
1. APScheduler runs hourly at the top of each hour
|
||||
2. Papers are selected based on volume and schedule configuration
|
||||
3. Individual paper processing jobs are scheduled at random times within the hour
|
||||
4. All jobs are tracked in the database with complete visibility
|
||||
|
||||
This architecture allows SciPaperLoader to handle CSV files with thousands of papers without timing out or blocking the web interface.
|
||||
This unified architecture provides reliable task processing without external dependencies.
|
||||
|
||||
## Configuration
|
||||
|
||||
@ -72,12 +80,12 @@ Default configuration is loaded from `scipaperloader.defaults` and can be
|
||||
overriden by environment variables with a `FLASK_` prefix. See
|
||||
[Configuring from Environment Variables](https://flask.palletsprojects.com/en/3.0.x/config/#configuring-from-environment-variables).
|
||||
|
||||
### Celery Configuration
|
||||
### Task Processing Configuration
|
||||
|
||||
The following environment variables can be set to configure Celery:
|
||||
APScheduler automatically uses your configured database for job persistence. No additional configuration required.
|
||||
|
||||
- `FLASK_CELERY_BROKER_URL`: Redis URL for the message broker (default: `redis://localhost:6379/0`)
|
||||
- `FLASK_CELERY_RESULT_BACKEND`: Redis URL for storing task results (default: `redis://localhost:6379/0`)
|
||||
For advanced configuration, you can set:
|
||||
- `FLASK_SQLALCHEMY_DATABASE_URI`: Database URL (APScheduler uses the same database)
|
||||
|
||||
Consider using
|
||||
[dotenv](https://flask.palletsprojects.com/en/3.0.x/cli/#environment-variables-from-dotenv).
|
||||
@ -115,17 +123,18 @@ You must set a
|
||||
[SECRET_KEY](https://flask.palletsprojects.com/en/3.0.x/tutorial/deploy/#configure-the-secret-key)
|
||||
in production to a secret and stable value.
|
||||
|
||||
### Deploying with Celery
|
||||
### Deploying with APScheduler
|
||||
|
||||
When deploying to production:
|
||||
|
||||
1. Configure a production-ready Redis instance or use a managed service
|
||||
2. Run Celery workers as system services or in Docker containers
|
||||
3. Consider setting up monitoring for your Celery tasks and workers
|
||||
1. APScheduler jobs are automatically persistent in your database
|
||||
2. The Flask application handles all background processing internally
|
||||
3. No external message broker or workers required
|
||||
4. Scale by running multiple Flask instances with shared database
|
||||
|
||||
## Troubleshooting and Diagnostics
|
||||
|
||||
SciPaperLoader includes a collection of diagnostic and emergency tools to help address issues with the application, particularly with the scraper and Celery task system.
|
||||
SciPaperLoader includes a collection of diagnostic and emergency tools to help address issues with the application, particularly with the scraper and APScheduler task system.
|
||||
|
||||
### Quick Access
|
||||
|
||||
@ -151,7 +160,7 @@ All diagnostic tools are located in the `tools/diagnostics/` directory:
|
||||
|
||||
- **check_state.py**: Quickly check the current state of the scraper in the database
|
||||
- **diagnose_scraper.py**: Comprehensive diagnostic tool that examines tasks, logs, and scraper state
|
||||
- **inspect_tasks.py**: View currently running, scheduled, and reserved Celery tasks
|
||||
- **inspect_tasks.py**: View currently running and scheduled APScheduler tasks
|
||||
- **test_reversion.py**: Test the paper reversion functionality when stopping the scraper
|
||||
|
||||
### Emergency Recovery
|
||||
@ -159,7 +168,7 @@ All diagnostic tools are located in the `tools/diagnostics/` directory:
|
||||
For cases where the scraper is stuck or behaving unexpectedly:
|
||||
|
||||
- **emergency_stop.py**: Force stops all scraper activities, revokes all running tasks, and reverts papers from "Pending" state
|
||||
- **quick_fix.py**: Simplified emergency stop that also restarts Celery workers to ensure code changes are applied
|
||||
- **quick_fix.py**: Simplified emergency stop that also stops Flask processes to ensure code changes are applied
|
||||
|
||||
### Usage Example
|
||||
|
||||
|
@ -1,11 +0,0 @@
|
||||
from scipaperloader.celery import celery, configure_celery
|
||||
# Import all task modules to ensure they are registered with Celery
|
||||
import scipaperloader.scrapers.tasks # Import new scheduler tasks
|
||||
import scipaperloader.blueprints.scraper # Import the scraper module with our tasks
|
||||
|
||||
# Configure celery with Flask app
|
||||
configure_celery()
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Start the Celery worker
|
||||
celery.start(['worker', '--loglevel=info', '--concurrency=2'])
|
@ -13,10 +13,10 @@ dependencies = [
|
||||
"flask-wtf>=1.2.2,<2",
|
||||
"pyzotero>=1.6.11,<2",
|
||||
"pandas>=2.2.3,<3",
|
||||
"celery>=5.5.1,<6",
|
||||
"redis>=5.2.1,<6",
|
||||
"flower>=2.0.1,<3",
|
||||
"APScheduler>=3.10.4,<4",
|
||||
"flask-migrate>=4.1.0,<5",
|
||||
"beautifulsoup4>=4.13.4,<5 ",
|
||||
"requests>=2.32.4,<3"
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
@ -5,14 +5,23 @@ from .db import db
|
||||
from .models import init_schedule_config
|
||||
from .models import ActivityLog, ActivityCategory
|
||||
from .blueprints import register_blueprints
|
||||
from .scheduler import ScraperScheduler
|
||||
|
||||
def create_app(test_config=None):
|
||||
app = Flask(__name__)
|
||||
app = Flask(__name__, instance_relative_config=True)
|
||||
app.config.from_object(Config)
|
||||
|
||||
# Celery configuration
|
||||
app.config['CELERY_BROKER_URL'] = app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0')
|
||||
app.config['CELERY_RESULT_BACKEND'] = app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0')
|
||||
# Ensure the instance folder exists
|
||||
import os
|
||||
try:
|
||||
os.makedirs(app.instance_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Set the database URI to use absolute path if it's the default relative path
|
||||
if app.config['SQLALCHEMY_DATABASE_URI'] == "sqlite:///instance/papers.db":
|
||||
db_path = os.path.join(app.instance_path, 'papers.db')
|
||||
app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}'
|
||||
|
||||
if test_config:
|
||||
app.config.update(test_config)
|
||||
@ -24,6 +33,12 @@ def create_app(test_config=None):
|
||||
db.create_all()
|
||||
init_schedule_config()
|
||||
|
||||
# Initialize APScheduler
|
||||
scheduler = ScraperScheduler(app)
|
||||
|
||||
# Store scheduler in app config for access from other modules
|
||||
app.config['SCHEDULER'] = scheduler
|
||||
|
||||
@app.context_processor
|
||||
def inject_app_title():
|
||||
return {"app_title": app.config["APP_TITLE"]}
|
||||
|
@ -2,7 +2,7 @@
|
||||
from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify, current_app
|
||||
from ..db import db
|
||||
# Import the new model
|
||||
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata
|
||||
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata, TimezoneConfig
|
||||
from ..defaults import MAX_VOLUME
|
||||
import os # Import os for path validation
|
||||
import sys
|
||||
@ -129,6 +129,54 @@ def _update_download_path(new_path):
|
||||
return False, f"Error updating download path: {str(e)}", None
|
||||
|
||||
|
||||
def _update_timezone(new_timezone):
|
||||
"""
|
||||
Helper function to update timezone configuration.
|
||||
|
||||
Args:
|
||||
new_timezone (str): The new timezone
|
||||
|
||||
Returns:
|
||||
tuple: (success, message, timezone_config)
|
||||
"""
|
||||
try:
|
||||
# Basic validation: check if it's a non-empty string
|
||||
if not new_timezone or not isinstance(new_timezone, str):
|
||||
return False, "Timezone cannot be empty.", None
|
||||
|
||||
# Validate timezone using pytz
|
||||
try:
|
||||
import pytz
|
||||
pytz.timezone(new_timezone) # This will raise an exception if invalid
|
||||
except ImportError:
|
||||
# If pytz is not available, do basic validation
|
||||
if '/' not in new_timezone:
|
||||
return False, "Invalid timezone format. Use format like 'Europe/Berlin'.", None
|
||||
except pytz.exceptions.UnknownTimeZoneError:
|
||||
return False, f"Unknown timezone: {new_timezone}. Use format like 'Europe/Berlin'.", None
|
||||
|
||||
config = TimezoneConfig.query.first()
|
||||
if not config:
|
||||
config = TimezoneConfig(timezone=new_timezone)
|
||||
db.session.add(config)
|
||||
else:
|
||||
old_value = config.timezone
|
||||
config.timezone = new_timezone
|
||||
ActivityLog.log_config_change(
|
||||
config_key="scheduler_timezone",
|
||||
old_value=old_value,
|
||||
new_value=new_timezone,
|
||||
description="Updated scheduler timezone"
|
||||
)
|
||||
|
||||
db.session.commit()
|
||||
return True, "Timezone updated successfully!", config
|
||||
|
||||
except Exception as e:
|
||||
db.session.rollback()
|
||||
return False, f"Error updating timezone: {str(e)}", None
|
||||
|
||||
|
||||
def _update_schedule(schedule_data):
|
||||
"""
|
||||
Helper function to update schedule configuration.
|
||||
@ -211,11 +259,19 @@ def general():
|
||||
db.session.add(download_path_config)
|
||||
db.session.commit()
|
||||
|
||||
# Fetch timezone config
|
||||
timezone_config = TimezoneConfig.query.first()
|
||||
if not timezone_config:
|
||||
timezone_config = TimezoneConfig() # Use default from model
|
||||
db.session.add(timezone_config)
|
||||
db.session.commit()
|
||||
|
||||
return render_template(
|
||||
"config/index.html.jinja",
|
||||
active_tab="general",
|
||||
volume_config=volume_config,
|
||||
download_path_config=download_path_config, # Pass to template
|
||||
timezone_config=timezone_config, # Pass to template
|
||||
max_volume=MAX_VOLUME,
|
||||
app_title="Configuration"
|
||||
)
|
||||
@ -369,9 +425,10 @@ def generate_test_papers():
|
||||
|
||||
@bp.route("/update/general", methods=["POST"])
|
||||
def update_general():
|
||||
"""Update general configuration (Volume and Download Path)."""
|
||||
"""Update general configuration (Volume, Download Path, and Timezone)."""
|
||||
volume_success, volume_message = True, ""
|
||||
path_success, path_message = True, ""
|
||||
timezone_success, timezone_message = True, ""
|
||||
|
||||
# Update Volume
|
||||
new_volume = request.form.get("total_volume")
|
||||
@ -391,6 +448,15 @@ def update_general():
|
||||
else:
|
||||
flash(path_message, "error")
|
||||
|
||||
# Update Timezone
|
||||
new_timezone = request.form.get("timezone")
|
||||
if new_timezone is not None:
|
||||
timezone_success, timezone_message, _ = _update_timezone(new_timezone)
|
||||
if timezone_success:
|
||||
flash(timezone_message, "success")
|
||||
else:
|
||||
flash(timezone_message, "error")
|
||||
|
||||
return redirect(url_for("config.general"))
|
||||
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
import csv
|
||||
import io
|
||||
import datetime
|
||||
from flask import Blueprint, render_template, request, send_file
|
||||
from flask import Blueprint, render_template, request, send_file, jsonify
|
||||
from ..db import db
|
||||
from ..models import ActivityLog, ActivityCategory
|
||||
|
||||
@ -11,11 +11,11 @@ bp = Blueprint("logger", __name__, url_prefix="/logs")
|
||||
|
||||
@bp.route("/")
|
||||
def list_logs():
|
||||
page = request.args.get("page", 1, type=int)
|
||||
per_page = 50
|
||||
# For the new modern view, we only need to provide initial filter values and categories
|
||||
# The actual data loading will be handled by JavaScript via the API endpoint
|
||||
|
||||
# Filters
|
||||
category = request.args.get("category")
|
||||
# Get filter parameters for initial state
|
||||
categories_param = request.args.getlist("category") # Get multiple categories
|
||||
start_date = request.args.get("start_date")
|
||||
end_date = request.args.get("end_date")
|
||||
search_term = request.args.get("search_term")
|
||||
@ -23,33 +23,12 @@ def list_logs():
|
||||
if search_term == "None":
|
||||
search_term = None
|
||||
|
||||
|
||||
query = ActivityLog.query
|
||||
|
||||
if category:
|
||||
query = query.filter(ActivityLog.category == category)
|
||||
if start_date:
|
||||
start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
|
||||
query = query.filter(ActivityLog.timestamp >= start_date_dt)
|
||||
if end_date:
|
||||
end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=1)
|
||||
query = query.filter(ActivityLog.timestamp <= end_date_dt)
|
||||
if search_term:
|
||||
query = query.filter(db.or_(
|
||||
ActivityLog.action.contains(search_term),
|
||||
ActivityLog.description.contains(search_term)
|
||||
))
|
||||
|
||||
pagination = query.order_by(ActivityLog.timestamp.desc()).paginate(page=page, per_page=per_page, error_out=False)
|
||||
|
||||
categories = [e.value for e in ActivityCategory]
|
||||
|
||||
return render_template(
|
||||
"logger.html.jinja",
|
||||
logs=pagination.items,
|
||||
pagination=pagination,
|
||||
"logs.html.jinja",
|
||||
categories=categories,
|
||||
category=category,
|
||||
selected_categories=categories_param, # Pass selected categories
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
search_term=search_term,
|
||||
@ -60,15 +39,15 @@ def list_logs():
|
||||
@bp.route("/download")
|
||||
def download_logs():
|
||||
# Filters - reuse logic from list_logs
|
||||
category = request.args.get("category")
|
||||
categories = request.args.getlist("category") # Get multiple categories
|
||||
start_date = request.args.get("start_date")
|
||||
end_date = request.args.get("end_date")
|
||||
search_term = request.args.get("search_term")
|
||||
|
||||
query = ActivityLog.query
|
||||
|
||||
if category:
|
||||
query = query.filter(ActivityLog.category == category)
|
||||
if categories:
|
||||
query = query.filter(ActivityLog.category.in_(categories))
|
||||
if start_date:
|
||||
start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
|
||||
query = query.filter(ActivityLog.timestamp >= start_date_dt)
|
||||
@ -99,8 +78,12 @@ def download_logs():
|
||||
|
||||
# Create response
|
||||
filename = f"logs_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
csv_data.seek(0)
|
||||
output = io.BytesIO(csv_data.getvalue().encode('utf-8'))
|
||||
output.seek(0)
|
||||
|
||||
return send_file(
|
||||
io.StringIO(csv_data.getvalue()),
|
||||
output,
|
||||
mimetype="text/csv",
|
||||
as_attachment=True,
|
||||
download_name=filename
|
||||
@ -110,3 +93,131 @@ def download_logs():
|
||||
def log_detail(log_id):
|
||||
log = ActivityLog.query.get_or_404(log_id)
|
||||
return render_template("partials/log_detail_modal.html.jinja", log=log)
|
||||
|
||||
|
||||
@bp.route("/api")
|
||||
def get_logs_api():
|
||||
"""Unified API endpoint for getting activity logs with filtering and pagination support."""
|
||||
try:
|
||||
# Pagination parameters
|
||||
page = request.args.get('page', 1, type=int)
|
||||
per_page = request.args.get('per_page', 50, type=int)
|
||||
|
||||
# Legacy limit parameter for backward compatibility
|
||||
limit = request.args.get('limit', type=int)
|
||||
if limit and not request.args.get('page'):
|
||||
# Legacy mode: use limit without pagination
|
||||
query = ActivityLog.query
|
||||
|
||||
# Apply filters
|
||||
categories = request.args.getlist('category')
|
||||
if categories:
|
||||
query = query.filter(ActivityLog.category.in_(categories))
|
||||
|
||||
status = request.args.get('status')
|
||||
if status:
|
||||
query = query.filter(ActivityLog.status == status)
|
||||
|
||||
start_date = request.args.get('start_date')
|
||||
if start_date:
|
||||
start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
|
||||
query = query.filter(ActivityLog.timestamp >= start_date_dt)
|
||||
|
||||
end_date = request.args.get('end_date')
|
||||
if end_date:
|
||||
end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=1)
|
||||
query = query.filter(ActivityLog.timestamp <= end_date_dt)
|
||||
|
||||
search_term = request.args.get('search_term')
|
||||
if search_term and search_term != "None":
|
||||
query = query.filter(db.or_(
|
||||
ActivityLog.action.contains(search_term),
|
||||
ActivityLog.description.contains(search_term)
|
||||
))
|
||||
|
||||
logs = query.order_by(ActivityLog.timestamp.desc()).limit(limit).all()
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"logs": [{
|
||||
"id": log.id,
|
||||
"timestamp": log.timestamp.isoformat(),
|
||||
"action": log.action,
|
||||
"status": log.status,
|
||||
"description": log.description,
|
||||
"category": log.category,
|
||||
"paper_id": log.paper_id,
|
||||
"extra_data": log.extra_data
|
||||
} for log in logs]
|
||||
})
|
||||
|
||||
# Ensure reasonable per_page limits
|
||||
per_page = min(per_page, 100) # Cap at 100 items per page
|
||||
|
||||
# Build query with filtering
|
||||
query = ActivityLog.query
|
||||
|
||||
# Filter by categories if specified
|
||||
categories = request.args.getlist('category')
|
||||
if categories:
|
||||
query = query.filter(ActivityLog.category.in_(categories))
|
||||
|
||||
# Filter by status if specified
|
||||
status = request.args.get('status')
|
||||
if status:
|
||||
query = query.filter(ActivityLog.status == status)
|
||||
|
||||
# Date filters
|
||||
start_date = request.args.get('start_date')
|
||||
if start_date:
|
||||
start_date_dt = datetime.datetime.strptime(start_date, "%Y-%m-%d")
|
||||
query = query.filter(ActivityLog.timestamp >= start_date_dt)
|
||||
|
||||
end_date = request.args.get('end_date')
|
||||
if end_date:
|
||||
end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=1)
|
||||
query = query.filter(ActivityLog.timestamp <= end_date_dt)
|
||||
|
||||
# Search term filter
|
||||
search_term = request.args.get('search_term')
|
||||
if search_term and search_term != "None":
|
||||
query = query.filter(db.or_(
|
||||
ActivityLog.action.contains(search_term),
|
||||
ActivityLog.description.contains(search_term)
|
||||
))
|
||||
|
||||
# Order by most recent first and paginate
|
||||
pagination = query.order_by(ActivityLog.timestamp.desc()).paginate(
|
||||
page=page,
|
||||
per_page=per_page,
|
||||
error_out=False
|
||||
)
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"logs": [{
|
||||
"id": log.id,
|
||||
"timestamp": log.timestamp.isoformat(),
|
||||
"action": log.action,
|
||||
"status": log.status,
|
||||
"description": log.description,
|
||||
"category": log.category,
|
||||
"paper_id": log.paper_id,
|
||||
"extra_data": log.extra_data
|
||||
} for log in pagination.items],
|
||||
"pagination": {
|
||||
"page": pagination.page,
|
||||
"pages": pagination.pages,
|
||||
"per_page": pagination.per_page,
|
||||
"total": pagination.total,
|
||||
"has_next": pagination.has_next,
|
||||
"has_prev": pagination.has_prev,
|
||||
"next_num": pagination.next_num if pagination.has_next else None,
|
||||
"prev_num": pagination.prev_num if pagination.has_prev else None
|
||||
}
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": f"Error getting logs: {str(e)}"
|
||||
}), 500
|
@ -1,7 +1,7 @@
|
||||
"""
|
||||
Simplified scraper blueprint using the new ScraperManager and hourly scheduling system.
|
||||
"""
|
||||
from flask import Blueprint, jsonify, render_template, request
|
||||
from flask import Blueprint, jsonify, render_template, request, current_app
|
||||
from ..models import ActivityLog, PaperMetadata, ScraperState, VolumeConfig
|
||||
from ..scrapers.manager import ScraperManager
|
||||
from ..scrapers.factory import get_available_scrapers
|
||||
@ -29,6 +29,10 @@ def index():
|
||||
# Get volume configuration
|
||||
volume_config = VolumeConfig.get_current_volume()
|
||||
|
||||
# Get scraper module configuration
|
||||
from ..models import ScraperModuleConfig
|
||||
current_scraper_module = ScraperModuleConfig.get_current_module()
|
||||
|
||||
# Get paper counts by status
|
||||
paper_counts = {
|
||||
'new': PaperMetadata.query.filter_by(status='New').count(),
|
||||
@ -46,7 +50,10 @@ def index():
|
||||
recent_logs=recent_logs,
|
||||
paper_counts=paper_counts,
|
||||
volume_config=volume_config,
|
||||
max_volume=MAX_VOLUME
|
||||
max_volume=MAX_VOLUME,
|
||||
current_scraper_module=current_scraper_module,
|
||||
available_scraper_modules=[s["name"] for s in available_scrapers],
|
||||
scraper_details={s["name"]: s for s in available_scrapers}
|
||||
)
|
||||
|
||||
@bp.route("/start", methods=["POST"])
|
||||
@ -55,11 +62,12 @@ def start_scraper():
|
||||
try:
|
||||
# Handle both JSON and form data
|
||||
if request.is_json:
|
||||
data = request.get_json() or {}
|
||||
data = request.get_json()
|
||||
# Allow empty JSON payload for start requests
|
||||
if data is None:
|
||||
data = {}
|
||||
else:
|
||||
data = request.form.to_dict()
|
||||
|
||||
scraper_name = data.get('scraper_name', 'dummy')
|
||||
return jsonify({"success": False, "message": "Invalid payload format. Expected JSON."}), 400
|
||||
|
||||
# Start the scraper using manager
|
||||
result = scraper_manager.start_scraper()
|
||||
@ -68,18 +76,16 @@ def start_scraper():
|
||||
ActivityLog.log_scraper_command(
|
||||
action="start_scraper",
|
||||
status="success",
|
||||
description="Started scraper with hourly scheduling"
|
||||
description="Scraper started successfully."
|
||||
)
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"message": result["message"]
|
||||
})
|
||||
return jsonify({"success": True, "message": result["message"]})
|
||||
else:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": result["message"]
|
||||
}), 400
|
||||
ActivityLog.log_scraper_command(
|
||||
action="start_scraper",
|
||||
status="failure",
|
||||
description=f"Failed to start scraper: {result['message']}"
|
||||
)
|
||||
return jsonify({"success": False, "message": result["message"]}), 400
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_scraper_command(
|
||||
@ -87,10 +93,7 @@ def start_scraper():
|
||||
status="error",
|
||||
description=f"Failed to start scraper: {str(e)}"
|
||||
)
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": f"Error starting scraper: {str(e)}"
|
||||
}), 500
|
||||
return jsonify({"success": False, "message": f"An error occurred: {str(e)}"}), 500
|
||||
|
||||
@bp.route("/pause", methods=["POST"])
|
||||
def pause_scraper():
|
||||
@ -223,6 +226,13 @@ def get_status():
|
||||
# Get current hour quota info
|
||||
current_quota = scraper_manager.get_current_hour_quota()
|
||||
|
||||
# Get current scraper module configuration
|
||||
from ..models import ScraperModuleConfig
|
||||
current_scraper_module = ScraperModuleConfig.get_current_module()
|
||||
|
||||
# Get volume configuration
|
||||
current_volume = VolumeConfig.get_current_volume()
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"scraper_state": {
|
||||
@ -231,7 +241,9 @@ def get_status():
|
||||
"last_updated": scraper_state.last_updated.isoformat() if scraper_state.last_updated else None
|
||||
},
|
||||
"paper_counts": paper_counts,
|
||||
"current_quota": current_quota
|
||||
"current_quota": current_quota,
|
||||
"current_scraper_module": current_scraper_module,
|
||||
"volume_config": current_volume
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
@ -242,28 +254,16 @@ def get_status():
|
||||
|
||||
@bp.route("/logs")
|
||||
def get_logs():
|
||||
"""Get recent activity logs."""
|
||||
try:
|
||||
limit = request.args.get('limit', 50, type=int)
|
||||
logs = ActivityLog.query.order_by(ActivityLog.timestamp.desc()).limit(limit).all()
|
||||
"""Get recent activity logs with pagination support."""
|
||||
# Redirect to the unified logs API endpoint
|
||||
from flask import redirect, url_for
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"logs": [{
|
||||
"id": log.id,
|
||||
"timestamp": log.timestamp.isoformat(),
|
||||
"action": log.action,
|
||||
"status": log.status,
|
||||
"description": log.description,
|
||||
"category": log.category.name if log.category else None
|
||||
} for log in logs]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": f"Error getting logs: {str(e)}"
|
||||
}), 500
|
||||
# Forward all query parameters to the unified endpoint
|
||||
query_string = request.query_string.decode('utf-8')
|
||||
if query_string:
|
||||
return redirect(f"{url_for('logger.get_logs_api')}?{query_string}")
|
||||
else:
|
||||
return redirect(url_for('logger.get_logs_api'))
|
||||
|
||||
@bp.route("/scrapers")
|
||||
def get_scrapers():
|
||||
@ -346,8 +346,6 @@ def process_papers_manually():
|
||||
def trigger_immediate_processing():
|
||||
"""Trigger immediate processing of papers without waiting for hourly schedule."""
|
||||
try:
|
||||
from ..scrapers.tasks import process_papers_batch
|
||||
|
||||
# Get papers that should be processed this hour
|
||||
manager = ScraperManager()
|
||||
papers = manager.select_papers_for_processing()
|
||||
@ -359,23 +357,38 @@ def trigger_immediate_processing():
|
||||
"papers_scheduled": 0
|
||||
})
|
||||
|
||||
# Get paper IDs for batch processing
|
||||
paper_ids = [paper.id for paper in papers]
|
||||
# Get APScheduler instance
|
||||
scheduler = current_app.config.get('SCHEDULER')
|
||||
if not scheduler:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": "APScheduler not available"
|
||||
}), 500
|
||||
|
||||
# Trigger immediate batch processing (no delay)
|
||||
task = process_papers_batch.delay(paper_ids)
|
||||
# Schedule papers for immediate processing via APScheduler
|
||||
scheduled_count = 0
|
||||
for paper in papers:
|
||||
try:
|
||||
import uuid
|
||||
job_id = f"immediate_paper_{paper.id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{uuid.uuid4().hex[:8]}"
|
||||
scheduler.schedule_paper_processing(paper.id, delay_seconds=1, job_id=job_id)
|
||||
scheduled_count += 1
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Failed to schedule paper {paper.id}: {str(e)}",
|
||||
source="trigger_immediate_processing"
|
||||
)
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="trigger_immediate_processing",
|
||||
status="success",
|
||||
description=f"Triggered immediate processing of {len(paper_ids)} papers"
|
||||
description=f"Triggered immediate processing of {scheduled_count} papers via APScheduler"
|
||||
)
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"message": f"Immediate processing started for {len(paper_ids)} papers",
|
||||
"papers_scheduled": len(paper_ids),
|
||||
"task_id": task.id
|
||||
"message": f"Immediate processing started for {scheduled_count} papers",
|
||||
"papers_scheduled": scheduled_count
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
@ -416,40 +429,96 @@ def get_stats():
|
||||
try:
|
||||
hours = int(request.args.get('hours', 24))
|
||||
current_time = datetime.utcnow()
|
||||
cutoff_time = current_time.replace(minute=0, second=0, microsecond=0)
|
||||
|
||||
# Get activity logs for scraper actions in the last N hours
|
||||
from ..models import ActivityCategory
|
||||
start_time = cutoff_time - timedelta(hours=hours)
|
||||
start_time = current_time - timedelta(hours=hours)
|
||||
logs = ActivityLog.query.filter(
|
||||
ActivityLog.category == ActivityCategory.SCRAPER_ACTIVITY.value,
|
||||
ActivityLog.timestamp >= start_time
|
||||
).all()
|
||||
|
||||
# Group by hour and status
|
||||
stats = {}
|
||||
# Get scraper command logs for state changes in the same time period
|
||||
state_logs = ActivityLog.query.filter(
|
||||
ActivityLog.category == ActivityCategory.SCRAPER_COMMAND.value,
|
||||
ActivityLog.action.in_(['start_scraper', 'pause_scraper', 'stop_scraper', 'reset_scraper']),
|
||||
ActivityLog.timestamp >= start_time
|
||||
).order_by(ActivityLog.timestamp.asc()).all()
|
||||
|
||||
# Group by chronological hour buckets (not hour of day)
|
||||
stats = []
|
||||
for hour_offset in range(hours):
|
||||
target_hour = (current_time.hour - hour_offset) % 24
|
||||
stats[target_hour] = {
|
||||
# Calculate the hour bucket (most recent hour first when hour_offset=0)
|
||||
bucket_end_time = current_time - timedelta(hours=hour_offset)
|
||||
bucket_start_time = bucket_end_time - timedelta(hours=1)
|
||||
|
||||
# Format hour label for display (e.g., "14:00-15:00" or "14:00" for simplicity)
|
||||
hour_label = bucket_start_time.strftime("%H:%M")
|
||||
|
||||
# Initialize counters for this hour bucket
|
||||
bucket_stats = {
|
||||
"success": 0,
|
||||
"error": 0,
|
||||
"pending": 0,
|
||||
"hour": target_hour,
|
||||
"hour": hour_label,
|
||||
"hour_offset": hour_offset, # For sorting
|
||||
"bucket_start": bucket_start_time,
|
||||
"bucket_end": bucket_end_time,
|
||||
"scraper_active": 0 # Default to inactive
|
||||
}
|
||||
|
||||
# Count logs that fall within this hour bucket
|
||||
for log in logs:
|
||||
hour = log.timestamp.hour
|
||||
if hour in stats:
|
||||
if bucket_start_time <= log.timestamp < bucket_end_time:
|
||||
if log.status == "success":
|
||||
stats[hour]["success"] += 1
|
||||
bucket_stats["success"] += 1
|
||||
elif log.status == "error":
|
||||
stats[hour]["error"] += 1
|
||||
bucket_stats["error"] += 1
|
||||
elif log.status in ("pending", "info"):
|
||||
stats[hour]["pending"] += 1
|
||||
bucket_stats["pending"] += 1
|
||||
|
||||
# Convert to list for easier consumption by JavaScript
|
||||
result = [stats[hour] for hour in sorted(stats.keys())]
|
||||
return jsonify(result)
|
||||
# Determine scraper status for this hour by checking if scraper was active
|
||||
# For simplicity, check if there were any successful scrapes in this hour
|
||||
# If there were scrapes, assume scraper was active
|
||||
bucket_stats["scraper_active"] = 1 if bucket_stats["success"] > 0 else 0
|
||||
|
||||
stats.append(bucket_stats)
|
||||
|
||||
# Reverse so oldest hour comes first (better for chronological chart display)
|
||||
stats.reverse()
|
||||
|
||||
# Prepare precise scraper state changes for timeline
|
||||
scraper_timeline = []
|
||||
for log in state_logs:
|
||||
# Calculate hours ago from current time
|
||||
time_diff = current_time - log.timestamp
|
||||
hours_ago = time_diff.total_seconds() / 3600
|
||||
|
||||
# Only include logs within our time range
|
||||
if hours_ago <= hours:
|
||||
scraper_timeline.append({
|
||||
"timestamp": log.timestamp.isoformat(),
|
||||
"hours_ago": hours_ago,
|
||||
"action": log.action,
|
||||
"status": log.status,
|
||||
"active": 1 if log.action == "start_scraper" and log.status == "success" else 0
|
||||
})
|
||||
|
||||
# Clean up the response (remove internal fields)
|
||||
result = []
|
||||
for stat in stats:
|
||||
result.append({
|
||||
"success": stat["success"],
|
||||
"error": stat["error"],
|
||||
"pending": stat["pending"],
|
||||
"hour": stat["hour"],
|
||||
"scraper_active": stat["scraper_active"]
|
||||
})
|
||||
|
||||
return jsonify({
|
||||
"hourly_stats": result,
|
||||
"scraper_timeline": scraper_timeline
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
@ -472,20 +541,39 @@ def process_single_paper_endpoint(paper_id):
|
||||
"message": "Paper not found"
|
||||
}), 404
|
||||
|
||||
# Process the paper using the manager
|
||||
result = scraper_manager.process_paper(paper)
|
||||
# Get APScheduler instance
|
||||
scheduler = current_app.config.get('SCHEDULER')
|
||||
if not scheduler:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": "APScheduler not available"
|
||||
}), 500
|
||||
|
||||
# Schedule the paper for immediate manual processing via APScheduler
|
||||
# Use UUID suffix to ensure unique job IDs
|
||||
import uuid
|
||||
job_id = f"manual_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{uuid.uuid4().hex[:8]}"
|
||||
try:
|
||||
scheduler.schedule_manual_paper_processing(paper_id, scraper_name=scraper_name, delay_seconds=1, job_id=job_id)
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="manual_process_single",
|
||||
status="success",
|
||||
description=f"Manually processed paper {paper.doi}"
|
||||
description=f"Scheduled manual processing for paper {paper.doi} via APScheduler" +
|
||||
(f" using scraper '{scraper_name}'" if scraper_name else " using system default scraper")
|
||||
)
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"message": f"Processing started for paper {paper.doi}",
|
||||
"message": f"Processing scheduled for paper {paper.doi}" +
|
||||
(f" using {scraper_name} scraper" if scraper_name else " using system default scraper"),
|
||||
"paper_id": paper_id
|
||||
})
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": f"Failed to schedule processing: {str(e)}"
|
||||
}), 500
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_scraper_command(
|
||||
@ -530,6 +618,35 @@ def update_scraper_config():
|
||||
"message": message
|
||||
}), 400
|
||||
|
||||
# Handle scraper module configuration updates
|
||||
if "scraper_module" in data:
|
||||
from ..models import ScraperModuleConfig
|
||||
|
||||
new_module = data["scraper_module"]
|
||||
|
||||
# Validate that the module exists and is valid
|
||||
available_modules = [m["name"] for m in get_available_scrapers()]
|
||||
|
||||
if new_module not in available_modules:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"message": f"Invalid scraper module: {new_module}"
|
||||
}), 400
|
||||
|
||||
# Update the database configuration
|
||||
ScraperModuleConfig.set_module(new_module)
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="update_scraper_module",
|
||||
status="success",
|
||||
description=f"Updated scraper module to '{new_module}'"
|
||||
)
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"message": f"Scraper module updated to '{new_module}' successfully"
|
||||
})
|
||||
|
||||
# Handle other configuration updates here if needed in the future
|
||||
|
||||
return jsonify({
|
||||
@ -547,3 +664,72 @@ def update_scraper_config():
|
||||
"success": False,
|
||||
"message": f"Error updating scraper config: {str(e)}"
|
||||
}), 500
|
||||
|
||||
@bp.route("/publishers")
|
||||
def get_publishers():
|
||||
"""Get publisher overview data for the scraper overview modal."""
|
||||
try:
|
||||
import os
|
||||
import glob
|
||||
|
||||
# Get available parser modules
|
||||
parsers_dir = os.path.join(current_app.root_path, 'parsers')
|
||||
parser_files = glob.glob(os.path.join(parsers_dir, '*_parser.py'))
|
||||
available_parsers = []
|
||||
|
||||
for parser_file in parser_files:
|
||||
filename = os.path.basename(parser_file)
|
||||
if filename != 'base_parser.py': # Skip the base parser
|
||||
parser_name = filename.replace('_parser.py', '')
|
||||
available_parsers.append(parser_name)
|
||||
|
||||
# Get publishers from database (papers that have publisher detected)
|
||||
publisher_query = db.session.query(
|
||||
PaperMetadata.publisher,
|
||||
db.func.count(PaperMetadata.id).label('paper_count')
|
||||
).filter(
|
||||
PaperMetadata.publisher.isnot(None),
|
||||
PaperMetadata.publisher != ''
|
||||
).group_by(PaperMetadata.publisher).all()
|
||||
|
||||
publishers_data = []
|
||||
for publisher, count in publisher_query:
|
||||
# Check if a parser exists for this publisher
|
||||
has_parser = publisher in available_parsers
|
||||
|
||||
publishers_data.append({
|
||||
'name': publisher,
|
||||
'paper_count': count,
|
||||
'has_parser': has_parser,
|
||||
'parser_status': 'available' if has_parser else 'missing'
|
||||
})
|
||||
|
||||
# Sort by paper count descending
|
||||
publishers_data.sort(key=lambda x: x['paper_count'], reverse=True)
|
||||
|
||||
# Get totals
|
||||
total_papers_with_publisher = sum(p['paper_count'] for p in publishers_data)
|
||||
total_papers_without_publisher = PaperMetadata.query.filter(
|
||||
db.or_(PaperMetadata.publisher.is_(None), PaperMetadata.publisher == '')
|
||||
).count()
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'data': {
|
||||
'publishers': publishers_data,
|
||||
'available_parsers': available_parsers,
|
||||
'stats': {
|
||||
'total_publishers': len(publishers_data),
|
||||
'publishers_with_parsers': len([p for p in publishers_data if p['has_parser']]),
|
||||
'publishers_without_parsers': len([p for p in publishers_data if not p['has_parser']]),
|
||||
'total_papers_with_publisher': total_papers_with_publisher,
|
||||
'total_papers_without_publisher': total_papers_without_publisher
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'message': f'Error getting publisher data: {str(e)}'
|
||||
}), 500
|
@ -2,8 +2,11 @@
|
||||
import codecs
|
||||
import csv
|
||||
import datetime
|
||||
from io import StringIO
|
||||
import traceback
|
||||
from io import StringIO, BytesIO
|
||||
import json
|
||||
import uuid
|
||||
from typing import Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
from flask import (
|
||||
@ -21,7 +24,6 @@ from flask import (
|
||||
|
||||
from ..db import db
|
||||
from ..models import PaperMetadata, ActivityLog
|
||||
from ..celery import celery # Import the celery instance directly
|
||||
from ..defaults import DUPLICATE_STRATEGIES
|
||||
|
||||
bp = Blueprint("upload", __name__)
|
||||
@ -29,6 +31,10 @@ bp = Blueprint("upload", __name__)
|
||||
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
|
||||
CHUNK_SIZE = 100 # Number of rows to process per batch
|
||||
|
||||
# Store task progress in memory (for simplicity)
|
||||
# In production, you might want to use Redis or database
|
||||
task_progress = {}
|
||||
|
||||
def parse_date(date_str):
|
||||
"""Parse date string into datetime object."""
|
||||
if not date_str or pd.isna(date_str):
|
||||
@ -38,6 +44,76 @@ def parse_date(date_str):
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _process_csv_background(task_id: str, file_content: str, delimiter: str, duplicate_strategy: str):
|
||||
"""Background function to process CSV file using APScheduler."""
|
||||
print(f"DEBUG: _process_csv_background called with task_id: {task_id}")
|
||||
|
||||
# Get Flask app for context
|
||||
from flask import current_app
|
||||
|
||||
# Get the Flask app from the scheduler context
|
||||
from ..scheduler import _get_flask_app
|
||||
app = _get_flask_app()
|
||||
|
||||
print(f"DEBUG: Flask app obtained: {app}")
|
||||
|
||||
if not app:
|
||||
# Fallback: try to get current_app
|
||||
try:
|
||||
app = current_app
|
||||
print(f"DEBUG: Using current_app: {app}")
|
||||
except RuntimeError as e:
|
||||
print(f"DEBUG: Failed to get current_app: {e}")
|
||||
task_progress[task_id] = {
|
||||
"state": "FAILURE",
|
||||
"progress": 0,
|
||||
"error": "Flask app context not available"
|
||||
}
|
||||
return
|
||||
|
||||
with app.app_context():
|
||||
try:
|
||||
print(f"DEBUG: Inside app context, starting CSV processing for task {task_id}")
|
||||
|
||||
# Initialize progress
|
||||
task_progress[task_id] = {
|
||||
"state": "PROGRESS",
|
||||
"progress": 0,
|
||||
"message": "Starting CSV processing..."
|
||||
}
|
||||
|
||||
result = process_csv(file_content, delimiter, duplicate_strategy, task_id)
|
||||
|
||||
print(f"DEBUG: CSV processing completed for task {task_id}, result: {result}")
|
||||
|
||||
# Mark as completed
|
||||
task_progress[task_id] = {
|
||||
"state": "SUCCESS",
|
||||
"progress": 100,
|
||||
"result": result
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"DEBUG: Exception in _process_csv_background: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Mark as failed
|
||||
task_progress[task_id] = {
|
||||
"state": "FAILURE",
|
||||
"progress": 0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Background CSV processing failed: {str(e)}",
|
||||
source="upload._process_csv_background"
|
||||
)
|
||||
except Exception:
|
||||
# If logging fails, just print the error
|
||||
print(f"Background CSV processing failed: {str(e)}")
|
||||
|
||||
@bp.route("/", methods=["GET", "POST"])
|
||||
def upload():
|
||||
if request.method == "POST":
|
||||
@ -51,23 +127,75 @@ def upload():
|
||||
stream = codecs.iterdecode(file.stream, "utf-8")
|
||||
content = "".join(stream)
|
||||
|
||||
# Trigger the Celery task
|
||||
task = process_csv.delay(content, delimiter, duplicate_strategy)
|
||||
# Generate task ID
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
return jsonify({"task_id": task.id})
|
||||
# Get the APScheduler instance from the global variable
|
||||
from ..scheduler import _scheduler
|
||||
if not _scheduler:
|
||||
return jsonify({"error": "APScheduler not initialized."})
|
||||
|
||||
if not _scheduler.running:
|
||||
return jsonify({"error": "APScheduler not running."})
|
||||
|
||||
# Initialize task progress immediately
|
||||
task_progress[task_id] = {
|
||||
"state": "PENDING",
|
||||
"progress": 0,
|
||||
"message": "Task queued for processing..."
|
||||
}
|
||||
|
||||
# Schedule background task
|
||||
job_id = f"csv_upload_{task_id}"
|
||||
# Use UTC time to match APScheduler's timezone configuration
|
||||
run_time = datetime.datetime.utcnow() + datetime.timedelta(seconds=1) # Start in 1 second
|
||||
|
||||
try:
|
||||
_scheduler.add_job(
|
||||
func=_process_csv_background,
|
||||
trigger='date',
|
||||
run_date=run_time,
|
||||
args=[task_id, content, delimiter, duplicate_strategy],
|
||||
id=job_id,
|
||||
name=f"CSV Upload {task_id}",
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
ActivityLog.log_import_activity(
|
||||
action="schedule_csv_upload",
|
||||
status="info",
|
||||
description=f"Scheduled CSV upload task {task_id}",
|
||||
task_id=task_id
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
task_progress[task_id] = {
|
||||
"state": "FAILURE",
|
||||
"progress": 0,
|
||||
"error": f"Failed to schedule task: {str(e)}"
|
||||
}
|
||||
return jsonify({"error": f"Failed to schedule background task: {str(e)}"})
|
||||
|
||||
return jsonify({"task_id": task_id})
|
||||
|
||||
return render_template("upload.html.jinja", duplicate_strategies=DUPLICATE_STRATEGIES)
|
||||
|
||||
@celery.task(bind=True)
|
||||
def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
|
||||
def process_csv(file_content, delimiter, duplicate_strategy="skip", task_id=None):
|
||||
"""Process CSV file and import paper metadata."""
|
||||
|
||||
# With the ContextTask in place, we're already inside an app context
|
||||
added_count = skipped_count = updated_count = error_count = 0
|
||||
errors = []
|
||||
skipped_records = [] # Add this to track skipped records
|
||||
|
||||
try:
|
||||
# Update task progress if provided
|
||||
if task_id:
|
||||
task_progress[task_id] = {
|
||||
"state": "PROGRESS",
|
||||
"progress": 10,
|
||||
"message": "Starting CSV import..."
|
||||
}
|
||||
|
||||
# Log the start of import using ActivityLog model
|
||||
ActivityLog.log_import_activity(
|
||||
action="start_csv_import",
|
||||
@ -77,9 +205,6 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
|
||||
delimiter=delimiter
|
||||
)
|
||||
|
||||
# Set initial progress percentage
|
||||
self.update_state(state='PROGRESS', meta={'progress': 10})
|
||||
|
||||
# Read CSV into chunks
|
||||
csv_buffer = StringIO(file_content)
|
||||
# Count total chunks
|
||||
@ -116,16 +241,16 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
|
||||
skipped_count += 1
|
||||
continue
|
||||
else:
|
||||
metadata = PaperMetadata(
|
||||
title=row["title"],
|
||||
doi=doi,
|
||||
alt_id=row.get("alternative_id"),
|
||||
issn=row["issn"],
|
||||
paper = PaperMetadata(
|
||||
title=row.get("title"),
|
||||
doi=row.get("doi"),
|
||||
alt_id=row.get("alt_id") or row.get("alternative_id"), # Handle both column names
|
||||
issn=row.get("issn"),
|
||||
journal=row.get("journal"),
|
||||
published_online=parse_date(row.get("published_online")),
|
||||
status="New",
|
||||
status="New"
|
||||
)
|
||||
db.session.add(metadata)
|
||||
db.session.add(paper)
|
||||
added_count += 1
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
@ -134,6 +259,15 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
|
||||
# Commit the chunk and roll session fresh
|
||||
db.session.commit()
|
||||
|
||||
# Update progress
|
||||
if task_id:
|
||||
progress = min(90, 10 + int((chunk_idx + 1) * 80 / total_chunks))
|
||||
task_progress[task_id] = {
|
||||
"state": "PROGRESS",
|
||||
"progress": progress,
|
||||
"message": f"Processed {chunk_idx+1}/{total_chunks} chunks"
|
||||
}
|
||||
|
||||
# Log periodic progress every 5 chunks
|
||||
if (chunk_idx + 1) % 5 == 0:
|
||||
ActivityLog.log_import_activity(
|
||||
@ -148,11 +282,14 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
|
||||
}
|
||||
)
|
||||
|
||||
progress = min(90, 10 + int((chunk_idx + 1) * 80 / total_chunks))
|
||||
self.update_state(state='PROGRESS', meta={'progress': progress})
|
||||
|
||||
# Final progress update and completion log
|
||||
self.update_state(state='PROGRESS', meta={'progress': 100})
|
||||
if task_id:
|
||||
task_progress[task_id] = {
|
||||
"state": "PROGRESS",
|
||||
"progress": 100,
|
||||
"message": "Finalizing import..."
|
||||
}
|
||||
|
||||
ActivityLog.log_import_activity(
|
||||
action="complete_csv_import",
|
||||
status="success",
|
||||
@ -167,6 +304,12 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
|
||||
|
||||
except Exception as e:
|
||||
db.session.rollback()
|
||||
if task_id:
|
||||
task_progress[task_id] = {
|
||||
"state": "FAILURE",
|
||||
"progress": 0,
|
||||
"error": str(e)
|
||||
}
|
||||
ActivityLog.log_error(
|
||||
error_message="CSV import failed",
|
||||
exception=e,
|
||||
@ -189,7 +332,7 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
|
||||
status="error",
|
||||
description=f"Import completed with {error_count} errors",
|
||||
error_csv=error_csv.getvalue(),
|
||||
task_id=self.request.id,
|
||||
task_id=task_id,
|
||||
error_count=error_count
|
||||
)
|
||||
except Exception:
|
||||
@ -204,41 +347,23 @@ def process_csv(self, file_content, delimiter, duplicate_strategy="skip"):
|
||||
"skipped_records": skipped_records[:5], # Include up to 5 examples
|
||||
"skipped_reason_summary": "Records were skipped because they already exist in the database. Use 'update' strategy to update them.",
|
||||
"errors": errors[:5],
|
||||
"error_count": error_count,
|
||||
"task_id": self.request.id
|
||||
"error_count": error_count
|
||||
}
|
||||
|
||||
@bp.route("/task_status/<task_id>")
|
||||
def task_status(task_id):
|
||||
"""Get status of background task."""
|
||||
task = celery.AsyncResult(task_id)
|
||||
progress_data = task_progress.get(task_id)
|
||||
if not progress_data:
|
||||
return jsonify({"error": "Task not found."})
|
||||
|
||||
if task.state == "PENDING":
|
||||
response = {"state": task.state, "progress": 0}
|
||||
elif task.state == "PROGRESS":
|
||||
response = {
|
||||
"state": task.state,
|
||||
"progress": task.info.get("progress", 0)
|
||||
}
|
||||
elif task.state == "SUCCESS":
|
||||
response = {
|
||||
"state": task.state,
|
||||
"result": task.result
|
||||
}
|
||||
else: # FAILURE, REVOKED, etc.
|
||||
response = {
|
||||
"state": task.state,
|
||||
"error": str(task.info) if task.info else "Unknown error"
|
||||
}
|
||||
|
||||
return jsonify(response)
|
||||
return jsonify(progress_data)
|
||||
|
||||
@bp.route("/download_error_log/<task_id>")
|
||||
def download_error_log(task_id):
|
||||
# Find the most recent error log for this task
|
||||
error_log = ActivityLog.query.filter(
|
||||
ActivityLog.action == "import_errors",
|
||||
ActivityLog.extra_data.like(f'%"{task_id}"%') # Search in JSON
|
||||
ActivityLog.action == "import_errors"
|
||||
).order_by(ActivityLog.timestamp.desc()).first()
|
||||
|
||||
if not error_log:
|
||||
@ -255,7 +380,7 @@ def download_error_log(task_id):
|
||||
|
||||
buffer = StringIO(error_csv)
|
||||
return send_file(
|
||||
buffer,
|
||||
BytesIO(buffer.getvalue().encode()), # Corrected to use BytesIO
|
||||
mimetype="text/csv",
|
||||
as_attachment=True,
|
||||
download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
|
@ -1,52 +0,0 @@
|
||||
from celery import Celery
|
||||
from celery.schedules import crontab
|
||||
|
||||
# Create Celery instance without Flask app initially
|
||||
celery = Celery(
|
||||
'scipaperloader',
|
||||
broker='redis://localhost:6379/0',
|
||||
backend='redis://localhost:6379/0',
|
||||
)
|
||||
|
||||
def configure_celery(app=None):
|
||||
"""Configure Celery with the Flask app settings and ensure tasks run in the app context."""
|
||||
if app is None:
|
||||
# Import here to avoid circular import
|
||||
from scipaperloader import create_app
|
||||
app = create_app()
|
||||
|
||||
# Update Celery configuration using the app settings
|
||||
celery.conf.update(
|
||||
broker_url=app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0'),
|
||||
result_backend=app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0'),
|
||||
task_serializer='json',
|
||||
accept_content=['json'],
|
||||
result_serializer='json',
|
||||
timezone='UTC',
|
||||
enable_utc=True,
|
||||
task_time_limit=3600, # 1 hour max runtime
|
||||
task_soft_time_limit=3000, # 50 minutes soft limit
|
||||
worker_max_tasks_per_child=10, # Restart workers after 10 tasks
|
||||
worker_max_memory_per_child=1000000, # 1GB memory limit
|
||||
task_acks_late=True, # Acknowledge tasks after completion
|
||||
task_reject_on_worker_lost=True, # Requeue tasks if worker dies
|
||||
# Configure Beat schedule for periodic tasks
|
||||
beat_schedule={
|
||||
'hourly-scraper-scheduler': {
|
||||
'task': 'scipaperloader.scrapers.tasks.hourly_scraper_scheduler',
|
||||
'schedule': crontab(minute=0), # Run at the start of every hour
|
||||
'options': {'expires': 3600}
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Create a custom task class that pushes the Flask application context
|
||||
class ContextTask(celery.Task):
|
||||
abstract = True
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
with app.app_context():
|
||||
return self.run(*args, **kwargs)
|
||||
|
||||
celery.Task = ContextTask
|
||||
return celery
|
@ -3,7 +3,7 @@ import os
|
||||
|
||||
class Config:
|
||||
SECRET_KEY = os.environ.get("SECRET_KEY", "dev")
|
||||
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
|
||||
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///instance/papers.db")
|
||||
SQLALCHEMY_TRACK_MODIFICATIONS = False
|
||||
APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
|
||||
SCRAPER_MODULE = os.environ.get("SCRAPER_MODULE", "dummy")
|
||||
|
@ -191,6 +191,7 @@ class PaperMetadata(db.Model):
|
||||
type = db.Column(db.String(50))
|
||||
language = db.Column(db.String(50))
|
||||
published_online = db.Column(db.Date) # or DateTime/String
|
||||
publisher = db.Column(db.String(100), nullable=True) # Detected publisher name
|
||||
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
|
||||
previous_status = db.Column(db.String(10), nullable=True) # Store previous status for reversion
|
||||
file_path = db.Column(db.Text)
|
||||
@ -342,6 +343,41 @@ class ScraperModuleConfig(db.Model):
|
||||
db.session.commit()
|
||||
return config
|
||||
|
||||
|
||||
class TimezoneConfig(db.Model):
|
||||
"""Model to store the configured timezone for the scheduler."""
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
timezone = db.Column(db.String(50), default="Europe/Berlin")
|
||||
|
||||
@classmethod
|
||||
def get_current_timezone(cls):
|
||||
"""Get the currently configured timezone."""
|
||||
config = cls.query.first()
|
||||
if not config:
|
||||
config = cls(timezone="Europe/Berlin")
|
||||
db.session.add(config)
|
||||
db.session.commit()
|
||||
return config.timezone
|
||||
|
||||
@classmethod
|
||||
def set_timezone(cls, timezone_name):
|
||||
"""Set the timezone configuration."""
|
||||
config = cls.query.first()
|
||||
if not config:
|
||||
config = cls(timezone=timezone_name)
|
||||
db.session.add(config)
|
||||
else:
|
||||
old_value = config.timezone
|
||||
config.timezone = timezone_name
|
||||
ActivityLog.log_config_change(
|
||||
config_key="scheduler_timezone",
|
||||
old_value=old_value,
|
||||
new_value=timezone_name,
|
||||
description="Updated scheduler timezone configuration"
|
||||
)
|
||||
db.session.commit()
|
||||
return config
|
||||
|
||||
def init_schedule_config():
|
||||
"""Initialize ScheduleConfig with default values if empty"""
|
||||
if ScheduleConfig.query.count() == 0:
|
||||
@ -379,3 +415,9 @@ def init_schedule_config():
|
||||
default_path = DownloadPathConfig(path="/path/to/dummy/papers")
|
||||
db.session.add(default_path)
|
||||
db.session.commit()
|
||||
|
||||
# Initialize TimezoneConfig if it doesn't exist
|
||||
if TimezoneConfig.query.count() == 0:
|
||||
default_timezone = TimezoneConfig(timezone="Europe/Berlin")
|
||||
db.session.add(default_timezone)
|
||||
db.session.commit()
|
||||
|
6
scipaperloader/parsers/__init__.py
Normal file
6
scipaperloader/parsers/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
# Parser modules for extracting full text from publisher-specific HTML content
|
||||
from .base_parser import BaseParser, ParsedContent, ParseError
|
||||
from .elsevier_parser import ElsevierParser
|
||||
from .arxiv_parser import ArxivParser
|
||||
|
||||
__all__ = ['BaseParser', 'ParsedContent', 'ParseError', 'ElsevierParser', 'ArxivParser']
|
227
scipaperloader/parsers/arxiv_parser.py
Normal file
227
scipaperloader/parsers/arxiv_parser.py
Normal file
@ -0,0 +1,227 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Optional, List
|
||||
from .base_parser import BaseParser, ParsedContent, ParseError
|
||||
|
||||
class ArxivParser(BaseParser):
|
||||
"""Parser for arXiv papers."""
|
||||
|
||||
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
|
||||
"""Check if this is an arXiv page."""
|
||||
html_lower = html_content.lower()
|
||||
|
||||
# Check for arXiv indicators
|
||||
indicators = [
|
||||
'arxiv.org',
|
||||
'export.arxiv.org',
|
||||
'arxiv:',
|
||||
'meta name="citation_publisher" content="arxiv"',
|
||||
]
|
||||
|
||||
return any(indicator in html_lower for indicator in indicators)
|
||||
|
||||
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
|
||||
"""Parse arXiv HTML content."""
|
||||
try:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Extract title
|
||||
title = self._extract_title(soup)
|
||||
|
||||
# Extract abstract
|
||||
abstract = self._extract_abstract(soup)
|
||||
|
||||
# Extract authors
|
||||
authors = self._extract_authors(soup)
|
||||
|
||||
# Extract full text (arXiv usually just has abstract on the HTML page)
|
||||
full_text = self._extract_full_text(soup, abstract)
|
||||
|
||||
# Extract keywords/subjects
|
||||
keywords = self._extract_subjects(soup)
|
||||
|
||||
# Extract arxiv ID
|
||||
arxiv_id = self._extract_arxiv_id(soup)
|
||||
|
||||
if not full_text or len(full_text.strip()) < 50:
|
||||
raise ParseError("Could not extract meaningful content from arXiv page")
|
||||
|
||||
return ParsedContent(
|
||||
full_text=full_text,
|
||||
title=title,
|
||||
abstract=abstract,
|
||||
authors=authors,
|
||||
keywords=keywords,
|
||||
sections=None, # arXiv HTML pages don't usually have full sections
|
||||
references=None, # References are typically in the PDF
|
||||
doi=doi,
|
||||
journal="arXiv",
|
||||
publication_date=self._extract_submission_date(soup),
|
||||
metadata={
|
||||
'parser': 'arxiv',
|
||||
'arxiv_id': arxiv_id,
|
||||
'source': 'arxiv.org'
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise ParseError(f"Failed to parse arXiv content: {str(e)}")
|
||||
|
||||
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract paper title."""
|
||||
# Try multiple title selectors for arXiv
|
||||
selectors = [
|
||||
'h1.title',
|
||||
'meta[name="citation_title"]',
|
||||
'title'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
if 'meta' in selector:
|
||||
element = soup.find('meta', attrs={'name': 'citation_title'})
|
||||
if element:
|
||||
return element.get('content', '').strip()
|
||||
else:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
text = element.get_text(strip=True)
|
||||
# Remove "Title:" prefix if present
|
||||
text = re.sub(r'^Title:\s*', '', text)
|
||||
return text
|
||||
|
||||
return None
|
||||
|
||||
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract paper abstract."""
|
||||
# arXiv abstract selectors
|
||||
selectors = [
|
||||
'blockquote.abstract',
|
||||
'div.abstract',
|
||||
'meta[name="citation_abstract"]'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
if 'meta' in selector:
|
||||
element = soup.find('meta', attrs={'name': 'citation_abstract'})
|
||||
if element:
|
||||
return element.get('content', '').strip()
|
||||
else:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
text = element.get_text(strip=True)
|
||||
# Remove "Abstract:" prefix if present
|
||||
text = re.sub(r'^Abstract:\s*', '', text)
|
||||
return text
|
||||
|
||||
return None
|
||||
|
||||
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
||||
"""Extract author names."""
|
||||
authors = []
|
||||
|
||||
# Try author meta tags
|
||||
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
|
||||
if author_metas:
|
||||
authors = [meta.get('content', '').strip() for meta in author_metas]
|
||||
|
||||
# Try arXiv author div
|
||||
if not authors:
|
||||
authors_div = soup.select_one('div.authors')
|
||||
if authors_div:
|
||||
# Extract author links or text
|
||||
author_links = authors_div.find_all('a')
|
||||
if author_links:
|
||||
authors = [link.get_text(strip=True) for link in author_links]
|
||||
else:
|
||||
# Fallback to text parsing
|
||||
text = authors_div.get_text()
|
||||
# Remove "Authors:" prefix and split by commas
|
||||
text = re.sub(r'^Authors?:\s*', '', text)
|
||||
authors = [author.strip() for author in text.split(',')]
|
||||
|
||||
return authors if authors else None
|
||||
|
||||
def _extract_full_text(self, soup: BeautifulSoup, abstract: Optional[str] = None) -> str:
|
||||
"""Extract main content (usually just abstract for arXiv HTML pages)."""
|
||||
content_parts = []
|
||||
|
||||
# For arXiv, the HTML page typically only contains abstract and metadata
|
||||
# The full text is in the PDF
|
||||
|
||||
if abstract:
|
||||
content_parts.append(f"Abstract\n{abstract}")
|
||||
|
||||
# Look for any additional content sections
|
||||
comments_section = soup.select_one('td.comments')
|
||||
if comments_section:
|
||||
comments = comments_section.get_text(strip=True)
|
||||
if comments:
|
||||
content_parts.append(f"Comments\n{comments}")
|
||||
|
||||
# Add note about PDF availability
|
||||
content_parts.append(
|
||||
"\nNote: This is the abstract and metadata from the arXiv HTML page. "
|
||||
"The full text is available in the PDF version."
|
||||
)
|
||||
|
||||
return '\n\n'.join(content_parts)
|
||||
|
||||
def _extract_subjects(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
||||
"""Extract subject classifications."""
|
||||
subjects = []
|
||||
|
||||
# Look for subject classification
|
||||
subjects_td = soup.select_one('td.subjects')
|
||||
if subjects_td:
|
||||
subjects_text = subjects_td.get_text(strip=True)
|
||||
# Parse subjects (format: "Primary: subject1; Secondary: subject2")
|
||||
subjects = [subj.strip() for subj in re.split(r'[;,]', subjects_text)]
|
||||
# Clean up prefixes
|
||||
subjects = [re.sub(r'^(Primary|Secondary):\s*', '', subj) for subj in subjects]
|
||||
subjects = [subj for subj in subjects if subj] # Remove empty strings
|
||||
|
||||
return subjects if subjects else None
|
||||
|
||||
def _extract_arxiv_id(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract arXiv ID."""
|
||||
# Look for arXiv ID in various places
|
||||
arxiv_id_patterns = [
|
||||
r'arXiv:(\d+\.\d+(?:v\d+)?)',
|
||||
r'(\d{4}\.\d{4,5}(?:v\d+)?)',
|
||||
]
|
||||
|
||||
# Search in page text
|
||||
page_text = soup.get_text()
|
||||
for pattern in arxiv_id_patterns:
|
||||
match = re.search(pattern, page_text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# Search in URL or meta tags
|
||||
canonical_link = soup.find('link', attrs={'rel': 'canonical'})
|
||||
if canonical_link:
|
||||
href = canonical_link.get('href', '')
|
||||
for pattern in arxiv_id_patterns:
|
||||
match = re.search(pattern, href)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
def _extract_submission_date(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract submission date."""
|
||||
# Look for submission date
|
||||
submission_td = soup.select_one('td.submission-history')
|
||||
if submission_td:
|
||||
date_text = submission_td.get_text()
|
||||
# Extract date (format varies)
|
||||
date_match = re.search(r'(\d{1,2}\s+\w+\s+\d{4})', date_text)
|
||||
if date_match:
|
||||
return date_match.group(1)
|
||||
|
||||
# Try meta tag
|
||||
date_meta = soup.find('meta', attrs={'name': 'citation_date'})
|
||||
if date_meta:
|
||||
return date_meta.get('content', '').strip()
|
||||
|
||||
return None
|
83
scipaperloader/parsers/base_parser.py
Normal file
83
scipaperloader/parsers/base_parser.py
Normal file
@ -0,0 +1,83 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Optional, List
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class ParsedContent:
|
||||
"""Container for parsed content from a publisher's HTML."""
|
||||
full_text: str
|
||||
title: Optional[str] = None
|
||||
abstract: Optional[str] = None
|
||||
authors: Optional[List[str]] = None
|
||||
keywords: Optional[List[str]] = None
|
||||
sections: Optional[Dict[str, str]] = None # section_title -> section_content
|
||||
references: Optional[List[str]] = None
|
||||
doi: Optional[str] = None
|
||||
journal: Optional[str] = None
|
||||
publication_date: Optional[str] = None
|
||||
metadata: Optional[Dict] = None # Additional metadata specific to publisher
|
||||
|
||||
class BaseParser(ABC):
|
||||
"""Base class for all publisher-specific parsers."""
|
||||
|
||||
def __init__(self):
|
||||
self.parser_name = self.__class__.__name__.lower().replace('parser', '')
|
||||
|
||||
@abstractmethod
|
||||
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
|
||||
"""
|
||||
Check if this parser can handle the given HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to check
|
||||
url: Optional URL of the content (for additional context)
|
||||
|
||||
Returns:
|
||||
True if this parser can handle the content, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
|
||||
"""
|
||||
Parse HTML content and extract structured information.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to parse
|
||||
doi: Optional DOI of the paper
|
||||
|
||||
Returns:
|
||||
ParsedContent object with extracted information
|
||||
|
||||
Raises:
|
||||
ParseError: If parsing fails
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_name(self) -> str:
|
||||
"""Return the name of this parser."""
|
||||
return self.parser_name
|
||||
|
||||
def get_description(self) -> str:
|
||||
"""Return a description of this parser."""
|
||||
return getattr(self.__class__, "__doc__", "No description available")
|
||||
|
||||
def validate_content(self, content: ParsedContent) -> bool:
|
||||
"""
|
||||
Validate the parsed content to ensure it meets minimum requirements.
|
||||
|
||||
Args:
|
||||
content: The parsed content to validate
|
||||
|
||||
Returns:
|
||||
True if content is valid, False otherwise
|
||||
"""
|
||||
# Basic validation - must have some full text
|
||||
if not content.full_text or len(content.full_text.strip()) < 100:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
class ParseError(Exception):
|
||||
"""Exception raised when parsing fails."""
|
||||
pass
|
252
scipaperloader/parsers/elsevier_parser.py
Normal file
252
scipaperloader/parsers/elsevier_parser.py
Normal file
@ -0,0 +1,252 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Optional, List
|
||||
from .base_parser import BaseParser, ParsedContent, ParseError
|
||||
|
||||
class ElsevierParser(BaseParser):
|
||||
"""Parser for Elsevier/ScienceDirect articles."""
|
||||
|
||||
def can_parse(self, html_content: str, url: Optional[str] = None) -> bool:
|
||||
"""Check if this is an Elsevier/ScienceDirect page."""
|
||||
html_lower = html_content.lower()
|
||||
|
||||
# Check for Elsevier/ScienceDirect indicators
|
||||
indicators = [
|
||||
'sciencedirect.com',
|
||||
'elsevier.com',
|
||||
'meta name="citation_publisher" content="elsevier"',
|
||||
'copyright.*elsevier',
|
||||
'sciencedirect',
|
||||
]
|
||||
|
||||
return any(indicator in html_lower for indicator in indicators)
|
||||
|
||||
def parse(self, html_content: str, doi: Optional[str] = None) -> ParsedContent:
|
||||
"""Parse Elsevier/ScienceDirect HTML content."""
|
||||
try:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Extract title
|
||||
title = self._extract_title(soup)
|
||||
|
||||
# Extract abstract
|
||||
abstract = self._extract_abstract(soup)
|
||||
|
||||
# Extract authors
|
||||
authors = self._extract_authors(soup)
|
||||
|
||||
# Extract full text
|
||||
full_text = self._extract_full_text(soup)
|
||||
|
||||
# Extract sections
|
||||
sections = self._extract_sections(soup)
|
||||
|
||||
# Extract keywords
|
||||
keywords = self._extract_keywords(soup)
|
||||
|
||||
# Extract references
|
||||
references = self._extract_references(soup)
|
||||
|
||||
# Extract journal info
|
||||
journal = self._extract_journal(soup)
|
||||
|
||||
# Extract publication date
|
||||
publication_date = self._extract_publication_date(soup)
|
||||
|
||||
# Combine everything into full text if sections exist
|
||||
if sections:
|
||||
full_text = self._combine_sections(sections, abstract)
|
||||
|
||||
if not full_text or len(full_text.strip()) < 100:
|
||||
raise ParseError("Could not extract meaningful full text content")
|
||||
|
||||
return ParsedContent(
|
||||
full_text=full_text,
|
||||
title=title,
|
||||
abstract=abstract,
|
||||
authors=authors,
|
||||
keywords=keywords,
|
||||
sections=sections,
|
||||
references=references,
|
||||
doi=doi,
|
||||
journal=journal,
|
||||
publication_date=publication_date,
|
||||
metadata={
|
||||
'parser': 'elsevier',
|
||||
'source': 'sciencedirect'
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise ParseError(f"Failed to parse Elsevier content: {str(e)}")
|
||||
|
||||
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract article title."""
|
||||
# Try multiple title selectors
|
||||
selectors = [
|
||||
'h1.title-text',
|
||||
'h1[data-testid="title"]',
|
||||
'h1.article-title',
|
||||
'meta[name="citation_title"]',
|
||||
'title'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
if 'meta' in selector:
|
||||
element = soup.find('meta', attrs={'name': 'citation_title'})
|
||||
if element:
|
||||
return element.get('content', '').strip()
|
||||
else:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text(strip=True)
|
||||
|
||||
return None
|
||||
|
||||
def _extract_abstract(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract article abstract."""
|
||||
selectors = [
|
||||
'div.abstract-content',
|
||||
'div[data-testid="abstract"]',
|
||||
'div.abstract',
|
||||
'section.abstract',
|
||||
'div#abstract'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text(strip=True)
|
||||
|
||||
return None
|
||||
|
||||
def _extract_authors(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
||||
"""Extract author names."""
|
||||
authors = []
|
||||
|
||||
# Try author meta tags
|
||||
author_metas = soup.find_all('meta', attrs={'name': 'citation_author'})
|
||||
if author_metas:
|
||||
authors = [meta.get('content', '').strip() for meta in author_metas]
|
||||
|
||||
# Try author div/span elements
|
||||
if not authors:
|
||||
author_elements = soup.select('div.author a, span.author, .author-name')
|
||||
authors = [elem.get_text(strip=True) for elem in author_elements]
|
||||
|
||||
return authors if authors else None
|
||||
|
||||
def _extract_full_text(self, soup: BeautifulSoup) -> str:
|
||||
"""Extract main article content."""
|
||||
content_parts = []
|
||||
|
||||
# Try main content selectors
|
||||
main_selectors = [
|
||||
'div.article-content',
|
||||
'div.body-content',
|
||||
'main.article-body',
|
||||
'div[data-testid="article-body"]',
|
||||
'section.article-section'
|
||||
]
|
||||
|
||||
for selector in main_selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
# Remove script, style, and navigation elements
|
||||
for unwanted in element.find_all(['script', 'style', 'nav', 'footer', 'header']):
|
||||
unwanted.decompose()
|
||||
|
||||
text = element.get_text(separator='\n', strip=True)
|
||||
if text and len(text) > 50: # Only add substantial content
|
||||
content_parts.append(text)
|
||||
|
||||
return '\n\n'.join(content_parts)
|
||||
|
||||
def _extract_sections(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]:
|
||||
"""Extract article sections with headings."""
|
||||
sections = {}
|
||||
|
||||
# Look for section headings and content
|
||||
section_elements = soup.find_all(['h2', 'h3', 'h4'], class_=re.compile(r'section|heading'))
|
||||
|
||||
for heading in section_elements:
|
||||
section_title = heading.get_text(strip=True)
|
||||
|
||||
# Find content after this heading until next heading
|
||||
content_parts = []
|
||||
current = heading.next_sibling
|
||||
|
||||
while current and current.name not in ['h1', 'h2', 'h3', 'h4']:
|
||||
if hasattr(current, 'get_text'):
|
||||
text = current.get_text(strip=True)
|
||||
if text:
|
||||
content_parts.append(text)
|
||||
current = current.next_sibling
|
||||
|
||||
if content_parts:
|
||||
sections[section_title] = '\n'.join(content_parts)
|
||||
|
||||
return sections if sections else None
|
||||
|
||||
def _extract_keywords(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
||||
"""Extract article keywords."""
|
||||
keywords = []
|
||||
|
||||
# Try keyword meta tags
|
||||
keyword_metas = soup.find_all('meta', attrs={'name': 'citation_keywords'})
|
||||
if keyword_metas:
|
||||
for meta in keyword_metas:
|
||||
content = meta.get('content', '')
|
||||
if content:
|
||||
keywords.extend([kw.strip() for kw in content.split(',')])
|
||||
|
||||
# Try keyword sections
|
||||
if not keywords:
|
||||
keyword_sections = soup.select('div.keywords, section.keywords')
|
||||
for section in keyword_sections:
|
||||
text = section.get_text()
|
||||
keywords.extend([kw.strip() for kw in text.split(',') if kw.strip()])
|
||||
|
||||
return keywords if keywords else None
|
||||
|
||||
def _extract_references(self, soup: BeautifulSoup) -> Optional[List[str]]:
|
||||
"""Extract references."""
|
||||
references = []
|
||||
|
||||
ref_sections = soup.select('section.references, div.references, ol.references li')
|
||||
for section in ref_sections:
|
||||
if section.name == 'li':
|
||||
references.append(section.get_text(strip=True))
|
||||
else:
|
||||
ref_items = section.find_all(['li', 'div'], class_=re.compile(r'reference'))
|
||||
references.extend([item.get_text(strip=True) for item in ref_items])
|
||||
|
||||
return references if references else None
|
||||
|
||||
def _extract_journal(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract journal name."""
|
||||
journal_meta = soup.find('meta', attrs={'name': 'citation_journal_title'})
|
||||
if journal_meta:
|
||||
return journal_meta.get('content', '').strip()
|
||||
|
||||
return None
|
||||
|
||||
def _extract_publication_date(self, soup: BeautifulSoup) -> Optional[str]:
|
||||
"""Extract publication date."""
|
||||
date_meta = soup.find('meta', attrs={'name': 'citation_publication_date'})
|
||||
if date_meta:
|
||||
return date_meta.get('content', '').strip()
|
||||
|
||||
return None
|
||||
|
||||
def _combine_sections(self, sections: Dict[str, str], abstract: Optional[str] = None) -> str:
|
||||
"""Combine all sections into full text."""
|
||||
full_text_parts = []
|
||||
|
||||
if abstract:
|
||||
full_text_parts.append(f"Abstract\n{abstract}")
|
||||
|
||||
for section_title, section_content in sections.items():
|
||||
full_text_parts.append(f"{section_title}\n{section_content}")
|
||||
|
||||
return '\n\n'.join(full_text_parts)
|
593
scipaperloader/scheduler.py
Normal file
593
scipaperloader/scheduler.py
Normal file
@ -0,0 +1,593 @@
|
||||
"""
|
||||
APScheduler-based scheduling system to replace complex Celery delayed task management.
|
||||
This provides clean job scheduling and revocation without manual Redis manipulation.
|
||||
"""
|
||||
|
||||
import random
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, List
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore
|
||||
from apscheduler.executors.pool import ThreadPoolExecutor
|
||||
from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR, EVENT_JOB_MISSED
|
||||
from apscheduler.jobstores.base import JobLookupError
|
||||
|
||||
# Configure APScheduler logging
|
||||
logging.getLogger('apscheduler').setLevel(logging.WARNING)
|
||||
|
||||
# Global scheduler instance
|
||||
_scheduler = None
|
||||
_flask_app = None
|
||||
|
||||
|
||||
def _get_flask_app():
|
||||
"""Get the Flask app instance."""
|
||||
global _flask_app
|
||||
if _flask_app:
|
||||
return _flask_app
|
||||
|
||||
try:
|
||||
from flask import current_app
|
||||
return current_app
|
||||
except RuntimeError:
|
||||
return None
|
||||
|
||||
|
||||
def _hourly_scraper_scheduler():
|
||||
"""Standalone function for hourly scheduling logic."""
|
||||
app = _get_flask_app()
|
||||
if not app:
|
||||
return
|
||||
|
||||
with app.app_context():
|
||||
try:
|
||||
from .models import ScraperState, ActivityLog
|
||||
|
||||
# Check if scraper is active
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
if not scraper_state.is_active:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="hourly_scheduler_apscheduler",
|
||||
status="info",
|
||||
description="Hourly scheduler skipped - scraper not active"
|
||||
)
|
||||
return {"status": "inactive", "papers_scheduled": 0}
|
||||
|
||||
if scraper_state.is_paused:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="hourly_scheduler_apscheduler",
|
||||
status="info",
|
||||
description="Hourly scheduler skipped - scraper paused"
|
||||
)
|
||||
return {"status": "paused", "papers_scheduled": 0}
|
||||
|
||||
# Get papers to process this hour
|
||||
from .scrapers.manager import ScraperManager
|
||||
manager = ScraperManager()
|
||||
papers = manager.select_papers_for_processing()
|
||||
|
||||
if not papers:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="hourly_scheduler_apscheduler",
|
||||
status="info",
|
||||
description="No papers available for processing this hour"
|
||||
)
|
||||
return {"status": "empty", "papers_scheduled": 0}
|
||||
|
||||
# Schedule papers at random times within the hour
|
||||
scheduled_count = 0
|
||||
current_time = datetime.now()
|
||||
scheduled_papers = []
|
||||
|
||||
for paper in papers:
|
||||
# Random delay between 1 second and 58 minutes
|
||||
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
|
||||
run_time = current_time + timedelta(seconds=delay_seconds)
|
||||
|
||||
# Schedule the individual paper processing job with unique ID
|
||||
# Include microseconds and random suffix to prevent collisions
|
||||
import uuid
|
||||
job_id = f"process_paper_{paper.id}_{int(current_time.timestamp())}_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
global _scheduler
|
||||
if _scheduler:
|
||||
_scheduler.add_job(
|
||||
func=_process_single_paper,
|
||||
trigger='date',
|
||||
run_date=run_time,
|
||||
args=[paper.id],
|
||||
id=job_id,
|
||||
replace_existing=True, # Changed to True to handle conflicts gracefully
|
||||
name=f"Process Paper {paper.doi}"
|
||||
)
|
||||
|
||||
scheduled_count += 1
|
||||
|
||||
# Collect paper info for single log entry
|
||||
paper_info = {
|
||||
"paper_id": paper.id,
|
||||
"paper_doi": paper.doi,
|
||||
"job_id": job_id,
|
||||
"scheduled_time": run_time.isoformat(),
|
||||
"delay_seconds": delay_seconds
|
||||
}
|
||||
scheduled_papers.append(paper_info)
|
||||
|
||||
# Create single comprehensive log entry with JSON data
|
||||
try:
|
||||
import json
|
||||
from .models import ActivityLog
|
||||
|
||||
scheduling_data = {
|
||||
"total_scheduled": scheduled_count,
|
||||
"scheduled_papers": scheduled_papers,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"hour_range": f"{current_time.strftime('%H:%M')} - {(current_time + timedelta(hours=1)).strftime('%H:%M')}"
|
||||
}
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="hourly_scheduler_apscheduler",
|
||||
status="success",
|
||||
description=f"Scheduled {scheduled_count} papers for random processing within this hour using APScheduler. See extra_data for details.",
|
||||
**{"scheduling_details": json.dumps(scheduling_data)}
|
||||
)
|
||||
except Exception:
|
||||
# Fallback to simple logging
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="hourly_scheduler_apscheduler",
|
||||
status="success",
|
||||
description=f"Scheduled {scheduled_count} papers for random processing within this hour using APScheduler"
|
||||
)
|
||||
|
||||
return {"status": "success", "papers_scheduled": scheduled_count}
|
||||
|
||||
except Exception as e:
|
||||
from .models import ActivityLog
|
||||
ActivityLog.log_error(
|
||||
error_message=f"APScheduler hourly scheduler error: {str(e)}",
|
||||
source="_hourly_scraper_scheduler"
|
||||
)
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
|
||||
def _process_single_paper(paper_id: int):
|
||||
"""Standalone function to process a single paper."""
|
||||
app = _get_flask_app()
|
||||
if not app:
|
||||
return
|
||||
|
||||
with app.app_context():
|
||||
try:
|
||||
from .models import ScraperState, ActivityLog, PaperMetadata
|
||||
|
||||
# Enhanced race condition protection
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
if not scraper_state.is_active:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_single_paper_apscheduler",
|
||||
paper_id=paper_id,
|
||||
status="skipped",
|
||||
description="Task skipped - scraper not active (APScheduler)"
|
||||
)
|
||||
return {"status": "inactive", "paper_id": paper_id}
|
||||
|
||||
if scraper_state.is_paused:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_single_paper_apscheduler",
|
||||
paper_id=paper_id,
|
||||
status="skipped",
|
||||
description="Task skipped - scraper paused (APScheduler)"
|
||||
)
|
||||
return {"status": "paused", "paper_id": paper_id}
|
||||
|
||||
# Get the paper
|
||||
paper = PaperMetadata.query.get(paper_id)
|
||||
if not paper:
|
||||
return {"status": "error", "message": f"Paper {paper_id} not found"}
|
||||
|
||||
# Final check before processing
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
if not scraper_state.is_active:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_single_paper_apscheduler",
|
||||
paper_id=paper_id,
|
||||
status="skipped",
|
||||
description="Task skipped - scraper not active (pre-processing check)"
|
||||
)
|
||||
return {"status": "inactive", "paper_id": paper_id}
|
||||
|
||||
# Process the paper using scraper manager
|
||||
from .scrapers.manager import ScraperManager
|
||||
manager = ScraperManager()
|
||||
result = manager.process_paper(paper)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
from .models import ActivityLog
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error processing paper {paper_id} in APScheduler: {str(e)}",
|
||||
source="_process_single_paper"
|
||||
)
|
||||
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
||||
|
||||
|
||||
def _process_single_paper_manual(paper_id: int, scraper_name: Optional[str] = None):
|
||||
"""Standalone function to process a single paper manually (bypasses scraper state checks)."""
|
||||
app = _get_flask_app()
|
||||
if not app:
|
||||
return
|
||||
|
||||
with app.app_context():
|
||||
try:
|
||||
from .models import ActivityLog, PaperMetadata
|
||||
|
||||
# Get the paper
|
||||
paper = PaperMetadata.query.get(paper_id)
|
||||
if not paper:
|
||||
return {"status": "error", "message": f"Paper {paper_id} not found"}
|
||||
|
||||
# Process the paper using manual method (bypasses scraper state checks)
|
||||
from .scrapers.manager import ScraperManager
|
||||
manager = ScraperManager()
|
||||
result = manager.process_paper_manual(paper, scraper_name=scraper_name)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
from .models import ActivityLog
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error manually processing paper {paper_id} in APScheduler: {str(e)}",
|
||||
source="_process_single_paper_manual"
|
||||
)
|
||||
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
||||
|
||||
|
||||
def _job_listener(event):
|
||||
"""Listen to job execution events."""
|
||||
app = _get_flask_app()
|
||||
if not app:
|
||||
return
|
||||
|
||||
with app.app_context():
|
||||
try:
|
||||
from .models import ActivityLog
|
||||
|
||||
job_id = event.job_id
|
||||
|
||||
if event.exception:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"APScheduler job {job_id} failed: {str(event.exception)}",
|
||||
source="ScraperScheduler.job_listener"
|
||||
)
|
||||
elif hasattr(event, 'retval') and event.retval:
|
||||
# Job completed successfully
|
||||
if job_id.startswith('process_paper_'):
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="apscheduler_job_complete",
|
||||
status="success",
|
||||
description=f"Job {job_id} completed successfully"
|
||||
)
|
||||
except Exception as e:
|
||||
# Don't let logging errors break the scheduler
|
||||
print(f"Error in job listener: {str(e)}")
|
||||
|
||||
|
||||
class ScraperScheduler:
|
||||
"""APScheduler-based scraper task scheduler."""
|
||||
|
||||
def __init__(self, app=None):
|
||||
self.app = app
|
||||
if app:
|
||||
self.init_app(app)
|
||||
|
||||
@property
|
||||
def scheduler(self):
|
||||
"""Expose the global _scheduler instance."""
|
||||
global _scheduler
|
||||
return _scheduler
|
||||
|
||||
def init_app(self, app):
|
||||
"""Initialize the scheduler with Flask app context."""
|
||||
global _scheduler, _flask_app
|
||||
_flask_app = app
|
||||
self.app = app
|
||||
|
||||
# Initialize scheduler within app context to access db.engine properly
|
||||
with app.app_context():
|
||||
# Use the existing Flask-SQLAlchemy database engine for APScheduler
|
||||
from .db import db
|
||||
|
||||
# Configure job store to use the existing database engine
|
||||
jobstores = {
|
||||
'default': SQLAlchemyJobStore(engine=db.engine)
|
||||
}
|
||||
|
||||
# Configure thread pool executor
|
||||
executors = {
|
||||
'default': ThreadPoolExecutor(max_workers=50) # Increased from 20 to 50
|
||||
}
|
||||
|
||||
# Job defaults
|
||||
job_defaults = {
|
||||
'coalesce': False, # Don't combine multiple scheduled instances
|
||||
'max_instances': 3, # Allow up to 3 instances of the same job
|
||||
'misfire_grace_time': 30 # 30 seconds grace period for missed jobs
|
||||
}
|
||||
|
||||
# Get timezone from database configuration
|
||||
from .models import TimezoneConfig
|
||||
configured_timezone = TimezoneConfig.get_current_timezone()
|
||||
|
||||
# Create the scheduler
|
||||
_scheduler = BackgroundScheduler(
|
||||
jobstores=jobstores,
|
||||
executors=executors,
|
||||
job_defaults=job_defaults,
|
||||
timezone=configured_timezone # Use configurable timezone from database
|
||||
)
|
||||
|
||||
# Add event listeners
|
||||
_scheduler.add_listener(_job_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR | EVENT_JOB_MISSED)
|
||||
|
||||
# Start the scheduler FIRST, which will auto-create tables
|
||||
_scheduler.start()
|
||||
|
||||
# THEN add the hourly scraper job
|
||||
_scheduler.add_job(
|
||||
func=_hourly_scraper_scheduler,
|
||||
trigger='cron',
|
||||
minute=0, # Run at the start of every hour
|
||||
id='hourly_scraper_main',
|
||||
replace_existing=True,
|
||||
name='Hourly Scraper Scheduler'
|
||||
)
|
||||
|
||||
try:
|
||||
from .models import ActivityLog
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="apscheduler_init",
|
||||
status="success",
|
||||
description="APScheduler initialized with database job store and hourly scheduling"
|
||||
)
|
||||
except Exception:
|
||||
# Handle case where we're outside application context
|
||||
print("✅ APScheduler initialized successfully")
|
||||
|
||||
def revoke_all_scraper_jobs(self) -> int:
|
||||
"""Clean replacement for the complex _clear_delayed_tasks_from_redis method."""
|
||||
global _scheduler
|
||||
if not _scheduler:
|
||||
try:
|
||||
from .models import ActivityLog
|
||||
ActivityLog.log_error(
|
||||
error_message="Scheduler not initialized - cannot revoke jobs",
|
||||
source="ScraperScheduler.revoke_all_scraper_jobs"
|
||||
)
|
||||
except Exception:
|
||||
print("❌ Scheduler not initialized - cannot revoke jobs")
|
||||
return 0
|
||||
|
||||
revoked_count = 0
|
||||
revoked_jobs = []
|
||||
already_gone_jobs = []
|
||||
failed_jobs = []
|
||||
|
||||
try:
|
||||
# Get all jobs
|
||||
jobs = _scheduler.get_jobs()
|
||||
|
||||
for job in jobs:
|
||||
# Remove any job that processes papers or uploads (but keep the main hourly scheduler)
|
||||
if ('paper_process_' in job.id or 'test_paper_process_' in job.id or
|
||||
'process_paper_' in job.id or 'csv_upload_' in job.id or 'manual_paper_' in job.id or
|
||||
'startup_paper_' in job.id):
|
||||
try:
|
||||
_scheduler.remove_job(job.id)
|
||||
revoked_count += 1
|
||||
|
||||
# Collect job info for single log entry
|
||||
job_info = {
|
||||
"job_id": job.id,
|
||||
"job_name": job.name,
|
||||
"next_run_time": job.next_run_time.isoformat() if job.next_run_time else None,
|
||||
"args": job.args
|
||||
}
|
||||
revoked_jobs.append(job_info)
|
||||
|
||||
print(f"✅ Revoked APScheduler job: {job.id}")
|
||||
|
||||
except JobLookupError as e:
|
||||
# Job already removed/completed - this is normal
|
||||
already_gone_jobs.append({
|
||||
"job_id": job.id,
|
||||
"reason": str(e)
|
||||
})
|
||||
print(f"ℹ️ Job {job.id} was already completed or removed")
|
||||
|
||||
except Exception as e:
|
||||
# Other error - log it but continue
|
||||
failed_jobs.append({
|
||||
"job_id": job.id,
|
||||
"error": str(e)
|
||||
})
|
||||
print(f"❌ Error removing job {job.id}: {str(e)}")
|
||||
|
||||
# Create single comprehensive log entry with JSON data
|
||||
if revoked_jobs or already_gone_jobs or failed_jobs:
|
||||
try:
|
||||
import json
|
||||
from .models import ActivityLog
|
||||
|
||||
revocation_data = {
|
||||
"total_revoked": revoked_count,
|
||||
"revoked_jobs": revoked_jobs,
|
||||
"already_gone_jobs": already_gone_jobs,
|
||||
"failed_jobs": failed_jobs,
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_all_scraper_jobs_apscheduler",
|
||||
status="success",
|
||||
description=f"Successfully revoked {revoked_count} APScheduler jobs. See extra_data for details.",
|
||||
**{"revocation_details": json.dumps(revocation_data)}
|
||||
)
|
||||
except Exception:
|
||||
print(f"✅ Successfully revoked {revoked_count} APScheduler jobs")
|
||||
|
||||
return revoked_count
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
from .models import ActivityLog
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error revoking APScheduler jobs: {str(e)}",
|
||||
source="ScraperScheduler.revoke_all_scraper_jobs"
|
||||
)
|
||||
except Exception:
|
||||
print(f"❌ Error revoking APScheduler jobs: {str(e)}")
|
||||
return 0
|
||||
|
||||
def get_job_count(self) -> int:
|
||||
"""Get the number of scheduled jobs."""
|
||||
global _scheduler
|
||||
if not _scheduler:
|
||||
return 0
|
||||
return len(_scheduler.get_jobs())
|
||||
|
||||
def get_paper_jobs(self) -> List[dict]:
|
||||
"""Get information about scheduled paper processing jobs."""
|
||||
global _scheduler
|
||||
if not _scheduler:
|
||||
return []
|
||||
|
||||
jobs = []
|
||||
all_jobs = _scheduler.get_jobs()
|
||||
|
||||
for job in all_jobs:
|
||||
# Match jobs that contain paper processing patterns
|
||||
if ('process_paper_' in job.id or 'paper_process_' in job.id or 'test_paper_process_' in job.id):
|
||||
job_info = {
|
||||
'id': job.id,
|
||||
'name': job.name,
|
||||
'next_run_time': job.next_run_time.isoformat() if job.next_run_time else None,
|
||||
'args': job.args
|
||||
}
|
||||
jobs.append(job_info)
|
||||
|
||||
return jobs
|
||||
|
||||
def shutdown(self):
|
||||
"""Gracefully shutdown the scheduler."""
|
||||
global _scheduler
|
||||
if _scheduler:
|
||||
try:
|
||||
from .models import ActivityLog
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="apscheduler_shutdown",
|
||||
status="info",
|
||||
description="Shutting down APScheduler"
|
||||
)
|
||||
except Exception:
|
||||
print("🔄 Shutting down APScheduler")
|
||||
|
||||
_scheduler.shutdown(wait=False)
|
||||
_scheduler = None
|
||||
|
||||
def schedule_paper_processing(self, paper_id: int, delay_seconds: int = 0, job_id: Optional[str] = None) -> str:
|
||||
"""Schedule a paper for processing with APScheduler.
|
||||
|
||||
Args:
|
||||
paper_id: ID of the paper to process
|
||||
delay_seconds: Delay in seconds before processing (default: 0 for immediate)
|
||||
job_id: Optional custom job ID (will be generated if not provided)
|
||||
|
||||
Returns:
|
||||
str: The job ID of the scheduled job
|
||||
"""
|
||||
global _scheduler
|
||||
if not _scheduler:
|
||||
raise RuntimeError("APScheduler not initialized")
|
||||
|
||||
# Generate job ID if not provided
|
||||
if not job_id:
|
||||
# Use microseconds and UUID suffix to prevent collisions
|
||||
import uuid
|
||||
job_id = f"process_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Calculate run time
|
||||
run_time = datetime.now() + timedelta(seconds=delay_seconds)
|
||||
|
||||
# Schedule the job
|
||||
job = _scheduler.add_job(
|
||||
func=_process_single_paper,
|
||||
trigger='date',
|
||||
run_date=run_time,
|
||||
args=[paper_id],
|
||||
id=job_id,
|
||||
name=f"Process Paper {paper_id}",
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
# Log the scheduling
|
||||
try:
|
||||
from .models import ActivityLog
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="schedule_paper_processing_apscheduler",
|
||||
paper_id=paper_id,
|
||||
status="info",
|
||||
description=f"Scheduled paper {paper_id} for processing at {run_time.strftime('%H:%M:%S')} (Job ID: {job_id})"
|
||||
)
|
||||
except Exception:
|
||||
print(f"✅ Scheduled paper {paper_id} for processing (Job ID: {job_id})")
|
||||
|
||||
return job_id
|
||||
|
||||
def schedule_manual_paper_processing(self, paper_id: int, scraper_name: Optional[str] = None, delay_seconds: int = 0, job_id: Optional[str] = None) -> str:
|
||||
"""
|
||||
Schedule manual paper processing that bypasses scraper state checks.
|
||||
|
||||
Args:
|
||||
paper_id: ID of the paper to process
|
||||
scraper_name: Optional specific scraper module to use (defaults to system scraper)
|
||||
delay_seconds: Delay before processing starts (default: 0)
|
||||
job_id: Optional custom job ID (auto-generated if not provided)
|
||||
|
||||
Returns:
|
||||
Job ID of the scheduled task
|
||||
"""
|
||||
global _scheduler
|
||||
if not _scheduler:
|
||||
raise RuntimeError("APScheduler not initialized")
|
||||
|
||||
if job_id is None:
|
||||
job_id = f"manual_paper_{paper_id}_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}"
|
||||
|
||||
run_time = datetime.now() + timedelta(seconds=delay_seconds)
|
||||
|
||||
# Schedule the manual processing job
|
||||
job = _scheduler.add_job(
|
||||
func=_process_single_paper_manual,
|
||||
trigger='date',
|
||||
run_date=run_time,
|
||||
args=[paper_id, scraper_name],
|
||||
id=job_id,
|
||||
name=f"Manual Process Paper {paper_id}",
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
# Log the scheduling
|
||||
try:
|
||||
from .models import ActivityLog
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="schedule_manual_paper_processing",
|
||||
paper_id=paper_id,
|
||||
status="info",
|
||||
description=f"Scheduled manual processing for paper {paper_id} at {run_time.strftime('%H:%M:%S')} (Job ID: {job_id})"
|
||||
)
|
||||
except Exception:
|
||||
pass # Don't fail if logging fails
|
||||
|
||||
return job_id
|
@ -18,6 +18,43 @@ class BaseScraper(ABC):
|
||||
OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping
|
||||
OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the scraper."""
|
||||
self.scraper_name = self.get_name().lower()
|
||||
|
||||
def log_scrape_start(self, doi: str, paper_id: Optional[int] = None):
|
||||
"""Log the start of a scraping operation."""
|
||||
from ..models import ActivityLog
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action=f"{self.scraper_name}_scrape_start",
|
||||
status="info",
|
||||
description=f"Starting {self.get_name()} for DOI: {doi}",
|
||||
paper_id=paper_id
|
||||
)
|
||||
|
||||
def log_scrape_success(self, doi: str, message: str, paper_id: Optional[int] = None):
|
||||
"""Log successful completion of scraping."""
|
||||
from ..models import ActivityLog
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action=f"{self.scraper_name}_scrape_success",
|
||||
status="success",
|
||||
description=f"{self.get_name()} completed successfully for DOI: {doi} - {message}",
|
||||
paper_id=paper_id
|
||||
)
|
||||
|
||||
def log_scrape_failure(self, doi: str, message: str, paper_id: Optional[int] = None):
|
||||
"""Log failed scraping operation."""
|
||||
from ..models import ActivityLog
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action=f"{self.scraper_name}_scrape_failure",
|
||||
status="error",
|
||||
description=f"{self.get_name()} failed for DOI: {doi} - {message}",
|
||||
paper_id=paper_id
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""
|
||||
|
@ -30,6 +30,9 @@ class Scraper(BaseScraper):
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log start of scraping
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Simulate processing time (1-3 seconds)
|
||||
processing_time = random.uniform(1, 3)
|
||||
time.sleep(processing_time)
|
||||
@ -145,12 +148,7 @@ class Scraper(BaseScraper):
|
||||
)
|
||||
|
||||
# Log success
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="dummy_scrape",
|
||||
status="success",
|
||||
description=f"Successfully scraped {doi}",
|
||||
paper_id=paper.id
|
||||
)
|
||||
self.log_scrape_success(doi, f"Successfully scraped {doi}", paper.id)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="success",
|
||||
@ -178,12 +176,7 @@ class Scraper(BaseScraper):
|
||||
paper.error_msg = error_msg
|
||||
|
||||
# Log failure
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="dummy_scrape",
|
||||
status="error",
|
||||
description=f"Failed to scrape {doi}: {error_msg}",
|
||||
paper_id=paper.id
|
||||
)
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="error",
|
||||
|
@ -30,13 +30,8 @@ class Scraper(BaseScraper):
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log retry attempt
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_failed_paper",
|
||||
status="info",
|
||||
description=f"Retrying failed paper: {paper.title}",
|
||||
paper_id=paper.id
|
||||
)
|
||||
# Log start of retry
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Simulate longer processing time for retry (2-5 seconds)
|
||||
processing_time = random.uniform(2, 5)
|
||||
@ -64,12 +59,7 @@ class Scraper(BaseScraper):
|
||||
result_data = {"file_path": file_path}
|
||||
|
||||
# Log success
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_scrape_success",
|
||||
status="success",
|
||||
description=f"Successfully retried {doi} on second attempt",
|
||||
paper_id=paper.id
|
||||
)
|
||||
self.log_scrape_success(doi, f"Successfully retried {doi} on second attempt", paper.id)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="success",
|
||||
@ -81,12 +71,7 @@ class Scraper(BaseScraper):
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to save retry file: {str(e)}"
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_scrape_file_error",
|
||||
status="error",
|
||||
description=error_msg,
|
||||
paper_id=paper.id
|
||||
)
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="error",
|
||||
@ -105,12 +90,7 @@ class Scraper(BaseScraper):
|
||||
]
|
||||
error_msg = random.choice(error_messages)
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_scrape_failure",
|
||||
status="error",
|
||||
description=f"Retry failed for {doi}: {error_msg}",
|
||||
paper_id=paper.id
|
||||
)
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="error",
|
||||
|
172
scipaperloader/scrapers/html_fetcher.py
Normal file
172
scipaperloader/scrapers/html_fetcher.py
Normal file
@ -0,0 +1,172 @@
|
||||
import time
|
||||
import os
|
||||
import requests
|
||||
from datetime import datetime
|
||||
from .base import BaseScraper, ScrapeResult
|
||||
from flask import current_app
|
||||
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||
from ..db import db
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Scraper that fetches HTML content from DOI and saves it for further processing."""
|
||||
|
||||
# This scraper processes "New" papers and outputs "HtmlDownloaded"/"Failed"
|
||||
INPUT_STATUSES = ["New"]
|
||||
OUTPUT_STATUS_SUCCESS = "HtmlDownloaded"
|
||||
OUTPUT_STATUS_FAILURE = "Failed"
|
||||
OUTPUT_STATUS_PROCESSING = "FetchingHtml"
|
||||
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""Fetch HTML content from DOI and save to download path."""
|
||||
start_time = time.time()
|
||||
|
||||
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
if not paper:
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=f"No paper found for DOI {doi}",
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log start of scraping
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Update status to processing
|
||||
paper.status = self.OUTPUT_STATUS_PROCESSING
|
||||
db.session.commit()
|
||||
|
||||
# Prepare file paths
|
||||
download_path = DownloadPathConfig.get_path()
|
||||
file_name = f"{doi.replace('/', '_')}.html"
|
||||
file_path = os.path.join(download_path, file_name)
|
||||
|
||||
# Check/create download directory (same pattern as dummy)
|
||||
if not os.path.exists(download_path):
|
||||
try:
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
except OSError as e:
|
||||
error_msg = f"Failed to create download directory: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "path_creation_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Check path permissions (same pattern as dummy)
|
||||
if not os.access(download_path, os.W_OK):
|
||||
error_msg = f"Download path '{download_path}' is not writable"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="html_fetch_path_error",
|
||||
status="error",
|
||||
description=error_msg,
|
||||
paper_id=paper.id
|
||||
)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "path_write_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
try:
|
||||
# Fetch HTML from DOI
|
||||
doi_url = f"https://doi.org/{doi}"
|
||||
headers = {'User-Agent': 'SciPaperLoader/1.0'}
|
||||
response = requests.get(doi_url, headers=headers, timeout=30, allow_redirects=True)
|
||||
|
||||
# Check for invalid DOI (404) or other HTTP errors
|
||||
if response.status_code == 404:
|
||||
error_msg = f"Invalid DOI: {doi} not found"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "invalid_doi"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
response.raise_for_status() # Raise for other HTTP errors
|
||||
|
||||
# Save HTML content
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(response.text)
|
||||
|
||||
# Update paper status to success
|
||||
paper.status = self.OUTPUT_STATUS_SUCCESS
|
||||
paper.file_path = file_path
|
||||
paper.error_msg = None
|
||||
db.session.commit()
|
||||
|
||||
# Log success
|
||||
self.log_scrape_success(doi, f"Successfully fetched HTML for {doi}", paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="success",
|
||||
message=f"Successfully fetched HTML for {doi}",
|
||||
data={
|
||||
"file_path": file_path,
|
||||
"url": response.url, # Final URL after redirects
|
||||
"title": paper.title
|
||||
},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
error_msg = f"Failed to fetch HTML from DOI {doi}: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="html_fetch",
|
||||
status="error",
|
||||
description=error_msg,
|
||||
paper_id=paper.id
|
||||
)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "network_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to save HTML file: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "file_creation_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
@ -1,13 +1,14 @@
|
||||
"""
|
||||
Simplified scraper management system with hourly quota scheduling.
|
||||
Uses APScheduler for all task processing - no Celery dependencies.
|
||||
"""
|
||||
|
||||
import random
|
||||
import math
|
||||
import redis
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime, timedelta, UTC
|
||||
from typing import List, Dict, Optional
|
||||
from sqlalchemy import func
|
||||
from flask import current_app
|
||||
|
||||
from ..models import (
|
||||
PaperMetadata,
|
||||
@ -20,7 +21,6 @@ from ..models import (
|
||||
from ..db import db
|
||||
from ..cache_utils import get_cached_hourly_quota
|
||||
from .factory import get_scraper, get_available_scrapers
|
||||
from ..celery import celery
|
||||
|
||||
|
||||
class ScraperManager:
|
||||
@ -29,237 +29,81 @@ class ScraperManager:
|
||||
def __init__(self):
|
||||
self.current_scraper = None
|
||||
self.pending_papers = [] # Track papers being processed
|
||||
# Initialize Redis client for delayed task management
|
||||
self.redis_client = None
|
||||
self._init_redis_client()
|
||||
# No more Redis client initialization - using APScheduler now
|
||||
|
||||
def _init_redis_client(self):
|
||||
"""Initialize Redis client for delayed task management."""
|
||||
def _get_scheduler(self):
|
||||
"""Get the ScraperScheduler instance from Flask app config."""
|
||||
try:
|
||||
# Use same Redis configuration as Celery
|
||||
self.redis_client = redis.Redis(
|
||||
host='localhost',
|
||||
port=6379,
|
||||
db=0,
|
||||
decode_responses=True
|
||||
)
|
||||
# Test connection
|
||||
self.redis_client.ping()
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Failed to initialize Redis client: {str(e)}",
|
||||
source="ScraperManager._init_redis_client"
|
||||
)
|
||||
self.redis_client = None
|
||||
return current_app.config.get('SCHEDULER')
|
||||
except RuntimeError:
|
||||
# Outside application context
|
||||
return None
|
||||
|
||||
def _clear_delayed_tasks_from_redis(self) -> int:
|
||||
"""Clear delayed tasks from Redis structures used by Celery.
|
||||
def _get_raw_scheduler(self):
|
||||
"""Get the raw APScheduler instance for direct job scheduling."""
|
||||
try:
|
||||
scheduler_wrapper = current_app.config.get('SCHEDULER')
|
||||
if scheduler_wrapper:
|
||||
return scheduler_wrapper.scheduler
|
||||
return None
|
||||
except RuntimeError:
|
||||
return None
|
||||
|
||||
Based on analysis, Celery stores delayed tasks in:
|
||||
- 'unacked_index': Sorted set containing task IDs with execution timestamps
|
||||
- 'unacked': Hash containing task data keyed by task ID
|
||||
|
||||
def _clear_delayed_tasks_from_apscheduler(self) -> int:
|
||||
"""Clear delayed tasks from APScheduler - clean replacement for Redis manipulation.
|
||||
|
||||
Returns:
|
||||
int: Number of delayed tasks cleared
|
||||
"""
|
||||
if not self.redis_client:
|
||||
scheduler = self._get_scheduler()
|
||||
if not scheduler:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message="Redis client not available - cannot clear delayed tasks",
|
||||
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||
error_message="APScheduler not available - cannot clear delayed tasks",
|
||||
source="ScraperManager._clear_delayed_tasks_from_apscheduler"
|
||||
)
|
||||
except RuntimeError:
|
||||
# Working outside application context - just print instead
|
||||
print("❌ Redis client not available - cannot clear delayed tasks")
|
||||
print("❌ APScheduler not available - cannot clear delayed tasks")
|
||||
return 0
|
||||
|
||||
cleared_count = 0
|
||||
try:
|
||||
# Define scraper task patterns to identify our tasks
|
||||
scraper_patterns = [
|
||||
'process_single_paper',
|
||||
'process_papers_batch',
|
||||
'hourly_scraper_scheduler'
|
||||
]
|
||||
cleared_count = scheduler.revoke_all_scraper_jobs()
|
||||
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="check_delayed_tasks",
|
||||
status="info",
|
||||
description="Checking Celery delayed task structures (unacked_index, unacked)"
|
||||
)
|
||||
except RuntimeError:
|
||||
print("🔍 Checking Celery delayed task structures (unacked_index, unacked)")
|
||||
|
||||
# Check 'unacked_index' (sorted set with task IDs and timestamps)
|
||||
unacked_index_cleared = 0
|
||||
if self.redis_client.exists('unacked_index'):
|
||||
try:
|
||||
# Get all task IDs from the sorted set
|
||||
task_ids = self.redis_client.zrange('unacked_index', 0, -1)
|
||||
|
||||
if task_ids:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="scan_unacked_index",
|
||||
status="info",
|
||||
description=f"Found {len(task_ids)} tasks in 'unacked_index'"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"📋 Found {len(task_ids)} tasks in 'unacked_index'")
|
||||
|
||||
# Check each task ID against the 'unacked' hash to get task details
|
||||
scraper_task_ids = []
|
||||
for task_id in task_ids:
|
||||
try:
|
||||
# Get task data from 'unacked' hash
|
||||
task_data = self.redis_client.hget('unacked', task_id)
|
||||
if task_data:
|
||||
# Check if this task contains any of our scraper patterns
|
||||
if any(pattern in str(task_data) for pattern in scraper_patterns):
|
||||
scraper_task_ids.append(task_id)
|
||||
except Exception:
|
||||
# Skip individual task errors
|
||||
continue
|
||||
|
||||
# Remove scraper task IDs from both structures
|
||||
for task_id in scraper_task_ids:
|
||||
try:
|
||||
# Remove from unacked_index (sorted set)
|
||||
removed_from_index = self.redis_client.zrem('unacked_index', task_id)
|
||||
# Remove from unacked (hash)
|
||||
removed_from_hash = self.redis_client.hdel('unacked', task_id)
|
||||
|
||||
if removed_from_index or removed_from_hash:
|
||||
unacked_index_cleared += 1
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error removing delayed task {task_id}: {str(e)}",
|
||||
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"❌ Error removing delayed task {task_id}: {str(e)}")
|
||||
continue
|
||||
|
||||
cleared_count += unacked_index_cleared
|
||||
|
||||
if unacked_index_cleared > 0:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="clear_unacked_tasks",
|
||||
status="success",
|
||||
description=f"Cleared {unacked_index_cleared} scraper tasks from unacked structures"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"✅ Cleared {unacked_index_cleared} scraper tasks from unacked structures")
|
||||
else:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="check_unacked_index",
|
||||
status="info",
|
||||
description="No tasks found in 'unacked_index'"
|
||||
)
|
||||
except RuntimeError:
|
||||
print("ℹ️ No tasks found in 'unacked_index'")
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error accessing 'unacked_index': {str(e)}",
|
||||
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"❌ Error accessing 'unacked_index': {str(e)}")
|
||||
else:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="check_unacked_index",
|
||||
status="info",
|
||||
description="'unacked_index' key does not exist - no delayed tasks"
|
||||
)
|
||||
except RuntimeError:
|
||||
print("ℹ️ 'unacked_index' key does not exist - no delayed tasks")
|
||||
|
||||
# Also check the 'celery' queue for immediate tasks (backup check)
|
||||
celery_cleared = 0
|
||||
try:
|
||||
queue_length = self.redis_client.llen('celery')
|
||||
if queue_length and queue_length > 0:
|
||||
# Scan for any scraper tasks in the immediate queue
|
||||
scraper_tasks = []
|
||||
for i in range(queue_length):
|
||||
try:
|
||||
task_data = self.redis_client.lindex('celery', i)
|
||||
if task_data and any(pattern in str(task_data) for pattern in scraper_patterns):
|
||||
scraper_tasks.append(task_data)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Remove scraper tasks from celery queue
|
||||
for task_data in scraper_tasks:
|
||||
try:
|
||||
removed_count = self.redis_client.lrem('celery', 0, task_data)
|
||||
celery_cleared += removed_count
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
cleared_count += celery_cleared
|
||||
|
||||
if celery_cleared > 0:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="clear_celery_tasks",
|
||||
status="success",
|
||||
description=f"Cleared {celery_cleared} scraper tasks from 'celery' queue"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"✅ Cleared {celery_cleared} scraper tasks from 'celery' queue")
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error checking 'celery' queue: {str(e)}",
|
||||
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"❌ Error checking 'celery' queue: {str(e)}")
|
||||
|
||||
# Summary
|
||||
# Summary logging
|
||||
if cleared_count > 0:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="clear_delayed_tasks_complete",
|
||||
action="clear_delayed_tasks_complete_apscheduler",
|
||||
status="success",
|
||||
description=f"Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})"
|
||||
description=f"Total delayed scraper tasks cleared from APScheduler: {cleared_count}"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"✅ Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})")
|
||||
print(f"✅ Total delayed scraper tasks cleared from APScheduler: {cleared_count}")
|
||||
else:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="clear_delayed_tasks_complete",
|
||||
action="clear_delayed_tasks_complete_apscheduler",
|
||||
status="info",
|
||||
description="No delayed scraper tasks found to clear in Redis"
|
||||
description="No delayed scraper tasks found to clear in APScheduler"
|
||||
)
|
||||
except RuntimeError:
|
||||
print("ℹ️ No delayed scraper tasks found to clear in Redis")
|
||||
print("ℹ️ No delayed scraper tasks found to clear in APScheduler")
|
||||
|
||||
return cleared_count
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Failed to clear delayed tasks from Redis: {str(e)}",
|
||||
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||
error_message=f"Failed to clear delayed tasks from APScheduler: {str(e)}",
|
||||
source="ScraperManager._clear_delayed_tasks_from_apscheduler"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"❌ Failed to clear delayed tasks from Redis: {str(e)}")
|
||||
print(f"❌ Failed to clear delayed tasks from APScheduler: {str(e)}")
|
||||
return 0
|
||||
|
||||
def start_scraper(self) -> Dict[str, str]:
|
||||
"""Start the scraper system."""
|
||||
"""Start the scraper system and immediately schedule papers for the current hour."""
|
||||
try:
|
||||
# Get current scraper
|
||||
self.current_scraper = get_scraper()
|
||||
@ -270,13 +114,25 @@ class ScraperManager:
|
||||
|
||||
scraper_name = self.current_scraper.get_name()
|
||||
|
||||
# Immediately schedule papers for the remaining time in the current hour
|
||||
immediate_scheduled_count = self._schedule_papers_for_current_hour()
|
||||
|
||||
if immediate_scheduled_count > 0:
|
||||
ActivityLog.log_scraper_command(
|
||||
action="start_scraper",
|
||||
status="success",
|
||||
description=f"Started scraper: {scraper_name}. Use /trigger-immediate endpoint to immediately schedule papers instead of waiting for the next hourly boundary."
|
||||
description=f"Started scraper: {scraper_name}. Immediately scheduled {immediate_scheduled_count} papers for the remaining time in this hour."
|
||||
)
|
||||
|
||||
return {"status": "success", "message": "Scraper started successfully. Papers will be scheduled at the next hourly boundary, or use /trigger-immediate to schedule immediately."}
|
||||
return {"status": "success", "message": f"Scraper started successfully. Immediately scheduled {immediate_scheduled_count} papers for processing in the remaining time this hour."}
|
||||
else:
|
||||
ActivityLog.log_scraper_command(
|
||||
action="start_scraper",
|
||||
status="success",
|
||||
description=f"Started scraper: {scraper_name}. No papers available for immediate scheduling in the current hour."
|
||||
)
|
||||
|
||||
return {"status": "success", "message": "Scraper started successfully. No papers available for immediate scheduling this hour."}
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
@ -318,123 +174,29 @@ class ScraperManager:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
def stop_scraper(self) -> Dict[str, str]:
|
||||
"""Stop the scraper, revoke all running tasks, and revert pending papers."""
|
||||
"""Stop the scraper, revoke all APScheduler jobs, and revert pending papers."""
|
||||
try:
|
||||
# First, revoke all running tasks
|
||||
revoked_count = 0
|
||||
delayed_cleared_count = 0
|
||||
# STEP 1: Immediately set scraper as inactive - this is critical for race condition prevention
|
||||
ScraperState.set_active(False)
|
||||
ScraperState.set_paused(False)
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="stop_scraper_start",
|
||||
status="info",
|
||||
description="Beginning scraper stop process with task revocation and delayed task clearing"
|
||||
description="Scraper stop initiated - marked as inactive. Beginning APScheduler job revocation."
|
||||
)
|
||||
|
||||
try:
|
||||
# Get Celery inspector to check for running tasks
|
||||
i = celery.control.inspect()
|
||||
active = i.active() or {}
|
||||
scheduled = i.scheduled() or {}
|
||||
reserved = i.reserved() or {}
|
||||
# STEP 2: Brief pause to allow running jobs to see the inactive state
|
||||
import time
|
||||
time.sleep(0.2)
|
||||
|
||||
# Revoke active tasks
|
||||
for worker, tasks in active.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_task",
|
||||
status="success",
|
||||
description=f"Revoked active task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||
)
|
||||
# STEP 3: Revoke all APScheduler jobs
|
||||
delayed_cleared_count = self._clear_delayed_tasks_from_apscheduler()
|
||||
|
||||
# Revoke scheduled tasks
|
||||
for worker, tasks in scheduled.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_task",
|
||||
status="success",
|
||||
description=f"Revoked scheduled task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||
)
|
||||
# STEP 4: Wait a bit for any remaining jobs to finish their checks and exit
|
||||
time.sleep(1.0)
|
||||
|
||||
# Revoke reserved tasks
|
||||
for worker, tasks in reserved.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_task",
|
||||
status="success",
|
||||
description=f"Revoked reserved task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||
)
|
||||
|
||||
# Purge all task queues
|
||||
celery.control.purge()
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="purge_queues",
|
||||
status="success",
|
||||
description="Purged all task queues"
|
||||
)
|
||||
|
||||
# **NEW: Clear delayed tasks from Redis sorted sets**
|
||||
delayed_cleared_count = self._clear_delayed_tasks_from_redis()
|
||||
|
||||
# Additional cleanup: revoke any remaining scraper-related tasks by name pattern
|
||||
try:
|
||||
# Use broadcast to revoke tasks that match scraper patterns
|
||||
scraper_task_patterns = [
|
||||
'process_single_paper',
|
||||
'process_papers_batch',
|
||||
'hourly_scraper_scheduler'
|
||||
]
|
||||
|
||||
# Get a fresh inspection of tasks after purge
|
||||
fresh_inspect = celery.control.inspect()
|
||||
all_tasks = {}
|
||||
all_tasks.update(fresh_inspect.active() or {})
|
||||
all_tasks.update(fresh_inspect.scheduled() or {})
|
||||
all_tasks.update(fresh_inspect.reserved() or {})
|
||||
|
||||
additional_revoked = 0
|
||||
for worker, tasks in all_tasks.items():
|
||||
for task in tasks:
|
||||
task_name = task.get('name', '')
|
||||
task_id = task.get('id', '')
|
||||
if any(pattern in task_name for pattern in scraper_task_patterns) and task_id:
|
||||
celery.control.revoke(task_id, terminate=True)
|
||||
additional_revoked += 1
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_scraper_task",
|
||||
status="success",
|
||||
description=f"Revoked lingering scraper task: {task_name} (ID: {task_id})"
|
||||
)
|
||||
|
||||
if additional_revoked > 0:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="cleanup_scraper_tasks",
|
||||
status="success",
|
||||
description=f"Additional cleanup: revoked {additional_revoked} lingering scraper tasks"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error during additional scraper task cleanup: {str(e)}",
|
||||
source="ScraperManager.stop_scraper.cleanup"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error revoking tasks: {str(e)}",
|
||||
source="ScraperManager.stop_scraper"
|
||||
)
|
||||
# Continue with paper reversion even if task revocation fails
|
||||
|
||||
# Get current scraper to know what status to revert to
|
||||
# STEP 5: Revert papers from processing status
|
||||
scraper = get_scraper()
|
||||
input_statuses = scraper.get_input_statuses()
|
||||
|
||||
@ -453,7 +215,7 @@ class ScraperManager:
|
||||
paper.status = paper.previous_status
|
||||
else:
|
||||
paper.status = revert_status
|
||||
paper.updated_at = datetime.utcnow()
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
reverted_count += 1
|
||||
|
||||
db.session.commit()
|
||||
@ -464,19 +226,15 @@ class ScraperManager:
|
||||
description=f"Reverted {reverted_count} papers from '{processing_status}' to previous status"
|
||||
)
|
||||
|
||||
# Deactivate scraper
|
||||
ScraperState.set_active(False)
|
||||
ScraperState.set_paused(False)
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="stop_scraper",
|
||||
status="success",
|
||||
description=f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers."
|
||||
description=f"Scraper stopped completely. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers."
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"message": f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers to previous status."
|
||||
"message": f"Scraper stopped. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers to previous status."
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
@ -487,51 +245,16 @@ class ScraperManager:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
def reset_scraper(self) -> Dict[str, str]:
|
||||
"""Reset scraper state, revoke all running tasks, and clear all processing statuses."""
|
||||
"""Reset scraper state, revoke all APScheduler jobs, and clear all processing statuses."""
|
||||
try:
|
||||
# First, revoke all running tasks (similar to stop_scraper)
|
||||
revoked_count = 0
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="reset_scraper_start",
|
||||
status="info",
|
||||
description="Beginning scraper reset process with task revocation"
|
||||
description="Beginning scraper reset process with APScheduler job revocation"
|
||||
)
|
||||
|
||||
try:
|
||||
# Get Celery inspector to check for running tasks
|
||||
i = celery.control.inspect()
|
||||
active = i.active() or {}
|
||||
scheduled = i.scheduled() or {}
|
||||
reserved = i.reserved() or {}
|
||||
|
||||
# Revoke all tasks (active, scheduled, reserved)
|
||||
for queue_name, queue_tasks in [("active", active), ("scheduled", scheduled), ("reserved", reserved)]:
|
||||
for worker, tasks in queue_tasks.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_task",
|
||||
status="success",
|
||||
description=f"Revoked {queue_name} task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||
)
|
||||
|
||||
# Purge all task queues
|
||||
celery.control.purge()
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="purge_queues",
|
||||
status="success",
|
||||
description="Purged all task queues during reset"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error revoking tasks during reset: {str(e)}",
|
||||
source="ScraperManager.reset_scraper"
|
||||
)
|
||||
# Continue with paper reversion even if task revocation fails
|
||||
# Clear all APScheduler jobs
|
||||
delayed_cleared_count = self._clear_delayed_tasks_from_apscheduler()
|
||||
|
||||
# Get current scraper configuration
|
||||
scraper = get_scraper()
|
||||
@ -551,7 +274,7 @@ class ScraperManager:
|
||||
paper.status = paper.previous_status
|
||||
else:
|
||||
paper.status = revert_status
|
||||
paper.updated_at = datetime.utcnow()
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
paper.error_msg = None # Clear any error messages
|
||||
reverted_count += 1
|
||||
|
||||
@ -564,12 +287,12 @@ class ScraperManager:
|
||||
ActivityLog.log_scraper_command(
|
||||
action="reset_scraper",
|
||||
status="success",
|
||||
description=f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers."
|
||||
description=f"Scraper reset. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers."
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"message": f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers to original status."
|
||||
"message": f"Scraper reset. Cleared {delayed_cleared_count} APScheduler jobs and reverted {reverted_count} papers to original status."
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
@ -638,24 +361,52 @@ class ScraperManager:
|
||||
.limit(papers_needed)
|
||||
.all())
|
||||
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="select_papers",
|
||||
status="info",
|
||||
description=f"Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})"
|
||||
)
|
||||
except RuntimeError:
|
||||
# Outside application context - use print fallback
|
||||
print(f"📋 Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})")
|
||||
|
||||
return papers
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error selecting papers: {str(e)}",
|
||||
source="ScraperManager.select_papers_for_processing"
|
||||
)
|
||||
except RuntimeError:
|
||||
# Outside application context - use print fallback
|
||||
print(f"❌ Error selecting papers: {str(e)}")
|
||||
return []
|
||||
|
||||
def process_paper(self, paper: PaperMetadata) -> Dict:
|
||||
"""Process a single paper using the current scraper."""
|
||||
try:
|
||||
# **RACE CONDITION FIX**: Double-check scraper state before proceeding
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
if not scraper_state.is_active:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_paper",
|
||||
paper_id=paper.id,
|
||||
status="skipped",
|
||||
description="Skipped processing - scraper deactivated during task execution"
|
||||
)
|
||||
return {"paper_id": paper.id, "status": "skipped", "message": "Scraper not active"}
|
||||
|
||||
if scraper_state.is_paused:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_paper",
|
||||
paper_id=paper.id,
|
||||
status="skipped",
|
||||
description="Skipped processing - scraper paused during task execution"
|
||||
)
|
||||
return {"paper_id": paper.id, "status": "skipped", "message": "Scraper paused"}
|
||||
|
||||
scraper = get_scraper()
|
||||
output_statuses = scraper.get_output_statuses()
|
||||
|
||||
@ -665,9 +416,25 @@ class ScraperManager:
|
||||
# Update paper status to processing
|
||||
paper.previous_status = previous_status
|
||||
paper.status = output_statuses["processing"]
|
||||
paper.updated_at = datetime.utcnow()
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
db.session.commit()
|
||||
|
||||
# **ADDITIONAL RACE CONDITION CHECK**: Verify scraper is still active before expensive scraping operation
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
if not scraper_state.is_active:
|
||||
# Scraper was deactivated after we marked paper as processing - revert and exit
|
||||
paper.status = previous_status
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
db.session.commit()
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_paper",
|
||||
paper_id=paper.id,
|
||||
status="cancelled",
|
||||
description="Cancelled processing - scraper deactivated after paper marked as processing"
|
||||
)
|
||||
return {"paper_id": paper.id, "status": "cancelled", "message": "Scraper deactivated during processing"}
|
||||
|
||||
# Perform scraping
|
||||
result = scraper.scrape(paper.doi)
|
||||
|
||||
@ -681,7 +448,7 @@ class ScraperManager:
|
||||
paper.status = output_statuses["failure"]
|
||||
paper.error_msg = result.message
|
||||
|
||||
paper.updated_at = datetime.utcnow()
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
db.session.commit()
|
||||
|
||||
# Log result
|
||||
@ -706,7 +473,7 @@ class ScraperManager:
|
||||
if input_statuses:
|
||||
paper.status = input_statuses[0]
|
||||
paper.error_msg = f"Processing error: {str(e)}"
|
||||
paper.updated_at = datetime.utcnow()
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
db.session.commit()
|
||||
except:
|
||||
pass # Don't fail if reversion fails
|
||||
@ -718,6 +485,91 @@ class ScraperManager:
|
||||
|
||||
return {"paper_id": paper.id, "status": "error", "message": str(e)}
|
||||
|
||||
def process_paper_manual(self, paper: PaperMetadata, scraper_name: Optional[str] = None) -> Dict:
|
||||
"""Process a single paper manually, bypassing scraper state checks."""
|
||||
try:
|
||||
# Get scraper configuration but skip state validation for manual processing
|
||||
if scraper_name:
|
||||
# Use the specified scraper
|
||||
import importlib
|
||||
from .base import BaseScraper
|
||||
try:
|
||||
module = importlib.import_module(f"scipaperloader.scrapers.{scraper_name}")
|
||||
scraper_cls = getattr(module, "Scraper")
|
||||
if not issubclass(scraper_cls, BaseScraper):
|
||||
raise TypeError(f"Scraper class in module '{scraper_name}' does not inherit from BaseScraper")
|
||||
scraper = scraper_cls()
|
||||
except (ImportError, AttributeError, TypeError) as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Failed to load specified scraper '{scraper_name}': {str(e)}. Falling back to system default.",
|
||||
source="ScraperManager.process_paper_manual"
|
||||
)
|
||||
scraper = get_scraper()
|
||||
else:
|
||||
# Use system default scraper
|
||||
scraper = get_scraper()
|
||||
|
||||
output_statuses = scraper.get_output_statuses()
|
||||
|
||||
# Store the previous status before changing it
|
||||
previous_status = paper.status
|
||||
|
||||
# Update paper status to processing
|
||||
paper.previous_status = previous_status
|
||||
paper.status = output_statuses["processing"]
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
db.session.commit()
|
||||
|
||||
# Perform scraping (no state checks for manual processing)
|
||||
result = scraper.scrape(paper.doi)
|
||||
|
||||
# Update paper status based on result
|
||||
if result.status == "success":
|
||||
paper.status = output_statuses["success"]
|
||||
paper.error_msg = None
|
||||
if result.data and "file_path" in result.data:
|
||||
paper.file_path = result.data["file_path"]
|
||||
else:
|
||||
paper.status = output_statuses["failure"]
|
||||
paper.error_msg = result.message
|
||||
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
db.session.commit()
|
||||
|
||||
# Log result
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_paper_manual",
|
||||
paper_id=paper.id,
|
||||
status=result.status,
|
||||
description=f"Manually processed {paper.doi}: {result.message}"
|
||||
)
|
||||
|
||||
return {
|
||||
"paper_id": paper.id,
|
||||
"status": result.status,
|
||||
"message": result.message,
|
||||
"duration": result.duration
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
# Revert paper status on error
|
||||
try:
|
||||
input_statuses = get_scraper().get_input_statuses()
|
||||
if input_statuses:
|
||||
paper.status = input_statuses[0]
|
||||
paper.error_msg = f"Manual processing error: {str(e)}"
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
db.session.commit()
|
||||
except:
|
||||
pass # Don't fail if reversion fails
|
||||
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error manually processing paper {paper.id}: {str(e)}",
|
||||
source="ScraperManager.process_paper_manual"
|
||||
)
|
||||
|
||||
return {"paper_id": paper.id, "status": "error", "message": str(e)}
|
||||
|
||||
def get_status(self) -> Dict:
|
||||
"""Get current scraper status."""
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
@ -745,3 +597,119 @@ class ScraperManager:
|
||||
"processing_papers": processing_count,
|
||||
"current_hour_quota": self.get_current_hour_quota()
|
||||
}
|
||||
|
||||
def _schedule_papers_for_current_hour(self) -> int:
|
||||
"""Schedule papers for processing in the remaining time of the current hour.
|
||||
|
||||
Returns:
|
||||
int: Number of papers scheduled
|
||||
"""
|
||||
try:
|
||||
# Get papers that should be processed this hour
|
||||
papers = self.select_papers_for_processing()
|
||||
|
||||
if not papers:
|
||||
return 0
|
||||
|
||||
# Get raw APScheduler instance for direct job scheduling
|
||||
scheduler = self._get_raw_scheduler()
|
||||
if not scheduler:
|
||||
ActivityLog.log_error(
|
||||
error_message="Raw APScheduler not available for immediate paper scheduling",
|
||||
source="ScraperManager._schedule_papers_for_current_hour"
|
||||
)
|
||||
return 0
|
||||
|
||||
# Calculate remaining time in current hour
|
||||
current_time = datetime.now()
|
||||
next_hour = current_time.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)
|
||||
remaining_seconds = int((next_hour - current_time).total_seconds())
|
||||
|
||||
# Don't schedule if less than 2 minutes remaining
|
||||
if remaining_seconds < 120:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="start_scraper_immediate_scheduling",
|
||||
status="info",
|
||||
description=f"Skipping immediate scheduling - only {remaining_seconds} seconds remaining in current hour"
|
||||
)
|
||||
return 0
|
||||
|
||||
# Schedule papers at random times within the remaining time
|
||||
scheduled_count = 0
|
||||
scheduled_papers = []
|
||||
|
||||
for paper in papers:
|
||||
try:
|
||||
# Random delay between 1 second and remaining time minus 60 seconds buffer
|
||||
max_delay = max(1, remaining_seconds - 60)
|
||||
delay_seconds = random.randint(1, max_delay)
|
||||
run_time = current_time + timedelta(seconds=delay_seconds)
|
||||
|
||||
# Generate unique job ID
|
||||
import uuid
|
||||
job_id = f"startup_paper_{paper.id}_{int(current_time.timestamp())}_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Schedule the job
|
||||
from ..scheduler import _process_single_paper
|
||||
scheduler.add_job(
|
||||
func=_process_single_paper,
|
||||
trigger='date',
|
||||
run_date=run_time,
|
||||
args=[paper.id],
|
||||
id=job_id,
|
||||
name=f"Startup Process Paper {paper.id}",
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
scheduled_count += 1
|
||||
|
||||
# Collect paper info for logging
|
||||
paper_info = {
|
||||
"paper_id": paper.id,
|
||||
"paper_doi": paper.doi,
|
||||
"job_id": job_id,
|
||||
"scheduled_time": run_time.isoformat(),
|
||||
"delay_seconds": delay_seconds
|
||||
}
|
||||
scheduled_papers.append(paper_info)
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Failed to schedule paper {paper.id} during startup: {str(e)}",
|
||||
source="ScraperManager._schedule_papers_for_current_hour"
|
||||
)
|
||||
|
||||
# Create single comprehensive log entry
|
||||
if scheduled_papers:
|
||||
try:
|
||||
import json
|
||||
scheduling_data = {
|
||||
"total_scheduled": scheduled_count,
|
||||
"scheduled_papers": scheduled_papers,
|
||||
"timestamp": current_time.isoformat(),
|
||||
"remaining_time_seconds": remaining_seconds,
|
||||
"trigger": "startup_immediate_scheduling"
|
||||
}
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="startup_immediate_scheduling",
|
||||
status="success",
|
||||
description=f"Scheduled {scheduled_count} papers for immediate processing during startup for remaining {remaining_seconds}s in current hour. See extra_data for details.",
|
||||
**{"scheduling_details": json.dumps(scheduling_data)}
|
||||
)
|
||||
except Exception:
|
||||
# Fallback to simple logging
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="startup_immediate_scheduling",
|
||||
status="success",
|
||||
description=f"Scheduled {scheduled_count} papers for immediate processing during startup"
|
||||
)
|
||||
|
||||
return scheduled_count
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error in startup immediate scheduling: {str(e)}",
|
||||
source="ScraperManager._schedule_papers_for_current_hour"
|
||||
)
|
||||
return 0
|
||||
|
282
scipaperloader/scrapers/publisher_detector.py
Normal file
282
scipaperloader/scrapers/publisher_detector.py
Normal file
@ -0,0 +1,282 @@
|
||||
import time
|
||||
import requests
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from .base import BaseScraper, ScrapeResult
|
||||
from flask import current_app
|
||||
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||
from ..db import db
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Publisher detection scraper that identifies the publisher from the final URL after DOI redirect."""
|
||||
|
||||
# This scraper processes "New" papers and outputs "PublisherDetected"/"Failed"
|
||||
INPUT_STATUSES = ["New"]
|
||||
OUTPUT_STATUS_SUCCESS = "PublisherDetected"
|
||||
OUTPUT_STATUS_FAILURE = "Failed"
|
||||
OUTPUT_STATUS_PROCESSING = "DetectingPublisher"
|
||||
|
||||
# Publisher detection patterns based on URL domains and paths
|
||||
PUBLISHER_URL_PATTERNS = {
|
||||
'elsevier': [
|
||||
r'sciencedirect\.com',
|
||||
r'elsevier\.com',
|
||||
r'.*\.elsevier\.com'
|
||||
],
|
||||
'springer': [
|
||||
r'link\.springer\.com',
|
||||
r'springer\.com',
|
||||
r'.*\.springer\.com'
|
||||
],
|
||||
'wiley': [
|
||||
r'onlinelibrary\.wiley\.com',
|
||||
r'wiley\.com',
|
||||
r'.*\.wiley\.com'
|
||||
],
|
||||
'ieee': [
|
||||
r'ieeexplore\.ieee\.org',
|
||||
r'ieee\.org',
|
||||
r'.*\.ieee\.org'
|
||||
],
|
||||
'plos': [
|
||||
r'journals\.plos\.org',
|
||||
r'plos\.org',
|
||||
r'.*\.plos\.org'
|
||||
],
|
||||
'nature': [
|
||||
r'nature\.com',
|
||||
r'.*\.nature\.com'
|
||||
],
|
||||
'sage': [
|
||||
r'journals\.sagepub\.com',
|
||||
r'sagepub\.com',
|
||||
r'.*\.sagepub\.com'
|
||||
],
|
||||
'taylor_francis': [
|
||||
r'tandfonline\.com',
|
||||
r'.*\.tandfonline\.com'
|
||||
],
|
||||
'acs': [
|
||||
r'pubs\.acs\.org',
|
||||
r'acs\.org',
|
||||
r'.*\.acs\.org'
|
||||
],
|
||||
'arxiv': [
|
||||
r'arxiv\.org',
|
||||
r'export\.arxiv\.org'
|
||||
],
|
||||
'pubmed': [
|
||||
r'pubmed\.ncbi\.nlm\.nih\.gov',
|
||||
r'ncbi\.nlm\.nih\.gov'
|
||||
],
|
||||
'oxford': [
|
||||
r'academic\.oup\.com',
|
||||
r'oup\.com',
|
||||
r'.*\.oup\.com'
|
||||
],
|
||||
'cambridge': [
|
||||
r'cambridge\.org',
|
||||
r'.*\.cambridge\.org'
|
||||
],
|
||||
'biorxiv': [
|
||||
r'biorxiv\.org',
|
||||
r'.*\.biorxiv\.org'
|
||||
],
|
||||
'researchgate': [
|
||||
r'researchgate\.net',
|
||||
r'.*\.researchgate\.net'
|
||||
]
|
||||
}
|
||||
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""Detect publisher from the final URL after DOI redirect."""
|
||||
start_time = time.time()
|
||||
|
||||
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
if not paper:
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=f"No paper found for DOI {doi}",
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log start of scraping
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Update status to processing
|
||||
paper.status = self.OUTPUT_STATUS_PROCESSING
|
||||
db.session.commit()
|
||||
|
||||
try:
|
||||
# Get the final URL by following the DOI redirect
|
||||
final_url = self._get_final_url(doi)
|
||||
|
||||
if not final_url:
|
||||
error_msg = f"Could not resolve DOI {doi} to a URL"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "doi_resolution_failed"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Detect publisher from URL
|
||||
detected_publisher = self._detect_publisher_from_url(final_url)
|
||||
|
||||
if detected_publisher:
|
||||
# Update paper with detected publisher
|
||||
paper.publisher = detected_publisher
|
||||
paper.status = self.OUTPUT_STATUS_SUCCESS
|
||||
paper.error_msg = None
|
||||
db.session.commit()
|
||||
|
||||
success_msg = f"Publisher '{detected_publisher}' detected from URL: {final_url}"
|
||||
self.log_scrape_success(doi, success_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="success",
|
||||
message=success_msg,
|
||||
data={
|
||||
"publisher": detected_publisher,
|
||||
"final_url": final_url
|
||||
},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
else:
|
||||
error_msg = f"Could not detect publisher from URL: {final_url}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={
|
||||
"final_url": final_url,
|
||||
"error_code": "publisher_not_detected"
|
||||
},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error detecting publisher for DOI {doi}: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "publisher_detection_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
def _get_final_url(self, doi: str) -> Optional[str]:
|
||||
"""
|
||||
Get the final URL after following DOI redirects.
|
||||
|
||||
Args:
|
||||
doi: The DOI to resolve
|
||||
|
||||
Returns:
|
||||
Final URL after redirects, or None if resolution fails
|
||||
"""
|
||||
try:
|
||||
doi_url = f"https://doi.org/{doi}"
|
||||
headers = {
|
||||
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
}
|
||||
|
||||
# Make a HEAD request to get the final URL without downloading content
|
||||
response = requests.head(
|
||||
doi_url,
|
||||
headers=headers,
|
||||
timeout=15,
|
||||
allow_redirects=True
|
||||
)
|
||||
|
||||
# If HEAD is not allowed, try GET but with minimal content
|
||||
if response.status_code == 405: # Method Not Allowed
|
||||
response = requests.get(
|
||||
doi_url,
|
||||
headers=headers,
|
||||
timeout=15,
|
||||
allow_redirects=True,
|
||||
stream=True # Don't download the full content
|
||||
)
|
||||
response.close() # Close connection after getting headers
|
||||
|
||||
if response.status_code in [200, 302, 301]:
|
||||
return response.url
|
||||
else:
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
# Log error but don't raise - we'll handle this gracefully
|
||||
return None
|
||||
|
||||
def _detect_publisher_from_url(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
Detect publisher from URL using domain patterns.
|
||||
|
||||
Args:
|
||||
url: The URL to analyze
|
||||
|
||||
Returns:
|
||||
Publisher name if detected, None otherwise
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Parse the URL to get the domain
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc.lower()
|
||||
|
||||
# Remove 'www.' prefix if present
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
# Score each publisher based on URL pattern matches
|
||||
publisher_scores = {}
|
||||
|
||||
for publisher, patterns in self.PUBLISHER_URL_PATTERNS.items():
|
||||
score = 0
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, domain, re.IGNORECASE):
|
||||
score += 10 # Strong match for domain patterns
|
||||
|
||||
# Also check the full URL for path-based patterns
|
||||
if re.search(pattern, url.lower(), re.IGNORECASE):
|
||||
score += 5
|
||||
|
||||
if score > 0:
|
||||
publisher_scores[publisher] = score
|
||||
|
||||
# Return the publisher with the highest score
|
||||
if publisher_scores:
|
||||
best_publisher = max(publisher_scores.keys(), key=lambda x: publisher_scores[x])
|
||||
|
||||
# Only return if we have a reasonable confidence (score > 5)
|
||||
if publisher_scores[best_publisher] > 5:
|
||||
return best_publisher
|
||||
|
||||
return None
|
@ -1,18 +1,17 @@
|
||||
"""
|
||||
Hourly scheduler task that processes papers at random times within each hour.
|
||||
APScheduler-based task functions that replace Celery tasks for paper processing.
|
||||
"""
|
||||
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
from celery import shared_task
|
||||
from flask import current_app
|
||||
|
||||
from ..models import ScraperState, ActivityLog
|
||||
from ..models import ScraperState, ActivityLog, PaperMetadata
|
||||
from .manager import ScraperManager
|
||||
|
||||
|
||||
@shared_task(bind=True)
|
||||
def hourly_scraper_scheduler(self):
|
||||
def hourly_scraper_scheduler():
|
||||
"""
|
||||
Hourly task that schedules paper processing at random times within the hour.
|
||||
|
||||
@ -29,8 +28,6 @@ def hourly_scraper_scheduler(self):
|
||||
status="info",
|
||||
description="Hourly scheduler skipped - scraper not active"
|
||||
)
|
||||
# Disable retries for inactive scheduler
|
||||
self.retry = False
|
||||
return {"status": "inactive", "papers_scheduled": 0}
|
||||
|
||||
if scraper_state.is_paused:
|
||||
@ -39,8 +36,6 @@ def hourly_scraper_scheduler(self):
|
||||
status="info",
|
||||
description="Hourly scheduler skipped - scraper paused"
|
||||
)
|
||||
# Disable retries for paused scheduler
|
||||
self.retry = False
|
||||
return {"status": "paused", "papers_scheduled": 0}
|
||||
|
||||
# Initialize scraper manager
|
||||
@ -57,6 +52,15 @@ def hourly_scraper_scheduler(self):
|
||||
)
|
||||
return {"status": "empty", "papers_scheduled": 0}
|
||||
|
||||
# Get scheduler from Flask app config
|
||||
scheduler = current_app.config.get('SCHEDULER')
|
||||
if not scheduler:
|
||||
ActivityLog.log_error(
|
||||
error_message="APScheduler not available for paper scheduling",
|
||||
source="hourly_scraper_scheduler"
|
||||
)
|
||||
return {"status": "error", "message": "APScheduler not available"}
|
||||
|
||||
# Schedule papers at random times within the hour (0-3600 seconds)
|
||||
scheduled_count = 0
|
||||
current_time = datetime.now()
|
||||
@ -64,24 +68,27 @@ def hourly_scraper_scheduler(self):
|
||||
for paper in papers:
|
||||
# Random delay between 1 second and 58 minutes
|
||||
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
|
||||
run_date = current_time + timedelta(seconds=delay_seconds)
|
||||
|
||||
# Schedule the task using Celery's task registry to avoid circular import issues
|
||||
from ..celery import celery
|
||||
celery.send_task(
|
||||
'scipaperloader.scrapers.tasks.process_single_paper',
|
||||
# Schedule the task using APScheduler
|
||||
job_id = f"paper_process_{paper.id}_{int(current_time.timestamp())}"
|
||||
scheduler.add_job(
|
||||
func=process_single_paper,
|
||||
trigger='date',
|
||||
run_date=run_date,
|
||||
args=[paper.id],
|
||||
countdown=delay_seconds
|
||||
id=job_id,
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
scheduled_count += 1
|
||||
|
||||
# Log each scheduled paper
|
||||
schedule_time = current_time + timedelta(seconds=delay_seconds)
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="schedule_paper",
|
||||
paper_id=paper.id,
|
||||
status="info",
|
||||
description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
|
||||
description=f"Scheduled paper {paper.doi} for processing at {run_date.strftime('%H:%M:%S')}"
|
||||
)
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
@ -100,8 +107,7 @@ def hourly_scraper_scheduler(self):
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
|
||||
@shared_task(bind=True)
|
||||
def process_single_paper(self, paper_id: int):
|
||||
def process_single_paper(paper_id: int):
|
||||
"""
|
||||
Process a single paper. This task is scheduled at random times within each hour.
|
||||
|
||||
@ -109,17 +115,17 @@ def process_single_paper(self, paper_id: int):
|
||||
paper_id: ID of the paper to process
|
||||
"""
|
||||
try:
|
||||
# Double-check scraper state before processing
|
||||
# ENHANCED RACE CONDITION PROTECTION: Check scraper state multiple times
|
||||
|
||||
# Initial check before any processing
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
if not scraper_state.is_active:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_single_paper",
|
||||
paper_id=paper_id,
|
||||
status="skipped",
|
||||
description="Skipped processing - scraper not active"
|
||||
description="Task skipped - scraper not active (initial check)"
|
||||
)
|
||||
# Use Celery's ignore to mark this task as completed without error
|
||||
self.retry = False
|
||||
return {"status": "inactive", "paper_id": paper_id}
|
||||
|
||||
if scraper_state.is_paused:
|
||||
@ -127,18 +133,50 @@ def process_single_paper(self, paper_id: int):
|
||||
action="process_single_paper",
|
||||
paper_id=paper_id,
|
||||
status="skipped",
|
||||
description="Skipped processing - scraper paused"
|
||||
description="Task skipped - scraper paused (initial check)"
|
||||
)
|
||||
return {"status": "paused", "paper_id": paper_id}
|
||||
|
||||
# Brief pause to allow stop commands to take effect
|
||||
import time
|
||||
time.sleep(0.1)
|
||||
|
||||
# Second check after brief delay
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
if not scraper_state.is_active:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_single_paper",
|
||||
paper_id=paper_id,
|
||||
status="skipped",
|
||||
description="Task skipped - scraper not active (secondary check)"
|
||||
)
|
||||
return {"status": "inactive", "paper_id": paper_id}
|
||||
|
||||
if scraper_state.is_paused:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_single_paper",
|
||||
paper_id=paper_id,
|
||||
status="skipped",
|
||||
description="Task skipped - scraper paused (secondary check)"
|
||||
)
|
||||
# Use Celery's ignore for paused state too
|
||||
self.retry = False
|
||||
return {"status": "paused", "paper_id": paper_id}
|
||||
|
||||
# Get the paper
|
||||
from ..models import PaperMetadata
|
||||
paper = PaperMetadata.query.get(paper_id)
|
||||
if not paper:
|
||||
return {"status": "error", "message": f"Paper {paper_id} not found"}
|
||||
|
||||
# Third check before starting actual processing
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
if not scraper_state.is_active:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_single_paper",
|
||||
paper_id=paper_id,
|
||||
status="skipped",
|
||||
description="Task skipped - scraper not active (pre-processing check)"
|
||||
)
|
||||
return {"status": "inactive", "paper_id": paper_id}
|
||||
|
||||
# Process the paper using scraper manager
|
||||
manager = ScraperManager()
|
||||
result = manager.process_paper(paper)
|
||||
@ -153,8 +191,48 @@ def process_single_paper(self, paper_id: int):
|
||||
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
||||
|
||||
|
||||
@shared_task(bind=True)
|
||||
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
|
||||
def process_single_paper_manual(paper_id: int, scraper_name: Optional[str] = None):
|
||||
"""
|
||||
Process a single paper manually, bypassing scraper state checks.
|
||||
Used for manual paper processing from the UI.
|
||||
|
||||
Args:
|
||||
paper_id: ID of the paper to process
|
||||
scraper_name: Optional specific scraper module to use
|
||||
"""
|
||||
try:
|
||||
# Get the paper without checking scraper state
|
||||
paper = PaperMetadata.query.get(paper_id)
|
||||
if not paper:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Paper {paper_id} not found for manual processing",
|
||||
source="process_single_paper_manual"
|
||||
)
|
||||
return {"status": "error", "message": f"Paper {paper_id} not found"}
|
||||
|
||||
# Process the paper using the manual processing method (bypasses state checks)
|
||||
manager = ScraperManager()
|
||||
result = manager.process_paper_manual(paper, scraper_name=scraper_name)
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="manual_process_complete",
|
||||
paper_id=paper_id,
|
||||
status=result["status"],
|
||||
description=f"Manual processing completed for paper {paper.doi}" +
|
||||
(f" using scraper '{scraper_name}'" if scraper_name else " using system default scraper")
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error manually processing paper {paper_id}: {str(e)}",
|
||||
source="process_single_paper_manual"
|
||||
)
|
||||
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
||||
|
||||
|
||||
def process_papers_batch(paper_ids: list, scraper_module: Optional[str] = None):
|
||||
"""
|
||||
Process multiple papers in a batch for immediate processing.
|
||||
|
||||
@ -167,7 +245,6 @@ def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] =
|
||||
manager = ScraperManager()
|
||||
|
||||
for paper_id in paper_ids:
|
||||
from ..models import PaperMetadata
|
||||
paper = PaperMetadata.query.get(paper_id)
|
||||
if paper:
|
||||
result = manager.process_paper(paper)
|
||||
|
237
scipaperloader/scrapers/text_extractor.py
Normal file
237
scipaperloader/scrapers/text_extractor.py
Normal file
@ -0,0 +1,237 @@
|
||||
import time
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from .base import BaseScraper, ScrapeResult
|
||||
from flask import current_app
|
||||
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||
from ..db import db
|
||||
from ..parsers.base_parser import BaseParser, ParseError
|
||||
from ..parsers.elsevier_parser import ElsevierParser
|
||||
from ..parsers.arxiv_parser import ArxivParser
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Full text extraction scraper that uses publisher-specific parsers."""
|
||||
|
||||
# This scraper processes papers with HTML content and outputs "TextExtracted"/"Failed"
|
||||
INPUT_STATUSES = ["WebContentDownloaded", "PublisherDetected"]
|
||||
OUTPUT_STATUS_SUCCESS = "TextExtracted"
|
||||
OUTPUT_STATUS_FAILURE = "Failed"
|
||||
OUTPUT_STATUS_PROCESSING = "ExtractingText"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Registry of available parsers
|
||||
self.parsers = [
|
||||
ElsevierParser(),
|
||||
ArxivParser(),
|
||||
# Add more parsers here as you create them
|
||||
# SpringerParser(),
|
||||
# WileyParser(),
|
||||
# IEEEParser(),
|
||||
]
|
||||
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""Extract full text using appropriate publisher parser."""
|
||||
start_time = time.time()
|
||||
|
||||
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
if not paper:
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=f"No paper found for DOI {doi}",
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log start of scraping
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Update status to processing
|
||||
paper.status = self.OUTPUT_STATUS_PROCESSING
|
||||
db.session.commit()
|
||||
|
||||
# Check if HTML file exists
|
||||
if not paper.file_path or not os.path.exists(paper.file_path):
|
||||
error_msg = f"HTML file not found for DOI {doi}. Expected at: {paper.file_path}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "html_file_not_found"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
try:
|
||||
# Read HTML content
|
||||
with open(paper.file_path, 'r', encoding='utf-8') as f:
|
||||
html_content = f.read()
|
||||
|
||||
# Find appropriate parser
|
||||
parser = self._select_parser(html_content)
|
||||
|
||||
if not parser:
|
||||
error_msg = f"No suitable parser found for DOI {doi}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "no_parser_available"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Parse content
|
||||
parsed_content = parser.parse(html_content, doi)
|
||||
|
||||
# Validate parsed content
|
||||
if not parser.validate_content(parsed_content):
|
||||
error_msg = f"Parsed content validation failed for DOI {doi}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "content_validation_failed"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Save extracted text to file
|
||||
text_file_path = self._save_extracted_text(parsed_content, doi)
|
||||
|
||||
# Update paper status to success
|
||||
paper.status = self.OUTPUT_STATUS_SUCCESS
|
||||
paper.error_msg = None
|
||||
# You might want to add a text_file_path field to store the text file location
|
||||
# paper.text_file_path = text_file_path
|
||||
db.session.commit()
|
||||
|
||||
success_msg = f"Successfully extracted text using {parser.get_name()} parser"
|
||||
self.log_scrape_success(doi, success_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="success",
|
||||
message=f"Successfully extracted full text for {doi}",
|
||||
data={
|
||||
"text_file_path": text_file_path,
|
||||
"parser_used": parser.get_name(),
|
||||
"title": parsed_content.title,
|
||||
"word_count": len(parsed_content.full_text.split()),
|
||||
"has_abstract": bool(parsed_content.abstract),
|
||||
"has_sections": bool(parsed_content.sections),
|
||||
"author_count": len(parsed_content.authors) if parsed_content.authors else 0,
|
||||
"keyword_count": len(parsed_content.keywords) if parsed_content.keywords else 0,
|
||||
"reference_count": len(parsed_content.references) if parsed_content.references else 0
|
||||
},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except ParseError as e:
|
||||
error_msg = f"Parser error for DOI {doi}: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "parser_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Unexpected error extracting text for DOI {doi}: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "extraction_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
def _select_parser(self, html_content: str) -> Optional[BaseParser]:
|
||||
"""
|
||||
Select the most appropriate parser for the HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The HTML content to analyze
|
||||
|
||||
Returns:
|
||||
The best parser for this content, or None if no parser can handle it
|
||||
"""
|
||||
for parser in self.parsers:
|
||||
if parser.can_parse(html_content):
|
||||
return parser
|
||||
|
||||
return None
|
||||
|
||||
def _save_extracted_text(self, parsed_content, doi: str) -> str:
|
||||
"""
|
||||
Save extracted text to a file.
|
||||
|
||||
Args:
|
||||
parsed_content: The parsed content object
|
||||
doi: The DOI of the paper
|
||||
|
||||
Returns:
|
||||
Path to the saved text file
|
||||
"""
|
||||
download_path = DownloadPathConfig.get_path()
|
||||
text_file_name = f"{doi.replace('/', '_')}_fulltext.txt"
|
||||
text_file_path = os.path.join(download_path, text_file_name)
|
||||
|
||||
with open(text_file_path, 'w', encoding='utf-8') as f:
|
||||
# Write structured content
|
||||
f.write(f"DOI: {parsed_content.doi or doi}\n")
|
||||
f.write(f"Title: {parsed_content.title or 'Unknown'}\n")
|
||||
f.write(f"Journal: {parsed_content.journal or 'Unknown'}\n")
|
||||
f.write(f"Publication Date: {parsed_content.publication_date or 'Unknown'}\n")
|
||||
|
||||
if parsed_content.authors:
|
||||
f.write(f"Authors: {', '.join(parsed_content.authors)}\n")
|
||||
|
||||
if parsed_content.keywords:
|
||||
f.write(f"Keywords: {', '.join(parsed_content.keywords)}\n")
|
||||
|
||||
f.write(f"Extracted: {datetime.utcnow().isoformat()}\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
# Write full text
|
||||
f.write(parsed_content.full_text)
|
||||
|
||||
# Optionally write references at the end
|
||||
if parsed_content.references:
|
||||
f.write("\n\n" + "=" * 80 + "\n")
|
||||
f.write("REFERENCES\n")
|
||||
f.write("=" * 80 + "\n")
|
||||
for i, ref in enumerate(parsed_content.references, 1):
|
||||
f.write(f"{i}. {ref}\n")
|
||||
|
||||
return text_file_path
|
201
scipaperloader/scrapers/web_fetcher.py
Normal file
201
scipaperloader/scrapers/web_fetcher.py
Normal file
@ -0,0 +1,201 @@
|
||||
import time
|
||||
import os
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
from datetime import datetime
|
||||
from .base import BaseScraper, ScrapeResult
|
||||
from flask import current_app
|
||||
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||
from ..db import db
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Web fetcher scraper that downloads HTML content from DOI URLs."""
|
||||
|
||||
# This scraper processes "New" papers and outputs "WebContentDownloaded"/"Failed"
|
||||
INPUT_STATUSES = ["New"]
|
||||
OUTPUT_STATUS_SUCCESS = "WebContentDownloaded"
|
||||
OUTPUT_STATUS_FAILURE = "Failed"
|
||||
OUTPUT_STATUS_PROCESSING = "FetchingWebContent"
|
||||
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""Fetch HTML content from DOI and save to download path."""
|
||||
start_time = time.time()
|
||||
|
||||
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
if not paper:
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=f"No paper found for DOI {doi}",
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log start of scraping
|
||||
self.log_scrape_start(doi, paper.id)
|
||||
|
||||
# Update status to processing
|
||||
paper.status = self.OUTPUT_STATUS_PROCESSING
|
||||
db.session.commit()
|
||||
|
||||
# Prepare file paths
|
||||
download_path = DownloadPathConfig.get_path()
|
||||
file_name = f"{doi.replace('/', '_')}.html"
|
||||
file_path = os.path.join(download_path, file_name)
|
||||
|
||||
# Check/create download directory
|
||||
if not os.path.exists(download_path):
|
||||
try:
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
except OSError as e:
|
||||
error_msg = f"Failed to create download directory: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "path_creation_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Check path permissions
|
||||
if not os.access(download_path, os.W_OK):
|
||||
error_msg = f"Download path '{download_path}' is not writable"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "path_write_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
try:
|
||||
# Fetch HTML from DOI
|
||||
doi_url = f"https://doi.org/{doi}"
|
||||
headers = {
|
||||
'User-Agent': 'SciPaperLoader/1.0 (Academic Research Tool)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
doi_url,
|
||||
headers=headers,
|
||||
timeout=30,
|
||||
allow_redirects=True,
|
||||
verify=True
|
||||
)
|
||||
|
||||
# Check for invalid DOI (404) or other HTTP errors
|
||||
if response.status_code == 404:
|
||||
error_msg = f"Invalid DOI: {doi} not found (404)"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "invalid_doi"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Check for other HTTP errors
|
||||
response.raise_for_status()
|
||||
|
||||
# Save HTML content
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(response.text)
|
||||
|
||||
# Extract final URL after redirects (for publisher detection)
|
||||
final_url = response.url
|
||||
|
||||
# Update paper status to success
|
||||
paper.status = self.OUTPUT_STATUS_SUCCESS
|
||||
paper.file_path = file_path
|
||||
paper.error_msg = None
|
||||
db.session.commit()
|
||||
|
||||
# Log success
|
||||
success_msg = f"Successfully fetched HTML content for {doi} from {final_url}"
|
||||
self.log_scrape_success(doi, success_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="success",
|
||||
message=f"Successfully fetched HTML for {doi}",
|
||||
data={
|
||||
"file_path": file_path,
|
||||
"final_url": final_url,
|
||||
"content_length": len(response.text),
|
||||
"content_type": response.headers.get('content-type', 'unknown'),
|
||||
"title": paper.title,
|
||||
"domain": urlparse(final_url).netloc if final_url else None
|
||||
},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except requests.exceptions.HTTPError as e:
|
||||
error_msg = f"HTTP error fetching {doi_url}: {e.response.status_code} - {e}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "http_error", "status_code": e.response.status_code},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
error_msg = f"Network error fetching {doi_url}: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "network_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to save HTML file: {str(e)}"
|
||||
paper.status = self.OUTPUT_STATUS_FAILURE
|
||||
paper.error_msg = error_msg
|
||||
db.session.commit()
|
||||
|
||||
self.log_scrape_failure(doi, error_msg, paper.id)
|
||||
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data={"error_code": "file_creation_error"},
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
384
scipaperloader/static/js/README.md
Normal file
384
scipaperloader/static/js/README.md
Normal file
@ -0,0 +1,384 @@
|
||||
# JavaScript Modularization Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
The JavaScript code in the SciPaperLoader application has been modularized into reusable components to improve maintainability, reduce code duplication, and enable easier testing and updates.
|
||||
|
||||
## Modularization Task Completed
|
||||
|
||||
### Problem Statement
|
||||
The original codebase had ~800+ lines of inline JavaScript scattered across multiple Jinja templates with several critical issues:
|
||||
- **Code Duplication**: Similar functionality replicated across templates
|
||||
- **Maintenance Difficulty**: Changes required editing multiple template files
|
||||
- **Linter Issues**: Jinja template syntax mixed with JavaScript caused linting errors
|
||||
- **Testing Challenges**: Inline code was difficult to unit test
|
||||
- **Poor Separation of Concerns**: Template logic mixed with application logic
|
||||
|
||||
### Solution Implemented
|
||||
Successfully transformed the codebase by:
|
||||
|
||||
1. **Extracted 10 Modular JavaScript Files** (~800+ lines of code moved from templates)
|
||||
2. **Eliminated Code Duplication** by creating reusable components
|
||||
3. **Fixed Linter Compatibility** by separating template syntax from JavaScript logic
|
||||
4. **Implemented Clean Variable Passing** using JSON script tags instead of direct Jinja embedding
|
||||
5. **Created Class-Based Architecture** with proper inheritance and composition patterns
|
||||
6. **Established Inter-Component Communication** through callback systems
|
||||
7. **Added Comprehensive Error Handling** and loading states throughout
|
||||
|
||||
### Key Achievements
|
||||
- ✅ **5 templates modularized**: `scraper.html.jinja`, `papers.html.jinja`, `upload.html.jinja`, `logger.html.jinja`, `config/schedule.html.jinja`
|
||||
- ✅ **10 JavaScript modules created**: Covering all functionality from utilities to dashboard coordination
|
||||
- ✅ **Zero functionality loss**: All existing features preserved during modularization
|
||||
- ✅ **Improved maintainability**: Changes now require editing single module files
|
||||
- ✅ **Enhanced testability**: Individual modules can be unit tested
|
||||
- ✅ **Clean variable handling**: Jinja variables passed as JSON configuration instead of inline embedding
|
||||
|
||||
### Before vs After Example
|
||||
**Before (inline in template)**:
|
||||
```html
|
||||
<script>
|
||||
var maxVolume = {{ max_volume }}; // Linter error
|
||||
$('#start-scraper').click(function() {
|
||||
// 50+ lines of mixed template/JS code
|
||||
});
|
||||
</script>
|
||||
```
|
||||
|
||||
**After (modular)**:
|
||||
```html
|
||||
<script type="application/json" id="config-data">
|
||||
{"maxVolume": {{ max_volume|tojson }}}
|
||||
</script>
|
||||
<script src="{{ url_for('static', filename='js/scraper-control.js') }}"></script>
|
||||
<script>
|
||||
const config = JSON.parse(document.getElementById('config-data').textContent);
|
||||
new ScraperControl(config).init();
|
||||
</script>
|
||||
```
|
||||
|
||||
## Modular JavaScript Files
|
||||
|
||||
### 1. `/static/js/common.js`
|
||||
**Purpose**: Common utilities used across the application
|
||||
|
||||
**Key Functions**:
|
||||
- `showFlashMessage(message, type)` - Display flash messages to users
|
||||
- `createStatusBadge(status)` - Generate status badge HTML
|
||||
- `formatTimestamp(timestamp)` - Format timestamps for display
|
||||
- `truncateText(text, maxLength)` - Truncate text with ellipsis
|
||||
- `toggleButtonLoading(button, loading, loadingText)` - Handle button loading states
|
||||
- `apiRequest(url, options)` - Generic API request wrapper
|
||||
|
||||
**Used by**: All templates that need basic utilities
|
||||
|
||||
### 2. `/static/js/modal-handler.js`
|
||||
**Purpose**: Handle modal dialogs with dynamic content loading
|
||||
|
||||
**Key Features**:
|
||||
- AJAX content loading
|
||||
- Error handling
|
||||
- Automatic click handler setup
|
||||
- Bootstrap modal integration
|
||||
|
||||
**Used by**:
|
||||
- `papers.html.jinja` (paper details modal)
|
||||
- `logger.html.jinja` (log details modal)
|
||||
|
||||
### 3. `/static/js/form-handler.js`
|
||||
**Purpose**: Handle form submissions with progress tracking
|
||||
|
||||
**Key Features**:
|
||||
- Progress modal display
|
||||
- Task status polling
|
||||
- Error handling
|
||||
- Customizable callbacks
|
||||
|
||||
**Used by**:
|
||||
- `upload.html.jinja` (CSV upload form)
|
||||
|
||||
### 4. `/static/js/chart.js`
|
||||
**Purpose**: Handle Chart.js activity visualization
|
||||
|
||||
**Key Features**:
|
||||
- Chart initialization and rendering
|
||||
- Data loading from API
|
||||
- Error handling for missing Chart.js
|
||||
|
||||
**Used by**:
|
||||
- `scraper.html.jinja` (activity charts)
|
||||
|
||||
### 5. `/static/js/scraper-control.js`
|
||||
**Purpose**: Handle scraper control operations (start/stop/pause/reset)
|
||||
|
||||
**Key Features**:
|
||||
- Status polling
|
||||
- Volume configuration
|
||||
- Callback system for refreshing other components
|
||||
|
||||
**Used by**:
|
||||
- `scraper.html.jinja`
|
||||
|
||||
### 6. `/static/js/paper-processor.js`
|
||||
**Purpose**: Handle paper search and processing functionality
|
||||
|
||||
**Key Features**:
|
||||
- Paper search
|
||||
- Single paper processing
|
||||
- Status polling
|
||||
- Scraper selection
|
||||
|
||||
**Used by**:
|
||||
- `scraper.html.jinja`
|
||||
|
||||
### 7. `/static/js/activity-monitor.js`
|
||||
**Purpose**: Handle activity log display and real-time notifications
|
||||
|
||||
**Key Features**:
|
||||
- Activity log loading
|
||||
- Real-time updates
|
||||
- Notification management
|
||||
|
||||
**Used by**:
|
||||
- `scraper.html.jinja`
|
||||
|
||||
### 8. `/static/js/scraper-dashboard.js`
|
||||
**Purpose**: Coordinate all scraper dashboard components
|
||||
|
||||
**Key Features**:
|
||||
- Component initialization
|
||||
- Inter-component communication
|
||||
- Configuration management
|
||||
|
||||
**Used by**:
|
||||
- `scraper.html.jinja`
|
||||
|
||||
### 9. `/static/js/config-handler.js`
|
||||
**Purpose**: Handle configuration forms and Alpine.js integration
|
||||
|
||||
**Key Features**:
|
||||
- Configuration API calls
|
||||
- Alpine.js data objects
|
||||
- Schedule management
|
||||
- Volume updates
|
||||
|
||||
**Used by**:
|
||||
- `config/schedule.html.jinja`
|
||||
|
||||
## Template Updates
|
||||
|
||||
### Templates Using Modular JavaScript
|
||||
|
||||
1. **scraper.html.jinja**
|
||||
- Uses all scraper-related modules
|
||||
- Passes Jinja variables as configuration parameters
|
||||
- Initializes dashboard with `initScraperDashboard(config)`
|
||||
|
||||
2. **papers.html.jinja**
|
||||
- Uses `modal-handler.js` for paper detail modals
|
||||
- Simplified from custom modal code to single line initialization
|
||||
|
||||
3. **upload.html.jinja**
|
||||
- Uses `form-handler.js` for upload progress tracking
|
||||
- Custom result display function
|
||||
- Automatic task status polling
|
||||
|
||||
4. **logger.html.jinja**
|
||||
- Uses `modal-handler.js` for log detail modals
|
||||
- Custom URL construction for log endpoints
|
||||
|
||||
5. **config/schedule.html.jinja**
|
||||
- Uses `config-handler.js` for Alpine.js integration
|
||||
- Modular schedule management functions
|
||||
|
||||
## Benefits of Modularization
|
||||
|
||||
### 1. **Reusability**
|
||||
- Modal functionality shared between papers and logger templates
|
||||
- Common utilities used across all templates
|
||||
- Form handling can be reused for other forms
|
||||
|
||||
### 2. **Maintainability**
|
||||
- Single place to update common functionality
|
||||
- Clear separation of concerns
|
||||
- Easier debugging and testing
|
||||
|
||||
### 3. **Parameter Passing**
|
||||
- Jinja variables passed as configuration objects
|
||||
- No more hardcoded values in JavaScript
|
||||
- Environment-specific settings easily configurable
|
||||
|
||||
### 4. **Extensibility**
|
||||
- Easy to add new functionality to existing modules
|
||||
- New templates can easily use existing modules
|
||||
- Plugin-like architecture for components
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Modal Usage
|
||||
```javascript
|
||||
const modal = new ModalHandler('modalId', 'contentElementId');
|
||||
modal.setupClickHandlers('.clickable-items');
|
||||
```
|
||||
|
||||
### Form with Progress Tracking
|
||||
```javascript
|
||||
const formHandler = new FormHandler('formId', {
|
||||
onSuccess: (result) => console.log('Success:', result),
|
||||
onError: (error) => console.log('Error:', error)
|
||||
});
|
||||
```
|
||||
|
||||
### Configuration Management
|
||||
```javascript
|
||||
// In Alpine.js template
|
||||
x-data="configHandler.createScheduleManager(initialData, volume)"
|
||||
```
|
||||
|
||||
## Migration Notes
|
||||
|
||||
### Old vs New Approach
|
||||
|
||||
**Before**: Inline JavaScript in each template
|
||||
```html
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
// Lots of inline JavaScript code
|
||||
});
|
||||
</script>
|
||||
```
|
||||
|
||||
**After**: Modular imports with configuration
|
||||
```html
|
||||
<script src="{{ url_for('static', filename='js/common.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/modal-handler.js') }}"></script>
|
||||
<script>
|
||||
const modal = new ModalHandler('modalId', 'contentId');
|
||||
modal.setupClickHandlers('.links');
|
||||
</script>
|
||||
```
|
||||
|
||||
### Jinja Variable Handling
|
||||
|
||||
To properly separate Jinja template variables from JavaScript code and avoid linting issues, we use a clean JSON configuration approach:
|
||||
|
||||
**Before**: Variables embedded directly in JavaScript (causes linting issues)
|
||||
```javascript
|
||||
if (volume > {{ max_volume }}) {
|
||||
// Error handling - JSLint will complain about {{ }}
|
||||
}
|
||||
```
|
||||
|
||||
**After**: Clean separation using JSON script tags
|
||||
```html
|
||||
<!-- Jinja variables in JSON format -->
|
||||
<script type="application/json" id="config-data">
|
||||
{
|
||||
"maxVolume": {{ max_volume|tojson }},
|
||||
"currentVolume": {{ volume|tojson }},
|
||||
"apiUrl": {{ url_for('api.endpoint')|tojson }},
|
||||
"csrfToken": {{ csrf_token()|tojson }}
|
||||
}
|
||||
</script>
|
||||
|
||||
<!-- Clean JavaScript that reads the configuration -->
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const config = JSON.parse(document.getElementById('config-data').textContent);
|
||||
const handler = new VolumeHandler(config);
|
||||
});
|
||||
</script>
|
||||
```
|
||||
|
||||
**Benefits of this approach**:
|
||||
- **Linter-friendly**: No template syntax in JavaScript files
|
||||
- **Type-safe**: JSON ensures proper data types
|
||||
- **Maintainable**: Clear separation of concerns
|
||||
- **Secure**: Automatic escaping with `|tojson` filter
|
||||
- **Debuggable**: Easy to inspect configuration in DevTools
|
||||
|
||||
**Real-world example from scraper.html.jinja**:
|
||||
```html
|
||||
<script type="application/json" id="scraper-config">
|
||||
{
|
||||
"statusUrl": {{ url_for('api.scraper_status')|tojson }},
|
||||
"startUrl": {{ url_for('api.start_scraper')|tojson }},
|
||||
"volume": {{ volume|tojson }},
|
||||
"scraperType": {{ scraper_type|tojson }},
|
||||
"csrfToken": {{ csrf_token()|tojson }}
|
||||
}
|
||||
</script>
|
||||
|
||||
<script>
|
||||
const config = JSON.parse(document.getElementById('scraper-config').textContent);
|
||||
initScraperDashboard(config);
|
||||
</script>
|
||||
```
|
||||
|
||||
## Future Improvements
|
||||
|
||||
### Potential Enhancements
|
||||
1. **Bundle Management**: Consider using webpack or similar for production builds
|
||||
2. **Unit Testing**: Add comprehensive test suite for individual modules
|
||||
3. **JSDoc Comments**: Add detailed documentation for better IDE support
|
||||
4. **Centralized Error Reporting**: Implement global error handling system
|
||||
5. **Performance Optimization**: Implement lazy loading for non-critical modules
|
||||
6. **TypeScript Migration**: Consider migrating to TypeScript for better type safety
|
||||
|
||||
### Adding New Modules
|
||||
When creating new JavaScript modules:
|
||||
1. Follow the established class-based pattern
|
||||
2. Include proper error handling
|
||||
3. Use the configuration pattern for Jinja variables
|
||||
4. Add documentation to this README
|
||||
5. Update templates to use the new module
|
||||
|
||||
## Testing
|
||||
|
||||
A test file `test_js_modularization.py` has been created to verify the modularization. To run comprehensive testing:
|
||||
|
||||
```bash
|
||||
python test_js_modularization.py
|
||||
```
|
||||
|
||||
This will verify:
|
||||
- All JavaScript files exist and are properly formatted
|
||||
- Templates correctly reference the modular files
|
||||
- Configuration patterns are properly implemented
|
||||
- No inline JavaScript remains in templates
|
||||
|
||||
## Maintenance
|
||||
|
||||
### When Making Changes
|
||||
1. **Update Single Module**: Changes to functionality only require editing one file
|
||||
2. **Test Affected Templates**: Ensure all templates using the module still work
|
||||
3. **Update Documentation**: Keep this README current with any changes
|
||||
4. **Consider Dependencies**: Check if changes affect other modules
|
||||
|
||||
### File Organization
|
||||
```
|
||||
/static/js/
|
||||
├── README.md # This documentation
|
||||
├── common.js # Shared utilities
|
||||
├── modal-handler.js # Modal functionality
|
||||
├── form-handler.js # Form processing
|
||||
├── chart.js # Chart visualization
|
||||
├── scraper-control.js # Scraper operations
|
||||
├── paper-processor.js # Paper management
|
||||
├── activity-monitor.js # Activity tracking
|
||||
├── scraper-dashboard.js # Dashboard coordination
|
||||
├── config-handler.js # Configuration management
|
||||
└── table-handler.js # Table utilities
|
||||
```
|
||||
|
||||
## Migration Summary
|
||||
|
||||
The modularization successfully transformed **~800+ lines of inline JavaScript** from templates into a maintainable, reusable module system. This improvement provides:
|
||||
|
||||
- **Enhanced maintainability** through single-responsibility modules
|
||||
- **Reduced code duplication** via shared utility functions
|
||||
- **Improved linter compatibility** by separating template and JavaScript concerns
|
||||
- **Better testability** with isolated, unit-testable modules
|
||||
- **Cleaner templates** with minimal, configuration-only JavaScript
|
||||
- **Easier debugging** with clearly separated concerns and proper error handling
|
||||
|
||||
All existing functionality has been preserved while significantly improving the codebase architecture and developer experience.
|
328
scipaperloader/static/js/activity-monitor.js
Normal file
328
scipaperloader/static/js/activity-monitor.js
Normal file
@ -0,0 +1,328 @@
|
||||
/**
|
||||
* Activity monitoring and display functionality
|
||||
*/
|
||||
|
||||
class ActivityMonitor {
|
||||
constructor() {
|
||||
this.activityLog = document.getElementById("activityLog");
|
||||
this.notificationsToggle = document.getElementById("notificationsToggle");
|
||||
this.notificationsEnabled = true;
|
||||
this.lastPaperTimestamp = new Date().toISOString();
|
||||
|
||||
// Pagination state
|
||||
this.currentPage = 1;
|
||||
this.perPage = 20;
|
||||
this.statusFilter = "";
|
||||
this.totalPages = 1;
|
||||
this.totalEntries = 0;
|
||||
|
||||
// Pagination elements
|
||||
this.paginationContainer = document.getElementById("activityPagination");
|
||||
this.paginationInfo = document.getElementById("activityPaginationInfo");
|
||||
this.prevPageBtn = document.getElementById("activityPrevPage");
|
||||
this.nextPageBtn = document.getElementById("activityNextPage");
|
||||
this.currentPageSpan = document.getElementById("activityCurrentPage");
|
||||
this.pageSizeSelect = document.getElementById("activityPageSize");
|
||||
this.statusFilterSelect = document.getElementById("activityStatusFilter");
|
||||
|
||||
this.initEventListeners();
|
||||
this.setupWebSocket();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize event listeners
|
||||
*/
|
||||
initEventListeners() {
|
||||
if (this.notificationsToggle) {
|
||||
this.notificationsToggle.addEventListener("click", () => {
|
||||
this.notificationsEnabled = this.notificationsToggle.checked;
|
||||
});
|
||||
}
|
||||
|
||||
// Time range buttons
|
||||
document.querySelectorAll(".time-range-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
document
|
||||
.querySelectorAll(".time-range-btn")
|
||||
.forEach((b) => b.classList.remove("active"));
|
||||
btn.classList.add("active");
|
||||
const currentTimeRange = parseInt(btn.dataset.hours);
|
||||
|
||||
// Trigger chart refresh if callback is provided
|
||||
if (this.onChartRefresh) {
|
||||
this.onChartRefresh(currentTimeRange);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// Pagination event listeners
|
||||
if (this.prevPageBtn) {
|
||||
this.prevPageBtn.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
if (this.currentPage > 1) {
|
||||
this.currentPage--;
|
||||
this.loadRecentActivity();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (this.nextPageBtn) {
|
||||
this.nextPageBtn.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
if (this.currentPage < this.totalPages) {
|
||||
this.currentPage++;
|
||||
this.loadRecentActivity();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Page size change
|
||||
if (this.pageSizeSelect) {
|
||||
this.pageSizeSelect.addEventListener("change", () => {
|
||||
this.perPage = parseInt(this.pageSizeSelect.value);
|
||||
this.currentPage = 1; // Reset to first page
|
||||
this.loadRecentActivity();
|
||||
});
|
||||
}
|
||||
|
||||
// Status filter change
|
||||
if (this.statusFilterSelect) {
|
||||
this.statusFilterSelect.addEventListener("change", () => {
|
||||
this.statusFilter = this.statusFilterSelect.value;
|
||||
this.currentPage = 1; // Reset to first page
|
||||
this.loadRecentActivity();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load and render recent activity
|
||||
*/
|
||||
async loadRecentActivity() {
|
||||
if (!this.activityLog) return;
|
||||
|
||||
try {
|
||||
// Build query parameters for pagination
|
||||
const params = new URLSearchParams({
|
||||
page: this.currentPage,
|
||||
per_page: this.perPage,
|
||||
});
|
||||
|
||||
// Add multiple category parameters
|
||||
params.append("category", "scraper_activity");
|
||||
params.append("category", "scraper_command");
|
||||
|
||||
if (this.statusFilter) {
|
||||
params.append("status", this.statusFilter);
|
||||
}
|
||||
|
||||
const data = await apiRequest(`/logs/api?${params.toString()}`);
|
||||
|
||||
if (data.success) {
|
||||
this.renderActivityLog(data.logs);
|
||||
this.updatePagination(data.pagination);
|
||||
console.log("Activity log refreshed with latest data");
|
||||
} else {
|
||||
throw new Error(data.message || "Failed to load logs");
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to load activity logs:", error);
|
||||
// If the API endpoint doesn't exist, just show a message
|
||||
this.activityLog.innerHTML =
|
||||
'<tr><td colspan="4" class="text-center">Activity log API not available</td></tr>';
|
||||
this.hidePagination();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Render activity log data
|
||||
* @param {Array} logs - Array of log entries
|
||||
*/
|
||||
renderActivityLog(logs) {
|
||||
if (!this.activityLog) return;
|
||||
|
||||
this.activityLog.innerHTML = "";
|
||||
|
||||
if (!logs || logs.length === 0) {
|
||||
this.activityLog.innerHTML =
|
||||
'<tr><td colspan="4" class="text-center">No recent activity</td></tr>';
|
||||
return;
|
||||
}
|
||||
|
||||
logs.forEach((log) => {
|
||||
const row = document.createElement("tr");
|
||||
|
||||
// Format timestamp
|
||||
const timeStr = formatTimestamp(log.timestamp);
|
||||
|
||||
// Create status badge
|
||||
const statusBadge = createStatusBadge(log.status);
|
||||
|
||||
row.innerHTML = `
|
||||
<td>${timeStr}</td>
|
||||
<td>${log.action}</td>
|
||||
<td>${statusBadge}</td>
|
||||
<td>${log.description || ""}</td>
|
||||
`;
|
||||
|
||||
this.activityLog.appendChild(row);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Update pagination controls based on API response
|
||||
* @param {Object} pagination - Pagination data from API
|
||||
*/
|
||||
updatePagination(pagination) {
|
||||
if (!pagination || !this.paginationContainer) return;
|
||||
|
||||
this.currentPage = pagination.page;
|
||||
this.totalPages = pagination.pages;
|
||||
this.totalEntries = pagination.total;
|
||||
|
||||
// Show pagination container
|
||||
this.paginationContainer.classList.remove("d-none");
|
||||
|
||||
// Update pagination info
|
||||
const startEntry = (pagination.page - 1) * pagination.per_page + 1;
|
||||
const endEntry = Math.min(
|
||||
pagination.page * pagination.per_page,
|
||||
pagination.total
|
||||
);
|
||||
|
||||
if (this.paginationInfo) {
|
||||
this.paginationInfo.textContent = `Showing ${startEntry} - ${endEntry} of ${pagination.total} entries`;
|
||||
}
|
||||
|
||||
// Update current page display
|
||||
if (this.currentPageSpan) {
|
||||
this.currentPageSpan.textContent = `${pagination.page} of ${pagination.pages}`;
|
||||
}
|
||||
|
||||
// Update previous button
|
||||
if (this.prevPageBtn) {
|
||||
if (pagination.has_prev) {
|
||||
this.prevPageBtn.classList.remove("disabled");
|
||||
this.prevPageBtn.querySelector("a").removeAttribute("tabindex");
|
||||
this.prevPageBtn
|
||||
.querySelector("a")
|
||||
.setAttribute("aria-disabled", "false");
|
||||
} else {
|
||||
this.prevPageBtn.classList.add("disabled");
|
||||
this.prevPageBtn.querySelector("a").setAttribute("tabindex", "-1");
|
||||
this.prevPageBtn
|
||||
.querySelector("a")
|
||||
.setAttribute("aria-disabled", "true");
|
||||
}
|
||||
}
|
||||
|
||||
// Update next button
|
||||
if (this.nextPageBtn) {
|
||||
if (pagination.has_next) {
|
||||
this.nextPageBtn.classList.remove("disabled");
|
||||
this.nextPageBtn.querySelector("a").removeAttribute("tabindex");
|
||||
this.nextPageBtn
|
||||
.querySelector("a")
|
||||
.setAttribute("aria-disabled", "false");
|
||||
} else {
|
||||
this.nextPageBtn.classList.add("disabled");
|
||||
this.nextPageBtn.querySelector("a").setAttribute("tabindex", "-1");
|
||||
this.nextPageBtn
|
||||
.querySelector("a")
|
||||
.setAttribute("aria-disabled", "true");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Hide pagination controls when not needed
|
||||
*/
|
||||
hidePagination() {
|
||||
if (this.paginationContainer) {
|
||||
this.paginationContainer.classList.add("d-none");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup WebSocket for real-time notifications
|
||||
*/
|
||||
setupWebSocket() {
|
||||
// If WebSocket is available, implement it here
|
||||
// For now we'll poll the server periodically for new papers
|
||||
setInterval(() => this.checkForNewPapers(), 10000); // Check every 10 seconds
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for new papers and show notifications
|
||||
*/
|
||||
async checkForNewPapers() {
|
||||
if (!this.notificationsEnabled) return;
|
||||
|
||||
try {
|
||||
// Use the API endpoint for checking new papers, with limit for efficiency
|
||||
const data = await apiRequest(
|
||||
`/logs/api?category=scraper_activity&category=scraper_command&action=scrape_paper&after=${this.lastPaperTimestamp}&limit=5`
|
||||
);
|
||||
|
||||
if (data && data.length > 0) {
|
||||
// Update the timestamp
|
||||
this.lastPaperTimestamp = new Date().toISOString();
|
||||
|
||||
// Show notifications for new papers
|
||||
data.forEach((log) => {
|
||||
const extraData = log.extra_data ? JSON.parse(log.extra_data) : {};
|
||||
if (log.status === "success") {
|
||||
showFlashMessage(
|
||||
`New paper scraped: ${extraData.title || "Unknown title"}`,
|
||||
"success"
|
||||
);
|
||||
} else if (log.status === "error") {
|
||||
showFlashMessage(
|
||||
`Failed to scrape paper: ${log.description}`,
|
||||
"error"
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// Refresh the activity chart and log
|
||||
if (this.onChartRefresh) {
|
||||
this.onChartRefresh();
|
||||
}
|
||||
// Only reload if we're on page 1 to avoid disrupting user navigation
|
||||
if (this.currentPage === 1) {
|
||||
this.loadRecentActivity();
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
// If the API endpoint doesn't exist, do nothing
|
||||
console.debug("Activity polling failed (this may be expected):", error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set callback for chart refresh
|
||||
*/
|
||||
setChartRefreshCallback(callback) {
|
||||
this.onChartRefresh = callback;
|
||||
}
|
||||
|
||||
/**
|
||||
* Refresh activity log manually (useful for external triggers)
|
||||
*/
|
||||
refresh() {
|
||||
this.loadRecentActivity();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset pagination to first page
|
||||
*/
|
||||
resetToFirstPage() {
|
||||
this.currentPage = 1;
|
||||
this.loadRecentActivity();
|
||||
}
|
||||
}
|
||||
|
||||
// Export for use in other modules
|
||||
if (typeof window !== "undefined") {
|
||||
window.ActivityMonitor = ActivityMonitor;
|
||||
}
|
436
scipaperloader/static/js/chart.js
Normal file
436
scipaperloader/static/js/chart.js
Normal file
@ -0,0 +1,436 @@
|
||||
/**
|
||||
* Chart utilities for activity visualization
|
||||
*/
|
||||
|
||||
/**
|
||||
* Chart utilities for activity visualization
|
||||
*/
|
||||
|
||||
class ActivityChart {
|
||||
constructor(canvasId) {
|
||||
this.canvasId = canvasId;
|
||||
this.chart = null;
|
||||
this.scraperChart = null;
|
||||
this.initChart();
|
||||
}
|
||||
|
||||
initChart() {
|
||||
// Check if Chart.js is available
|
||||
if (typeof Chart === "undefined") {
|
||||
console.error("Chart.js is not loaded");
|
||||
return;
|
||||
}
|
||||
|
||||
const chartElement = document.getElementById(this.canvasId);
|
||||
if (!chartElement) {
|
||||
console.error(
|
||||
`Chart canvas element with id "${this.canvasId}" not found`
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Set canvas height directly
|
||||
chartElement.style.height = "300px";
|
||||
chartElement.height = 300;
|
||||
|
||||
this.ctx = chartElement.getContext("2d");
|
||||
|
||||
// Initialize scraper activity chart
|
||||
this.initScraperChart();
|
||||
}
|
||||
|
||||
initScraperChart() {
|
||||
const scraperChartElement = document.getElementById("scraperActivityChart");
|
||||
if (!scraperChartElement) {
|
||||
console.warn("Scraper activity chart element not found");
|
||||
return;
|
||||
}
|
||||
|
||||
this.scraperCtx = scraperChartElement.getContext("2d");
|
||||
}
|
||||
|
||||
/**
|
||||
* Render the activity chart with provided data
|
||||
* @param {Object} data - Chart data object with hourly_stats and scraper_timeline
|
||||
*/
|
||||
render(data) {
|
||||
if (!this.ctx) {
|
||||
console.error("Chart context not available");
|
||||
return;
|
||||
}
|
||||
|
||||
console.log("Render received data:", data);
|
||||
|
||||
// Handle both old and new data formats for compatibility
|
||||
const hourlyStats = data.hourly_stats || data;
|
||||
const scraperTimeline = data.scraper_timeline || [];
|
||||
|
||||
console.log("Extracted hourlyStats:", hourlyStats);
|
||||
console.log("Extracted scraperTimeline:", scraperTimeline);
|
||||
|
||||
// Extract the data for the main chart (papers only)
|
||||
const labels = hourlyStats.map((item) => item.hour);
|
||||
const successData = hourlyStats.map((item) => item.success);
|
||||
const errorData = hourlyStats.map((item) => item.error);
|
||||
const pendingData = hourlyStats.map((item) => item.pending);
|
||||
|
||||
// Destroy existing charts if they exist
|
||||
if (this.chart) {
|
||||
this.chart.destroy();
|
||||
}
|
||||
if (this.scraperChart) {
|
||||
this.scraperChart.destroy();
|
||||
}
|
||||
|
||||
// Render main chart (papers only)
|
||||
this.chart = new Chart(this.ctx, {
|
||||
type: "bar",
|
||||
data: {
|
||||
labels: labels,
|
||||
datasets: [
|
||||
{
|
||||
label: "Success",
|
||||
data: successData,
|
||||
backgroundColor: "#28a745",
|
||||
stack: "Papers",
|
||||
},
|
||||
{
|
||||
label: "Error",
|
||||
data: errorData,
|
||||
backgroundColor: "#dc3545",
|
||||
stack: "Papers",
|
||||
},
|
||||
{
|
||||
label: "Pending",
|
||||
data: pendingData,
|
||||
backgroundColor: "#ffc107",
|
||||
stack: "Papers",
|
||||
},
|
||||
],
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
maintainAspectRatio: true,
|
||||
aspectRatio: 2.5,
|
||||
layout: {
|
||||
padding: {
|
||||
top: 20,
|
||||
bottom: 20,
|
||||
},
|
||||
},
|
||||
plugins: {
|
||||
legend: {
|
||||
position: "top",
|
||||
},
|
||||
tooltip: {
|
||||
mode: "index",
|
||||
intersect: false,
|
||||
},
|
||||
},
|
||||
scales: {
|
||||
x: {
|
||||
stacked: true,
|
||||
title: {
|
||||
display: true,
|
||||
text: "Time (Last Hours)",
|
||||
},
|
||||
},
|
||||
y: {
|
||||
type: "linear",
|
||||
display: true,
|
||||
stacked: true,
|
||||
beginAtZero: true,
|
||||
title: {
|
||||
display: true,
|
||||
text: "Papers Scraped",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// Render scraper activity timeline chart with precise timing
|
||||
this.renderScraperChart(labels, scraperTimeline, hourlyStats.length);
|
||||
|
||||
// Show simple legend for scraper activity
|
||||
this.showScraperStateLegend();
|
||||
}
|
||||
|
||||
/**
|
||||
* Render the separate scraper activity timeline chart with precise timestamps
|
||||
* @param {Array} hourLabels - Hour labels for main chart
|
||||
* @param {Array} scraperTimeline - Timeline of scraper state changes
|
||||
* @param {number} totalHours - Total hours range being displayed
|
||||
*/
|
||||
renderScraperChart(hourLabels, scraperTimeline, totalHours) {
|
||||
if (!this.scraperCtx) {
|
||||
console.warn("Scraper chart context not available");
|
||||
return;
|
||||
}
|
||||
|
||||
let timelineData = [];
|
||||
|
||||
if (scraperTimeline && scraperTimeline.length > 0) {
|
||||
console.log("Original scraper timeline:", scraperTimeline);
|
||||
|
||||
// Filter out duplicate events with the same action, status, and hours_ago
|
||||
const uniqueTimeline = scraperTimeline.filter((event, index, self) => {
|
||||
return (
|
||||
index ===
|
||||
self.findIndex(
|
||||
(e) =>
|
||||
e.action === event.action &&
|
||||
e.status === event.status &&
|
||||
e.hours_ago === event.hours_ago
|
||||
)
|
||||
);
|
||||
});
|
||||
|
||||
console.log("Filtered unique timeline:", uniqueTimeline);
|
||||
|
||||
// Sort timeline by hours_ago (oldest first = highest hours_ago first)
|
||||
const sortedTimeline = [...uniqueTimeline].sort(
|
||||
(a, b) => b.hours_ago - a.hours_ago
|
||||
);
|
||||
|
||||
console.log("Sorted scraper timeline:", sortedTimeline);
|
||||
|
||||
// Create simple timeline with relative positions
|
||||
let currentState = 0;
|
||||
|
||||
// Use hours_ago directly as x-coordinates (inverted so recent is on right)
|
||||
for (let i = 0; i < sortedTimeline.length; i++) {
|
||||
const event = sortedTimeline[i];
|
||||
|
||||
console.log(`Processing event ${i}:`, event);
|
||||
|
||||
// Set the new state based on the action
|
||||
if (event.action === "start_scraper" && event.status === "success") {
|
||||
currentState = 1;
|
||||
} else if (
|
||||
event.action === "stop_scraper" &&
|
||||
event.status === "success"
|
||||
) {
|
||||
currentState = 0;
|
||||
} else if (
|
||||
event.action === "reset_scraper" &&
|
||||
event.status === "success"
|
||||
) {
|
||||
currentState = 0;
|
||||
} else if (
|
||||
event.action === "pause_scraper" &&
|
||||
event.status === "success"
|
||||
) {
|
||||
currentState = 0; // Treat pause as inactive
|
||||
}
|
||||
|
||||
console.log(
|
||||
`New state for ${event.action}: ${currentState} at ${event.hours_ago}h ago`
|
||||
);
|
||||
|
||||
// Use negative hours_ago so recent events are on the right
|
||||
timelineData.push({
|
||||
x: -event.hours_ago,
|
||||
y: currentState,
|
||||
});
|
||||
}
|
||||
|
||||
// Add current time point
|
||||
timelineData.push({
|
||||
x: 0, // Current time
|
||||
y: currentState,
|
||||
});
|
||||
|
||||
console.log("Final timeline data:", timelineData);
|
||||
} else {
|
||||
// No timeline data, show as inactive for the full time range
|
||||
timelineData = [
|
||||
{ x: -totalHours, y: 0 }, // Start of time range
|
||||
{ x: 0, y: 0 }, // Current time
|
||||
];
|
||||
}
|
||||
|
||||
// Ensure we always have data points at the boundaries for proper scaling
|
||||
const hasStartPoint = timelineData.some(
|
||||
(point) => point.x <= -totalHours + 1
|
||||
);
|
||||
const hasEndPoint = timelineData.some((point) => point.x >= -1);
|
||||
|
||||
if (!hasStartPoint) {
|
||||
// Add a point at the start of the time range with current state
|
||||
const currentState =
|
||||
timelineData.length > 0 ? timelineData[timelineData.length - 1].y : 0;
|
||||
timelineData.unshift({ x: -totalHours, y: currentState });
|
||||
}
|
||||
|
||||
if (!hasEndPoint) {
|
||||
// Add a point near the current time with current state
|
||||
const currentState =
|
||||
timelineData.length > 0 ? timelineData[timelineData.length - 1].y : 0;
|
||||
timelineData.push({ x: 0, y: currentState });
|
||||
}
|
||||
|
||||
this.scraperChart = new Chart(this.scraperCtx, {
|
||||
type: "line",
|
||||
data: {
|
||||
datasets: [
|
||||
{
|
||||
label: "Scraper Active",
|
||||
data: timelineData,
|
||||
borderColor: "#28a745",
|
||||
backgroundColor: "rgba(40, 167, 69, 0.1)",
|
||||
borderWidth: 3,
|
||||
fill: true,
|
||||
stepped: "before", // Creates step transitions
|
||||
pointRadius: 5,
|
||||
pointHoverRadius: 7,
|
||||
pointBackgroundColor: "#28a745",
|
||||
pointBorderColor: "#ffffff",
|
||||
pointBorderWidth: 2,
|
||||
tension: 0,
|
||||
},
|
||||
],
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
maintainAspectRatio: true,
|
||||
aspectRatio: 10,
|
||||
layout: {
|
||||
padding: {
|
||||
top: 10,
|
||||
bottom: 10,
|
||||
},
|
||||
},
|
||||
plugins: {
|
||||
legend: {
|
||||
display: false,
|
||||
},
|
||||
tooltip: {
|
||||
callbacks: {
|
||||
label: function (context) {
|
||||
const status =
|
||||
context.parsed.y === 1 ? "Activated" : "Deactivated";
|
||||
const timestamp = new Date();
|
||||
timestamp.setHours(
|
||||
timestamp.getHours() - Math.abs(context.parsed.x)
|
||||
);
|
||||
const formattedTime = timestamp.toLocaleString("en-GB", {
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
second: "2-digit",
|
||||
day: "2-digit",
|
||||
month: "2-digit",
|
||||
year: "numeric",
|
||||
});
|
||||
return `Scraper: ${status} at ${formattedTime}`;
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
scales: {
|
||||
x: {
|
||||
type: "linear",
|
||||
min: -totalHours,
|
||||
max: 0,
|
||||
title: {
|
||||
display: true,
|
||||
text: "Timeline (Hours Ago → Now)",
|
||||
},
|
||||
ticks: {
|
||||
callback: function (value) {
|
||||
if (value === 0) return "Now";
|
||||
return `-${Math.abs(value)}h`;
|
||||
},
|
||||
stepSize: Math.max(1, Math.floor(totalHours / 8)), // Show reasonable number of ticks
|
||||
},
|
||||
grid: {
|
||||
display: true,
|
||||
},
|
||||
},
|
||||
y: {
|
||||
type: "linear",
|
||||
display: true,
|
||||
beginAtZero: true,
|
||||
max: 1.2,
|
||||
min: -0.2,
|
||||
title: {
|
||||
display: true,
|
||||
text: "Active Status",
|
||||
},
|
||||
ticks: {
|
||||
stepSize: 1,
|
||||
callback: function (value) {
|
||||
return value === 1 ? "Active" : value === 0 ? "Inactive" : "";
|
||||
},
|
||||
},
|
||||
grid: {
|
||||
color: function (context) {
|
||||
return context.tick.value === 0.5
|
||||
? "rgba(0,0,0,0.1)"
|
||||
: "rgba(0,0,0,0.05)";
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Show a simple legend for scraper states
|
||||
*/
|
||||
showScraperStateLegend() {
|
||||
let legendContainer = document.getElementById("scraper-state-legend");
|
||||
if (!legendContainer) {
|
||||
return;
|
||||
}
|
||||
|
||||
legendContainer.classList.remove("d-none");
|
||||
legendContainer.innerHTML = `
|
||||
<small class="text-muted">
|
||||
<i class="fas fa-info-circle"></i>
|
||||
The line chart below shows exact timestamps when the scraper was started or stopped with proper time intervals.
|
||||
</small>
|
||||
`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load and render chart data for specified time range
|
||||
* @param {number} hours - Number of hours to show data for
|
||||
*/
|
||||
async loadData(hours) {
|
||||
try {
|
||||
const response = await fetch(`/scraper/stats?hours=${hours}`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
const data = await response.json();
|
||||
console.log("Stats data loaded:", data);
|
||||
this.render(data);
|
||||
} catch (error) {
|
||||
console.error("Failed to load activity stats:", error);
|
||||
// Hide the chart or show an error message
|
||||
const chartContainer = document.getElementById(
|
||||
this.canvasId
|
||||
).parentElement;
|
||||
if (chartContainer) {
|
||||
chartContainer.innerHTML =
|
||||
'<p class="text-muted">Chart data unavailable</p>';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Destroy the chart instance
|
||||
*/
|
||||
destroy() {
|
||||
if (this.chart) {
|
||||
this.chart.destroy();
|
||||
this.chart = null;
|
||||
}
|
||||
if (this.scraperChart) {
|
||||
this.scraperChart.destroy();
|
||||
this.scraperChart = null;
|
||||
}
|
||||
}
|
||||
}
|
175
scipaperloader/static/js/common.js
Normal file
175
scipaperloader/static/js/common.js
Normal file
@ -0,0 +1,175 @@
|
||||
/**
|
||||
* Common utilities for the SciPaperLoader application
|
||||
*/
|
||||
|
||||
/**
|
||||
* Display a flash message to the user as an overlay
|
||||
* @param {string} message - The message to display
|
||||
* @param {string} type - The type of message (success, error, warning, info)
|
||||
* @param {number} duration - Duration in milliseconds (default: 5000)
|
||||
*/
|
||||
function showFlashMessage(message, type = "success", duration = 5000) {
|
||||
const flashMsg = document.createElement("div");
|
||||
const normalizedType = type === "error" ? "danger" : type;
|
||||
flashMsg.className = `flash-overlay flash-${normalizedType}`;
|
||||
|
||||
// Get the appropriate icon based on type
|
||||
const getIcon = (messageType) => {
|
||||
switch (messageType) {
|
||||
case "success":
|
||||
return '<svg class="flash-icon" role="img" aria-label="Success:"><use xlink:href="#check-circle-fill"/></svg>';
|
||||
case "danger":
|
||||
return '<svg class="flash-icon" role="img" aria-label="Error:"><use xlink:href="#x-circle-fill"/></svg>';
|
||||
case "warning":
|
||||
return '<svg class="flash-icon" role="img" aria-label="Warning:"><use xlink:href="#exclamation-triangle-fill"/></svg>';
|
||||
case "info":
|
||||
return '<svg class="flash-icon" role="img" aria-label="Info:"><use xlink:href="#info-fill"/></svg>';
|
||||
default:
|
||||
return '<svg class="flash-icon" role="img" aria-label="Info:"><use xlink:href="#info-fill"/></svg>';
|
||||
}
|
||||
};
|
||||
|
||||
flashMsg.innerHTML = `
|
||||
<div class="flash-content">
|
||||
${getIcon(normalizedType)}
|
||||
<div class="flash-message">${message}</div>
|
||||
<button type="button" class="flash-close" onclick="removeFlashMessage(this.parentElement.parentElement)">×</button>
|
||||
</div>
|
||||
`;
|
||||
|
||||
// Add to page first
|
||||
document.body.appendChild(flashMsg);
|
||||
|
||||
// Position all messages in stack
|
||||
updateFlashMessagePositions();
|
||||
|
||||
// Auto dismiss
|
||||
setTimeout(() => {
|
||||
removeFlashMessage(flashMsg);
|
||||
}, duration);
|
||||
|
||||
return flashMsg;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a flash message and update positions
|
||||
* @param {HTMLElement} flashMsg - The flash message element to remove
|
||||
*/
|
||||
function removeFlashMessage(flashMsg) {
|
||||
if (!flashMsg || !flashMsg.parentNode) return;
|
||||
|
||||
flashMsg.classList.add("fade-out");
|
||||
setTimeout(() => {
|
||||
if (flashMsg.parentNode) {
|
||||
flashMsg.remove();
|
||||
updateFlashMessagePositions();
|
||||
}
|
||||
}, 300);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update positions of all flash messages to create a proper stack
|
||||
*/
|
||||
function updateFlashMessagePositions() {
|
||||
const messages = document.querySelectorAll(".flash-overlay:not(.fade-out)");
|
||||
messages.forEach((msg, index) => {
|
||||
const topPosition = 20 + index * 90; // 90px spacing between messages
|
||||
msg.style.top = `${topPosition}px`;
|
||||
msg.style.zIndex = 9999 - index; // Higher z-index for newer messages
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a status badge HTML element
|
||||
* @param {string} status - The status to create a badge for
|
||||
* @returns {string} HTML string for the status badge
|
||||
*/
|
||||
function createStatusBadge(status) {
|
||||
switch (status) {
|
||||
case "New":
|
||||
return '<span class="badge bg-info">New</span>';
|
||||
case "Pending":
|
||||
return '<span class="badge bg-warning text-dark">Pending</span>';
|
||||
case "Done":
|
||||
return '<span class="badge bg-success">Done</span>';
|
||||
case "Failed":
|
||||
return '<span class="badge bg-danger">Failed</span>';
|
||||
case "success":
|
||||
return '<span class="badge bg-success">Success</span>';
|
||||
case "error":
|
||||
return '<span class="badge bg-danger">Error</span>';
|
||||
case "pending":
|
||||
return '<span class="badge bg-warning text-dark">Pending</span>';
|
||||
default:
|
||||
return `<span class="badge bg-secondary">${status}</span>`;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Format a timestamp to a readable time string
|
||||
* @param {string} timestamp - ISO timestamp string
|
||||
* @returns {string} Formatted time string
|
||||
*/
|
||||
function formatTimestamp(timestamp) {
|
||||
const date = new Date(timestamp);
|
||||
return date.toLocaleTimeString("de-DE", {
|
||||
year: "2-digit",
|
||||
month: "numeric",
|
||||
day: "numeric",
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
second: "2-digit",
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate text to a specified length
|
||||
* @param {string} text - The text to truncate
|
||||
* @param {number} maxLength - Maximum length before truncation
|
||||
* @returns {string} Truncated text with ellipsis if needed
|
||||
*/
|
||||
function truncateText(text, maxLength) {
|
||||
return text.length > maxLength ? text.substring(0, maxLength) + "..." : text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggle button loading state
|
||||
* @param {HTMLElement} button - The button element
|
||||
* @param {boolean} loading - Whether to show loading state
|
||||
* @param {string} loadingText - Text to show when loading
|
||||
*/
|
||||
function toggleButtonLoading(button, loading, loadingText = "Loading...") {
|
||||
if (loading) {
|
||||
button.disabled = true;
|
||||
button.dataset.originalText = button.innerHTML;
|
||||
button.innerHTML = `<i class="fas fa-spinner fa-spin"></i> ${loadingText}`;
|
||||
} else {
|
||||
button.disabled = false;
|
||||
button.innerHTML = button.dataset.originalText || button.innerHTML;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic fetch wrapper with error handling
|
||||
* @param {string} url - The URL to fetch
|
||||
* @param {object} options - Fetch options
|
||||
* @returns {Promise} Fetch promise
|
||||
*/
|
||||
async function apiRequest(url, options = {}) {
|
||||
const defaultOptions = {
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
};
|
||||
|
||||
const mergedOptions = { ...defaultOptions, ...options };
|
||||
|
||||
try {
|
||||
const response = await fetch(url, mergedOptions);
|
||||
const data = await response.json();
|
||||
return data;
|
||||
} catch (error) {
|
||||
console.error(`API request failed for ${url}:`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
195
scipaperloader/static/js/config-handler.js
Normal file
195
scipaperloader/static/js/config-handler.js
Normal file
@ -0,0 +1,195 @@
|
||||
/**
|
||||
* Configuration utilities for handling settings and form submissions
|
||||
*/
|
||||
|
||||
class ConfigHandler {
|
||||
constructor(options = {}) {
|
||||
this.options = {
|
||||
apiEndpoint: options.apiEndpoint || "/config/api/update_config",
|
||||
...options,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Update configuration via API
|
||||
* @param {object} configData - Configuration data to send
|
||||
* @returns {Promise} API response promise
|
||||
*/
|
||||
async updateConfig(configData) {
|
||||
try {
|
||||
const response = await fetch(this.options.apiEndpoint, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify(configData),
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.success) {
|
||||
showFlashMessage(
|
||||
data.message || "Configuration updated successfully!",
|
||||
"success"
|
||||
);
|
||||
} else {
|
||||
const errorMessage =
|
||||
data.updates?.[0]?.message ||
|
||||
data.message ||
|
||||
"Error updating configuration";
|
||||
showFlashMessage(errorMessage, "error");
|
||||
}
|
||||
|
||||
return data;
|
||||
} catch (error) {
|
||||
console.error("Error updating configuration:", error);
|
||||
showFlashMessage("Network error occurred", "error");
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update volume configuration
|
||||
* @param {number} volume - New volume value
|
||||
*/
|
||||
async updateVolume(volume) {
|
||||
return this.updateConfig({ volume: volume });
|
||||
}
|
||||
|
||||
/**
|
||||
* Update schedule configuration
|
||||
* @param {object} schedule - Schedule configuration object
|
||||
*/
|
||||
async updateSchedule(schedule) {
|
||||
return this.updateConfig({ schedule: schedule });
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an Alpine.js data object for schedule management
|
||||
* Reads configuration from JSON script tag in the template
|
||||
* @returns {object} Alpine.js data object
|
||||
*/
|
||||
createScheduleManager() {
|
||||
const self = this;
|
||||
|
||||
// Read configuration from JSON script tag
|
||||
const configElement = document.getElementById("schedule-config");
|
||||
const config = configElement ? JSON.parse(configElement.textContent) : {};
|
||||
const initialSchedule = config.initialSchedule || {};
|
||||
const volume = config.totalVolume || 0;
|
||||
|
||||
return {
|
||||
schedule: { ...initialSchedule },
|
||||
volume: volume,
|
||||
selectedHours: [],
|
||||
newWeight: 1.0,
|
||||
volumeValue: volume,
|
||||
isDragging: false,
|
||||
dragOperation: null,
|
||||
|
||||
formatHour(h) {
|
||||
return String(h).padStart(2, "0") + ":00";
|
||||
},
|
||||
|
||||
async updateVolume() {
|
||||
try {
|
||||
const data = await self.updateVolume(this.volumeValue);
|
||||
if (data.success) {
|
||||
this.volume = parseFloat(this.volumeValue);
|
||||
}
|
||||
} catch (error) {
|
||||
// Error handling is done in updateConfig
|
||||
}
|
||||
},
|
||||
|
||||
getBackgroundStyle(hour) {
|
||||
const weight = parseFloat(this.schedule[hour]);
|
||||
const maxWeight = 2.5;
|
||||
|
||||
// Normalize weight (0.0 to 1.0)
|
||||
const t = Math.min(weight / maxWeight, 1.0);
|
||||
|
||||
// Interpolate HSL lightness: 95% (light) to 30% (dark)
|
||||
const lightness = 95 - t * 65;
|
||||
const backgroundColor = `hsl(210, 10%, ${lightness}%)`;
|
||||
|
||||
const textColor = t > 0.65 ? "white" : "black";
|
||||
|
||||
return {
|
||||
backgroundColor,
|
||||
color: textColor,
|
||||
};
|
||||
},
|
||||
|
||||
startDrag(event, hour) {
|
||||
event.preventDefault();
|
||||
this.isDragging = true;
|
||||
this.dragOperation = this.isSelected(hour) ? "remove" : "add";
|
||||
this.toggleSelect(hour);
|
||||
},
|
||||
|
||||
dragSelect(hour) {
|
||||
if (!this.isDragging) return;
|
||||
const selected = this.isSelected(hour);
|
||||
|
||||
if (this.dragOperation === "add" && !selected) {
|
||||
this.selectedHours.push(hour);
|
||||
} else if (this.dragOperation === "remove" && selected) {
|
||||
this.selectedHours = this.selectedHours.filter((h) => h !== hour);
|
||||
}
|
||||
},
|
||||
|
||||
endDrag() {
|
||||
this.isDragging = false;
|
||||
},
|
||||
|
||||
toggleSelect(hour) {
|
||||
if (this.isSelected(hour)) {
|
||||
this.selectedHours = this.selectedHours.filter((h) => h !== hour);
|
||||
} else {
|
||||
this.selectedHours.push(hour);
|
||||
}
|
||||
},
|
||||
|
||||
isSelected(hour) {
|
||||
return this.selectedHours.includes(hour);
|
||||
},
|
||||
|
||||
applyWeight() {
|
||||
this.selectedHours.forEach((hour) => {
|
||||
this.schedule[hour] = parseFloat(this.newWeight).toFixed(1);
|
||||
});
|
||||
this.selectedHours = [];
|
||||
},
|
||||
|
||||
getTotalWeight() {
|
||||
return Object.values(this.schedule).reduce(
|
||||
(sum, w) => sum + parseFloat(w),
|
||||
0
|
||||
);
|
||||
},
|
||||
|
||||
getPapersPerHour(hour) {
|
||||
const total = this.getTotalWeight();
|
||||
if (total === 0) return 0;
|
||||
return (
|
||||
(parseFloat(this.schedule[hour]) / total) *
|
||||
this.volume
|
||||
).toFixed(1);
|
||||
},
|
||||
|
||||
async saveSchedule() {
|
||||
try {
|
||||
await self.updateSchedule(this.schedule);
|
||||
} catch (error) {
|
||||
// Error handling is done in updateConfig
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Global instance for easy access
|
||||
*/
|
||||
window.configHandler = new ConfigHandler();
|
231
scipaperloader/static/js/form-handler.js
Normal file
231
scipaperloader/static/js/form-handler.js
Normal file
@ -0,0 +1,231 @@
|
||||
/**
|
||||
* Form utilities for handling form submissions with progress tracking
|
||||
*/
|
||||
|
||||
class FormHandler {
|
||||
constructor(formId, options = {}) {
|
||||
this.form = document.getElementById(formId);
|
||||
this.options = {
|
||||
progressModalId: "progressModal",
|
||||
progressBarId: "progressBar",
|
||||
progressStatusId: "progressStatus",
|
||||
statusCheckInterval: 1000,
|
||||
onSuccess: null,
|
||||
onError: null,
|
||||
onProgress: null,
|
||||
...options,
|
||||
};
|
||||
|
||||
this.progressModal = null;
|
||||
this.progressBar = null;
|
||||
this.progressStatus = null;
|
||||
this.submitButton = null;
|
||||
|
||||
this.initElements();
|
||||
this.initEventListeners();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize DOM elements
|
||||
*/
|
||||
initElements() {
|
||||
if (this.options.progressModalId) {
|
||||
const modalElement = document.getElementById(
|
||||
this.options.progressModalId
|
||||
);
|
||||
if (modalElement && typeof bootstrap !== "undefined") {
|
||||
this.progressModal = new bootstrap.Modal(modalElement);
|
||||
}
|
||||
}
|
||||
|
||||
this.progressBar = document.getElementById(this.options.progressBarId);
|
||||
this.progressStatus = document.getElementById(
|
||||
this.options.progressStatusId
|
||||
);
|
||||
this.submitButton = this.form?.querySelector('button[type="submit"]');
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize event listeners
|
||||
*/
|
||||
initEventListeners() {
|
||||
if (this.form) {
|
||||
this.form.addEventListener("submit", (e) => this.handleSubmit(e));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle form submission
|
||||
* @param {Event} e - Form submit event
|
||||
*/
|
||||
async handleSubmit(e) {
|
||||
e.preventDefault();
|
||||
|
||||
// Show progress modal
|
||||
this.showProgress();
|
||||
this.updateProgress(5, "Starting...");
|
||||
|
||||
// Disable submit button
|
||||
if (this.submitButton) {
|
||||
this.submitButton.disabled = true;
|
||||
}
|
||||
|
||||
const formData = new FormData(this.form);
|
||||
|
||||
try {
|
||||
const response = await fetch(this.form.action, {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.error) {
|
||||
this.handleError(data.error);
|
||||
return;
|
||||
}
|
||||
|
||||
// Start polling for task status if task_id is provided
|
||||
if (data.task_id) {
|
||||
this.pollTaskStatus(data.task_id);
|
||||
} else {
|
||||
// Handle immediate response
|
||||
this.handleSuccess(data);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Form submission failed:", error);
|
||||
this.handleError("Form submission failed. Please try again.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Poll task status for long-running operations
|
||||
* @param {string} taskId - Task ID to poll
|
||||
*/
|
||||
async pollTaskStatus(taskId) {
|
||||
const checkStatus = async () => {
|
||||
try {
|
||||
// Construct status URL - this should be customizable
|
||||
const statusUrl = this.options.statusUrlTemplate
|
||||
? this.options.statusUrlTemplate.replace("{taskId}", taskId)
|
||||
: `/upload/task_status/${taskId}`;
|
||||
|
||||
const response = await fetch(statusUrl);
|
||||
const status = await response.json();
|
||||
|
||||
console.log("Task status:", status);
|
||||
|
||||
if (status.state === "SUCCESS") {
|
||||
this.updateProgress(100, "Completed!");
|
||||
setTimeout(() => {
|
||||
this.hideProgress();
|
||||
this.handleSuccess(status.result);
|
||||
}, 1000);
|
||||
} else if (status.state === "FAILURE") {
|
||||
this.updateProgress(100, "Failed!", true);
|
||||
setTimeout(() => {
|
||||
this.hideProgress();
|
||||
this.handleError(status.error || "Unknown error occurred");
|
||||
}, 1000);
|
||||
} else {
|
||||
// Update progress
|
||||
const progress = status.progress || 0;
|
||||
this.updateProgress(progress, `Processing... (${status.state})`);
|
||||
|
||||
// Continue polling
|
||||
setTimeout(checkStatus, this.options.statusCheckInterval);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to check task status:", error);
|
||||
// Continue polling on error
|
||||
setTimeout(checkStatus, this.options.statusCheckInterval);
|
||||
}
|
||||
};
|
||||
|
||||
checkStatus();
|
||||
}
|
||||
|
||||
/**
|
||||
* Show progress modal
|
||||
*/
|
||||
showProgress() {
|
||||
if (this.progressModal) {
|
||||
this.progressModal.show();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Hide progress modal
|
||||
*/
|
||||
hideProgress() {
|
||||
if (this.progressModal) {
|
||||
this.progressModal.hide();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update progress display
|
||||
* @param {number} percentage - Progress percentage (0-100)
|
||||
* @param {string} message - Status message
|
||||
* @param {boolean} isError - Whether this is an error state
|
||||
*/
|
||||
updateProgress(percentage, message, isError = false) {
|
||||
if (this.progressBar) {
|
||||
this.progressBar.style.width = `${percentage}%`;
|
||||
this.progressBar.textContent = `${percentage}%`;
|
||||
|
||||
if (isError) {
|
||||
this.progressBar.classList.add("bg-danger");
|
||||
}
|
||||
}
|
||||
|
||||
if (this.progressStatus) {
|
||||
this.progressStatus.textContent = message;
|
||||
}
|
||||
|
||||
// Call custom progress callback
|
||||
if (this.options.onProgress) {
|
||||
this.options.onProgress(percentage, message, isError);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle successful form submission
|
||||
* @param {object} result - Success result data
|
||||
*/
|
||||
handleSuccess(result) {
|
||||
// Re-enable submit button
|
||||
if (this.submitButton) {
|
||||
this.submitButton.disabled = false;
|
||||
}
|
||||
|
||||
// Call custom success callback
|
||||
if (this.options.onSuccess) {
|
||||
this.options.onSuccess(result);
|
||||
} else {
|
||||
// Default success handling
|
||||
showFlashMessage("Operation completed successfully!", "success");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle form submission error
|
||||
* @param {string} error - Error message
|
||||
*/
|
||||
handleError(error) {
|
||||
this.hideProgress();
|
||||
|
||||
// Re-enable submit button
|
||||
if (this.submitButton) {
|
||||
this.submitButton.disabled = false;
|
||||
}
|
||||
|
||||
// Call custom error callback
|
||||
if (this.options.onError) {
|
||||
this.options.onError(error);
|
||||
} else {
|
||||
// Default error handling
|
||||
showFlashMessage(`Error: ${error}`, "error");
|
||||
}
|
||||
}
|
||||
}
|
485
scipaperloader/static/js/logger-manager.js
Normal file
485
scipaperloader/static/js/logger-manager.js
Normal file
@ -0,0 +1,485 @@
|
||||
/**
|
||||
* Logger Manager - Modern activity log management for the unified logger view
|
||||
*/
|
||||
|
||||
class LoggerManager {
|
||||
constructor(options = {}) {
|
||||
this.categories = options.categories || [];
|
||||
this.initialFilters = options.initialFilters || {};
|
||||
|
||||
// Pagination state
|
||||
this.currentPage = 1;
|
||||
this.perPage = 50;
|
||||
this.totalPages = 1;
|
||||
this.totalEntries = 0;
|
||||
|
||||
// Current filter state
|
||||
this.filters = { ...this.initialFilters };
|
||||
|
||||
// DOM elements
|
||||
this.initElements();
|
||||
this.initEventListeners();
|
||||
|
||||
// Apply initial filters and load data
|
||||
this.applyInitialFilters();
|
||||
this.loadLogs();
|
||||
}
|
||||
|
||||
initElements() {
|
||||
// Form elements
|
||||
this.filtersForm = document.getElementById("filterForm");
|
||||
this.categoryCheckboxes = document.querySelectorAll(".category-checkbox");
|
||||
this.selectAllCategories = document.getElementById("selectAllCategories");
|
||||
this.statusSelect = document.getElementById("statusFilter");
|
||||
this.startDateInput = document.getElementById("startDate");
|
||||
this.endDateInput = document.getElementById("endDate");
|
||||
this.searchTermInput = document.getElementById("searchTerm");
|
||||
this.clearFiltersBtn = document.getElementById("clearFilters");
|
||||
this.downloadLogsBtn = document.getElementById("downloadLogs");
|
||||
this.refreshLogsBtn = document.getElementById("refreshLogs");
|
||||
|
||||
// Logs display elements
|
||||
this.logsTableBody = document.getElementById("logsTableBody");
|
||||
this.pageSizeSelect = document.getElementById("pageSize");
|
||||
|
||||
// Pagination elements
|
||||
this.paginationContainer = document.getElementById("logsPagination");
|
||||
this.paginationInfo = document.getElementById("paginationDetails");
|
||||
this.prevPageBtn = document.getElementById("prevPage");
|
||||
this.nextPageBtn = document.getElementById("nextPage");
|
||||
this.currentPageSpan = document.getElementById("currentPageSpan");
|
||||
|
||||
// Modal
|
||||
this.logModal = new ModalHandler("logDetailModal", "log-detail-content");
|
||||
}
|
||||
|
||||
initEventListeners() {
|
||||
// Filter form submission
|
||||
if (this.filtersForm) {
|
||||
this.filtersForm.addEventListener("submit", (e) => {
|
||||
e.preventDefault();
|
||||
this.applyFilters();
|
||||
});
|
||||
}
|
||||
|
||||
// Handle "Select All" checkbox for categories
|
||||
if (this.selectAllCategories) {
|
||||
this.selectAllCategories.addEventListener("change", () => {
|
||||
const isChecked = this.selectAllCategories.checked;
|
||||
this.categoryCheckboxes.forEach((checkbox) => {
|
||||
checkbox.checked = isChecked;
|
||||
});
|
||||
this.applyFilters();
|
||||
});
|
||||
}
|
||||
|
||||
// Handle individual category checkboxes
|
||||
this.categoryCheckboxes.forEach((checkbox) => {
|
||||
checkbox.addEventListener("change", () => {
|
||||
// Update "Select All" checkbox state
|
||||
this.updateSelectAllState();
|
||||
this.applyFilters();
|
||||
});
|
||||
});
|
||||
|
||||
// Individual filter changes for immediate application
|
||||
[this.statusSelect, this.startDateInput, this.endDateInput].forEach(
|
||||
(element) => {
|
||||
if (element) {
|
||||
element.addEventListener("change", () => {
|
||||
this.applyFilters();
|
||||
});
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Search term with debounce
|
||||
if (this.searchTermInput) {
|
||||
let searchTimeout;
|
||||
this.searchTermInput.addEventListener("input", () => {
|
||||
clearTimeout(searchTimeout);
|
||||
searchTimeout = setTimeout(() => {
|
||||
this.applyFilters();
|
||||
}, 500);
|
||||
});
|
||||
}
|
||||
|
||||
// Clear filters
|
||||
if (this.clearFiltersBtn) {
|
||||
this.clearFiltersBtn.addEventListener("click", () => {
|
||||
this.clearAllFilters();
|
||||
});
|
||||
}
|
||||
|
||||
// Download logs
|
||||
if (this.downloadLogsBtn) {
|
||||
this.downloadLogsBtn.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
this.downloadLogs();
|
||||
});
|
||||
}
|
||||
|
||||
// Refresh logs
|
||||
if (this.refreshLogsBtn) {
|
||||
this.refreshLogsBtn.addEventListener("click", () => {
|
||||
this.loadLogs();
|
||||
});
|
||||
}
|
||||
|
||||
// Page size change
|
||||
if (this.pageSizeSelect) {
|
||||
this.pageSizeSelect.addEventListener("change", () => {
|
||||
this.perPage = parseInt(this.pageSizeSelect.value);
|
||||
this.currentPage = 1; // Reset to first page
|
||||
this.loadLogs();
|
||||
});
|
||||
}
|
||||
|
||||
// Pagination buttons
|
||||
if (this.prevPageBtn) {
|
||||
this.prevPageBtn.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
if (this.currentPage > 1) {
|
||||
this.currentPage--;
|
||||
this.loadLogs();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (this.nextPageBtn) {
|
||||
this.nextPageBtn.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
if (this.currentPage < this.totalPages) {
|
||||
this.currentPage++;
|
||||
this.loadLogs();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
updateSelectAllState() {
|
||||
const checkedCount = Array.from(this.categoryCheckboxes).filter(
|
||||
(cb) => cb.checked
|
||||
).length;
|
||||
const totalCount = this.categoryCheckboxes.length;
|
||||
|
||||
if (checkedCount === 0) {
|
||||
this.selectAllCategories.checked = false;
|
||||
this.selectAllCategories.indeterminate = false;
|
||||
} else if (checkedCount === totalCount) {
|
||||
this.selectAllCategories.checked = true;
|
||||
this.selectAllCategories.indeterminate = false;
|
||||
} else {
|
||||
this.selectAllCategories.checked = false;
|
||||
this.selectAllCategories.indeterminate = true;
|
||||
}
|
||||
}
|
||||
|
||||
getSelectedCategories() {
|
||||
return Array.from(this.categoryCheckboxes)
|
||||
.filter((checkbox) => checkbox.checked)
|
||||
.map((checkbox) => checkbox.value);
|
||||
}
|
||||
|
||||
applyInitialFilters() {
|
||||
// Set category checkboxes from initial filters
|
||||
if (this.initialFilters.category) {
|
||||
const selectedCategories = Array.isArray(this.initialFilters.category)
|
||||
? this.initialFilters.category
|
||||
: [this.initialFilters.category];
|
||||
|
||||
this.categoryCheckboxes.forEach((checkbox) => {
|
||||
checkbox.checked = selectedCategories.includes(checkbox.value);
|
||||
});
|
||||
this.updateSelectAllState();
|
||||
}
|
||||
|
||||
if (this.startDateInput && this.initialFilters.start_date) {
|
||||
this.startDateInput.value = this.initialFilters.start_date;
|
||||
}
|
||||
if (this.endDateInput && this.initialFilters.end_date) {
|
||||
this.endDateInput.value = this.initialFilters.end_date;
|
||||
}
|
||||
if (this.searchTermInput && this.initialFilters.search_term) {
|
||||
this.searchTermInput.value = this.initialFilters.search_term;
|
||||
}
|
||||
}
|
||||
|
||||
applyFilters() {
|
||||
// Collect current filter values
|
||||
const selectedCategories = this.getSelectedCategories();
|
||||
|
||||
this.filters = {
|
||||
category: selectedCategories, // Now an array
|
||||
status: this.statusSelect?.value || "",
|
||||
start_date: this.startDateInput?.value || "",
|
||||
end_date: this.endDateInput?.value || "",
|
||||
search_term: this.searchTermInput?.value || "",
|
||||
};
|
||||
|
||||
// Reset to first page when filters change
|
||||
this.currentPage = 1;
|
||||
|
||||
// Load logs with new filters
|
||||
this.loadLogs();
|
||||
|
||||
// Update URL to reflect current filters (for bookmarking/sharing)
|
||||
this.updateUrl();
|
||||
}
|
||||
|
||||
clearAllFilters() {
|
||||
// Clear all category checkboxes and select all
|
||||
this.categoryCheckboxes.forEach((checkbox) => {
|
||||
checkbox.checked = true; // Default to all selected
|
||||
});
|
||||
if (this.selectAllCategories) {
|
||||
this.selectAllCategories.checked = true;
|
||||
this.selectAllCategories.indeterminate = false;
|
||||
}
|
||||
|
||||
if (this.statusSelect) this.statusSelect.value = "";
|
||||
if (this.startDateInput) this.startDateInput.value = "";
|
||||
if (this.endDateInput) this.endDateInput.value = "";
|
||||
if (this.searchTermInput) this.searchTermInput.value = "";
|
||||
|
||||
// Apply empty filters
|
||||
this.applyFilters();
|
||||
}
|
||||
|
||||
async loadLogs() {
|
||||
if (!this.logsTableBody) return;
|
||||
|
||||
try {
|
||||
// Show loading state
|
||||
this.logsTableBody.innerHTML =
|
||||
'<tr><td colspan="5" class="text-center"><div class="spinner-border spinner-border-sm text-primary" role="status"><span class="visually-hidden">Loading...</span></div> Loading logs...</td></tr>';
|
||||
|
||||
// Build query parameters
|
||||
const params = new URLSearchParams({
|
||||
page: this.currentPage,
|
||||
per_page: this.perPage,
|
||||
});
|
||||
|
||||
// Add filters to query
|
||||
Object.entries(this.filters).forEach(([key, value]) => {
|
||||
if (value) {
|
||||
if (key === "category" && Array.isArray(value)) {
|
||||
// Handle multiple categories
|
||||
value.forEach((cat) => {
|
||||
if (cat) params.append("category", cat);
|
||||
});
|
||||
} else if (value) {
|
||||
params.append(key, value);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Fetch logs from unified API
|
||||
const data = await apiRequest(`/logs/api?${params.toString()}`);
|
||||
|
||||
if (data.success) {
|
||||
this.renderLogs(data.logs);
|
||||
this.updatePagination(data.pagination);
|
||||
console.log("Logs loaded successfully");
|
||||
} else {
|
||||
throw new Error(data.message || "Failed to load logs");
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Failed to load logs:", error);
|
||||
this.logsTableBody.innerHTML =
|
||||
'<tr><td colspan="5" class="text-center text-danger">Error loading logs. Please try again.</td></tr>';
|
||||
this.hidePagination();
|
||||
}
|
||||
}
|
||||
|
||||
renderLogs(logs) {
|
||||
if (!this.logsTableBody) return;
|
||||
|
||||
this.logsTableBody.innerHTML = "";
|
||||
|
||||
if (!logs || logs.length === 0) {
|
||||
this.logsTableBody.innerHTML =
|
||||
'<tr><td colspan="5" class="text-center">No logs found matching the current filters.</td></tr>';
|
||||
return;
|
||||
}
|
||||
|
||||
logs.forEach((log) => {
|
||||
const row = document.createElement("tr");
|
||||
row.className = "log-entry";
|
||||
row.setAttribute("data-log-id", log.id);
|
||||
|
||||
// Format timestamp
|
||||
const timeStr = formatTimestamp(log.timestamp);
|
||||
|
||||
// Create status badge
|
||||
const statusBadge = createStatusBadge(log.status);
|
||||
|
||||
// Create category badge
|
||||
const categoryBadge = this.createCategoryBadge(log.category);
|
||||
|
||||
row.innerHTML = `
|
||||
<td>${timeStr}</td>
|
||||
<td>${categoryBadge}</td>
|
||||
<td>${log.action}</td>
|
||||
<td>${statusBadge}</td>
|
||||
<td>${log.description || ""}</td>
|
||||
`;
|
||||
|
||||
// Add click handler for details modal - whole row is clickable
|
||||
row.addEventListener("click", () => {
|
||||
const url = `/logs/${log.id}/detail`;
|
||||
this.logModal.loadAndShow(url, "Error loading log details.");
|
||||
});
|
||||
|
||||
this.logsTableBody.appendChild(row);
|
||||
});
|
||||
}
|
||||
|
||||
createCategoryBadge(category) {
|
||||
const categoryColors = {
|
||||
gui_interaction: "bg-primary",
|
||||
config_change: "bg-warning",
|
||||
scraper_command: "bg-info",
|
||||
scraper_activity: "bg-success",
|
||||
system: "bg-danger",
|
||||
data_import: "bg-secondary",
|
||||
};
|
||||
|
||||
const colorClass = categoryColors[category] || "bg-secondary";
|
||||
const displayName = category
|
||||
.replace(/_/g, " ")
|
||||
.replace(/\b\w/g, (l) => l.toUpperCase());
|
||||
|
||||
return `<span class="badge ${colorClass}">${displayName}</span>`;
|
||||
}
|
||||
|
||||
updatePagination(pagination) {
|
||||
if (!pagination || !this.paginationContainer) return;
|
||||
|
||||
this.currentPage = pagination.page;
|
||||
this.totalPages = pagination.pages;
|
||||
this.totalEntries = pagination.total;
|
||||
|
||||
// Show pagination container
|
||||
this.paginationContainer.classList.remove("d-none");
|
||||
|
||||
// Update pagination info
|
||||
const startEntry = (pagination.page - 1) * pagination.per_page + 1;
|
||||
const endEntry = Math.min(
|
||||
pagination.page * pagination.per_page,
|
||||
pagination.total
|
||||
);
|
||||
|
||||
if (this.paginationInfo) {
|
||||
this.paginationInfo.textContent = `Showing ${startEntry} - ${endEntry} of ${pagination.total} entries`;
|
||||
}
|
||||
|
||||
// Update current page display
|
||||
if (this.currentPageSpan) {
|
||||
this.currentPageSpan.innerHTML = `<span class="page-link">${pagination.page} of ${pagination.pages}</span>`;
|
||||
}
|
||||
|
||||
// Update previous button
|
||||
if (this.prevPageBtn) {
|
||||
if (pagination.has_prev) {
|
||||
this.prevPageBtn.classList.remove("disabled");
|
||||
this.prevPageBtn.querySelector("a").removeAttribute("tabindex");
|
||||
this.prevPageBtn
|
||||
.querySelector("a")
|
||||
.setAttribute("aria-disabled", "false");
|
||||
} else {
|
||||
this.prevPageBtn.classList.add("disabled");
|
||||
this.prevPageBtn.querySelector("a").setAttribute("tabindex", "-1");
|
||||
this.prevPageBtn
|
||||
.querySelector("a")
|
||||
.setAttribute("aria-disabled", "true");
|
||||
}
|
||||
}
|
||||
|
||||
// Update next button
|
||||
if (this.nextPageBtn) {
|
||||
if (pagination.has_next) {
|
||||
this.nextPageBtn.classList.remove("disabled");
|
||||
this.nextPageBtn.querySelector("a").removeAttribute("tabindex");
|
||||
this.nextPageBtn
|
||||
.querySelector("a")
|
||||
.setAttribute("aria-disabled", "false");
|
||||
} else {
|
||||
this.nextPageBtn.classList.add("disabled");
|
||||
this.nextPageBtn.querySelector("a").setAttribute("tabindex", "-1");
|
||||
this.nextPageBtn
|
||||
.querySelector("a")
|
||||
.setAttribute("aria-disabled", "true");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hidePagination() {
|
||||
if (this.paginationContainer) {
|
||||
this.paginationContainer.classList.add("d-none");
|
||||
}
|
||||
}
|
||||
|
||||
updateUrl() {
|
||||
// Update URL with current filters for bookmarking
|
||||
const params = new URLSearchParams();
|
||||
|
||||
Object.entries(this.filters).forEach(([key, value]) => {
|
||||
if (value) {
|
||||
if (key === "category" && Array.isArray(value)) {
|
||||
// Handle multiple categories
|
||||
value.forEach((cat) => {
|
||||
if (cat) params.append("category", cat);
|
||||
});
|
||||
} else if (value) {
|
||||
params.append(key, value);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const newUrl = `${window.location.pathname}${
|
||||
params.toString() ? "?" + params.toString() : ""
|
||||
}`;
|
||||
window.history.replaceState({}, "", newUrl);
|
||||
}
|
||||
|
||||
downloadLogs() {
|
||||
// Build download URL with current filters
|
||||
const params = new URLSearchParams();
|
||||
|
||||
Object.entries(this.filters).forEach(([key, value]) => {
|
||||
if (value) {
|
||||
if (key === "category" && Array.isArray(value)) {
|
||||
// Handle multiple categories
|
||||
value.forEach((cat) => {
|
||||
if (cat) params.append("category", cat);
|
||||
});
|
||||
} else if (value) {
|
||||
params.append(key, value);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const downloadUrl = `/logs/download${
|
||||
params.toString() ? "?" + params.toString() : ""
|
||||
}`;
|
||||
window.location.href = downloadUrl;
|
||||
}
|
||||
|
||||
refresh() {
|
||||
this.loadLogs();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set modal handler for log details
|
||||
* @param {ModalHandler} modalHandler - Modal handler instance
|
||||
*/
|
||||
setModalHandler(modalHandler) {
|
||||
this.logModal = modalHandler;
|
||||
}
|
||||
}
|
||||
|
||||
// Export for use in other modules
|
||||
if (typeof window !== "undefined") {
|
||||
window.LoggerManager = LoggerManager;
|
||||
}
|
221
scipaperloader/static/js/modal-handler.js
Normal file
221
scipaperloader/static/js/modal-handler.js
Normal file
@ -0,0 +1,221 @@
|
||||
/**
|
||||
* Modal utilities for handling dynamic content loading
|
||||
*/
|
||||
|
||||
class ModalHandler {
|
||||
constructor(modalId, contentElementId) {
|
||||
this.modalElement = document.getElementById(modalId);
|
||||
this.contentElement = document.getElementById(contentElementId);
|
||||
this.modal = null;
|
||||
|
||||
if (this.modalElement && typeof bootstrap !== "undefined") {
|
||||
this.modal = new bootstrap.Modal(this.modalElement);
|
||||
|
||||
// Set up global event delegation for modal close buttons
|
||||
this.setupGlobalCloseHandlers();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load content into modal via AJAX and show it
|
||||
* @param {string} url - URL to fetch content from
|
||||
* @param {string} errorMessage - Message to show on error
|
||||
*/
|
||||
async loadAndShow(url, errorMessage = "Error loading content.") {
|
||||
if (!this.modal || !this.contentElement) {
|
||||
console.error("Modal or content element not found");
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
const html = await response.text();
|
||||
this.contentElement.innerHTML = html;
|
||||
|
||||
// Set up close button handlers after content is loaded
|
||||
this.setupCloseHandlers();
|
||||
|
||||
// Format any JSON content in the modal
|
||||
this.formatJsonContent();
|
||||
|
||||
this.modal.show();
|
||||
} catch (error) {
|
||||
console.error("Error loading modal content:", error);
|
||||
this.contentElement.innerHTML = `<div class="modal-body text-danger">${errorMessage}</div>`;
|
||||
this.modal.show();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up click handlers for elements that should open the modal
|
||||
* @param {string} selector - CSS selector for clickable elements
|
||||
* @param {string} urlAttribute - Attribute name containing the URL (default: 'data-url')
|
||||
*/
|
||||
setupClickHandlers(selector, urlAttribute = "data-url") {
|
||||
document.addEventListener("DOMContentLoaded", () => {
|
||||
document.querySelectorAll(selector).forEach((element) => {
|
||||
element.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
const url = element.getAttribute(urlAttribute);
|
||||
if (url) {
|
||||
this.loadAndShow(url);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Show the modal with custom content
|
||||
* @param {string} content - HTML content to display
|
||||
*/
|
||||
showWithContent(content) {
|
||||
if (!this.modal || !this.contentElement) return;
|
||||
|
||||
this.contentElement.innerHTML = content;
|
||||
|
||||
// Set up close button handlers after content is loaded
|
||||
this.setupCloseHandlers();
|
||||
|
||||
this.modal.show();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up global event delegation for modal close buttons
|
||||
*/
|
||||
setupGlobalCloseHandlers() {
|
||||
// Use event delegation to handle dynamically loaded close buttons
|
||||
this.modalElement.addEventListener("click", (e) => {
|
||||
if (
|
||||
e.target.matches('[data-bs-dismiss="modal"]') ||
|
||||
e.target.closest('[data-bs-dismiss="modal"]') ||
|
||||
e.target.matches(".btn-close") ||
|
||||
e.target.closest(".btn-close")
|
||||
) {
|
||||
e.preventDefault();
|
||||
this.hide();
|
||||
}
|
||||
});
|
||||
|
||||
// Handle ESC key press
|
||||
document.addEventListener("keydown", (e) => {
|
||||
if (
|
||||
e.key === "Escape" &&
|
||||
this.modal &&
|
||||
this.modalElement.classList.contains("show")
|
||||
) {
|
||||
this.hide();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up close button event handlers for dynamically loaded content
|
||||
*/
|
||||
setupCloseHandlers() {
|
||||
// This method is now mostly redundant due to global event delegation
|
||||
// but we'll keep it for backward compatibility
|
||||
|
||||
// Handle close buttons with data-bs-dismiss="modal"
|
||||
const closeButtons = this.contentElement.querySelectorAll(
|
||||
'[data-bs-dismiss="modal"]'
|
||||
);
|
||||
closeButtons.forEach((button) => {
|
||||
button.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
this.hide();
|
||||
});
|
||||
});
|
||||
|
||||
// Handle close buttons with .btn-close class
|
||||
const closeButtonsClass =
|
||||
this.contentElement.querySelectorAll(".btn-close");
|
||||
closeButtonsClass.forEach((button) => {
|
||||
button.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
this.hide();
|
||||
});
|
||||
});
|
||||
|
||||
// Also handle ESC key press
|
||||
document.addEventListener("keydown", (e) => {
|
||||
if (
|
||||
e.key === "Escape" &&
|
||||
this.modal &&
|
||||
this.modalElement.classList.contains("show")
|
||||
) {
|
||||
this.hide();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Format JSON content in the modal after it's loaded
|
||||
*/
|
||||
formatJsonContent() {
|
||||
// Format JSON in extra data if present
|
||||
const extraDataElement = this.contentElement.querySelector(
|
||||
"#extra-data-content"
|
||||
);
|
||||
if (extraDataElement && extraDataElement.textContent.trim()) {
|
||||
try {
|
||||
const jsonData = JSON.parse(extraDataElement.textContent);
|
||||
|
||||
// Pretty-format the JSON with proper indentation
|
||||
const formattedJson = JSON.stringify(jsonData, null, 2);
|
||||
extraDataElement.textContent = formattedJson;
|
||||
|
||||
// Add syntax highlighting classes if the JSON is complex
|
||||
if (typeof jsonData === "object" && jsonData !== null) {
|
||||
extraDataElement.parentElement.classList.add("json-formatted");
|
||||
}
|
||||
} catch (e) {
|
||||
// If it's not valid JSON, leave it as is but still format if it looks like JSON
|
||||
const text = extraDataElement.textContent.trim();
|
||||
if (text.startsWith("{") || text.startsWith("[")) {
|
||||
// Try to fix common JSON issues and reformat
|
||||
try {
|
||||
const fixedJson = text
|
||||
.replace(/'/g, '"')
|
||||
.replace(/None/g, "null")
|
||||
.replace(/True/g, "true")
|
||||
.replace(/False/g, "false");
|
||||
const parsed = JSON.parse(fixedJson);
|
||||
extraDataElement.textContent = JSON.stringify(parsed, null, 2);
|
||||
} catch (fixError) {
|
||||
// If still can't parse, just leave as is
|
||||
console.debug("Extra data is not valid JSON:", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also format old_value and new_value if they contain JSON
|
||||
const preElements = this.contentElement.querySelectorAll("pre code");
|
||||
preElements.forEach(function (codeElement) {
|
||||
if (codeElement && codeElement.textContent.trim()) {
|
||||
const text = codeElement.textContent.trim();
|
||||
if (
|
||||
(text.startsWith("{") && text.endsWith("}")) ||
|
||||
(text.startsWith("[") && text.endsWith("]"))
|
||||
) {
|
||||
try {
|
||||
const jsonData = JSON.parse(text);
|
||||
codeElement.textContent = JSON.stringify(jsonData, null, 2);
|
||||
} catch (e) {
|
||||
// Not JSON, leave as is
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Hide the modal
|
||||
*/
|
||||
hide() {
|
||||
if (this.modal) {
|
||||
this.modal.hide();
|
||||
}
|
||||
}
|
||||
}
|
315
scipaperloader/static/js/paper-processor.js
Normal file
315
scipaperloader/static/js/paper-processor.js
Normal file
@ -0,0 +1,315 @@
|
||||
/**
|
||||
* Paper search and processing functionality
|
||||
*/
|
||||
|
||||
class PaperProcessor {
|
||||
constructor() {
|
||||
// DOM elements
|
||||
this.searchForm = document.getElementById("searchPaperForm");
|
||||
this.searchInput = document.getElementById("paperSearchInput");
|
||||
this.searchResults = document.getElementById("searchResults");
|
||||
this.paperSearchResults = document.getElementById("paperSearchResults");
|
||||
this.scraperSelect = document.getElementById("scraperSelect");
|
||||
|
||||
this.initEventListeners();
|
||||
this.loadAvailableScrapers();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize event listeners
|
||||
*/
|
||||
initEventListeners() {
|
||||
if (this.searchForm) {
|
||||
this.searchForm.addEventListener("submit", (e) => {
|
||||
e.preventDefault();
|
||||
this.searchPapers();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load available scraper modules
|
||||
*/
|
||||
async loadAvailableScrapers() {
|
||||
if (!this.scraperSelect) return;
|
||||
|
||||
try {
|
||||
const data = await apiRequest("/scraper/available_scrapers");
|
||||
|
||||
if (data.success && data.scrapers && data.scrapers.length > 0) {
|
||||
// Clear previous options except the default one
|
||||
while (this.scraperSelect.options.length > 1) {
|
||||
this.scraperSelect.remove(1);
|
||||
}
|
||||
|
||||
// Add each scraper as an option
|
||||
data.scrapers.forEach((scraper) => {
|
||||
const option = document.createElement("option");
|
||||
option.value = scraper.name;
|
||||
option.textContent = `${
|
||||
scraper.name
|
||||
} - ${scraper.description.substring(0, 50)}${
|
||||
scraper.description.length > 50 ? "..." : ""
|
||||
}`;
|
||||
if (scraper.is_current) {
|
||||
option.textContent += " (system default)";
|
||||
}
|
||||
this.scraperSelect.appendChild(option);
|
||||
});
|
||||
} else {
|
||||
// If no scrapers or error, add a note
|
||||
const option = document.createElement("option");
|
||||
option.disabled = true;
|
||||
option.textContent = "No scrapers available";
|
||||
this.scraperSelect.appendChild(option);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error loading scrapers:", error);
|
||||
const option = document.createElement("option");
|
||||
option.disabled = true;
|
||||
option.textContent = "Error loading scrapers";
|
||||
this.scraperSelect.appendChild(option);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search for papers
|
||||
*/
|
||||
async searchPapers() {
|
||||
if (!this.searchInput || !this.paperSearchResults || !this.searchResults)
|
||||
return;
|
||||
|
||||
const query = this.searchInput.value.trim();
|
||||
|
||||
if (!query) {
|
||||
showFlashMessage("Please enter a search term", "warning");
|
||||
return;
|
||||
}
|
||||
|
||||
// Show loading message
|
||||
this.paperSearchResults.innerHTML =
|
||||
'<tr><td colspan="5" class="text-center">Searching papers...</td></tr>';
|
||||
this.searchResults.classList.remove("d-none");
|
||||
|
||||
try {
|
||||
const data = await apiRequest(
|
||||
`/api/papers?query=${encodeURIComponent(query)}`
|
||||
);
|
||||
|
||||
if (!data.papers || data.papers.length === 0) {
|
||||
this.paperSearchResults.innerHTML =
|
||||
'<tr><td colspan="5" class="text-center">No papers found matching your search</td></tr>';
|
||||
return;
|
||||
}
|
||||
|
||||
this.paperSearchResults.innerHTML = "";
|
||||
|
||||
data.papers.forEach((paper) => {
|
||||
const row = document.createElement("tr");
|
||||
|
||||
// Create status badge
|
||||
const statusBadge = createStatusBadge(paper.status);
|
||||
|
||||
// Create process button (enabled only for papers not in 'Pending' status)
|
||||
const processButtonDisabled =
|
||||
paper.status === "Pending" ? "disabled" : "";
|
||||
|
||||
// Truncate title if too long
|
||||
const truncatedTitle = truncateText(paper.title, 70);
|
||||
|
||||
row.innerHTML = `
|
||||
<td>${paper.id}</td>
|
||||
<td title="${paper.title}">${truncatedTitle}</td>
|
||||
<td>${paper.doi || "N/A"}</td>
|
||||
<td>${statusBadge}</td>
|
||||
<td>
|
||||
<button class="btn btn-sm btn-primary process-paper-btn"
|
||||
data-paper-id="${paper.id}"
|
||||
${processButtonDisabled}>
|
||||
Process Now
|
||||
</button>
|
||||
</td>
|
||||
`;
|
||||
|
||||
this.paperSearchResults.appendChild(row);
|
||||
});
|
||||
|
||||
// Add event listeners to the process buttons
|
||||
document.querySelectorAll(".process-paper-btn").forEach((btn) => {
|
||||
btn.addEventListener("click", () => {
|
||||
this.processSinglePaper(btn.getAttribute("data-paper-id"));
|
||||
});
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("Error searching papers:", error);
|
||||
this.paperSearchResults.innerHTML =
|
||||
'<tr><td colspan="5" class="text-center">Error searching papers</td></tr>';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a single paper
|
||||
* @param {string} paperId - The ID of the paper to process
|
||||
*/
|
||||
async processSinglePaper(paperId) {
|
||||
if (!this.scraperSelect) return;
|
||||
|
||||
// Disable all process buttons to prevent multiple clicks
|
||||
document.querySelectorAll(".process-paper-btn").forEach((btn) => {
|
||||
btn.disabled = true;
|
||||
});
|
||||
|
||||
// Show processing status via flash message
|
||||
showFlashMessage("Processing paper...", "info");
|
||||
|
||||
// Get selected scraper
|
||||
const selectedScraper = this.scraperSelect.value;
|
||||
|
||||
try {
|
||||
const data = await apiRequest(`/scraper/process_single/${paperId}`, {
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
scraper_module: selectedScraper,
|
||||
}),
|
||||
});
|
||||
|
||||
if (data.success) {
|
||||
// Update status in the search results
|
||||
const row = document
|
||||
.querySelector(`.process-paper-btn[data-paper-id="${paperId}"]`)
|
||||
?.closest("tr");
|
||||
if (row) {
|
||||
const statusCell = row.querySelector("td:nth-child(4)");
|
||||
if (statusCell) {
|
||||
statusCell.innerHTML = createStatusBadge("Pending");
|
||||
}
|
||||
}
|
||||
|
||||
// Show success notification
|
||||
showFlashMessage(data.message, "success");
|
||||
|
||||
// Set up polling to check paper status and refresh activity
|
||||
this.pollPaperStatus(paperId, 3000, 20);
|
||||
} else {
|
||||
showFlashMessage(data.message, "error");
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error processing paper:", error);
|
||||
showFlashMessage("Error processing paper", "error");
|
||||
} finally {
|
||||
// Re-enable the process buttons after a short delay
|
||||
setTimeout(() => {
|
||||
document.querySelectorAll(".process-paper-btn").forEach((btn) => {
|
||||
if (btn.getAttribute("data-paper-id") !== paperId) {
|
||||
btn.disabled = false;
|
||||
}
|
||||
});
|
||||
}, 1000);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Poll paper status until it changes from Pending
|
||||
* @param {string} paperId - The paper ID to poll
|
||||
* @param {number} interval - Polling interval in milliseconds
|
||||
* @param {number} maxAttempts - Maximum number of polling attempts
|
||||
*/
|
||||
pollPaperStatus(paperId, interval = 3000, maxAttempts = 20) {
|
||||
let attempts = 0;
|
||||
|
||||
// Immediately refresh activity log to show the initial pending status
|
||||
if (this.onActivityRefresh) {
|
||||
this.onActivityRefresh();
|
||||
}
|
||||
|
||||
const checkStatus = async () => {
|
||||
attempts++;
|
||||
console.log(
|
||||
`Checking status of paper ${paperId}, attempt ${attempts}/${maxAttempts}`
|
||||
);
|
||||
|
||||
try {
|
||||
const data = await apiRequest(`/api/papers/${paperId}`);
|
||||
|
||||
if (data && data.paper) {
|
||||
const paper = data.paper;
|
||||
console.log(`Paper status: ${paper.status}`);
|
||||
|
||||
// Update the UI with the current status
|
||||
const row = document
|
||||
.querySelector(`.process-paper-btn[data-paper-id="${paperId}"]`)
|
||||
?.closest("tr");
|
||||
if (row) {
|
||||
const statusCell = row.querySelector("td:nth-child(4)");
|
||||
if (statusCell) {
|
||||
statusCell.innerHTML = createStatusBadge(paper.status);
|
||||
}
|
||||
|
||||
// Update processing status message if status changed
|
||||
if (paper.status !== "Pending") {
|
||||
if (paper.status === "Done") {
|
||||
showFlashMessage(
|
||||
`Paper processed successfully: ${paper.title}`,
|
||||
"success"
|
||||
);
|
||||
} else if (paper.status === "Failed") {
|
||||
showFlashMessage(
|
||||
`Paper processing failed: ${
|
||||
paper.error_msg || "Unknown error"
|
||||
}`,
|
||||
"error"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Always refresh activity log
|
||||
if (this.onActivityRefresh) {
|
||||
this.onActivityRefresh();
|
||||
}
|
||||
|
||||
// If status is still pending and we haven't reached max attempts, check again
|
||||
if (paper.status === "Pending" && attempts < maxAttempts) {
|
||||
setTimeout(checkStatus, interval);
|
||||
} else {
|
||||
// If status changed or we reached max attempts, refresh chart data too
|
||||
if (this.onChartRefresh) {
|
||||
this.onChartRefresh();
|
||||
}
|
||||
|
||||
// If we hit max attempts but status is still pending, show a message
|
||||
if (paper.status === "Pending" && attempts >= maxAttempts) {
|
||||
showFlashMessage(
|
||||
"Paper is still being processed. Check the activity log for updates.",
|
||||
"info"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error polling paper status: ${error}`);
|
||||
// If there's an error, we can still try again if under max attempts
|
||||
if (attempts < maxAttempts) {
|
||||
setTimeout(checkStatus, interval);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Start checking
|
||||
setTimeout(checkStatus, interval);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set callback for activity refresh
|
||||
*/
|
||||
setActivityRefreshCallback(callback) {
|
||||
this.onActivityRefresh = callback;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set callback for chart refresh
|
||||
*/
|
||||
setChartRefreshCallback(callback) {
|
||||
this.onChartRefresh = callback;
|
||||
}
|
||||
}
|
335
scipaperloader/static/js/scraper-control.js
Normal file
335
scipaperloader/static/js/scraper-control.js
Normal file
@ -0,0 +1,335 @@
|
||||
/**
|
||||
* Scraper control functionality
|
||||
*/
|
||||
|
||||
class ScraperController {
|
||||
constructor(options = {}) {
|
||||
this.maxVolume = options.maxVolume || 1000;
|
||||
this.volumeConfig = options.volumeConfig || 100;
|
||||
|
||||
// DOM elements
|
||||
this.statusIndicator = document.getElementById("statusIndicator");
|
||||
this.statusText = document.getElementById("statusText");
|
||||
this.startButton = document.getElementById("startButton");
|
||||
this.pauseButton = document.getElementById("pauseButton");
|
||||
this.stopButton = document.getElementById("stopButton");
|
||||
this.resetButton = document.getElementById("resetButton");
|
||||
|
||||
this.initEventListeners();
|
||||
this.initStatusPolling();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize event listeners for scraper controls
|
||||
*/
|
||||
initEventListeners() {
|
||||
if (this.startButton) {
|
||||
this.startButton.addEventListener("click", () => this.startScraper());
|
||||
}
|
||||
if (this.pauseButton) {
|
||||
this.pauseButton.addEventListener("click", () =>
|
||||
this.togglePauseScraper()
|
||||
);
|
||||
}
|
||||
if (this.stopButton) {
|
||||
this.stopButton.addEventListener("click", () => this.stopScraper());
|
||||
}
|
||||
if (this.resetButton) {
|
||||
this.resetButton.addEventListener("click", () => this.resetScraper());
|
||||
}
|
||||
|
||||
// Configuration form (handles both volume and scraper module)
|
||||
const configForm = document.getElementById("volumeForm");
|
||||
if (configForm) {
|
||||
configForm.addEventListener("submit", (e) => {
|
||||
e.preventDefault();
|
||||
this.updateConfiguration();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize status polling
|
||||
*/
|
||||
initStatusPolling() {
|
||||
this.updateStatus();
|
||||
setInterval(() => this.updateStatus(), 5000); // Poll every 5 seconds
|
||||
}
|
||||
|
||||
/**
|
||||
* Update scraper status display
|
||||
*/
|
||||
async updateStatus() {
|
||||
try {
|
||||
const data = await apiRequest("/scraper/status");
|
||||
console.log("Status data received:", data);
|
||||
|
||||
// Remove all status classes first
|
||||
if (this.statusIndicator) {
|
||||
this.statusIndicator.classList.remove(
|
||||
"status-active",
|
||||
"status-paused",
|
||||
"status-inactive"
|
||||
);
|
||||
}
|
||||
|
||||
// Handle the new JSON structure with scraper_state
|
||||
const scraperState = data.scraper_state || data; // Fallback for old structure
|
||||
|
||||
if (scraperState.active) {
|
||||
if (scraperState.paused) {
|
||||
this.statusIndicator?.classList.add("status-paused");
|
||||
if (this.statusText) this.statusText.textContent = "Paused";
|
||||
if (this.pauseButton) this.pauseButton.textContent = "Resume";
|
||||
} else {
|
||||
this.statusIndicator?.classList.add("status-active");
|
||||
if (this.statusText) this.statusText.textContent = "Active";
|
||||
if (this.pauseButton) this.pauseButton.textContent = "Pause";
|
||||
}
|
||||
if (this.startButton) this.startButton.disabled = true;
|
||||
if (this.pauseButton) this.pauseButton.disabled = false;
|
||||
if (this.stopButton) this.stopButton.disabled = false;
|
||||
if (this.resetButton) this.resetButton.disabled = false;
|
||||
} else {
|
||||
this.statusIndicator?.classList.add("status-inactive");
|
||||
if (this.statusText) this.statusText.textContent = "Inactive";
|
||||
if (this.startButton) this.startButton.disabled = false;
|
||||
if (this.pauseButton) this.pauseButton.disabled = true;
|
||||
if (this.stopButton) this.stopButton.disabled = true;
|
||||
if (this.resetButton) this.resetButton.disabled = false;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error fetching status:", error);
|
||||
// On error, show inactive state
|
||||
if (this.statusIndicator) {
|
||||
this.statusIndicator.classList.remove(
|
||||
"status-active",
|
||||
"status-paused",
|
||||
"status-inactive"
|
||||
);
|
||||
this.statusIndicator.classList.add("status-inactive");
|
||||
}
|
||||
if (this.statusText) this.statusText.textContent = "Error";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the scraper
|
||||
*/
|
||||
async startScraper() {
|
||||
console.log("Start button clicked - sending request to /scraper/start");
|
||||
|
||||
try {
|
||||
const data = await apiRequest("/scraper/start", {
|
||||
method: "POST",
|
||||
body: JSON.stringify({}),
|
||||
});
|
||||
console.log("Data received:", data);
|
||||
|
||||
if (data.success) {
|
||||
showFlashMessage("Scraper started successfully", "success");
|
||||
this.updateStatus();
|
||||
// Trigger activity refresh if callback is provided
|
||||
if (this.onActivityRefresh) {
|
||||
setTimeout(() => this.onActivityRefresh(), 1000);
|
||||
}
|
||||
} else {
|
||||
showFlashMessage(data.message, "error");
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error starting scraper:", error);
|
||||
showFlashMessage("Error starting scraper: " + error.message, "error");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggle pause/resume scraper
|
||||
*/
|
||||
async togglePauseScraper() {
|
||||
try {
|
||||
const data = await apiRequest("/scraper/pause", {
|
||||
method: "POST",
|
||||
body: JSON.stringify({}),
|
||||
});
|
||||
|
||||
if (data.success) {
|
||||
showFlashMessage(data.message, "info");
|
||||
this.updateStatus();
|
||||
if (this.onActivityRefresh) {
|
||||
setTimeout(() => this.onActivityRefresh(), 1000);
|
||||
}
|
||||
} else {
|
||||
showFlashMessage(data.message, "error");
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error toggling pause:", error);
|
||||
showFlashMessage("Error controlling scraper: " + error.message, "error");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop the scraper
|
||||
*/
|
||||
async stopScraper() {
|
||||
try {
|
||||
const data = await apiRequest("/scraper/stop", {
|
||||
method: "POST",
|
||||
body: JSON.stringify({}),
|
||||
});
|
||||
|
||||
if (data.success) {
|
||||
showFlashMessage("Scraper stopped successfully", "warning");
|
||||
this.updateStatus();
|
||||
if (this.onActivityRefresh) {
|
||||
setTimeout(() => this.onActivityRefresh(), 1000);
|
||||
}
|
||||
} else {
|
||||
showFlashMessage(data.message, "error");
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error stopping scraper:", error);
|
||||
showFlashMessage("Error stopping scraper: " + error.message, "error");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the scraper
|
||||
*/
|
||||
async resetScraper() {
|
||||
if (
|
||||
!confirm(
|
||||
"Are you sure you want to reset the scraper? This will stop all current tasks, optionally clear non-pending papers, and restart the scraper."
|
||||
)
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Disable button to prevent multiple clicks
|
||||
if (this.resetButton) this.resetButton.disabled = true;
|
||||
|
||||
// Show a loading message
|
||||
showFlashMessage("Resetting scraper, please wait...", "info");
|
||||
|
||||
try {
|
||||
const data = await apiRequest("/scraper/reset", {
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
clear_papers: true, // You could make this configurable with a checkbox
|
||||
}),
|
||||
});
|
||||
|
||||
if (data.success) {
|
||||
showFlashMessage(
|
||||
"Scraper has been completely reset and restarted",
|
||||
"success"
|
||||
);
|
||||
// Update everything
|
||||
this.updateStatus();
|
||||
if (this.onActivityRefresh) {
|
||||
this.onActivityRefresh();
|
||||
setTimeout(() => this.onActivityRefresh(), 1000);
|
||||
}
|
||||
if (this.onChartRefresh) {
|
||||
this.onChartRefresh();
|
||||
}
|
||||
} else {
|
||||
showFlashMessage(data.message || "Error resetting scraper", "error");
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error resetting scraper:", error);
|
||||
showFlashMessage("Error resetting scraper: " + error.message, "error");
|
||||
} finally {
|
||||
// Re-enable button
|
||||
if (this.resetButton) this.resetButton.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update configuration (volume and/or scraper module)
|
||||
*/
|
||||
async updateConfiguration() {
|
||||
const volumeInput = document.getElementById("volumeInput");
|
||||
const scraperSelect = document.getElementById("mainScraperSelect");
|
||||
const submitButton = document.querySelector(
|
||||
'#volumeForm button[type="submit"]'
|
||||
);
|
||||
|
||||
if (!submitButton) return;
|
||||
|
||||
const updates = {};
|
||||
let hasChanges = false;
|
||||
|
||||
// Check volume changes
|
||||
if (volumeInput) {
|
||||
const volume = volumeInput.value;
|
||||
|
||||
// Basic validation
|
||||
if (!volume || volume < 1 || volume > this.maxVolume) {
|
||||
showFlashMessage(
|
||||
`Please enter a valid volume between 1 and ${this.maxVolume}`,
|
||||
"warning"
|
||||
);
|
||||
volumeInput.focus();
|
||||
return;
|
||||
}
|
||||
|
||||
updates.volume = volume;
|
||||
hasChanges = true;
|
||||
}
|
||||
|
||||
// Check scraper module changes
|
||||
if (scraperSelect && scraperSelect.value) {
|
||||
updates.scraper_module = scraperSelect.value;
|
||||
hasChanges = true;
|
||||
}
|
||||
|
||||
if (!hasChanges) {
|
||||
showFlashMessage("No changes to save", "info");
|
||||
return;
|
||||
}
|
||||
|
||||
// Toggle loading state
|
||||
toggleButtonLoading(submitButton, true, "Updating...");
|
||||
|
||||
try {
|
||||
const data = await apiRequest("/scraper/update_config", {
|
||||
method: "POST",
|
||||
body: JSON.stringify(updates),
|
||||
});
|
||||
|
||||
if (data.success) {
|
||||
showFlashMessage(
|
||||
data.message || "Configuration updated successfully",
|
||||
"success"
|
||||
);
|
||||
} else {
|
||||
showFlashMessage(
|
||||
data.message || "Failed to update configuration",
|
||||
"error"
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error updating configuration:", error);
|
||||
showFlashMessage(
|
||||
"Network error while updating configuration. Please try again.",
|
||||
"error"
|
||||
);
|
||||
} finally {
|
||||
toggleButtonLoading(submitButton, false);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set callback for activity refresh
|
||||
*/
|
||||
setActivityRefreshCallback(callback) {
|
||||
this.onActivityRefresh = callback;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set callback for chart refresh
|
||||
*/
|
||||
setChartRefreshCallback(callback) {
|
||||
this.onChartRefresh = callback;
|
||||
}
|
||||
}
|
87
scipaperloader/static/js/scraper-dashboard.js
Normal file
87
scipaperloader/static/js/scraper-dashboard.js
Normal file
@ -0,0 +1,87 @@
|
||||
/**
|
||||
* Main scraper dashboard initialization and coordination
|
||||
*/
|
||||
|
||||
class ScraperDashboard {
|
||||
constructor(config = {}) {
|
||||
this.config = {
|
||||
maxVolume: config.maxVolume || 1000,
|
||||
volumeConfig: config.volumeConfig || 100,
|
||||
currentTimeRange: 24,
|
||||
};
|
||||
|
||||
this.initComponents();
|
||||
this.setupCallbacks();
|
||||
this.initializeData();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize all dashboard components
|
||||
*/
|
||||
initComponents() {
|
||||
// Initialize chart
|
||||
this.activityChart = new ActivityChart("activityChart");
|
||||
|
||||
// Initialize scraper controller
|
||||
this.scraperController = new ScraperController({
|
||||
maxVolume: this.config.maxVolume,
|
||||
volumeConfig: this.config.volumeConfig,
|
||||
});
|
||||
|
||||
// Initialize paper processor
|
||||
this.paperProcessor = new PaperProcessor();
|
||||
|
||||
// Initialize activity monitor
|
||||
this.activityMonitor = new ActivityMonitor();
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup callbacks between components
|
||||
*/
|
||||
setupCallbacks() {
|
||||
// Set up activity refresh callbacks
|
||||
const activityRefreshCallback = () =>
|
||||
this.activityMonitor.loadRecentActivity();
|
||||
this.scraperController.setActivityRefreshCallback(activityRefreshCallback);
|
||||
this.paperProcessor.setActivityRefreshCallback(activityRefreshCallback);
|
||||
|
||||
// Set up chart refresh callbacks
|
||||
const chartRefreshCallback = (timeRange = this.config.currentTimeRange) => {
|
||||
this.config.currentTimeRange = timeRange;
|
||||
this.activityChart.loadData(timeRange);
|
||||
};
|
||||
this.scraperController.setChartRefreshCallback(chartRefreshCallback);
|
||||
this.paperProcessor.setChartRefreshCallback(chartRefreshCallback);
|
||||
this.activityMonitor.setChartRefreshCallback(chartRefreshCallback);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize data on page load
|
||||
*/
|
||||
initializeData() {
|
||||
// Load recent activity
|
||||
this.activityMonitor.loadRecentActivity();
|
||||
|
||||
// Load chart data after a short delay to ensure Chart.js is loaded
|
||||
setTimeout(() => {
|
||||
this.activityChart.loadData(this.config.currentTimeRange);
|
||||
}, 100);
|
||||
}
|
||||
|
||||
/**
|
||||
* Refresh all dashboard data
|
||||
*/
|
||||
refreshAll() {
|
||||
this.activityMonitor.loadRecentActivity();
|
||||
this.activityChart.loadData(this.config.currentTimeRange);
|
||||
this.scraperController.updateStatus();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the scraper dashboard
|
||||
* @param {Object} config - Configuration object with Jinja variables
|
||||
*/
|
||||
function initScraperDashboard(config = {}) {
|
||||
return new ScraperDashboard(config);
|
||||
}
|
500
scipaperloader/static/js/scraper-overview.js
Normal file
500
scipaperloader/static/js/scraper-overview.js
Normal file
@ -0,0 +1,500 @@
|
||||
/**
|
||||
* Scraper Overview functionality
|
||||
*/
|
||||
|
||||
class ScraperOverview {
|
||||
constructor() {
|
||||
this.modal = null;
|
||||
this.scrapers = [];
|
||||
this.systemConfig = {};
|
||||
this.init();
|
||||
}
|
||||
|
||||
init() {
|
||||
// Initialize modal reference
|
||||
this.modal = document.getElementById("scraperOverviewModal");
|
||||
|
||||
// Load data when modal is shown
|
||||
if (this.modal) {
|
||||
this.modal.addEventListener("show.bs.modal", () => {
|
||||
this.loadScraperOverview();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async loadScraperOverview() {
|
||||
const loadingEl = document.getElementById("scraperOverviewLoading");
|
||||
const errorEl = document.getElementById("scraperOverviewError");
|
||||
const contentEl = document.getElementById("scraperOverviewContent");
|
||||
|
||||
// Show loading state
|
||||
loadingEl?.classList.remove("d-none");
|
||||
errorEl?.classList.add("d-none");
|
||||
contentEl?.classList.add("d-none");
|
||||
|
||||
try {
|
||||
// Load scrapers, system config, and publishers in parallel
|
||||
const [scrapersResponse, statusResponse, publishersResponse] =
|
||||
await Promise.all([
|
||||
fetch("/scraper/scrapers"),
|
||||
fetch("/scraper/status"),
|
||||
fetch("/scraper/publishers"),
|
||||
]);
|
||||
|
||||
if (
|
||||
!scrapersResponse.ok ||
|
||||
!statusResponse.ok ||
|
||||
!publishersResponse.ok
|
||||
) {
|
||||
throw new Error("Failed to load scraper information");
|
||||
}
|
||||
|
||||
const scrapersData = await scrapersResponse.json();
|
||||
const statusData = await statusResponse.json();
|
||||
const publishersData = await publishersResponse.json();
|
||||
|
||||
if (
|
||||
!scrapersData.success ||
|
||||
!statusData.success ||
|
||||
!publishersData.success
|
||||
) {
|
||||
throw new Error(
|
||||
scrapersData.message ||
|
||||
statusData.message ||
|
||||
publishersData.message ||
|
||||
"Unknown error"
|
||||
);
|
||||
}
|
||||
|
||||
this.scrapers = scrapersData.scrapers;
|
||||
this.systemConfig = statusData;
|
||||
this.publishersData = publishersData.data;
|
||||
|
||||
// Update UI
|
||||
this.updateSystemConfig();
|
||||
this.updateScrapersTable();
|
||||
this.updatePublishersSection();
|
||||
this.updateStatusFlowDiagram();
|
||||
|
||||
// Show content
|
||||
loadingEl?.classList.add("d-none");
|
||||
contentEl?.classList.remove("d-none");
|
||||
} catch (error) {
|
||||
console.error("Error loading scraper overview:", error);
|
||||
|
||||
// Show error state
|
||||
loadingEl?.classList.add("d-none");
|
||||
const errorMessage = document.getElementById(
|
||||
"scraperOverviewErrorMessage"
|
||||
);
|
||||
if (errorMessage) {
|
||||
errorMessage.textContent =
|
||||
error.message || "Failed to load scraper information";
|
||||
}
|
||||
errorEl?.classList.remove("d-none");
|
||||
}
|
||||
}
|
||||
|
||||
updateSystemConfig() {
|
||||
// Current scraper module
|
||||
const currentModuleEl = document.getElementById("currentScraperModule");
|
||||
if (currentModuleEl) {
|
||||
const currentModule =
|
||||
this.systemConfig.current_scraper_module || "System Default";
|
||||
currentModuleEl.textContent = currentModule;
|
||||
currentModuleEl.className = "badge bg-primary";
|
||||
}
|
||||
|
||||
// Volume limit
|
||||
const volumeLimitEl = document.getElementById("currentVolumeLimit");
|
||||
if (volumeLimitEl) {
|
||||
const volumeLimit = this.systemConfig.volume_config || "Unknown";
|
||||
volumeLimitEl.textContent = volumeLimit;
|
||||
}
|
||||
|
||||
// Total modules
|
||||
const totalModulesEl = document.getElementById("totalScraperModules");
|
||||
if (totalModulesEl) {
|
||||
totalModulesEl.textContent = this.scrapers.length;
|
||||
}
|
||||
|
||||
// Paper counts summary
|
||||
const paperCountsEl = document.getElementById("paperCountsSummary");
|
||||
if (paperCountsEl && this.systemConfig.paper_counts) {
|
||||
const counts = this.systemConfig.paper_counts;
|
||||
paperCountsEl.innerHTML = `
|
||||
<div class="d-flex flex-wrap gap-2">
|
||||
<span class="badge bg-primary">${counts.new || 0} New</span>
|
||||
<span class="badge bg-warning">${
|
||||
counts.processing || 0
|
||||
} Processing</span>
|
||||
<span class="badge bg-success">${
|
||||
counts.done || 0
|
||||
} Done</span>
|
||||
<span class="badge bg-danger">${
|
||||
counts.failed || 0
|
||||
} Failed</span>
|
||||
<span class="badge bg-info">${
|
||||
counts.pending || 0
|
||||
} Pending</span>
|
||||
<span class="badge bg-secondary">${
|
||||
counts.retrying || 0
|
||||
} Retrying</span>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
}
|
||||
|
||||
updateScrapersTable() {
|
||||
const tbody = document.getElementById("scrapersTableBody");
|
||||
if (!tbody) return;
|
||||
|
||||
tbody.innerHTML = "";
|
||||
|
||||
this.scrapers.forEach((scraper) => {
|
||||
const row = document.createElement("tr");
|
||||
|
||||
// Check if this is the current active scraper
|
||||
const isCurrentScraper =
|
||||
scraper.name === this.systemConfig.current_scraper_module;
|
||||
|
||||
if (scraper.error) {
|
||||
row.innerHTML = `
|
||||
<td>${scraper.name}</td>
|
||||
<td colspan="5" class="text-danger">
|
||||
<i class="fas fa-exclamation-triangle"></i> ${scraper.error}
|
||||
</td>
|
||||
`;
|
||||
} else {
|
||||
row.innerHTML = `
|
||||
<td>
|
||||
<strong>${scraper.name}</strong>
|
||||
${
|
||||
scraper.name === "dummy"
|
||||
? '<span class="badge bg-info ms-2">Test Module</span>'
|
||||
: ""
|
||||
}
|
||||
${
|
||||
isCurrentScraper
|
||||
? '<span class="badge bg-success ms-2"><i class="fas fa-check"></i> Active</span>'
|
||||
: ""
|
||||
}
|
||||
</td>
|
||||
<td class="scraper-description">
|
||||
${this.truncateDescription(scraper.description)}
|
||||
</td>
|
||||
<td class="input-status-list">
|
||||
${this.renderStatusBadges(
|
||||
scraper.input_statuses,
|
||||
"bg-info"
|
||||
)}
|
||||
</td>
|
||||
<td class="status-output">
|
||||
<span class="badge bg-success">${
|
||||
scraper.output_status_success
|
||||
}</span>
|
||||
</td>
|
||||
<td class="status-output">
|
||||
<span class="badge bg-danger">${
|
||||
scraper.output_status_failure
|
||||
}</span>
|
||||
</td>
|
||||
<td class="status-output">
|
||||
<span class="badge bg-warning">${
|
||||
scraper.output_status_processing
|
||||
}</span>
|
||||
</td>
|
||||
`;
|
||||
}
|
||||
|
||||
// Highlight the current scraper row
|
||||
if (isCurrentScraper) {
|
||||
row.classList.add("table-success");
|
||||
}
|
||||
|
||||
tbody.appendChild(row);
|
||||
});
|
||||
}
|
||||
|
||||
updateStatusFlowDiagram() {
|
||||
const diagramEl = document.getElementById("statusFlowDiagram");
|
||||
if (!diagramEl) return;
|
||||
|
||||
// Analyze actual scrapers to build real flow
|
||||
const statusFlow = this.analyzeScraperFlow();
|
||||
|
||||
let diagramHTML = '<div class="status-flow-container">';
|
||||
|
||||
// Create visual flow based on actual scrapers
|
||||
statusFlow.forEach((stage, index) => {
|
||||
if (index > 0) {
|
||||
diagramHTML +=
|
||||
'<div class="status-flow-arrow text-center my-2"><i class="fas fa-arrow-down fa-2x text-muted"></i></div>';
|
||||
}
|
||||
|
||||
diagramHTML += '<div class="status-flow-stage mb-4 p-3 border rounded">';
|
||||
diagramHTML += `<div class="fw-bold mb-2 text-primary">${stage.title}</div>`;
|
||||
|
||||
if (stage.scrapers && stage.scrapers.length > 0) {
|
||||
diagramHTML +=
|
||||
'<div class="mb-2"><small class="text-muted">Handled by: ' +
|
||||
stage.scrapers.map((s) => `<strong>${s}</strong>`).join(", ") +
|
||||
"</small></div>";
|
||||
}
|
||||
|
||||
diagramHTML += '<div class="status-badges">';
|
||||
stage.statuses.forEach((status, statusIndex) => {
|
||||
if (statusIndex > 0) {
|
||||
diagramHTML += '<i class="fas fa-arrow-right status-flow-arrow"></i>';
|
||||
}
|
||||
|
||||
const badgeClass = this.getStatusBadgeClass(status);
|
||||
diagramHTML += `<span class="status-flow-node badge ${badgeClass}">${status}</span>`;
|
||||
});
|
||||
diagramHTML += "</div>";
|
||||
|
||||
if (stage.description) {
|
||||
diagramHTML += `<div class="small text-muted mt-2">${stage.description}</div>`;
|
||||
}
|
||||
|
||||
diagramHTML += "</div>";
|
||||
});
|
||||
|
||||
diagramHTML += "</div>";
|
||||
|
||||
// Add explanation
|
||||
diagramHTML += `
|
||||
<div class="mt-4 p-3 bg-light rounded">
|
||||
<h6><i class="fas fa-info-circle"></i> Flow Explanation:</h6>
|
||||
<ul class="small mb-0">
|
||||
<li><strong>Modular Processing:</strong> Each scraper handles specific input statuses</li>
|
||||
<li><strong>Status Transitions:</strong> Papers move through statuses as they are processed</li>
|
||||
<li><strong>Pipeline Architecture:</strong> Output from one scraper can become input to another</li>
|
||||
<li><strong>Error Handling:</strong> Failed papers can be retried by specialized scrapers</li>
|
||||
<li><strong>Parallel Processing:</strong> Multiple scrapers can work on different papers simultaneously</li>
|
||||
</ul>
|
||||
</div>
|
||||
`;
|
||||
|
||||
diagramEl.innerHTML = diagramHTML;
|
||||
}
|
||||
|
||||
analyzeScraperFlow() {
|
||||
// Build actual flow based on available scrapers
|
||||
const stages = [];
|
||||
const allInputStatuses = new Set();
|
||||
const allOutputStatuses = new Set();
|
||||
const scrapersByInput = {};
|
||||
|
||||
// Analyze scrapers to understand the flow
|
||||
this.scrapers.forEach((scraper) => {
|
||||
if (scraper.input_statuses) {
|
||||
scraper.input_statuses.forEach((status) => {
|
||||
allInputStatuses.add(status);
|
||||
if (!scrapersByInput[status]) {
|
||||
scrapersByInput[status] = [];
|
||||
}
|
||||
scrapersByInput[status].push(scraper.name);
|
||||
});
|
||||
}
|
||||
|
||||
if (scraper.output_status_success)
|
||||
allOutputStatuses.add(scraper.output_status_success);
|
||||
if (scraper.output_status_failure)
|
||||
allOutputStatuses.add(scraper.output_status_failure);
|
||||
});
|
||||
|
||||
// Entry point
|
||||
if (allInputStatuses.has("New")) {
|
||||
stages.push({
|
||||
title: "Entry Point",
|
||||
statuses: ["New"],
|
||||
scrapers: scrapersByInput["New"] || [],
|
||||
description: "Newly uploaded papers enter the processing pipeline",
|
||||
});
|
||||
}
|
||||
|
||||
// Processing stages
|
||||
const processingStatuses = Array.from(allInputStatuses).filter(
|
||||
(status) => !["New", "Done", "Failed"].includes(status)
|
||||
);
|
||||
|
||||
if (processingStatuses.length > 0) {
|
||||
stages.push({
|
||||
title: "Processing Stages",
|
||||
statuses: processingStatuses,
|
||||
scrapers: [],
|
||||
description: "Papers move through various processing stages",
|
||||
});
|
||||
}
|
||||
|
||||
// Final outputs
|
||||
const finalStatuses = ["Done", "Failed"];
|
||||
stages.push({
|
||||
title: "Final States",
|
||||
statuses: finalStatuses.filter((status) => allOutputStatuses.has(status)),
|
||||
scrapers: [],
|
||||
description: "Papers end up in final success or failure states",
|
||||
});
|
||||
|
||||
// Retry handling
|
||||
if (allInputStatuses.has("Failed")) {
|
||||
stages.push({
|
||||
title: "Retry Processing",
|
||||
statuses: ["Failed", "Retrying"],
|
||||
scrapers: scrapersByInput["Failed"] || [],
|
||||
description: "Failed papers can be retried with specialized scrapers",
|
||||
});
|
||||
}
|
||||
|
||||
return stages;
|
||||
}
|
||||
|
||||
getStatusBadgeClass(status) {
|
||||
const statusClasses = {
|
||||
New: "bg-primary",
|
||||
Pending: "bg-warning",
|
||||
Processing: "bg-warning",
|
||||
Retrying: "bg-warning",
|
||||
Done: "bg-success",
|
||||
Failed: "bg-danger",
|
||||
HtmlDownloaded: "bg-info",
|
||||
PublisherDetected: "bg-info",
|
||||
TextExtracted: "bg-info",
|
||||
};
|
||||
|
||||
return statusClasses[status] || "bg-secondary";
|
||||
}
|
||||
|
||||
renderStatusBadges(statuses, defaultClass = "bg-secondary") {
|
||||
if (!Array.isArray(statuses)) return "";
|
||||
|
||||
return statuses
|
||||
.map(
|
||||
(status) =>
|
||||
`<span class="badge ${this.getStatusBadgeClass(
|
||||
status
|
||||
)} status-badge">${status}</span>`
|
||||
)
|
||||
.join("");
|
||||
}
|
||||
|
||||
truncateDescription(description, maxLength = 100) {
|
||||
if (!description) return "No description available";
|
||||
|
||||
if (description.length <= maxLength) return description;
|
||||
|
||||
return description.substring(0, maxLength).trim() + "...";
|
||||
}
|
||||
|
||||
updatePublishersSection() {
|
||||
// Update publisher statistics
|
||||
const publisherStatsEl = document.getElementById("publisherStats");
|
||||
if (publisherStatsEl && this.publishersData && this.publishersData.stats) {
|
||||
const stats = this.publishersData.stats;
|
||||
publisherStatsEl.innerHTML = `
|
||||
<div class="col-md-3">
|
||||
<div class="text-center">
|
||||
<div class="h4 text-primary mb-1">${stats.total_publishers}</div>
|
||||
<div class="text-muted small">Total Publishers</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-md-3">
|
||||
<div class="text-center">
|
||||
<div class="h4 text-success mb-1">${stats.publishers_with_parsers}</div>
|
||||
<div class="text-muted small">With Parsers</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-md-3">
|
||||
<div class="text-center">
|
||||
<div class="h4 text-warning mb-1">${stats.publishers_without_parsers}</div>
|
||||
<div class="text-muted small">Missing Parsers</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-md-3">
|
||||
<div class="text-center">
|
||||
<div class="h4 text-info mb-1">${stats.total_papers_with_publisher}</div>
|
||||
<div class="text-muted small">Papers with Publisher</div>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
// Update publishers table
|
||||
const publishersTableBody = document.getElementById("publishersTableBody");
|
||||
if (
|
||||
publishersTableBody &&
|
||||
this.publishersData &&
|
||||
this.publishersData.publishers
|
||||
) {
|
||||
publishersTableBody.innerHTML = "";
|
||||
|
||||
if (this.publishersData.publishers.length === 0) {
|
||||
publishersTableBody.innerHTML = `
|
||||
<tr>
|
||||
<td colspan="4" class="text-center text-muted py-4">
|
||||
<i class="fas fa-info-circle"></i> No publishers detected yet.<br>
|
||||
<small>Run the publisher_detector scraper to identify publishers from paper URLs.</small>
|
||||
</td>
|
||||
</tr>
|
||||
`;
|
||||
return;
|
||||
}
|
||||
|
||||
this.publishersData.publishers.forEach((publisher) => {
|
||||
const row = document.createElement("tr");
|
||||
|
||||
// Publisher status badge
|
||||
const statusBadge = publisher.has_parser
|
||||
? '<span class="badge bg-success"><i class="fas fa-check"></i> Available</span>'
|
||||
: '<span class="badge bg-warning"><i class="fas fa-exclamation-triangle"></i> Missing</span>';
|
||||
|
||||
// Parser availability indicator
|
||||
const parserIndicator = publisher.has_parser
|
||||
? '<i class="fas fa-check-circle text-success" title="Parser available"></i>'
|
||||
: '<i class="fas fa-times-circle text-warning" title="Parser not available"></i>';
|
||||
|
||||
row.innerHTML = `
|
||||
<td>
|
||||
<strong>${publisher.name}</strong>
|
||||
</td>
|
||||
<td>
|
||||
<span class="badge bg-info">${publisher.paper_count}</span>
|
||||
</td>
|
||||
<td>${statusBadge}</td>
|
||||
<td class="text-center">${parserIndicator}</td>
|
||||
`;
|
||||
|
||||
publishersTableBody.appendChild(row);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Public method to show the modal
|
||||
show() {
|
||||
if (this.modal) {
|
||||
const bootstrapModal = new bootstrap.Modal(this.modal);
|
||||
bootstrapModal.show();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Global function to load scraper overview (used by retry button)
|
||||
function loadScraperOverview() {
|
||||
if (window.scraperOverview) {
|
||||
window.scraperOverview.loadScraperOverview();
|
||||
}
|
||||
}
|
||||
|
||||
// Global function to show scraper overview modal
|
||||
function showScraperOverview() {
|
||||
if (!window.scraperOverview) {
|
||||
window.scraperOverview = new ScraperOverview();
|
||||
}
|
||||
window.scraperOverview.show();
|
||||
}
|
||||
|
||||
// Initialize when DOM is ready
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
window.scraperOverview = new ScraperOverview();
|
||||
});
|
337
scipaperloader/static/js/table-handler.js
Normal file
337
scipaperloader/static/js/table-handler.js
Normal file
@ -0,0 +1,337 @@
|
||||
/**
|
||||
* Table utilities for handling data tables with pagination, sorting, and filtering
|
||||
*/
|
||||
|
||||
class TableHandler {
|
||||
constructor(tableId, options = {}) {
|
||||
this.table = document.getElementById(tableId);
|
||||
this.options = {
|
||||
enableSorting: true,
|
||||
enableFiltering: true,
|
||||
enablePagination: true,
|
||||
loadingText: "Loading...",
|
||||
noDataText: "No data available",
|
||||
...options,
|
||||
};
|
||||
|
||||
this.currentPage = 1;
|
||||
this.itemsPerPage = options.itemsPerPage || 20;
|
||||
this.sortColumn = null;
|
||||
this.sortDirection = "asc";
|
||||
this.filters = {};
|
||||
|
||||
this.initializeTable();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize table features
|
||||
*/
|
||||
initializeTable() {
|
||||
if (!this.table) return;
|
||||
|
||||
if (this.options.enableSorting) {
|
||||
this.setupSortingHandlers();
|
||||
}
|
||||
|
||||
if (this.options.enableFiltering) {
|
||||
this.setupFilteringHandlers();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up sorting handlers for table headers
|
||||
*/
|
||||
setupSortingHandlers() {
|
||||
const headers = this.table.querySelectorAll("th[data-sortable]");
|
||||
|
||||
headers.forEach((header) => {
|
||||
header.style.cursor = "pointer";
|
||||
header.addEventListener("click", () => {
|
||||
const column = header.dataset.sortable;
|
||||
this.sortByColumn(column);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort table by column
|
||||
* @param {string} column - Column to sort by
|
||||
*/
|
||||
sortByColumn(column) {
|
||||
if (this.sortColumn === column) {
|
||||
this.sortDirection = this.sortDirection === "asc" ? "desc" : "asc";
|
||||
} else {
|
||||
this.sortColumn = column;
|
||||
this.sortDirection = "asc";
|
||||
}
|
||||
|
||||
this.updateSortIndicators();
|
||||
this.refreshData();
|
||||
}
|
||||
|
||||
/**
|
||||
* Update sort direction indicators in table headers
|
||||
*/
|
||||
updateSortIndicators() {
|
||||
// Remove existing sort indicators
|
||||
this.table.querySelectorAll("th .sort-indicator").forEach((indicator) => {
|
||||
indicator.remove();
|
||||
});
|
||||
|
||||
// Add indicator to current sort column
|
||||
if (this.sortColumn) {
|
||||
const header = this.table.querySelector(
|
||||
`th[data-sortable="${this.sortColumn}"]`
|
||||
);
|
||||
if (header) {
|
||||
const indicator = document.createElement("span");
|
||||
indicator.className = "sort-indicator";
|
||||
indicator.innerHTML = this.sortDirection === "asc" ? " ↑" : " ↓";
|
||||
header.appendChild(indicator);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up filtering handlers
|
||||
*/
|
||||
setupFilteringHandlers() {
|
||||
const filterInputs = document.querySelectorAll("[data-table-filter]");
|
||||
|
||||
filterInputs.forEach((input) => {
|
||||
input.addEventListener("input", (e) => {
|
||||
const filterKey = e.target.dataset.tableFilter;
|
||||
this.setFilter(filterKey, e.target.value);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a filter value
|
||||
* @param {string} key - Filter key
|
||||
* @param {string} value - Filter value
|
||||
*/
|
||||
setFilter(key, value) {
|
||||
if (value && value.trim() !== "") {
|
||||
this.filters[key] = value.trim();
|
||||
} else {
|
||||
delete this.filters[key];
|
||||
}
|
||||
|
||||
this.currentPage = 1; // Reset to first page when filtering
|
||||
this.refreshData();
|
||||
}
|
||||
|
||||
/**
|
||||
* Show loading state
|
||||
*/
|
||||
showLoading() {
|
||||
const tbody = this.table.querySelector("tbody");
|
||||
if (tbody) {
|
||||
const colCount = this.table.querySelectorAll("th").length;
|
||||
tbody.innerHTML = `
|
||||
<tr>
|
||||
<td colspan="${colCount}" class="text-center">${this.options.loadingText}</td>
|
||||
</tr>
|
||||
`;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Show no data message
|
||||
*/
|
||||
showNoData() {
|
||||
const tbody = this.table.querySelector("tbody");
|
||||
if (tbody) {
|
||||
const colCount = this.table.querySelectorAll("th").length;
|
||||
tbody.innerHTML = `
|
||||
<tr>
|
||||
<td colspan="${colCount}" class="text-center">${this.options.noDataText}</td>
|
||||
</tr>
|
||||
`;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Render table data
|
||||
* @param {Array} data - Array of data objects
|
||||
* @param {Function} rowRenderer - Function to render each row
|
||||
*/
|
||||
renderData(data, rowRenderer) {
|
||||
const tbody = this.table.querySelector("tbody");
|
||||
if (!tbody) return;
|
||||
|
||||
if (!data || data.length === 0) {
|
||||
this.showNoData();
|
||||
return;
|
||||
}
|
||||
|
||||
tbody.innerHTML = data.map(rowRenderer).join("");
|
||||
}
|
||||
|
||||
/**
|
||||
* Build query parameters for API requests
|
||||
* @returns {object} Query parameters object
|
||||
*/
|
||||
buildQueryParams() {
|
||||
const params = {
|
||||
page: this.currentPage,
|
||||
per_page: this.itemsPerPage,
|
||||
...this.filters,
|
||||
};
|
||||
|
||||
if (this.sortColumn) {
|
||||
params.sort_by = this.sortColumn;
|
||||
params.sort_dir = this.sortDirection;
|
||||
}
|
||||
|
||||
return params;
|
||||
}
|
||||
|
||||
/**
|
||||
* Refresh table data (to be implemented by subclasses or passed as callback)
|
||||
*/
|
||||
refreshData() {
|
||||
if (this.options.onRefresh) {
|
||||
this.options.onRefresh(this.buildQueryParams());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update pagination controls
|
||||
* @param {object} paginationInfo - Pagination information
|
||||
*/
|
||||
updatePagination(paginationInfo) {
|
||||
const paginationContainer = document.querySelector(".pagination-container");
|
||||
if (!paginationContainer || !paginationInfo) return;
|
||||
|
||||
// This is a basic implementation - you might want to enhance this
|
||||
const { current_page, total_pages, has_prev, has_next } = paginationInfo;
|
||||
|
||||
let paginationHTML = '<nav><ul class="pagination justify-content-center">';
|
||||
|
||||
// Previous button
|
||||
if (has_prev) {
|
||||
paginationHTML += `<li class="page-item"><a class="page-link" href="#" data-page="${
|
||||
current_page - 1
|
||||
}">Previous</a></li>`;
|
||||
} else {
|
||||
paginationHTML +=
|
||||
'<li class="page-item disabled"><span class="page-link">Previous</span></li>';
|
||||
}
|
||||
|
||||
// Page numbers (simplified - show current and adjacent pages)
|
||||
const startPage = Math.max(1, current_page - 2);
|
||||
const endPage = Math.min(total_pages, current_page + 2);
|
||||
|
||||
for (let i = startPage; i <= endPage; i++) {
|
||||
if (i === current_page) {
|
||||
paginationHTML += `<li class="page-item active"><span class="page-link">${i}</span></li>`;
|
||||
} else {
|
||||
paginationHTML += `<li class="page-item"><a class="page-link" href="#" data-page="${i}">${i}</a></li>`;
|
||||
}
|
||||
}
|
||||
|
||||
// Next button
|
||||
if (has_next) {
|
||||
paginationHTML += `<li class="page-item"><a class="page-link" href="#" data-page="${
|
||||
current_page + 1
|
||||
}">Next</a></li>`;
|
||||
} else {
|
||||
paginationHTML +=
|
||||
'<li class="page-item disabled"><span class="page-link">Next</span></li>';
|
||||
}
|
||||
|
||||
paginationHTML += "</ul></nav>";
|
||||
paginationContainer.innerHTML = paginationHTML;
|
||||
|
||||
// Add click handlers for pagination links
|
||||
paginationContainer.querySelectorAll("a[data-page]").forEach((link) => {
|
||||
link.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
this.currentPage = parseInt(e.target.dataset.page);
|
||||
this.refreshData();
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Specialized table handler for papers
|
||||
*/
|
||||
class PapersTableHandler extends TableHandler {
|
||||
constructor(tableId, options = {}) {
|
||||
super(tableId, {
|
||||
apiEndpoint: "/api/papers",
|
||||
...options,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Render a paper row
|
||||
* @param {object} paper - Paper data object
|
||||
* @returns {string} HTML string for table row
|
||||
*/
|
||||
renderPaperRow(paper) {
|
||||
const statusBadge = createStatusBadge(paper.status);
|
||||
const truncatedTitle = truncateText(paper.title, 70);
|
||||
|
||||
return `
|
||||
<tr>
|
||||
<td>
|
||||
<a href="#" class="paper-link" data-url="/papers/${
|
||||
paper.id
|
||||
}/detail">
|
||||
${truncatedTitle}
|
||||
</a>
|
||||
</td>
|
||||
<td>
|
||||
<a href="https://doi.org/${paper.doi}" target="_blank">
|
||||
${paper.doi || "N/A"}
|
||||
</a>
|
||||
</td>
|
||||
<td>${paper.journal || "N/A"}</td>
|
||||
<td>${paper.issn || "N/A"}</td>
|
||||
<td>${statusBadge}</td>
|
||||
<td>${formatTimestamp(paper.created_at)}</td>
|
||||
<td>${formatTimestamp(paper.updated_at)}</td>
|
||||
</tr>
|
||||
`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load and display papers data
|
||||
* @param {object} params - Query parameters
|
||||
*/
|
||||
async loadPapers(params = {}) {
|
||||
this.showLoading();
|
||||
|
||||
try {
|
||||
const queryString = new URLSearchParams(params).toString();
|
||||
const url = `${this.options.apiEndpoint}?${queryString}`;
|
||||
|
||||
const response = await fetch(url);
|
||||
const data = await response.json();
|
||||
|
||||
if (data.papers) {
|
||||
this.renderData(data.papers, (paper) => this.renderPaperRow(paper));
|
||||
|
||||
if (data.pagination) {
|
||||
this.updatePagination(data.pagination);
|
||||
}
|
||||
} else {
|
||||
this.showNoData();
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error loading papers:", error);
|
||||
this.showNoData();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Refresh data implementation
|
||||
*/
|
||||
refreshData() {
|
||||
this.loadPapers(this.buildQueryParams());
|
||||
}
|
||||
}
|
@ -7,3 +7,34 @@
|
||||
.progress-bar {
|
||||
width: 0%;
|
||||
}
|
||||
|
||||
/* JSON formatting styles */
|
||||
.json-formatted {
|
||||
background-color: #f8f9fa;
|
||||
border: 1px solid #dee2e6;
|
||||
border-radius: 0.375rem;
|
||||
font-family: "Monaco", "Menlo", "Ubuntu Mono", monospace;
|
||||
font-size: 0.875rem;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.json-formatted code {
|
||||
color: #212529;
|
||||
background-color: transparent;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
/* Improve readability of JSON in modals */
|
||||
#extra-data-content {
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
font-family: "Monaco", "Menlo", "Ubuntu Mono", monospace;
|
||||
font-size: 0.875rem;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
/* Style for old/new value code blocks */
|
||||
pre code {
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
@ -1,4 +1,8 @@
|
||||
{% extends "base.html.jinja" %} {% block content %}
|
||||
{% extends "base.html.jinja" %}
|
||||
|
||||
{% block title %}About{% endblock title %}
|
||||
|
||||
{% block content %}
|
||||
<h1 class="mb-4">📘 About This App</h1>
|
||||
|
||||
<p class="lead">
|
||||
|
@ -7,6 +7,7 @@
|
||||
<meta name="keywords" content="science, papers, research, management" />
|
||||
<title>{{ app_title }}</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet" />
|
||||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.11.0/font/bootstrap-icons.css">
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
|
||||
<!-- Optional Alpine.js -->
|
||||
<script defer src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"></script>
|
||||
@ -17,6 +18,8 @@
|
||||
<main class="container my-5">{% block content %}{% endblock content %}</main>
|
||||
{% include "footer.html.jinja" %}
|
||||
|
||||
<!-- Include common utilities globally -->
|
||||
<script src="{{ url_for('static', filename='js/common.js') }}"></script>
|
||||
{% block scripts %}{% endblock scripts %}
|
||||
</body>
|
||||
|
||||
|
@ -38,6 +38,43 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="form-section">
|
||||
<h6>Scheduler Timezone</h6>
|
||||
<p class="text-muted">Configure the timezone for the APScheduler to use for job
|
||||
scheduling.</p>
|
||||
<div class="mb-3">
|
||||
<label for="timezone" class="form-label">Timezone:</label>
|
||||
<select class="form-control" id="timezone" name="timezone" required>
|
||||
<option value="UTC" {% if timezone_config.timezone=='UTC' %}selected{% endif %}>
|
||||
UTC</option>
|
||||
<option value="Europe/Berlin" {% if timezone_config.timezone=='Europe/Berlin'
|
||||
%}selected{% endif %}>Europe/Berlin (CET/CEST)</option>
|
||||
<option value="Europe/London" {% if timezone_config.timezone=='Europe/London'
|
||||
%}selected{% endif %}>Europe/London (GMT/BST)</option>
|
||||
<option value="Europe/Paris" {% if timezone_config.timezone=='Europe/Paris'
|
||||
%}selected{% endif %}>Europe/Paris (CET/CEST)</option>
|
||||
<option value="Europe/Rome" {% if timezone_config.timezone=='Europe/Rome'
|
||||
%}selected{% endif %}>Europe/Rome (CET/CEST)</option>
|
||||
<option value="US/Eastern" {% if timezone_config.timezone=='US/Eastern'
|
||||
%}selected{% endif %}>US/Eastern (EST/EDT)</option>
|
||||
<option value="US/Central" {% if timezone_config.timezone=='US/Central'
|
||||
%}selected{% endif %}>US/Central (CST/CDT)</option>
|
||||
<option value="US/Mountain" {% if timezone_config.timezone=='US/Mountain'
|
||||
%}selected{% endif %}>US/Mountain (MST/MDT)</option>
|
||||
<option value="US/Pacific" {% if timezone_config.timezone=='US/Pacific'
|
||||
%}selected{% endif %}>US/Pacific (PST/PDT)</option>
|
||||
<option value="Asia/Tokyo" {% if timezone_config.timezone=='Asia/Tokyo'
|
||||
%}selected{% endif %}>Asia/Tokyo (JST)</option>
|
||||
<option value="Asia/Shanghai" {% if timezone_config.timezone=='Asia/Shanghai'
|
||||
%}selected{% endif %}>Asia/Shanghai (CST)</option>
|
||||
<option value="Australia/Sydney" {% if
|
||||
timezone_config.timezone=='Australia/Sydney' %}selected{% endif %}>
|
||||
Australia/Sydney (AEST/AEDT)</option>
|
||||
</select>
|
||||
<div class="form-text">Current: {{ timezone_config.timezone }}</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="form-section">
|
||||
<h6>System Settings</h6>
|
||||
<p class="text-muted">Configure general system behavior.</p>
|
||||
@ -65,15 +102,21 @@
|
||||
<div class="col-md-6">
|
||||
<form method="post" action="{{ url_for('config.update_scraper_module') }}">
|
||||
<div class="form-section">
|
||||
<div class="d-flex justify-content-between align-items-center mb-2">
|
||||
<h6>Scraper Module</h6>
|
||||
<button type="button" class="btn btn-outline-info btn-sm"
|
||||
onclick="showScraperOverview()" title="View scraper modules overview">
|
||||
<i class="fas fa-info-circle"></i> How Scrapers Work
|
||||
</button>
|
||||
</div>
|
||||
<p class="text-muted">Select which scraper module to use for processing papers.</p>
|
||||
|
||||
<div class="mb-3">
|
||||
<label for="scraper_module" class="form-label">Active Scraper Module:</label>
|
||||
<select class="form-control" id="scraper_module" name="scraper_module">
|
||||
{% for module in available_scraper_modules %}
|
||||
<option value="{{ module }}" {% if module==current_scraper_module %} selected
|
||||
{%endif %}>
|
||||
<option value="{{ module }}" {% if module==current_scraper_module %} selected {%
|
||||
endif %}>
|
||||
{{ module }}
|
||||
{% if scraper_details[module] %}
|
||||
- {{ scraper_details[module].description[:50] }}...
|
||||
|
@ -53,4 +53,13 @@
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Include the scraper overview modal -->
|
||||
{% include "partials/scraper_overview_modal.html.jinja" %}
|
||||
|
||||
{% endblock content %}
|
||||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<script src="{{ url_for('static', filename='js/scraper-overview.js') }}"></script>
|
||||
{% endblock scripts %}
|
@ -39,12 +39,19 @@
|
||||
}
|
||||
</style>
|
||||
|
||||
<script>
|
||||
const initialSchedule = {{ schedule | tojson }};
|
||||
const totalVolume = {{ volume }};
|
||||
<!-- Configuration data in JSON format for clean separation -->
|
||||
<script type="application/json" id="schedule-config">
|
||||
{
|
||||
"initialSchedule": {{ schedule | tojson }},
|
||||
"totalVolume": {{ volume | tojson }},
|
||||
"maxVolume": {{ max_volume | tojson }}
|
||||
}
|
||||
</script>
|
||||
|
||||
<div x-data="scheduleManager(initialSchedule, totalVolume)" class="tab-pane active">
|
||||
<!-- Load config handler for modular functionality -->
|
||||
<script src="{{ url_for('static', filename='js/config-handler.js') }}"></script>
|
||||
|
||||
<div x-data="configHandler.createScheduleManager()" class="tab-pane active">
|
||||
<div class="card">
|
||||
<div class="card-header d-flex justify-content-between">
|
||||
<h5>Scheduling Configuration</h5>
|
||||
@ -211,164 +218,3 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
function scheduleManager(initial, volume) {
|
||||
return {
|
||||
schedule: { ...initial },
|
||||
volume: volume,
|
||||
selectedHours: [],
|
||||
newWeight: 1.0,
|
||||
volumeValue: volume,
|
||||
isDragging: false,
|
||||
dragOperation: null,
|
||||
|
||||
formatHour(h) {
|
||||
return String(h).padStart(2, "0") + ":00";
|
||||
},
|
||||
|
||||
updateVolume() {
|
||||
fetch('{{ url_for('config.api_update_config') }}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
volume: this.volumeValue
|
||||
})
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
this.volume = parseFloat(this.volumeValue);
|
||||
showFlashMessage('Volume updated successfully!', 'success');
|
||||
} else {
|
||||
showFlashMessage(data.updates?.[0]?.message || 'Error updating volume', 'error');
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error:', error);
|
||||
showFlashMessage('Network error occurred', 'error');
|
||||
});
|
||||
},
|
||||
|
||||
getBackgroundStyle(hour) {
|
||||
const weight = parseFloat(this.schedule[hour]);
|
||||
const maxWeight = 2.5; // You can adjust this
|
||||
|
||||
// Normalize weight (0.0 to 1.0)
|
||||
const t = Math.min(weight / maxWeight, 1.0);
|
||||
|
||||
// Interpolate HSL lightness: 95% (light) to 30% (dark)
|
||||
const lightness = 95 - t * 65; // 95 → 30
|
||||
const backgroundColor = `hsl(210, 10%, ${lightness}%)`; // soft gray-blue palette
|
||||
|
||||
const textColor = t > 0.65 ? "white" : "black"; // adaptive text color
|
||||
|
||||
return {
|
||||
backgroundColor,
|
||||
color: textColor,
|
||||
};
|
||||
},
|
||||
|
||||
getBackgroundStyleFromValue(value) {
|
||||
const weight = parseFloat(value);
|
||||
const maxWeight = 2.5; // You can adjust this
|
||||
|
||||
// Normalize weight (0.0 to 1.0)
|
||||
const t = Math.min(weight / maxWeight, 1.0);
|
||||
|
||||
// Interpolate HSL lightness: 95% (light) to 30% (dark)
|
||||
const lightness = 95 - t * 65; // 95 → 30
|
||||
const backgroundColor = `hsl(210, 10%, ${lightness}%)`; // soft gray-blue palette
|
||||
|
||||
const textColor = t > 0.65 ? "white" : "black"; // adaptive text color
|
||||
|
||||
return {
|
||||
backgroundColor,
|
||||
color: textColor,
|
||||
};
|
||||
},
|
||||
|
||||
startDrag(event, hour) {
|
||||
event.preventDefault();
|
||||
this.isDragging = true;
|
||||
this.dragOperation = this.isSelected(hour) ? "remove" : "add";
|
||||
this.toggleSelect(hour);
|
||||
},
|
||||
|
||||
dragSelect(hour) {
|
||||
if (!this.isDragging) return;
|
||||
const selected = this.isSelected(hour);
|
||||
if (this.dragOperation === "add" && !selected) {
|
||||
this.selectedHours.push(hour);
|
||||
} else if (this.dragOperation === "remove" && selected) {
|
||||
this.selectedHours = this.selectedHours.filter((h) => h !== hour);
|
||||
}
|
||||
},
|
||||
|
||||
endDrag() {
|
||||
this.isDragging = false;
|
||||
},
|
||||
|
||||
toggleSelect(hour) {
|
||||
if (this.isSelected(hour)) {
|
||||
this.selectedHours = this.selectedHours.filter((h) => h !== hour);
|
||||
} else {
|
||||
this.selectedHours.push(hour);
|
||||
}
|
||||
},
|
||||
|
||||
isSelected(hour) {
|
||||
return this.selectedHours.includes(hour);
|
||||
},
|
||||
|
||||
applyWeight() {
|
||||
this.selectedHours.forEach((hour) => {
|
||||
this.schedule[hour] = parseFloat(this.newWeight).toFixed(1);
|
||||
});
|
||||
this.selectedHours = [];
|
||||
},
|
||||
|
||||
getTotalWeight() {
|
||||
return Object.values(this.schedule).reduce(
|
||||
(sum, w) => sum + parseFloat(w),
|
||||
0
|
||||
);
|
||||
},
|
||||
|
||||
getPapersPerHour(hour) {
|
||||
const total = this.getTotalWeight();
|
||||
if (total === 0) return 0;
|
||||
return (
|
||||
(parseFloat(this.schedule[hour]) / total) *
|
||||
this.volume
|
||||
).toFixed(1);
|
||||
},
|
||||
|
||||
saveSchedule() {
|
||||
fetch('{{ url_for('config.api_update_config') }}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
schedule: this.schedule
|
||||
})
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
showFlashMessage('Schedule updated successfully!', 'success');
|
||||
} else {
|
||||
showFlashMessage(data.updates?.[0]?.message || 'Error updating schedule', 'error');
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error:', error);
|
||||
showFlashMessage('Network error occurred', 'error');
|
||||
});
|
||||
}
|
||||
};
|
||||
}
|
||||
</script>
|
@ -1,66 +1,146 @@
|
||||
{% extends "base.html.jinja" %}
|
||||
|
||||
<!-- Include flash messages template -->
|
||||
{% include "partials/flash_messages.html.jinja" %}
|
||||
|
||||
{% block title %}Home - SciPaperLoader{% endblock title %}
|
||||
|
||||
{% block content %}
|
||||
|
||||
<div class="container text-center">
|
||||
<div class="container text-center mb-5">
|
||||
<h1 class="display-4">Welcome to SciPaperLoader</h1>
|
||||
<p class="lead">Your paper scraping tool is ready.</p>
|
||||
<p class="text-muted">A simple tool to scrape papers from Zotero API.</p>
|
||||
<p class="lead">Your comprehensive paper management and scraping platform</p>
|
||||
<p class="text-muted">Automate paper collection, manage metadata, and monitor download progress with intelligent
|
||||
scheduling</p>
|
||||
</div>
|
||||
|
||||
<div class="row g-4">
|
||||
<div class="col-md-6">
|
||||
<div class="card shadow-sm">
|
||||
<!-- Main Features Section -->
|
||||
<div class="row g-4 mb-5">
|
||||
<div class="col-12">
|
||||
<h2 class="text-center mb-4">🚀 Core Features</h2>
|
||||
</div>
|
||||
|
||||
<div class="col-lg-4 col-md-6">
|
||||
<div class="card shadow-sm h-100">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">📄 CSV Import</h5>
|
||||
<h5 class="card-title">🎛️ Scraper Control Panel</h5>
|
||||
<p class="card-text">
|
||||
Upload a 37-column CSV to import paper metadata. Only relevant fields
|
||||
(title, DOI, ISSN, etc.) are stored. Errors are reported without
|
||||
aborting the batch.
|
||||
Start, pause, and monitor the automated paper scraping process. View real-time statistics,
|
||||
activity charts, and process individual papers on demand.
|
||||
</p>
|
||||
<a href="{{ url_for('upload.upload') }}" class="btn btn-sm btn-outline-primary">Upload Now</a>
|
||||
<a href="{{ url_for('scraper.index') }}" class="btn btn-primary">Open Control Panel</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-md-6">
|
||||
<div class="card shadow-sm">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">🧠 Background Scraper</h5>
|
||||
<p class="card-text">
|
||||
A daemon process runs hourly to fetch papers using Zotero API.
|
||||
Downloads are randomized to mimic human behavior and avoid detection.
|
||||
</p>
|
||||
<a href="{{ url_for('logger.list_logs') }}" class="btn btn-sm btn-outline-secondary">View Logs</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-md-6">
|
||||
<div class="card shadow-sm">
|
||||
<div class="col-lg-4 col-md-6">
|
||||
<div class="card shadow-sm h-100">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">📚 Paper Management</h5>
|
||||
<p class="card-text">
|
||||
Monitor paper status (Pending, Done, Failed), download PDFs, and
|
||||
inspect errors. Files are stored on disk in structured folders per
|
||||
DOI.
|
||||
Browse, search, and manage your paper collection. View download status,
|
||||
inspect metadata, export data, and handle failed downloads.
|
||||
</p>
|
||||
<a href="{{ url_for('papers.list_papers') }}" class="btn btn-sm btn-outline-success">Browse Papers</a>
|
||||
<a href="{{ url_for('papers.list_papers') }}" class="btn btn-success">Browse Papers</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-md-6">
|
||||
<div class="card shadow-sm">
|
||||
<div class="col-lg-4 col-md-6">
|
||||
<div class="card shadow-sm h-100">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">🕒 Download Schedule</h5>
|
||||
<h5 class="card-title">📄 CSV Data Import</h5>
|
||||
<p class="card-text">
|
||||
Control how many papers are downloaded per hour. Configure hourly
|
||||
volume (e.g. 2/hour at daytime, 0 at night) to match your bandwidth or
|
||||
usage pattern.
|
||||
Bulk import paper metadata from CSV files. Supports 37-column format with
|
||||
intelligent duplicate detection and comprehensive error reporting.
|
||||
</p>
|
||||
<a href="{{ url_for('config.schedule') }}" class="btn btn-sm btn-outline-warning">Adjust Schedule</a>
|
||||
<a href="{{ url_for('upload.upload') }}" class="btn btn-outline-primary">Import Data</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Configuration & Monitoring Section -->
|
||||
<div class="row g-4 mb-5">
|
||||
<div class="col-12">
|
||||
<h2 class="text-center mb-4">⚙️ Configuration & Monitoring</h2>
|
||||
</div>
|
||||
|
||||
<div class="col-lg-4 col-md-6">
|
||||
<div class="card shadow-sm h-100">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">🕒 Download Scheduling</h5>
|
||||
<p class="card-text">
|
||||
Configure hourly download quotas and timing patterns. Set different rates for
|
||||
day/night hours to optimize bandwidth usage and avoid detection.
|
||||
</p>
|
||||
<a href="{{ url_for('config.schedule') }}" class="btn btn-warning">Manage Schedule</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-lg-4 col-md-6">
|
||||
<div class="card shadow-sm h-100">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">🔧 System Configuration</h5>
|
||||
<p class="card-text">
|
||||
Adjust global settings including daily volume limits, download paths,
|
||||
and scraper module selection for optimal performance.
|
||||
</p>
|
||||
<a href="{{ url_for('config.general') }}" class="btn btn-outline-secondary">System Settings</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-lg-4 col-md-6">
|
||||
<div class="card shadow-sm h-100">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">📊 Activity Logs</h5>
|
||||
<p class="card-text">
|
||||
Monitor system activity, track scraping progress, and troubleshoot issues
|
||||
with comprehensive logging and activity timeline views.
|
||||
</p>
|
||||
<a href="{{ url_for('logger.list_logs') }}" class="btn btn-info">View Logs</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Advanced Features Section -->
|
||||
<div class="row g-4 mb-5">
|
||||
<div class="col-12">
|
||||
<h2 class="text-center mb-4">🔬 Advanced Features</h2>
|
||||
</div>
|
||||
|
||||
<div class="col-lg-6 col-md-6">
|
||||
<div class="card shadow-sm h-100">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">🗄️ Database Management</h5>
|
||||
<p class="card-text">
|
||||
Manage your paper database with tools for generating test data,
|
||||
cleaning up records, and database maintenance operations.
|
||||
</p>
|
||||
<a href="{{ url_for('config.database') }}" class="btn btn-outline-danger">Database Tools</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-lg-6 col-md-6">
|
||||
<div class="card shadow-sm h-100">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">🧠 Intelligent Processing</h5>
|
||||
<p class="card-text">
|
||||
Background daemon with randomized timing, human-like behavior patterns,
|
||||
and automatic retry mechanisms for robust paper collection.
|
||||
</p>
|
||||
<div class="mt-3">
|
||||
<span class="badge bg-success me-2">Auto-Retry</span>
|
||||
<span class="badge bg-info me-2">Smart Timing</span>
|
||||
<span class="badge bg-warning">Rate Limiting</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% endblock content %}
|
@ -1,117 +0,0 @@
|
||||
{% extends "base.html.jinja" %}
|
||||
{% block content %}
|
||||
<h1>Activity Logs</h1>
|
||||
|
||||
<form method="get" class="mb-3">
|
||||
<div class="row g-2">
|
||||
<div class="col-md-3">
|
||||
<label for="category" class="form-label">Category:</label>
|
||||
<select name="category" id="category" class="form-select">
|
||||
<option value="">All</option>
|
||||
{% for cat in categories %}
|
||||
<option value="{{ cat }}" {% if category==cat %}selected{% endif %}>{{ cat }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div class="col-md-3">
|
||||
<label for="start_date" class="form-label">Start Date:</label>
|
||||
<input type="date" name="start_date" id="start_date" value="{{ start_date }}" class="form-control">
|
||||
</div>
|
||||
|
||||
<div class="col-md-3">
|
||||
<label for="end_date" class="form-label">End Date:</label>
|
||||
<input type="date" name="end_date" id="end_date" value="{{ end_date }}" class="form-control">
|
||||
</div>
|
||||
|
||||
<div class="col-md-3">
|
||||
<label for="search_term" class="form-label">Search:</label>
|
||||
<input type="text" name="search_term" id="search_term" value="{{ search_term }}" class="form-control">
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="mt-3">
|
||||
<button type="submit" class="btn btn-primary">Filter</button>
|
||||
<a href="{{ url_for('logger.download_logs', category=category, start_date=start_date, end_date=end_date, search_term=search_term) }}"
|
||||
class="btn btn-secondary">Download CSV</a>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<ul class="list-group">
|
||||
{% for log in logs %}
|
||||
<li class="list-group-item log-item" data-log-id="{{ log.id }}">
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<div class="ms-2 me-auto">
|
||||
<div class="fw-bold">{{ log.timestamp }}</div>
|
||||
{{ log.action }} - {{ log.description }}
|
||||
</div>
|
||||
<span class="badge bg-primary rounded-pill">{{ log.category }}</span>
|
||||
</div>
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
{% if pagination %}
|
||||
<nav aria-label="Page navigation" class="mt-4">
|
||||
<ul class="pagination justify-content-center">
|
||||
{% if pagination.has_prev %}
|
||||
<li class="page-item">
|
||||
<a class="page-link"
|
||||
href="{{ url_for('logger.list_logs', page=pagination.prev_num, category=category, start_date=start_date, end_date=end_date, search_term=search_term) }}">Previous</a>
|
||||
</li>
|
||||
{% else %}
|
||||
<li class="page-item disabled">
|
||||
<span class="page-link">Previous</span>
|
||||
</li>
|
||||
{% endif %}
|
||||
|
||||
<li class="page-item disabled">
|
||||
<span class="page-link">Page {{ pagination.page }} of {{ pagination.pages }}</span>
|
||||
</li>
|
||||
|
||||
{% if pagination.has_next %}
|
||||
<li class="page-item">
|
||||
<a class="page-link"
|
||||
href="{{ url_for('logger.list_logs', page=pagination.next_num, category=category, start_date=start_date, end_date=end_date, search_term=search_term) }}">Next</a>
|
||||
</li>
|
||||
{% else %}
|
||||
<li class="page-item disabled">
|
||||
<span class="page-link">Next</span>
|
||||
</li>
|
||||
{% endif %}
|
||||
</ul>
|
||||
</nav>
|
||||
{% endif %}
|
||||
|
||||
<!-- Modal for log details -->
|
||||
<div class="modal fade" id="logDetailModal" tabindex="-1" aria-hidden="true">
|
||||
<div class="modal-dialog modal-lg modal-dialog-scrollable">
|
||||
<div class="modal-content" id="log-detail-content">
|
||||
<!-- Log details will be loaded here via AJAX -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
const modal = new bootstrap.Modal(document.getElementById('logDetailModal'));
|
||||
const content = document.getElementById('log-detail-content');
|
||||
|
||||
document.querySelectorAll('.log-item').forEach(item => {
|
||||
item.addEventListener('click', function () {
|
||||
const logId = this.getAttribute('data-log-id');
|
||||
fetch(`/logs/${logId}/detail`)
|
||||
.then(response => response.text())
|
||||
.then(html => {
|
||||
content.innerHTML = html;
|
||||
modal.show();
|
||||
})
|
||||
.catch(err => {
|
||||
content.innerHTML = '<div class="modal-body text-danger">Error loading log details.</div>';
|
||||
modal.show();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
</script>
|
||||
{% endblock content %}
|
263
scipaperloader/templates/logs.html.jinja
Normal file
263
scipaperloader/templates/logs.html.jinja
Normal file
@ -0,0 +1,263 @@
|
||||
{% extends "base.html.jinja" %}
|
||||
|
||||
{% block title %}Activity Logs{% endblock title %}
|
||||
|
||||
{% block styles %}
|
||||
{{ super() }}
|
||||
<style>
|
||||
.logs-container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.filter-panel {
|
||||
background: #f8f9fa;
|
||||
border-bottom: 1px solid #dee2e6;
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.log-entry {
|
||||
cursor: pointer;
|
||||
transition: background-color 0.2s ease;
|
||||
}
|
||||
|
||||
.log-entry:hover {
|
||||
background-color: #f8f9fa;
|
||||
}
|
||||
|
||||
.category-badge {
|
||||
font-size: 0.75rem;
|
||||
padding: 0.25rem 0.5rem;
|
||||
}
|
||||
|
||||
.activity-controls {
|
||||
width: auto;
|
||||
display: inline-block;
|
||||
}
|
||||
|
||||
.logs-table th {
|
||||
background-color: #f8f9fa;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.log-entry {
|
||||
cursor: pointer;
|
||||
transition: background-color 0.2s ease;
|
||||
}
|
||||
|
||||
.log-entry:hover {
|
||||
background-color: #f8f9fa;
|
||||
}
|
||||
|
||||
.pagination-info {
|
||||
font-size: 0.875rem;
|
||||
color: #6c757d;
|
||||
}
|
||||
|
||||
.search-results-container {
|
||||
max-height: 600px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
/* JSON formatting styles */
|
||||
.json-formatted {
|
||||
background-color: #f8f9fa;
|
||||
border: 1px solid #e9ecef;
|
||||
border-radius: 0.375rem;
|
||||
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||||
font-size: 0.875rem;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.json-formatted code {
|
||||
color: #495057;
|
||||
background: transparent;
|
||||
}
|
||||
</style>
|
||||
{% endblock styles %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container-fluid mt-4">
|
||||
<h1><i class="bi bi-list-ul"></i> Activity Logs</h1>
|
||||
|
||||
<!-- Include standardized flash messages -->
|
||||
{% include "partials/flash_messages.html.jinja" %}
|
||||
|
||||
<div class="logs-container">
|
||||
<!-- Filter Panel -->
|
||||
<div class="filter-panel">
|
||||
<form id="filterForm" class="row g-3">
|
||||
<div class="col-md-3">
|
||||
<label class="form-label">Categories:</label>
|
||||
<div class="category-checkbox-container p-2"
|
||||
style="max-height: 200px; overflow-y: auto; background-color: white; border: 1px solid #ced4da; border-radius: 0.375rem;">
|
||||
<div class="form-check">
|
||||
<input class="form-check-input" type="checkbox" id="selectAllCategories" {% if not
|
||||
selected_categories or selected_categories|length==categories|length %}checked{% endif
|
||||
%}>
|
||||
<label class="form-check-label fw-bold" for="selectAllCategories">
|
||||
All Categories
|
||||
</label>
|
||||
</div>
|
||||
<hr class="my-2">
|
||||
{% for cat in categories %}
|
||||
<div class="form-check">
|
||||
<input class="form-check-input category-checkbox" type="checkbox" id="category_{{ cat }}"
|
||||
value="{{ cat }}" {% if not selected_categories or cat in selected_categories
|
||||
%}checked{% endif %}>
|
||||
<label class="form-check-label" for="category_{{ cat }}">
|
||||
{{ cat.replace('_', ' ').title() }}
|
||||
</label>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-md-3">
|
||||
<div class="row">
|
||||
<label for="statusFilter" class="form-label">Status:</label>
|
||||
<select id="statusFilter" class="form-select form-select-sm">
|
||||
<option value="">All Statuses</option>
|
||||
<option value="success">Success</option>
|
||||
<option value="error">Error</option>
|
||||
<option value="warning">Warning</option>
|
||||
<option value="info">Info</option>
|
||||
<option value="pending">Pending</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="col-md-3">
|
||||
<label for="startDate" class="form-label">Start Date:</label>
|
||||
<input type="date" id="startDate" class="form-control form-control-sm"
|
||||
value="{{ start_date or '' }}">
|
||||
|
||||
<label for="endDate" class="form-label mt-2">End Date:</label>
|
||||
<input type="date" id="endDate" class="form-control form-control-sm" value="{{ end_date or '' }}">
|
||||
</div>
|
||||
|
||||
<div class="col-md-3">
|
||||
<label for="searchTerm" class="form-label">Search:</label>
|
||||
<input type="text" id="searchTerm" class="form-control form-control-sm"
|
||||
placeholder="Search in actions and descriptions" value="{{ search_term or '' }}">
|
||||
</div>
|
||||
|
||||
<div class="col-12 d-flex justify-content-end mt-3">
|
||||
<button type="button" id="clearFilters" class="btn btn-outline-secondary btn-sm">
|
||||
<i class="bi bi-x"></i> Clear Filters
|
||||
</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<!-- Controls Panel -->
|
||||
<div class="d-flex justify-content-between align-items-center p-3 border-bottom">
|
||||
<div class="d-flex align-items-center gap-3">
|
||||
<div class="form-group mb-0">
|
||||
<label for="pageSize" class="form-label mb-0 me-2">Show:</label>
|
||||
<select id="pageSize" class="form-select form-select-sm activity-controls">
|
||||
<option value="20">20</option>
|
||||
<option value="50" selected>50</option>
|
||||
<option value="100">100</option>
|
||||
</select>
|
||||
</div>
|
||||
<span id="paginationInfo" class="pagination-info">Loading...</span>
|
||||
</div>
|
||||
|
||||
<div class="d-flex gap-2">
|
||||
<button type="button" id="refreshLogs" class="btn btn-outline-primary btn-sm">
|
||||
<i class="bi bi-arrow-clockwise"></i> Refresh
|
||||
</button>
|
||||
<button type="button" id="downloadLogs" class="btn btn-outline-success btn-sm">
|
||||
<i class="bi bi-download"></i> Download CSV
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Logs Table -->
|
||||
<div class="search-results-container">
|
||||
<table class="table table-hover logs-table mb-0">
|
||||
<thead class="sticky-top">
|
||||
<tr>
|
||||
<th style="width: 150px;">Timestamp</th>
|
||||
<th style="width: 120px;">Category</th>
|
||||
<th style="width: 180px;">Action</th>
|
||||
<th style="width: 100px;">Status</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="logsTableBody">
|
||||
<tr>
|
||||
<td colspan="5" class="text-center py-4">
|
||||
<div class="spinner-border spinner-border-sm text-primary" role="status">
|
||||
<span class="visually-hidden">Loading...</span>
|
||||
</div>
|
||||
Loading logs...
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<!-- Pagination Controls -->
|
||||
<nav id="logsPagination" aria-label="Logs pagination" class="p-3 border-top d-none">
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<div class="pagination-info">
|
||||
<span id="paginationDetails">Showing 0 - 0 of 0 entries</span>
|
||||
</div>
|
||||
<ul class="pagination pagination-sm mb-0">
|
||||
<li class="page-item" id="prevPage">
|
||||
<a class="page-link" href="#" aria-label="Previous">
|
||||
<span aria-hidden="true">«</span>
|
||||
</a>
|
||||
</li>
|
||||
<li class="page-item active" id="currentPageItem">
|
||||
<span class="page-link" id="currentPageSpan">1</span>
|
||||
</li>
|
||||
<li class="page-item" id="nextPage">
|
||||
<a class="page-link" href="#" aria-label="Next">
|
||||
<span aria-hidden="true">»</span>
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Modal for log details -->
|
||||
<div class="modal fade" id="logDetailModal" tabindex="-1" aria-hidden="true" data-bs-backdrop="true"
|
||||
data-bs-keyboard="true">
|
||||
<div class="modal-dialog modal-lg modal-dialog-scrollable">
|
||||
<div class="modal-content" id="log-detail-content">
|
||||
<!-- Log details will be loaded here via AJAX -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock content %}
|
||||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<script src="{{ url_for('static', filename='js/modal-handler.js') }}"></script>
|
||||
<script src="{{ url_for('static', filename='js/logger-manager.js') }}"></script>
|
||||
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
// Initialize the logger manager
|
||||
window.loggerManager = new LoggerManager({
|
||||
initialFilters: {
|
||||
category: {{ selected_categories | tojson }},
|
||||
start_date: "{{ start_date or '' }}",
|
||||
end_date: "{{ end_date or '' }}",
|
||||
search_term: "{{ search_term or '' }}"
|
||||
}
|
||||
});
|
||||
|
||||
// Set up modal handler for log details
|
||||
const logModal = new ModalHandler('logDetailModal', 'log-detail-content');
|
||||
window.loggerManager.setModalHandler(logModal);
|
||||
});
|
||||
</script>
|
||||
{% endblock scripts %}
|
@ -8,7 +8,7 @@
|
||||
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
||||
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="{{ url_for('scraper.index') }}">Scraper</a>
|
||||
<a class="nav-link" href="{{ url_for('scraper.index') }}">Control Panel</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="{{ url_for('upload.upload') }}">Import CSV</a>
|
||||
|
@ -1,7 +1,12 @@
|
||||
{% extends "base.html.jinja" %}
|
||||
|
||||
{% block title %}Papers{% endblock title %}
|
||||
|
||||
{% block content %}
|
||||
|
||||
<!-- Include flash messages template -->
|
||||
{% include "partials/flash_messages.html.jinja" %}
|
||||
|
||||
{# --- Sort direction logic for each column --- #}
|
||||
{% set title_sort = 'asc' if sort_by != 'title' or sort_dir == 'desc' else 'desc' %}
|
||||
{% set journal_sort = 'asc' if sort_by != 'journal' or sort_dir == 'desc' else 'desc' %}
|
||||
@ -275,28 +280,14 @@
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
<script>
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
const modal = new bootstrap.Modal(document.getElementById('paperDetailModal'));
|
||||
const content = document.getElementById('paper-detail-content');
|
||||
|
||||
document.querySelectorAll('.paper-link').forEach(link => {
|
||||
link.addEventListener('click', function (e) {
|
||||
e.preventDefault();
|
||||
const url = this.getAttribute('data-url');
|
||||
|
||||
fetch(url)
|
||||
.then(response => response.text())
|
||||
.then(html => {
|
||||
content.innerHTML = html;
|
||||
modal.show();
|
||||
})
|
||||
.catch(err => {
|
||||
content.innerHTML = '<div class="modal-body text-danger">Error loading details.</div>';
|
||||
modal.show();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
</script>
|
||||
{% endblock content %}
|
||||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<script src="{{ url_for('static', filename='js/modal-handler.js') }}"></script>
|
||||
<script>
|
||||
// Use the reusable ModalHandler for paper details
|
||||
const paperModal = new ModalHandler('paperDetailModal', 'paper-detail-content');
|
||||
paperModal.setupClickHandlers('.paper-link');
|
||||
</script>
|
||||
{% endblock scripts %}
|
@ -1,93 +1,145 @@
|
||||
<!-- Server-side flash messages from Flask -->
|
||||
{% with messages = get_flashed_messages(with_categories=true) %}
|
||||
{% if messages %}
|
||||
<div class="server-flash-messages">
|
||||
{% for category, message in messages %}
|
||||
<div class="alert alert-{{ category }} alert-dismissible fade show" role="alert">
|
||||
{{ message }}
|
||||
<button type="button" class="btn-close" data-bs-dismiss="alert" aria-label="Close"></button>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endwith %}
|
||||
|
||||
<!-- JavaScript flash message container for client-side messages -->
|
||||
<div id="clientFlashContainer"></div>
|
||||
<!-- SVG Icons for Flash Messages -->
|
||||
<svg xmlns="http://www.w3.org/2000/svg" class="d-none">
|
||||
<symbol id="check-circle-fill" viewBox="0 0 16 16">
|
||||
<path
|
||||
d="M16 8A8 8 0 1 1 0 8a8 8 0 0 1 16 0zm-3.97-3.03a.75.75 0 0 0-1.08.022L7.477 9.417 5.384 7.323a.75.75 0 0 0-1.06 1.06L6.97 11.03a.75.75 0 0 0 1.079-.02l3.992-4.99a.75.75 0 0 0-.01-1.05z" />
|
||||
</symbol>
|
||||
<symbol id="info-fill" viewBox="0 0 16 16">
|
||||
<path
|
||||
d="M8 16A8 8 0 1 0 8 0a8 8 0 0 0 0 16zm.93-9.412-1 4.705c-.07.34.029.533.304.533.194 0 .487-.07.686-.246l-.088.416c-.287.346-.92.598-1.465.598-.703 0-1.002-.422-.808-1.319l.738-3.468c.064-.293.006-.399-.287-.47l-.451-.081.082-.381 2.29-.287zM8 5.5a1 1 0 1 1 0-2 1 1 0 0 1 0 2z" />
|
||||
</symbol>
|
||||
<symbol id="exclamation-triangle-fill" viewBox="0 0 16 16">
|
||||
<path
|
||||
d="M8.982 1.566a1.13 1.13 0 0 0-1.96 0L.165 13.233c-.457.778.091 1.767.98 1.767h13.713c.889 0 1.438-.99.98-1.767L8.982 1.566zM8 5c.535 0 .954.462.9.995l-.35 3.507a.552.552 0 0 1-1.1 0L7.1 5.995A.905.905 0 0 1 8 5zm.002 6a1 1 0 1 1 0 2 1 1 0 0 1 0-2z" />
|
||||
</symbol>
|
||||
<symbol id="x-circle-fill" viewBox="0 0 16 16">
|
||||
<path
|
||||
d="M16 8A8 8 0 1 1 0 8a8 8 0 0 1 16 0zM5.354 4.646a.5.5 0 1 0-.708.708L7.293 8l-2.647 2.646a.5.5 0 0 0 .708.708L8 8.707l2.646 2.647a.5.5 0 0 0 .708-.708L8.707 8l2.647-2.646a.5.5 0 0 0-.708-.708L8 7.293 5.354 4.646z" />
|
||||
</symbol>
|
||||
</svg>
|
||||
|
||||
<!-- CSS styles for flash overlay messages -->
|
||||
<style>
|
||||
.client-flash-message {
|
||||
.flash-overlay {
|
||||
position: fixed;
|
||||
top: 30%;
|
||||
left: 50%;
|
||||
transform: translate(-50%, -50%);
|
||||
z-index: 1000;
|
||||
width: 300px;
|
||||
text-align: center;
|
||||
font-weight: bold;
|
||||
padding: 12px;
|
||||
margin-bottom: 20px;
|
||||
border-radius: 6px;
|
||||
top: 20px;
|
||||
right: 20px;
|
||||
z-index: 9999;
|
||||
max-width: 420px;
|
||||
opacity: 1;
|
||||
transition: opacity 5s ease-in-out;
|
||||
transition: all 0.3s ease-in-out;
|
||||
transform: translateX(0);
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.client-flash-message.success {
|
||||
background-color: #d4edda;
|
||||
border-color: #c3e6cb;
|
||||
color: #155724;
|
||||
.flash-content {
|
||||
padding: 16px 20px;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 6px 20px rgba(0, 0, 0, 0.15);
|
||||
display: flex;
|
||||
align-items: flex-start;
|
||||
font-weight: 500;
|
||||
border-left: 4px solid;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.client-flash-message.error {
|
||||
.flash-icon {
|
||||
width: 20px;
|
||||
height: 20px;
|
||||
margin-right: 12px;
|
||||
margin-top: 1px;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.flash-message {
|
||||
flex: 1;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.flash-close {
|
||||
background: none;
|
||||
border: none;
|
||||
font-size: 20px;
|
||||
cursor: pointer;
|
||||
padding: 0;
|
||||
margin-left: 12px;
|
||||
opacity: 0.6;
|
||||
line-height: 1;
|
||||
font-weight: bold;
|
||||
flex-shrink: 0;
|
||||
margin-top: -2px;
|
||||
}
|
||||
|
||||
.flash-close:hover {
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
.flash-success .flash-content {
|
||||
background-color: #d1e7dd;
|
||||
border-left-color: #198754;
|
||||
color: #0f5132;
|
||||
}
|
||||
|
||||
.flash-danger .flash-content {
|
||||
background-color: #f8d7da;
|
||||
border-color: #f5c6cb;
|
||||
border-left-color: #dc3545;
|
||||
color: #721c24;
|
||||
}
|
||||
|
||||
.client-flash-message.info {
|
||||
background-color: #d1ecf1;
|
||||
border-color: #bee5eb;
|
||||
color: #0c5460;
|
||||
}
|
||||
|
||||
.client-flash-message.warning {
|
||||
.flash-warning .flash-content {
|
||||
background-color: #fff3cd;
|
||||
border-color: #ffeeba;
|
||||
color: #856404;
|
||||
border-left-color: #ffc107;
|
||||
color: #664d03;
|
||||
}
|
||||
|
||||
.client-flash-message.fade {
|
||||
.flash-info .flash-content {
|
||||
background-color: #cff4fc;
|
||||
border-left-color: #0dcaf0;
|
||||
color: #055160;
|
||||
}
|
||||
|
||||
.flash-overlay.fade-out {
|
||||
opacity: 0;
|
||||
transform: translateX(100%);
|
||||
}
|
||||
|
||||
/* Stack multiple flash messages with smooth transitions */
|
||||
.flash-overlay {
|
||||
/* Dynamic positioning will be set by JavaScript */
|
||||
}
|
||||
|
||||
/* Ensure proper z-index stacking */
|
||||
.flash-overlay:nth-child(1) {
|
||||
z-index: 9999;
|
||||
}
|
||||
|
||||
.flash-overlay:nth-child(2) {
|
||||
z-index: 9998;
|
||||
}
|
||||
|
||||
.flash-overlay:nth-child(3) {
|
||||
z-index: 9997;
|
||||
}
|
||||
|
||||
.flash-overlay:nth-child(4) {
|
||||
z-index: 9996;
|
||||
}
|
||||
|
||||
.flash-overlay:nth-child(5) {
|
||||
z-index: 9995;
|
||||
}
|
||||
</style>
|
||||
|
||||
<!-- Server-side flash messages from Flask -->
|
||||
{% with messages = get_flashed_messages(with_categories=true) %}
|
||||
{% if messages %}
|
||||
<script>
|
||||
// Global flash message function that can be used from anywhere
|
||||
function showFlashMessage(message, type = 'success', duration = 5000) {
|
||||
const flashMsg = document.createElement('div');
|
||||
flashMsg.className = `client-flash-message ${type}`;
|
||||
flashMsg.textContent = message;
|
||||
|
||||
const container = document.getElementById('clientFlashContainer');
|
||||
container.appendChild(flashMsg);
|
||||
|
||||
// Apply fade effect after some time
|
||||
setTimeout(() => flashMsg.classList.add('fade'), duration - 3000);
|
||||
|
||||
// Remove element after duration
|
||||
setTimeout(() => flashMsg.remove(), duration);
|
||||
|
||||
return flashMsg;
|
||||
}
|
||||
|
||||
// Initialize toast messages if Bootstrap is used
|
||||
// Convert server-side flash messages to overlay messages
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
// Initialize any Bootstrap toasts if they exist
|
||||
if (typeof bootstrap !== 'undefined' && bootstrap.Toast) {
|
||||
const toastElList = [].slice.call(document.querySelectorAll('.toast'));
|
||||
toastElList.map(function (toastEl) {
|
||||
return new bootstrap.Toast(toastEl);
|
||||
});
|
||||
}
|
||||
{% for category, message in messages %}
|
||||
showFlashMessage({{ message| tojson }}, {{ (category if category != 'error' else 'danger')| tojson }});
|
||||
{% endfor %}
|
||||
});
|
||||
</script>
|
||||
{% endif %}
|
||||
{% endwith %}
|
Before Width: | Height: | Size: 2.7 KiB After Width: | Height: | Size: 4.1 KiB |
@ -1,18 +1,82 @@
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title">Log Details</h5>
|
||||
<button type="button" class="btn-close" data-bs-dismiss="modal"></button>
|
||||
<h5 class="modal-title"><i class="fas fa-info-circle"></i> Log Details</h5>
|
||||
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<p><strong>Timestamp:</strong> {{ log.timestamp }}</p>
|
||||
<p><strong>Category:</strong> {{ log.category }}</p>
|
||||
<p><strong>Action:</strong> {{ log.action }}</p>
|
||||
<p><strong>Description:</strong> {{ log.description }}</p>
|
||||
{% if log.extra_data %}
|
||||
<p><strong>Extra Data:</strong>
|
||||
<pre><code>{{ log.extra_data }}</code></pre>
|
||||
<div class="row">
|
||||
<div class="col-md-6">
|
||||
<p><strong>Timestamp:</strong> <span class="text-muted">{{ log.timestamp }}</span></p>
|
||||
<p><strong>Category:</strong>
|
||||
<span class="badge bg-secondary">{{ log.category.replace('_', ' ').title() }}</span>
|
||||
</p>
|
||||
<p><strong>Action:</strong> <code>{{ log.action }}</code></p>
|
||||
{% if log.status %}
|
||||
<p><strong>Status:</strong>
|
||||
{% if log.status == 'success' %}
|
||||
<span class="badge bg-success">{{ log.status.title() }}</span>
|
||||
{% elif log.status == 'error' %}
|
||||
<span class="badge bg-danger">{{ log.status.title() }}</span>
|
||||
{% elif log.status == 'warning' %}
|
||||
<span class="badge bg-warning">{{ log.status.title() }}</span>
|
||||
{% else %}
|
||||
<span class="badge bg-info">{{ log.status.title() }}</span>
|
||||
{% endif %}
|
||||
</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
{% if log.paper_id %}
|
||||
<p><strong>Paper ID:</strong> <a href="/papers/{{ log.paper_id }}" target="_blank">{{ log.paper_id }}</a></p>
|
||||
{% endif %}
|
||||
{% if log.user_id %}
|
||||
<p><strong>User ID:</strong> {{ log.user_id }}</p>
|
||||
{% endif %}
|
||||
{% if log.config_key %}
|
||||
<p><strong>Config Key:</strong> <code>{{ log.config_key }}</code></p>
|
||||
{% endif %}
|
||||
{% if log.source_ip %}
|
||||
<p><strong>Source IP:</strong> {{ log.source_ip }}</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% if log.description %}
|
||||
<div class="mt-3">
|
||||
<p><strong>Description:</strong></p>
|
||||
<div class="alert alert-light">{{ log.description }}</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if log.old_value or log.new_value %}
|
||||
<div class="mt-3">
|
||||
<p><strong>Configuration Changes:</strong></p>
|
||||
<div class="row">
|
||||
{% if log.old_value %}
|
||||
<div class="col-md-6">
|
||||
<label class="form-label"><strong>Old Value:</strong></label>
|
||||
<pre class="bg-light p-2"><code>{{ log.old_value }}</code></pre>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if log.new_value %}
|
||||
<div class="col-md-6">
|
||||
<label class="form-label"><strong>New Value:</strong></label>
|
||||
<pre class="bg-light p-2"><code>{{ log.new_value }}</code></pre>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if log.extra_data %}
|
||||
<div class="mt-3">
|
||||
<p><strong>Additional Data:</strong></p>
|
||||
<pre class="bg-light p-3"
|
||||
style="max-height: 300px; overflow-y: auto;"><code id="extra-data-content">{{ log.extra_data }}</code></pre>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">
|
||||
<i class="fas fa-times"></i> Close
|
||||
</button>
|
||||
</div>
|
@ -0,0 +1,249 @@
|
||||
<!-- Scraper Overview Modal -->
|
||||
<div class="modal fade" id="scraperOverviewModal" tabindex="-1" role="dialog"
|
||||
aria-labelledby="scraperOverviewModalLabel" aria-hidden="true">
|
||||
<div class="modal-dialog modal-xl" role="document">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title" id="scraperOverviewModalLabel">
|
||||
<i class="fas fa-cogs"></i> Scraper Modules Overview
|
||||
</h5>
|
||||
<button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<!-- Loading state -->
|
||||
<div id="scraperOverviewLoading" class="text-center py-4">
|
||||
<div class="spinner-border text-primary" role="status">
|
||||
<span class="visually-hidden">Loading...</span>
|
||||
</div>
|
||||
<p class="mt-2 text-muted">Loading scraper information...</p>
|
||||
</div>
|
||||
|
||||
<!-- Error state -->
|
||||
<div id="scraperOverviewError" class="alert alert-danger d-none" role="alert">
|
||||
<h6 class="alert-heading">Error Loading Scrapers</h6>
|
||||
<p id="scraperOverviewErrorMessage"></p>
|
||||
<button class="btn btn-outline-danger btn-sm" onclick="loadScraperOverview()">
|
||||
<i class="fas fa-redo"></i> Retry
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- Content -->
|
||||
<div id="scraperOverviewContent" class="d-none">
|
||||
<!-- Scraper Architecture Overview -->
|
||||
<div class="card mb-4">
|
||||
<div class="card-header">
|
||||
<h6 class="mb-0">
|
||||
<i class="fas fa-info-circle"></i> How Scraper Modules Work
|
||||
</h6>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<p class="mb-3">
|
||||
SciPaperLoader uses a modular scraper architecture where each scraper module handles
|
||||
specific paper processing stages. Papers flow through different statuses as they are
|
||||
processed by various scrapers.
|
||||
</p>
|
||||
|
||||
<div class="row">
|
||||
<div class="col-md-6">
|
||||
<h6>Key Concepts:</h6>
|
||||
<ul class="small">
|
||||
<li><strong>Input Statuses:</strong> Paper statuses this scraper can process
|
||||
</li>
|
||||
<li><strong>Output Statuses:</strong> Statuses papers get after processing</li>
|
||||
<li><strong>Processing Status:</strong> Temporary status while scraper works
|
||||
</li>
|
||||
<li><strong>Pipeline:</strong> Scrapers can be chained together</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
<h6>Status Flow Example:</h6>
|
||||
<div class="d-flex align-items-center small">
|
||||
<span class="badge bg-info">New</span>
|
||||
<i class="fas fa-arrow-right mx-2"></i>
|
||||
<span class="badge bg-warning">Processing</span>
|
||||
<i class="fas fa-arrow-right mx-2"></i>
|
||||
<span class="badge bg-success">Done</span>
|
||||
</div>
|
||||
<div class="text-muted mt-1">Papers transition through these statuses</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Current System Configuration -->
|
||||
<div class="card mb-4">
|
||||
<div class="card-header">
|
||||
<h6 class="mb-0">
|
||||
<i class="fas fa-server"></i> System Configuration
|
||||
</h6>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="row">
|
||||
<div class="col-md-4">
|
||||
<p><strong>Active Scraper Module:</strong> <span id="currentScraperModule"
|
||||
class="badge bg-primary">Loading...</span></p>
|
||||
<p><strong>Daily Volume Limit:</strong> <span
|
||||
id="currentVolumeLimit">Loading...</span> papers</p>
|
||||
</div>
|
||||
<div class="col-md-4">
|
||||
<p><strong>Total Available Modules:</strong> <span
|
||||
id="totalScraperModules">Loading...</span></p>
|
||||
<p><strong>Processing Pipeline:</strong> <span
|
||||
id="processingPipeline">Multi-stage</span></p>
|
||||
</div>
|
||||
<div class="col-md-4">
|
||||
<p><strong>Current Paper Counts:</strong></p>
|
||||
<div id="paperCountsSummary" class="small">
|
||||
<!-- Will be populated by JavaScript -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Available Scrapers Table -->
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h6 class="mb-0">
|
||||
<i class="fas fa-list"></i> Available Scraper Modules
|
||||
</h6>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="table-responsive">
|
||||
<table class="table table-hover">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Module Name</th>
|
||||
<th>Description</th>
|
||||
<th>Input Statuses</th>
|
||||
<th>Success Output</th>
|
||||
<th>Failure Output</th>
|
||||
<th>Processing Status</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="scrapersTableBody">
|
||||
<!-- Table content will be populated by JavaScript -->
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Publisher Parser Overview -->
|
||||
<div class="card mt-4">
|
||||
<div class="card-header">
|
||||
<h6 class="mb-0">
|
||||
<i class="fas fa-building"></i> Publisher Parser Overview
|
||||
</h6>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="row mb-3">
|
||||
<div class="col-md-12">
|
||||
<p class="text-muted mb-2">
|
||||
<i class="fas fa-info-circle"></i>
|
||||
Publishers are detected from paper URLs and mapped to specific parser modules
|
||||
for content extraction.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Publisher Statistics -->
|
||||
<div class="row mb-4" id="publisherStats">
|
||||
<!-- Will be populated by JavaScript -->
|
||||
</div>
|
||||
|
||||
<!-- Publishers Table -->
|
||||
<div class="table-responsive">
|
||||
<table class="table table-hover table-sm">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Publisher</th>
|
||||
<th>Papers</th>
|
||||
<th>Parser Status</th>
|
||||
<th>Parser Available</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="publishersTableBody">
|
||||
<!-- Table content will be populated by JavaScript -->
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Status Flow Diagram -->
|
||||
<div class="card mt-4">
|
||||
<div class="card-header">
|
||||
<h6 class="mb-0">
|
||||
<i class="fas fa-project-diagram"></i> Paper Status Flow Diagram
|
||||
</h6>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div id="statusFlowDiagram" class="text-center py-4">
|
||||
<!-- This will be populated by JavaScript -->
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<div class="d-flex justify-content-between w-100">
|
||||
<small class="text-muted">
|
||||
<i class="fas fa-lightbulb"></i>
|
||||
Tip: Scrapers can be chained to create complex processing pipelines
|
||||
</small>
|
||||
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
/* Custom styles for the scraper overview modal */
|
||||
#scraperOverviewModal .modal-xl {
|
||||
max-width: 1200px;
|
||||
}
|
||||
|
||||
#scraperOverviewModal .table th {
|
||||
font-size: 0.9rem;
|
||||
background-color: #f8f9fa;
|
||||
}
|
||||
|
||||
#scraperOverviewModal .badge {
|
||||
font-size: 0.75rem;
|
||||
}
|
||||
|
||||
#scraperOverviewModal .status-badge {
|
||||
margin: 2px;
|
||||
display: inline-block;
|
||||
}
|
||||
|
||||
.status-flow-node {
|
||||
display: inline-block;
|
||||
padding: 8px 16px;
|
||||
margin: 4px;
|
||||
border-radius: 20px;
|
||||
font-size: 0.9rem;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.status-flow-arrow {
|
||||
color: #6c757d;
|
||||
margin: 0 8px;
|
||||
}
|
||||
|
||||
.scraper-description {
|
||||
max-width: 300px;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.input-status-list {
|
||||
max-width: 150px;
|
||||
}
|
||||
|
||||
.status-output {
|
||||
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
</style>
|
File diff suppressed because it is too large
Load Diff
@ -1,34 +1,14 @@
|
||||
{% extends "base.html.jinja" %} {% block content %}
|
||||
{% extends "base.html.jinja" %}
|
||||
|
||||
{% block title %}Import CSV{% endblock title %}
|
||||
|
||||
{% block content %}
|
||||
<h1>Welcome to SciPaperLoader</h1>
|
||||
|
||||
<div id="results-container"></div>
|
||||
<!-- Include flash messages template -->
|
||||
{% include "partials/flash_messages.html.jinja" %}
|
||||
|
||||
{% if success %}
|
||||
<div class="alert alert-success mt-3">{{ success }}</div>
|
||||
{% endif %} {% if error_message %}
|
||||
<div class="alert alert-warning mt-3">
|
||||
<h4>{{ error_message }}</h4>
|
||||
<table class="table table-sm table-bordered">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Row</th>
|
||||
<th>DOI</th>
|
||||
<th>Error</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for error in error_samples %}
|
||||
<tr>
|
||||
<td>{{ error.row }}</td>
|
||||
<td>{{ error.doi }}</td>
|
||||
<td>{{ error.error }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
<a href="{{ url_for('upload.download_error_log') }}" class="btn btn-outline-secondary">Download Full Error Log</a>
|
||||
</div>
|
||||
{% endif %}
|
||||
<div id="results-container"></div>
|
||||
|
||||
<div class="alert alert-info">
|
||||
<p>
|
||||
@ -88,93 +68,42 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% endblock content %}
|
||||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<!-- Configuration data in JSON format for clean separation -->
|
||||
<script type="application/json" id="upload-config">
|
||||
{
|
||||
"statusUrlTemplate": {{ (url_for('upload.task_status', task_id='') ~ '{taskId}')|tojson }}
|
||||
}
|
||||
</script>
|
||||
|
||||
<script src="{{ url_for('static', filename='js/form-handler.js') }}"></script>
|
||||
<script>
|
||||
const form = document.getElementById("upload-form");
|
||||
form.addEventListener("submit", function (e) {
|
||||
e.preventDefault();
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
// Read configuration from JSON
|
||||
const config = JSON.parse(document.getElementById('upload-config').textContent);
|
||||
|
||||
// Display loading state immediately
|
||||
const progressModal = new bootstrap.Modal(document.getElementById("progressModal"));
|
||||
progressModal.show();
|
||||
const progressBar = document.getElementById("progressBar");
|
||||
progressBar.style.width = "5%";
|
||||
progressBar.textContent = "Starting...";
|
||||
|
||||
const formData = new FormData(form);
|
||||
|
||||
// Disable the form while processing
|
||||
const submitButton = form.querySelector("button[type='submit']");
|
||||
submitButton.disabled = true;
|
||||
|
||||
fetch(form.action, {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
})
|
||||
.then((response) => response.json())
|
||||
.then((data) => {
|
||||
if (data.error) {
|
||||
// Handle error
|
||||
progressModal.hide();
|
||||
alert(`Error: ${data.error}`);
|
||||
submitButton.disabled = false;
|
||||
return;
|
||||
}
|
||||
|
||||
const taskId = data.task_id;
|
||||
const interval = setInterval(() => {
|
||||
fetch("{{ url_for('upload.task_status', task_id='') }}" + taskId)
|
||||
.then((response) => response.json())
|
||||
.then((status) => {
|
||||
console.log("Task status:", status);
|
||||
if (status.state === "SUCCESS") {
|
||||
clearInterval(interval);
|
||||
progressBar.style.width = "100%";
|
||||
progressBar.textContent = "Completed!";
|
||||
|
||||
setTimeout(() => {
|
||||
progressModal.hide();
|
||||
showResults(status.result);
|
||||
submitButton.disabled = false;
|
||||
}, 1000);
|
||||
} else if (status.state === "FAILURE") {
|
||||
clearInterval(interval);
|
||||
progressBar.style.width = "100%";
|
||||
progressBar.classList.add("bg-danger");
|
||||
progressBar.textContent = "Failed!";
|
||||
|
||||
setTimeout(() => {
|
||||
progressModal.hide();
|
||||
alert(`Task failed: ${status.error || "Unknown error"}`);
|
||||
submitButton.disabled = false;
|
||||
}, 1000);
|
||||
} else {
|
||||
// Update progress bar with more information
|
||||
const progress = status.progress || 0;
|
||||
progressBar.style.width = `${progress}%`;
|
||||
progressBar.textContent = `${progress}% complete`;
|
||||
document.getElementById("progressStatus").innerText = `Processing... (${status.state})`;
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error("Failed to check task status:", err);
|
||||
});
|
||||
}, 1000);
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error("Upload failed:", err);
|
||||
progressModal.hide();
|
||||
alert("Upload failed. Please try again.");
|
||||
submitButton.disabled = false;
|
||||
// Initialize form handler with custom callbacks
|
||||
const uploadFormHandler = new FormHandler('upload-form', {
|
||||
statusUrlTemplate: config.statusUrlTemplate,
|
||||
onSuccess: showResults,
|
||||
onError: (error) => showFlashMessage(`Upload failed: ${error}`, 'error')
|
||||
});
|
||||
});
|
||||
|
||||
const showResults = (result) => {
|
||||
// Show main success message as overlay
|
||||
const message = `Upload completed! Added: ${result.added}, Updated: ${result.updated}, Skipped: ${result.skipped}, Errors: ${result.error_count}`;
|
||||
showFlashMessage(message, 'success');
|
||||
|
||||
let resultHTML = `<div class="alert alert-success">${message}</div>`;
|
||||
// Build detailed results HTML for the results container
|
||||
let resultHTML = '';
|
||||
|
||||
// Add skipped records information
|
||||
if (result.skipped > 0) {
|
||||
showFlashMessage(`${result.skipped} records were skipped`, 'info');
|
||||
resultHTML += `
|
||||
<div class="alert alert-info">
|
||||
<h4>${result.skipped} records were skipped</h4>
|
||||
@ -205,6 +134,7 @@
|
||||
|
||||
// Existing error display code
|
||||
if (result.error_count > 0) {
|
||||
showFlashMessage(`${result.error_count} errors occurred during upload`, 'warning');
|
||||
resultHTML += `
|
||||
<div class="alert alert-warning">
|
||||
<h4>Some errors occurred (${result.error_count} total)</h4>
|
||||
@ -238,7 +168,8 @@
|
||||
</div>`;
|
||||
}
|
||||
|
||||
// Display detailed results in container
|
||||
document.getElementById("results-container").innerHTML = resultHTML;
|
||||
};
|
||||
</script>
|
||||
{% endblock content %}
|
||||
{% endblock scripts %}
|
131
tests/test_csv_upload.py
Normal file
131
tests/test_csv_upload.py
Normal file
@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script to verify CSV upload functionality works with APScheduler.
|
||||
"""
|
||||
import requests
|
||||
import time
|
||||
import io
|
||||
import csv
|
||||
from scipaperloader import create_app
|
||||
|
||||
def create_test_csv():
|
||||
"""Create a simple test CSV file."""
|
||||
csv_content = """title,doi,issn,journal,alternative_id,published_online
|
||||
Test Paper 1,10.1000/test_upload_001,1234-5678,Test Journal,ALT001,2024-01-01
|
||||
Test Paper 2,10.1000/test_upload_002,1234-5678,Test Journal,ALT002,2024-01-02
|
||||
Test Paper 3,10.1000/test_upload_003,1234-5678,Test Journal,ALT003,2024-01-03
|
||||
"""
|
||||
return csv_content
|
||||
|
||||
def test_csv_upload():
|
||||
"""Test the CSV upload functionality."""
|
||||
print("🧪 Testing CSV Upload Functionality")
|
||||
print("=" * 50)
|
||||
|
||||
# Create Flask app
|
||||
app = create_app()
|
||||
|
||||
with app.test_client() as client:
|
||||
# Create test CSV
|
||||
csv_content = create_test_csv()
|
||||
|
||||
# Prepare file data
|
||||
csv_file = io.BytesIO(csv_content.encode('utf-8'))
|
||||
csv_file.name = 'test_upload.csv'
|
||||
|
||||
print("📤 Uploading CSV file...")
|
||||
|
||||
# Make upload request
|
||||
response = client.post('/upload/', data={
|
||||
'file': (csv_file, 'test_upload.csv'),
|
||||
'delimiter': ',',
|
||||
'duplicate_strategy': 'skip'
|
||||
}, content_type='multipart/form-data')
|
||||
|
||||
print(f"Response Status: {response.status_code}")
|
||||
print(f"Response Data: {response.get_json()}")
|
||||
|
||||
if response.status_code == 200:
|
||||
response_data = response.get_json()
|
||||
if 'task_id' in response_data:
|
||||
task_id = response_data['task_id']
|
||||
print(f"✅ Task scheduled successfully: {task_id}")
|
||||
|
||||
# Monitor task progress
|
||||
print("\n📊 Monitoring task progress...")
|
||||
for i in range(30): # Wait up to 30 seconds
|
||||
progress_response = client.get(f'/upload/task_status/{task_id}')
|
||||
if progress_response.status_code == 200:
|
||||
progress_data = progress_response.get_json()
|
||||
print(f"Progress: {progress_data}")
|
||||
|
||||
if progress_data.get('state') == 'SUCCESS':
|
||||
print("✅ CSV upload completed successfully!")
|
||||
result = progress_data.get('result', {})
|
||||
print(f" Added: {result.get('added', 0)}")
|
||||
print(f" Skipped: {result.get('skipped', 0)}")
|
||||
print(f" Errors: {result.get('error_count', 0)}")
|
||||
return True
|
||||
elif progress_data.get('state') == 'FAILURE':
|
||||
print(f"❌ CSV upload failed: {progress_data.get('error')}")
|
||||
return False
|
||||
else:
|
||||
print(f"❌ Failed to get task status: {progress_response.status_code}")
|
||||
return False
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
print("⏰ Task did not complete within 30 seconds")
|
||||
return False
|
||||
else:
|
||||
print(f"❌ No task_id in response: {response_data}")
|
||||
return False
|
||||
else:
|
||||
print(f"❌ Upload request failed: {response.status_code}")
|
||||
print(f"Response: {response.get_data(as_text=True)}")
|
||||
return False
|
||||
|
||||
def check_scheduler_status():
|
||||
"""Check APScheduler status."""
|
||||
print("\n🔍 Checking APScheduler Status")
|
||||
print("=" * 50)
|
||||
|
||||
app = create_app()
|
||||
with app.app_context():
|
||||
from scipaperloader.scheduler import _scheduler
|
||||
|
||||
if not _scheduler:
|
||||
print("❌ APScheduler not initialized")
|
||||
return False
|
||||
|
||||
if not _scheduler.running:
|
||||
print("❌ APScheduler not running")
|
||||
return False
|
||||
|
||||
jobs = _scheduler.get_jobs()
|
||||
print(f"✅ APScheduler running with {len(jobs)} jobs")
|
||||
|
||||
# Show current jobs
|
||||
for job in jobs:
|
||||
print(f" - {job.id}: {job.name}")
|
||||
|
||||
return True
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("🚀 CSV Upload Test Suite")
|
||||
print("=" * 50)
|
||||
|
||||
# First check scheduler status
|
||||
if not check_scheduler_status():
|
||||
print("❌ APScheduler issues detected, cannot proceed with test")
|
||||
exit(1)
|
||||
|
||||
# Run the upload test
|
||||
success = test_csv_upload()
|
||||
|
||||
if success:
|
||||
print("\n🎉 All tests passed! CSV upload is working correctly.")
|
||||
exit(0)
|
||||
else:
|
||||
print("\n❌ Test failed! CSV upload needs debugging.")
|
||||
exit(1)
|
397
tests/test_scheduler_functionality.py
Normal file
397
tests/test_scheduler_functionality.py
Normal file
@ -0,0 +1,397 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive test for APScheduler functionality in SciPaperLoader.
|
||||
Tests job scheduling, execution, revocation, and hourly scheduler functionality.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import threading
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from scipaperloader import create_app
|
||||
from scipaperloader.models import PaperMetadata, ScraperState, ActivityLog, ScheduleConfig, VolumeConfig
|
||||
from scipaperloader.scrapers.manager import ScraperManager
|
||||
from scipaperloader.db import db
|
||||
|
||||
|
||||
def test_scheduler_functionality():
|
||||
"""Comprehensive test of APScheduler functionality."""
|
||||
|
||||
print("🧪 Testing APScheduler Functionality")
|
||||
print("=" * 50)
|
||||
|
||||
# Create test app with in-memory database
|
||||
app = create_app({
|
||||
'TESTING': True,
|
||||
'SQLALCHEMY_DATABASE_URI': 'sqlite:///:memory:',
|
||||
})
|
||||
|
||||
with app.app_context():
|
||||
# Test 1: Basic scheduler availability
|
||||
print("\n📋 Test 1: Scheduler Initialization")
|
||||
scheduler = app.config.get('SCHEDULER')
|
||||
if not scheduler:
|
||||
print("❌ APScheduler not found in app config")
|
||||
return False
|
||||
|
||||
print("✅ APScheduler available and initialized")
|
||||
print(f"📊 Initial job count: {scheduler.get_job_count()}")
|
||||
|
||||
# Test 2: Database table creation
|
||||
print("\n📋 Test 2: APScheduler Database Tables")
|
||||
try:
|
||||
# Check if we can query jobs (which requires tables to exist)
|
||||
jobs = scheduler.get_paper_jobs()
|
||||
print("✅ APScheduler database tables exist and accessible")
|
||||
print(f"📋 Current paper jobs: {len(jobs)}")
|
||||
except Exception as e:
|
||||
print(f"❌ APScheduler database tables not accessible: {e}")
|
||||
return False
|
||||
|
||||
# Test 3: Job scheduling functionality
|
||||
print("\n📋 Test 3: Job Scheduling")
|
||||
|
||||
# Create test paper
|
||||
test_paper = PaperMetadata(
|
||||
title="Test Paper for Scheduler",
|
||||
doi="10.1000/test_scheduler_001",
|
||||
issn="1234-5678",
|
||||
journal="Test Journal",
|
||||
status="New"
|
||||
)
|
||||
db.session.add(test_paper)
|
||||
db.session.commit()
|
||||
|
||||
# Schedule a paper for processing in 30 seconds (longer delay)
|
||||
try:
|
||||
job_id = scheduler.schedule_paper_processing(
|
||||
paper_id=test_paper.id,
|
||||
delay_seconds=30 # Increased delay to 30 seconds
|
||||
# Removed explicit job_id to allow default "paper_job_" prefix
|
||||
)
|
||||
print(f"✅ Paper scheduling works: Job ID {job_id}")
|
||||
except Exception as e:
|
||||
print(f"❌ Paper scheduling failed: {e}")
|
||||
return False
|
||||
|
||||
# Verify job was scheduled
|
||||
jobs_after = scheduler.get_paper_jobs()
|
||||
if len(jobs_after) == 0:
|
||||
print("❌ No jobs found after scheduling")
|
||||
return False
|
||||
|
||||
print(f"✅ Job successfully scheduled: {len(jobs_after)} paper job(s) found")
|
||||
|
||||
# Test 4: Job information retrieval
|
||||
print("\n📋 Test 4: Job Information Retrieval")
|
||||
|
||||
scheduled_job = jobs_after[0]
|
||||
print(f"✅ Job details accessible:")
|
||||
print(f" 📝 Job ID: {scheduled_job['id']}")
|
||||
print(f" 📝 Job Name: {scheduled_job['name']}")
|
||||
print(f" 📝 Next Run Time: {scheduled_job['next_run_time']}")
|
||||
print(f" 📝 Args: {scheduled_job['args']}")
|
||||
|
||||
# Test 5: Job revocation
|
||||
print("\n📋 Test 5: Job Revocation")
|
||||
|
||||
initial_count = len(jobs_after)
|
||||
revoked_count = scheduler.revoke_all_scraper_jobs()
|
||||
|
||||
if revoked_count != initial_count:
|
||||
print(f"⚠️ Warning: Expected to revoke {initial_count} jobs, but revoked {revoked_count}")
|
||||
else:
|
||||
print(f"✅ Job revocation works: {revoked_count} job(s) revoked")
|
||||
|
||||
# Verify jobs were revoked
|
||||
jobs_after_revocation = scheduler.get_paper_jobs()
|
||||
if len(jobs_after_revocation) > 0:
|
||||
print(f"❌ Jobs still exist after revocation: {len(jobs_after_revocation)}")
|
||||
return False
|
||||
|
||||
print("✅ All paper jobs successfully revoked")
|
||||
|
||||
# Test 6: Multiple job scheduling
|
||||
print("\n📋 Test 6: Multiple Job Scheduling")
|
||||
|
||||
# Create more test papers
|
||||
test_papers = []
|
||||
for i in range(3):
|
||||
paper = PaperMetadata(
|
||||
title=f"Test Paper {i+1}",
|
||||
doi=f"10.1000/test_scheduler_{i+2:03d}",
|
||||
issn="1234-5678",
|
||||
journal="Test Journal",
|
||||
status="New"
|
||||
)
|
||||
db.session.add(paper)
|
||||
test_papers.append(paper)
|
||||
|
||||
db.session.commit()
|
||||
|
||||
# Schedule multiple papers
|
||||
scheduled_jobs = []
|
||||
for i, paper in enumerate(test_papers):
|
||||
job_id = scheduler.schedule_paper_processing(
|
||||
paper_id=paper.id,
|
||||
delay_seconds=10 + i # Stagger the scheduling
|
||||
# Removed explicit job_id to allow default "paper_job_" prefix
|
||||
)
|
||||
scheduled_jobs.append(job_id)
|
||||
|
||||
print(f"✅ Multiple job scheduling works: {len(scheduled_jobs)} jobs scheduled")
|
||||
|
||||
# Verify all jobs are scheduled
|
||||
all_jobs = scheduler.get_paper_jobs()
|
||||
if len(all_jobs) != len(test_papers):
|
||||
print(f"❌ Expected {len(test_papers)} jobs, found {len(all_jobs)}")
|
||||
return False
|
||||
|
||||
print(f"✅ All jobs properly scheduled: {len(all_jobs)} total jobs")
|
||||
|
||||
# Test 7: ScraperManager integration
|
||||
print("\n📋 Test 7: ScraperManager Integration")
|
||||
|
||||
manager = ScraperManager()
|
||||
|
||||
# Test paper selection
|
||||
papers = manager.select_papers_for_processing(limit=2)
|
||||
print(f"✅ ScraperManager paper selection: {len(papers)} papers selected")
|
||||
|
||||
# Test scraper state management with APScheduler
|
||||
start_result = manager.start_scraper()
|
||||
if start_result["status"] != "success":
|
||||
print(f"❌ Failed to start scraper: {start_result['message']}")
|
||||
return False
|
||||
|
||||
print("✅ Scraper started successfully")
|
||||
|
||||
# Test job clearing through manager
|
||||
cleared_count = manager._clear_delayed_tasks_from_apscheduler()
|
||||
print(f"✅ ScraperManager job clearing: {cleared_count} jobs cleared")
|
||||
|
||||
# Verify jobs were cleared
|
||||
remaining_jobs = scheduler.get_paper_jobs()
|
||||
if len(remaining_jobs) > 0:
|
||||
print(f"❌ Jobs still exist after manager clearing: {len(remaining_jobs)}")
|
||||
return False
|
||||
|
||||
print("✅ ScraperManager successfully clears APScheduler jobs")
|
||||
|
||||
# Test 8: Hourly scheduler configuration
|
||||
print("\n📋 Test 8: Hourly Scheduler Configuration")
|
||||
|
||||
# Ensure the hourly job is scheduled correctly
|
||||
all_scheduler_jobs = scheduler._scheduler.get_jobs() if hasattr(scheduler, '_scheduler') and scheduler._scheduler else []
|
||||
hourly_jobs = [job for job in all_scheduler_jobs if job.id == 'hourly_scraper_main']
|
||||
|
||||
if not hourly_jobs:
|
||||
print("❌ Hourly scheduler job not found")
|
||||
return False
|
||||
|
||||
hourly_job = hourly_jobs[0]
|
||||
print("✅ Hourly scheduler job found:")
|
||||
print(f" 📝 Job ID: {hourly_job.id}")
|
||||
print(f" 📝 Job Name: {hourly_job.name}")
|
||||
print(f" 📝 Trigger: {hourly_job.trigger}")
|
||||
print(f" 📝 Next Run: {hourly_job.next_run_time}")
|
||||
|
||||
# Test 9: Configuration-based scheduling
|
||||
print("\n📋 Test 9: Configuration-based Scheduling")
|
||||
|
||||
# Set up volume configuration
|
||||
volume_config = VolumeConfig.query.first()
|
||||
if not volume_config:
|
||||
volume_config = VolumeConfig(volume=10) # 10 papers per day
|
||||
db.session.add(volume_config)
|
||||
db.session.commit()
|
||||
|
||||
# Test quota calculation
|
||||
quota = manager.get_current_hour_quota()
|
||||
print(f"✅ Hourly quota calculation: {quota} papers per hour")
|
||||
|
||||
if quota < 0:
|
||||
print("❌ Invalid quota calculation")
|
||||
return False
|
||||
|
||||
# Test 10: Activity logging integration
|
||||
print("\n📋 Test 10: Activity Logging Integration")
|
||||
|
||||
# Check recent APScheduler-related logs
|
||||
recent_logs = ActivityLog.query.filter(
|
||||
ActivityLog.action.like('%apscheduler%')
|
||||
).order_by(ActivityLog.timestamp.desc()).limit(5).all()
|
||||
|
||||
print(f"✅ APScheduler activity logging: {len(recent_logs)} related log entries")
|
||||
|
||||
if recent_logs:
|
||||
for log in recent_logs[:3]:
|
||||
print(f" 📝 {log.action}: {log.description}")
|
||||
|
||||
# Test 11: Error handling
|
||||
print("\n📋 Test 11: Error Handling")
|
||||
|
||||
# Test scheduling with invalid paper ID
|
||||
try:
|
||||
scheduler.schedule_paper_processing(
|
||||
paper_id=99999, # Non-existent paper
|
||||
delay_seconds=1,
|
||||
job_id="test_error_job"
|
||||
)
|
||||
print("✅ Scheduling with invalid paper ID handled gracefully")
|
||||
except Exception as e:
|
||||
print(f"✅ Scheduling with invalid paper ID properly raises exception: {e}")
|
||||
|
||||
# Test 12: Cleanup and shutdown
|
||||
print("\n📋 Test 12: Cleanup and Shutdown")
|
||||
|
||||
# Stop scraper
|
||||
stop_result = manager.stop_scraper()
|
||||
if stop_result["status"] != "success":
|
||||
print(f"❌ Failed to stop scraper: {stop_result['message']}")
|
||||
return False
|
||||
|
||||
print("✅ Scraper stopped successfully")
|
||||
|
||||
# Final job count should be minimal (only hourly scheduler)
|
||||
final_job_count = scheduler.get_job_count()
|
||||
final_paper_jobs = len(scheduler.get_paper_jobs())
|
||||
|
||||
print(f"📊 Final state:")
|
||||
print(f" 📝 Total jobs: {final_job_count}")
|
||||
print(f" 📝 Paper jobs: {final_paper_jobs}")
|
||||
|
||||
if final_paper_jobs > 0:
|
||||
print("❌ Paper jobs still exist after cleanup")
|
||||
return False
|
||||
|
||||
print("✅ Cleanup completed successfully")
|
||||
|
||||
print("\n🎉 ALL SCHEDULER TESTS PASSED!")
|
||||
print("\n📋 Test Summary:")
|
||||
print(" ✅ APScheduler initialization works")
|
||||
print(" ✅ Database tables created and accessible")
|
||||
print(" ✅ Job scheduling functionality works")
|
||||
print(" ✅ Job information retrieval works")
|
||||
print(" ✅ Job revocation works")
|
||||
print(" ✅ Multiple job scheduling works")
|
||||
print(" ✅ ScraperManager integration works")
|
||||
print(" ✅ Hourly scheduler configured correctly")
|
||||
print(" ✅ Configuration-based scheduling works")
|
||||
print(" ✅ Activity logging integration works")
|
||||
print(" ✅ Error handling works")
|
||||
print(" ✅ Cleanup and shutdown works")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def test_job_execution():
|
||||
"""Test that jobs actually execute (requires waiting)."""
|
||||
print("\n🔄 Testing Job Execution (5-second test)")
|
||||
print("-" * 40)
|
||||
|
||||
app = create_app({
|
||||
'TESTING': True,
|
||||
'SQLALCHEMY_DATABASE_URI': 'sqlite:///:memory:',
|
||||
})
|
||||
|
||||
with app.app_context():
|
||||
# Initialize database and scheduler
|
||||
db.create_all()
|
||||
scheduler = app.config.get('SCHEDULER')
|
||||
if not scheduler:
|
||||
print("❌ Scheduler not initialized")
|
||||
return False
|
||||
|
||||
# Create test paper
|
||||
test_paper = PaperMetadata(
|
||||
title="Test Paper for Execution",
|
||||
doi="10.1000/test_execution",
|
||||
issn="1234-5678",
|
||||
journal="Test Journal",
|
||||
status="Pending"
|
||||
)
|
||||
db.session.add(test_paper)
|
||||
db.session.commit()
|
||||
|
||||
# Verify paper is added to the database
|
||||
test_paper_id = test_paper.id
|
||||
if not test_paper_id:
|
||||
print("❌ Test paper not added to the database")
|
||||
return False
|
||||
|
||||
# Schedule paper for processing in 2 seconds
|
||||
job_id = scheduler.schedule_paper_processing(
|
||||
paper_id=test_paper_id,
|
||||
delay_seconds=2
|
||||
)
|
||||
|
||||
print(f"📅 Scheduled job {job_id} for execution in 2 seconds")
|
||||
|
||||
# Wait and check for execution
|
||||
print("⏳ Waiting for job execution...")
|
||||
time.sleep(3)
|
||||
|
||||
# Check if job completed (should be removed from scheduler)
|
||||
remaining_jobs = scheduler.get_paper_jobs()
|
||||
|
||||
if remaining_jobs:
|
||||
print(f"⚠️ Job still in scheduler: {len(remaining_jobs)} remaining")
|
||||
for job in remaining_jobs:
|
||||
print(f" 📝 Job ID: {job['id']}, Next Run Time: {job['next_run_time']}")
|
||||
else:
|
||||
print("✅ Job executed and removed from scheduler")
|
||||
|
||||
# Check activity logs for execution evidence
|
||||
execution_logs = ActivityLog.query.filter(
|
||||
ActivityLog.action.like('%process_single_paper%')
|
||||
).order_by(ActivityLog.timestamp.desc()).limit(3).all()
|
||||
|
||||
if execution_logs:
|
||||
print("✅ Job execution logged in activity:")
|
||||
for log in execution_logs:
|
||||
print(f" 📝 {log.action}: {log.description}")
|
||||
else:
|
||||
print("⚠️ No execution logs found")
|
||||
|
||||
# Validate job execution status in the database
|
||||
updated_paper = PaperMetadata.query.get(test_paper_id)
|
||||
if updated_paper:
|
||||
print(f"🔍 Retrieved paper: {updated_paper.title}, Status: {updated_paper.status}")
|
||||
if updated_paper.status == "Done":
|
||||
print("✅ Paper status updated to 'Done'")
|
||||
else:
|
||||
print(f"❌ Paper status not updated: {updated_paper.status}")
|
||||
else:
|
||||
print("❌ Paper not found in the database")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(f"📅 Starting scheduler tests at {datetime.now()}")
|
||||
|
||||
try:
|
||||
# Run main functionality tests
|
||||
success = test_scheduler_functionality()
|
||||
|
||||
if success:
|
||||
print("\n" + "="*50)
|
||||
# Run execution test if main tests pass
|
||||
test_job_execution()
|
||||
|
||||
print(f"\n📅 Tests completed at {datetime.now()}")
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n⏹️ Tests interrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Test error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
@ -18,4 +18,5 @@ def client(app):
|
||||
|
||||
def test_index(client):
|
||||
response = client.get("/")
|
||||
assert b"It works!" in response.data
|
||||
# Updated assertion to check for actual content in the index page
|
||||
assert b"Welcome to SciPaperLoader" in response.data
|
||||
|
@ -10,7 +10,7 @@ especially for addressing issues with the scraper module.
|
||||
**Symptoms:**
|
||||
- Web interface shows scraper as stopped but papers are still being processed
|
||||
- `/scraper/stop` endpoint returns success but processing continues
|
||||
- Active tasks show up in Celery inspector
|
||||
- Active tasks show up in APScheduler inspector
|
||||
|
||||
**Solutions:**
|
||||
|
||||
@ -24,7 +24,7 @@ python tools/diagnostics/emergency_stop.py
|
||||
|
||||
The emergency stop performs these actions:
|
||||
- Sets scraper state to inactive in the database
|
||||
- Revokes all running, reserved, and scheduled Celery tasks
|
||||
- Revokes all running and scheduled APScheduler tasks
|
||||
- Purges all task queues
|
||||
- Reverts papers with "Pending" status to their previous state
|
||||
|
||||
@ -33,12 +33,12 @@ The emergency stop performs these actions:
|
||||
**Symptoms:**
|
||||
- Code changes don't seem to have any effect
|
||||
- Bug fixes don't work even though the code is updated
|
||||
- Workers might be using cached versions of modified code
|
||||
- APScheduler might be using cached versions of modified code
|
||||
|
||||
**Solution:**
|
||||
|
||||
```bash
|
||||
# Use the quick fix to stop tasks and restart workers
|
||||
# Use the quick fix to stop tasks and restart the application
|
||||
make diagnostics # Then select option 6 (Quick fix)
|
||||
|
||||
# Or directly:
|
||||
@ -57,7 +57,7 @@ python tools/diagnostics/diagnose_scraper.py
|
||||
|
||||
This tool will:
|
||||
- Show current scraper state
|
||||
- List all active, scheduled, and reserved tasks
|
||||
- List all active and scheduled APScheduler tasks
|
||||
- Display recent activity and error logs
|
||||
|
||||
## Preventative Measures
|
||||
@ -67,11 +67,10 @@ This tool will:
|
||||
- Deploying code changes
|
||||
- Modifying the database
|
||||
|
||||
2. **Monitor task queue size** using Flower web interface:
|
||||
2. **Monitor APScheduler jobs** through the diagnostic tools:
|
||||
```bash
|
||||
make celery-flower
|
||||
make diagnostics # Then select option 2 (Inspect tasks)
|
||||
```
|
||||
Then visit http://localhost:5555
|
||||
|
||||
3. **Check logs for failed tasks** regularly in the Logger tab of the application
|
||||
|
||||
|
@ -7,14 +7,14 @@ This directory contains various scripts for diagnosing issues, debugging, and ha
|
||||
### Scraper Management
|
||||
|
||||
- **emergency_stop.py**: Force stops all scraper activities, revokes running tasks, and reverts papers from "Pending" state
|
||||
- **quick_fix.py**: A simplified emergency stop that also restarts Celery workers to ensure code changes are applied
|
||||
- **quick_fix.py**: A simplified emergency stop that also stops Flask processes to ensure code changes are applied
|
||||
- **test_reversion.py**: Tests the paper reversion functionality when stopping the scraper
|
||||
|
||||
### Monitoring and Diagnostics
|
||||
|
||||
- **check_state.py**: Checks the current state of the scraper in the database
|
||||
- **diagnose_scraper.py**: Comprehensive diagnostic tool that examines tasks, logs, and scraper state
|
||||
- **inspect_tasks.py**: Displays currently running, scheduled, and reserved Celery tasks
|
||||
- **inspect_tasks.py**: Displays currently running and scheduled APScheduler tasks
|
||||
|
||||
## Usage
|
||||
|
||||
@ -59,5 +59,5 @@ python tools/diagnostics/quick_fix.py
|
||||
## Notes
|
||||
|
||||
- Always run these scripts from the project root directory
|
||||
- Some scripts may require a running Redis server
|
||||
- Some scripts may require a running Flask application with APScheduler
|
||||
- After using emergency tools, the application may need to be restarted completely
|
||||
|
@ -3,7 +3,6 @@ Diagnose and fix scraper stopping issues.
|
||||
"""
|
||||
|
||||
from scipaperloader import create_app
|
||||
from scipaperloader.celery import celery
|
||||
from scipaperloader.models import ScraperState, ActivityLog
|
||||
from scipaperloader.scrapers.factory import get_scraper
|
||||
|
||||
@ -18,21 +17,15 @@ def check_scraper_status():
|
||||
else:
|
||||
print("No scraper state found in database")
|
||||
|
||||
def check_celery_tasks():
|
||||
"""Check currently running Celery tasks."""
|
||||
i = celery.control.inspect()
|
||||
|
||||
print("\n=== ACTIVE TASKS ===")
|
||||
active_tasks = i.active() or {}
|
||||
for worker, tasks in active_tasks.items():
|
||||
for task in tasks:
|
||||
print(f"Worker: {worker}, Task: {task.get('name', 'Unknown')}, ID: {task.get('id', 'Unknown')}")
|
||||
|
||||
print("\n=== SCHEDULED TASKS ===")
|
||||
scheduled_tasks = i.scheduled() or {}
|
||||
for worker, tasks in scheduled_tasks.items():
|
||||
for task in tasks:
|
||||
print(f"Worker: {worker}, Task: {task.get('name', 'Unknown')}, ID: {task.get('id', 'Unknown')}")
|
||||
def check_scheduler_jobs():
|
||||
"""Check the current jobs in APScheduler."""
|
||||
with app.app_context():
|
||||
scheduler = app.config.get('SCHEDULER')
|
||||
if not scheduler:
|
||||
print("❌ APScheduler not found in app config")
|
||||
else:
|
||||
jobs = scheduler.get_paper_jobs()
|
||||
print("Scheduled jobs:", jobs)
|
||||
|
||||
def check_recent_logs():
|
||||
"""Check recent activity logs for clues."""
|
||||
@ -60,41 +53,26 @@ def force_stop_scraper():
|
||||
print("Set scraper state to inactive")
|
||||
|
||||
# Revoke all tasks
|
||||
i = celery.control.inspect()
|
||||
revoked_ids = []
|
||||
|
||||
# Check all queues
|
||||
for queue_name, queue_func in [
|
||||
("scheduled", i.scheduled),
|
||||
("active", i.active),
|
||||
("reserved", i.reserved)
|
||||
]:
|
||||
queue = queue_func() or {}
|
||||
for worker, tasks in queue.items():
|
||||
for task in tasks:
|
||||
task_id = task.get('id')
|
||||
if task_id and task_id not in revoked_ids:
|
||||
celery.control.revoke(task_id, terminate=True)
|
||||
revoked_ids.append(task_id)
|
||||
print(f"Revoked task: {task_id}")
|
||||
|
||||
# Purge all queues
|
||||
celery.control.purge()
|
||||
print("Purged all task queues")
|
||||
scheduler = app.config.get('SCHEDULER')
|
||||
if not scheduler:
|
||||
print("❌ APScheduler not found in app config")
|
||||
else:
|
||||
revoked_count = scheduler.revoke_all_scraper_jobs()
|
||||
print(f"✅ Revoked {revoked_count} jobs from APScheduler")
|
||||
|
||||
# Log the action
|
||||
ActivityLog.log_scraper_command(
|
||||
action="force_stop_scraper",
|
||||
status="success",
|
||||
description=f"Force stopped scraper, revoked {len(revoked_ids)} tasks"
|
||||
description=f"Force stopped scraper, revoked {revoked_count} tasks"
|
||||
)
|
||||
|
||||
print(f"\nRevoked {len(revoked_ids)} tasks in total")
|
||||
print(f"\nRevoked {revoked_count} tasks in total")
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=== SCRAPER STATUS DIAGNOSTIC TOOL ===")
|
||||
check_scraper_status()
|
||||
check_celery_tasks()
|
||||
check_scheduler_jobs()
|
||||
check_recent_logs()
|
||||
|
||||
stop_confirmation = input("\nDo you want to force stop the scraper? (y/n): ")
|
||||
|
@ -23,7 +23,6 @@ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../.
|
||||
from scipaperloader import create_app
|
||||
from scipaperloader.db import db
|
||||
from scipaperloader.models import PaperMetadata, ActivityLog, ScraperState
|
||||
from scipaperloader.celery import celery
|
||||
|
||||
app = create_app()
|
||||
|
||||
@ -38,46 +37,18 @@ def emergency_stop():
|
||||
ScraperState.set_paused(False)
|
||||
print("✓ Set scraper state to inactive")
|
||||
|
||||
# 2. Revoke all tasks
|
||||
print("\nRevoking running tasks...")
|
||||
try:
|
||||
i = celery.control.inspect()
|
||||
active = i.active() or {}
|
||||
scheduled = i.scheduled() or {}
|
||||
reserved = i.reserved() or {}
|
||||
# 2. Revoke all jobs in APScheduler
|
||||
scheduler = app.config.get('SCHEDULER')
|
||||
if scheduler:
|
||||
revoked_count = scheduler.revoke_all_scraper_jobs()
|
||||
print(f"✅ Revoked {revoked_count} jobs from APScheduler")
|
||||
else:
|
||||
print("❌ APScheduler not found in app config")
|
||||
|
||||
revoked_count = 0
|
||||
|
||||
# Revoke active tasks
|
||||
for worker, tasks in active.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
print(f" Revoked active task: {task.get('name', 'unknown')}")
|
||||
|
||||
# Revoke scheduled tasks
|
||||
for worker, tasks in scheduled.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
|
||||
# Revoke reserved tasks
|
||||
for worker, tasks in reserved.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
|
||||
print(f"✓ Revoked {revoked_count} tasks")
|
||||
|
||||
# 3. Purge queues
|
||||
celery.control.purge()
|
||||
print("✓ Purged all task queues")
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠ Error revoking tasks: {str(e)}")
|
||||
# 3. Revert all papers to 'Pending' state
|
||||
PaperMetadata.query.filter_by(status="Processing").update({"status": "Pending"})
|
||||
db.session.commit()
|
||||
print("✅ Reverted all 'Processing' papers to 'Pending' state")
|
||||
|
||||
# 4. Revert papers in "Pending" status
|
||||
try:
|
||||
|
@ -1,11 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Inspect current Celery tasks (active, reserved, and scheduled)
|
||||
Inspect current APScheduler jobs (active and scheduled).
|
||||
"""
|
||||
|
||||
from scipaperloader.celery import celery
|
||||
import sys
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
i = celery.control.inspect()
|
||||
print("Active tasks:", i.active())
|
||||
print("Reserved tasks:", i.reserved())
|
||||
print("Scheduled tasks:", i.scheduled())
|
||||
# Add project root to path
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from scipaperloader import create_app
|
||||
from scipaperloader.models import ScraperState
|
||||
|
||||
def main():
|
||||
print("=== APScheduler Task Inspector ===")
|
||||
print(f"Time: {datetime.now()}\n")
|
||||
|
||||
app = create_app()
|
||||
|
||||
with app.app_context():
|
||||
# Check scraper state
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
print(f"🔄 Scraper State:")
|
||||
print(f" Active: {'✅' if scraper_state.is_active else '❌'} {scraper_state.is_active}")
|
||||
print(f" Paused: {'⏸️' if scraper_state.is_paused else '▶️'} {scraper_state.is_paused}")
|
||||
print()
|
||||
|
||||
# Check APScheduler
|
||||
scheduler = app.config.get('SCHEDULER')
|
||||
if not scheduler:
|
||||
print("❌ APScheduler not found in app config")
|
||||
return
|
||||
|
||||
print("📋 APScheduler Status:")
|
||||
# Access the underlying scheduler
|
||||
if hasattr(scheduler, 'scheduler') and scheduler.scheduler:
|
||||
print(f" Running: {'✅' if scheduler.scheduler.running else '❌'} {scheduler.scheduler.running}")
|
||||
else:
|
||||
print("❌ APScheduler instance not accessible")
|
||||
print()
|
||||
|
||||
# Get all jobs
|
||||
if hasattr(scheduler, 'scheduler') and scheduler.scheduler:
|
||||
all_jobs = scheduler.scheduler.get_jobs()
|
||||
else:
|
||||
all_jobs = []
|
||||
paper_jobs = scheduler.get_paper_jobs()
|
||||
|
||||
print(f"📊 Job Statistics:")
|
||||
print(f" Total jobs: {len(all_jobs)}")
|
||||
print(f" Paper processing jobs: {len(paper_jobs)}")
|
||||
print()
|
||||
|
||||
if paper_jobs:
|
||||
print("📝 Active Paper Processing Jobs:")
|
||||
for job in paper_jobs:
|
||||
next_run = job.get('next_run_time', 'Not scheduled')
|
||||
print(f" • {job['id']}")
|
||||
print(f" Next run: {next_run}")
|
||||
print(f" Name: {job.get('name', 'N/A')}")
|
||||
if job.get('args'):
|
||||
print(f" Paper ID: {job['args'][0] if job['args'] else 'N/A'}")
|
||||
print()
|
||||
else:
|
||||
print("✅ No active paper processing jobs")
|
||||
|
||||
# Show other jobs if any
|
||||
other_jobs = [job for job in all_jobs if not any(pattern in job.id for pattern in ['paper_process_', 'test_paper_process_', 'process_paper_'])]
|
||||
if other_jobs:
|
||||
print(f"🔧 Other Scheduled Jobs ({len(other_jobs)}):")
|
||||
for job in other_jobs:
|
||||
next_run = job.next_run_time.strftime('%Y-%m-%d %H:%M:%S') if job.next_run_time else 'Not scheduled'
|
||||
print(f" • {job.id} - Next run: {next_run}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick fix script to stop all running scraper tasks and restart Celery workers.
|
||||
This ensures the updated code is loaded and tasks are properly terminated.
|
||||
Quick fix script to stop all running scraper tasks using APScheduler.
|
||||
This ensures all scheduled tasks are properly terminated.
|
||||
"""
|
||||
|
||||
import os
|
||||
@ -9,45 +9,55 @@ import sys
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from datetime import datetime, UTC
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
def kill_celery_processes():
|
||||
"""Kill all running Celery processes"""
|
||||
print("Killing Celery processes...")
|
||||
def stop_apscheduler_jobs():
|
||||
"""Stop all APScheduler jobs through the Flask app"""
|
||||
print("Stopping APScheduler jobs...")
|
||||
try:
|
||||
# Get all celery processes
|
||||
result = subprocess.run(['pgrep', '-f', 'celery'], capture_output=True, text=True)
|
||||
from scipaperloader import create_app
|
||||
|
||||
app = create_app()
|
||||
with app.app_context():
|
||||
scheduler = app.config.get('SCHEDULER')
|
||||
if scheduler:
|
||||
revoked_count = scheduler.revoke_all_scraper_jobs()
|
||||
print(f"✓ Revoked {revoked_count} APScheduler jobs")
|
||||
else:
|
||||
print("❌ APScheduler not found in app config")
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠ Error stopping APScheduler jobs: {e}")
|
||||
|
||||
def kill_python_processes():
|
||||
"""Kill any running Python processes that might be Flask/APScheduler workers"""
|
||||
print("Checking for running Flask/APScheduler processes...")
|
||||
try:
|
||||
# Look for Flask processes
|
||||
result = subprocess.run(['pgrep', '-f', 'flask'], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
pids = result.stdout.strip().split('\n')
|
||||
for pid in pids:
|
||||
if pid:
|
||||
try:
|
||||
# Check if this is our process before killing
|
||||
cmdline_result = subprocess.run(['ps', '-p', pid, '-o', 'cmd='], capture_output=True, text=True)
|
||||
if 'scipaperloader' in cmdline_result.stdout:
|
||||
os.kill(int(pid), signal.SIGTERM)
|
||||
print(f" Killed process {pid}")
|
||||
except ProcessLookupError:
|
||||
pass # Process already dead
|
||||
print(f" Killed Flask process {pid}")
|
||||
except (ProcessLookupError, ValueError):
|
||||
pass # Process already dead or invalid PID
|
||||
|
||||
# Wait a moment for graceful shutdown
|
||||
time.sleep(2)
|
||||
else:
|
||||
print("✓ No Flask processes found")
|
||||
|
||||
# Force kill any remaining processes
|
||||
result = subprocess.run(['pgrep', '-f', 'celery'], capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
pids = result.stdout.strip().split('\n')
|
||||
for pid in pids:
|
||||
if pid:
|
||||
try:
|
||||
os.kill(int(pid), signal.SIGKILL)
|
||||
print(f" Force killed process {pid}")
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
|
||||
print("✓ All Celery processes terminated")
|
||||
except Exception as e:
|
||||
print(f"⚠ Error killing processes: {e}")
|
||||
print(f"⚠ Error checking processes: {e}")
|
||||
|
||||
def stop_scraper_state():
|
||||
"""Set scraper state to inactive using Flask app context"""
|
||||
@ -55,6 +65,7 @@ def stop_scraper_state():
|
||||
from scipaperloader import create_app
|
||||
from scipaperloader.models import ScraperState, PaperMetadata
|
||||
from scipaperloader.db import db
|
||||
from scipaperloader.scrapers.factory import get_scraper
|
||||
|
||||
app = create_app()
|
||||
with app.app_context():
|
||||
@ -63,41 +74,57 @@ def stop_scraper_state():
|
||||
ScraperState.set_paused(False)
|
||||
print("✓ Set scraper state to inactive")
|
||||
|
||||
# Revert any pending papers to "New" status (simple approach since we don't have previous_status data yet)
|
||||
pending_papers = PaperMetadata.query.filter_by(status="Pending").all()
|
||||
# Get scraper configuration for proper status reversion
|
||||
scraper = get_scraper()
|
||||
input_statuses = scraper.get_input_statuses()
|
||||
output_statuses = scraper.get_output_statuses()
|
||||
processing_status = output_statuses.get("processing", "Processing")
|
||||
|
||||
# Revert any papers in processing status
|
||||
processing_papers = PaperMetadata.query.filter_by(status=processing_status).all()
|
||||
reverted_count = 0
|
||||
|
||||
for paper in pending_papers:
|
||||
paper.status = "New" # Simple fallback - revert all to "New"
|
||||
if processing_papers and input_statuses:
|
||||
revert_status = input_statuses[0] # Use first input status as default
|
||||
|
||||
for paper in processing_papers:
|
||||
# Try to use previous_status if available, otherwise use first input status
|
||||
if hasattr(paper, 'previous_status') and paper.previous_status:
|
||||
paper.status = paper.previous_status
|
||||
else:
|
||||
paper.status = revert_status
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
reverted_count += 1
|
||||
|
||||
if reverted_count > 0:
|
||||
db.session.commit()
|
||||
print(f"✓ Reverted {reverted_count} papers from 'Pending' to 'New'")
|
||||
print(f"✓ Reverted {reverted_count} papers from '{processing_status}' to previous status")
|
||||
else:
|
||||
print("✓ No pending papers to revert")
|
||||
print("✓ No papers in processing status to revert")
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠ Error setting scraper state: {e}")
|
||||
|
||||
def main():
|
||||
print("=== QUICK SCRAPER FIX ===")
|
||||
print("=== QUICK SCRAPER FIX (APScheduler) ===")
|
||||
print(f"Time: {datetime.now()}")
|
||||
print()
|
||||
|
||||
# Step 1: Stop scraper state
|
||||
# Step 1: Stop scraper state and revert papers
|
||||
stop_scraper_state()
|
||||
|
||||
# Step 2: Kill all Celery processes
|
||||
kill_celery_processes()
|
||||
# Step 2: Stop all APScheduler jobs
|
||||
stop_apscheduler_jobs()
|
||||
|
||||
# Step 3: Kill any running Flask processes
|
||||
kill_python_processes()
|
||||
|
||||
print()
|
||||
print("=== FIX COMPLETE ===")
|
||||
print("The scraper has been stopped and all tasks terminated.")
|
||||
print("You can now restart the Celery workers with:")
|
||||
print(" make celery")
|
||||
print("or")
|
||||
print("You can now restart the application with:")
|
||||
print(" make run")
|
||||
print("or")
|
||||
print(" python -m flask --app scipaperloader run")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -1,16 +1,17 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for verifying the paper reversion fix.
|
||||
Test script for verifying the paper reversion fix with APScheduler.
|
||||
This script:
|
||||
1. Simulates stopping the scraper
|
||||
2. Checks that all pending papers were reverted to their previous status
|
||||
3. Ensures all running tasks were terminated
|
||||
1. Creates test papers and simulates processing
|
||||
2. Tests the stop_scraper functionality
|
||||
3. Checks that all pending papers were reverted to their previous status
|
||||
4. Ensures all running tasks were terminated
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from datetime import datetime, UTC, timedelta
|
||||
from sqlalchemy import func
|
||||
from flask import Flask
|
||||
|
||||
@ -21,81 +22,136 @@ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../.
|
||||
from scipaperloader import create_app
|
||||
from scipaperloader.db import db
|
||||
from scipaperloader.models import PaperMetadata, ActivityLog, ScraperState
|
||||
from scipaperloader.celery import celery
|
||||
from scipaperloader.scrapers.factory import get_scraper
|
||||
from scipaperloader.scrapers.manager import ScraperManager
|
||||
|
||||
print("[DEBUG] Initializing Flask app...")
|
||||
app = create_app()
|
||||
|
||||
print("[DEBUG] Flask app initialized.")
|
||||
|
||||
def test_stop_scraper():
|
||||
"""Test the stop_scraper functionality"""
|
||||
"""Test the stop_scraper functionality with proper APScheduler integration"""
|
||||
|
||||
print("[DEBUG] Entering app context...")
|
||||
with app.app_context():
|
||||
# First check current scraper state
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
print(f"Current scraper state: active={scraper_state.is_active}, paused={scraper_state.is_paused}")
|
||||
print("[DEBUG] App context entered.")
|
||||
|
||||
# Check if there are any papers in "Pending" state
|
||||
pending_count = PaperMetadata.query.filter_by(status="Pending").count()
|
||||
print(f"Papers in 'Pending' state before stopping: {pending_count}")
|
||||
|
||||
if pending_count == 0:
|
||||
print("No papers in 'Pending' state to test with.")
|
||||
print("Would you like to create a test paper in Pending state? (y/n)")
|
||||
choice = input().lower()
|
||||
if choice == 'y':
|
||||
# Create a test paper
|
||||
paper = PaperMetadata(
|
||||
title="Test Paper for Reversion",
|
||||
doi="10.1234/test.123",
|
||||
status="Pending",
|
||||
previous_status="New", # Test value we expect to be reverted to
|
||||
created_at=datetime.utcnow(),
|
||||
updated_at=datetime.utcnow()
|
||||
)
|
||||
db.session.add(paper)
|
||||
# Clear existing test data
|
||||
print("[DEBUG] Clearing existing test data...")
|
||||
PaperMetadata.query.filter(PaperMetadata.doi.like('10.1234/test%')).delete()
|
||||
db.session.commit()
|
||||
print(f"Created test paper with ID {paper.id}, status='Pending', previous_status='New'")
|
||||
pending_count = 1
|
||||
print("[DEBUG] Existing test data cleared.")
|
||||
|
||||
# Simulate the stop_scraper API call
|
||||
from scipaperloader.blueprints.scraper import revert_pending_papers
|
||||
print("Reverting pending papers...")
|
||||
reverted = revert_pending_papers()
|
||||
print(f"Reverted {reverted} papers from 'Pending' state")
|
||||
# Get scraper configuration
|
||||
scraper = get_scraper()
|
||||
input_statuses = scraper.get_input_statuses()
|
||||
output_statuses = scraper.get_output_statuses()
|
||||
|
||||
# Check if any papers are still in "Pending" state
|
||||
still_pending = PaperMetadata.query.filter_by(status="Pending").count()
|
||||
print(f"Papers still in 'Pending' state after stopping: {still_pending}")
|
||||
if not input_statuses:
|
||||
print("❌ No input statuses found for current scraper")
|
||||
return
|
||||
|
||||
# List any that were reverted and their current status
|
||||
if reverted > 0:
|
||||
print("\nPapers that were reverted:")
|
||||
recent_logs = ActivityLog.query.filter_by(action="revert_pending").order_by(
|
||||
ActivityLog.timestamp.desc()).limit(10).all()
|
||||
input_status = input_statuses[0] # Use first input status
|
||||
processing_status = output_statuses.get("processing", "Processing")
|
||||
|
||||
for log in recent_logs:
|
||||
paper = PaperMetadata.query.get(log.paper_id)
|
||||
if paper:
|
||||
print(f"Paper ID {paper.id}: '{paper.title}' - Now status='{paper.status}'")
|
||||
print(f"[DEBUG] Using input status: {input_status}")
|
||||
print(f"[DEBUG] Using processing status: {processing_status}")
|
||||
|
||||
# Check active celery tasks
|
||||
i = celery.control.inspect()
|
||||
active = i.active() or {}
|
||||
reserved = i.reserved() or {}
|
||||
scheduled = i.scheduled() or {}
|
||||
# Create test papers in input status
|
||||
test_papers = []
|
||||
print("[DEBUG] Creating test papers...")
|
||||
for i in range(3):
|
||||
test_paper = PaperMetadata()
|
||||
test_paper.title = f"Test Paper {i+1}"
|
||||
test_paper.doi = f"10.1234/test{i+1}"
|
||||
test_paper.status = input_status
|
||||
test_paper.created_at = datetime.now(UTC)
|
||||
test_paper.updated_at = datetime.now(UTC)
|
||||
db.session.add(test_paper)
|
||||
test_papers.append(test_paper)
|
||||
db.session.commit()
|
||||
print(f"[DEBUG] Created {len(test_papers)} test papers in '{input_status}' status.")
|
||||
|
||||
active_count = sum(len(tasks) for worker, tasks in active.items())
|
||||
reserved_count = sum(len(tasks) for worker, tasks in reserved.items())
|
||||
scheduled_count = sum(len(tasks) for worker, tasks in scheduled.items())
|
||||
# Simulate some papers being moved to processing status
|
||||
print("[DEBUG] Simulating papers in processing...")
|
||||
for i, paper in enumerate(test_papers[:2]): # Move first 2 papers to processing
|
||||
paper.previous_status = paper.status # Store previous status
|
||||
paper.status = processing_status
|
||||
paper.updated_at = datetime.now(UTC)
|
||||
db.session.commit()
|
||||
print(f"[DEBUG] Moved 2 papers to '{processing_status}' status.")
|
||||
|
||||
print(f"\nCurrently {active_count} active, {reserved_count} reserved, and {scheduled_count} scheduled tasks")
|
||||
# Check current scraper state
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
print(f"[DEBUG] Current scraper state: active={scraper_state.is_active}, paused={scraper_state.is_paused}")
|
||||
|
||||
# Print conclusion
|
||||
if still_pending == 0 and reverted > 0:
|
||||
print("\nSUCCESS: All pending papers were properly reverted!")
|
||||
elif still_pending > 0:
|
||||
print(f"\nWARNING: {still_pending} papers are still in 'Pending' state!")
|
||||
elif pending_count == 0 and reverted == 0:
|
||||
print("\nNo papers to revert. Can't fully test.")
|
||||
# Check paper counts before stopping
|
||||
input_count = PaperMetadata.query.filter_by(status=input_status).count()
|
||||
processing_count = PaperMetadata.query.filter_by(status=processing_status).count()
|
||||
print(f"[DEBUG] Papers before stopping: {input_count} in '{input_status}', {processing_count} in '{processing_status}'")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_stop_scraper()
|
||||
# Test APScheduler job management
|
||||
scheduler = app.config.get('SCHEDULER')
|
||||
if scheduler:
|
||||
print("[DEBUG] Testing APScheduler job management...")
|
||||
|
||||
# Create some test jobs using the correct API
|
||||
for paper in test_papers:
|
||||
job_id = scheduler.schedule_paper_processing(
|
||||
paper_id=paper.id,
|
||||
delay_seconds=60, # 1 minute from now
|
||||
job_id=f"test_paper_process_{paper.id}"
|
||||
)
|
||||
print(f"[DEBUG] Scheduled job {job_id} for paper {paper.id}")
|
||||
|
||||
jobs_before = len(scheduler.get_paper_jobs())
|
||||
print(f"[DEBUG] Created {jobs_before} test jobs in APScheduler")
|
||||
|
||||
# Test the manager's stop_scraper method
|
||||
print("[DEBUG] Testing ScraperManager.stop_scraper()...")
|
||||
manager = ScraperManager()
|
||||
result = manager.stop_scraper()
|
||||
|
||||
print(f"[DEBUG] stop_scraper result: {result}")
|
||||
|
||||
# Check jobs after stopping
|
||||
jobs_after = len(scheduler.get_paper_jobs())
|
||||
print(f"[DEBUG] Jobs after stopping: {jobs_after} (should be 0)")
|
||||
|
||||
if jobs_after == 0:
|
||||
print("✅ All APScheduler jobs successfully revoked")
|
||||
else:
|
||||
print(f"❌ {jobs_after} jobs still exist after revocation")
|
||||
else:
|
||||
print("❌ APScheduler not found in app config")
|
||||
|
||||
# Check paper counts after stopping
|
||||
input_count_after = PaperMetadata.query.filter_by(status=input_status).count()
|
||||
processing_count_after = PaperMetadata.query.filter_by(status=processing_status).count()
|
||||
print(f"[DEBUG] Papers after stopping: {input_count_after} in '{input_status}', {processing_count_after} in '{processing_status}'")
|
||||
|
||||
# Verify that processing papers were reverted
|
||||
if processing_count_after == 0 and input_count_after >= processing_count:
|
||||
print("✅ Papers successfully reverted from processing to previous status")
|
||||
else:
|
||||
print(f"❌ Paper reversion failed: expected 0 processing papers, got {processing_count_after}")
|
||||
|
||||
# Check scraper state after stopping
|
||||
scraper_state_after = ScraperState.get_current_state()
|
||||
print(f"[DEBUG] Scraper state after stopping: active={scraper_state_after.is_active}, paused={scraper_state_after.is_paused}")
|
||||
|
||||
if not scraper_state_after.is_active and not scraper_state_after.is_paused:
|
||||
print("✅ Scraper state correctly set to inactive")
|
||||
else:
|
||||
print("❌ Scraper state not properly updated")
|
||||
|
||||
# Clean up test data
|
||||
print("[DEBUG] Cleaning up test data...")
|
||||
PaperMetadata.query.filter(PaperMetadata.doi.like('10.1234/test%')).delete()
|
||||
db.session.commit()
|
||||
print("[DEBUG] Test data cleaned up.")
|
||||
|
||||
print("[DEBUG] Starting test_stop_scraper...")
|
||||
test_stop_scraper()
|
||||
print("[DEBUG] test_stop_scraper completed.")
|
||||
|
Loading…
x
Reference in New Issue
Block a user