implements asynchronous task management to the input module.
This commit is contained in:
parent
5d63e28a61
commit
05f4c8b517
4
.gitignore
vendored
4
.gitignore
vendored
@ -12,4 +12,6 @@ dist/
|
||||
|
||||
*.db
|
||||
|
||||
*.R
|
||||
*.R
|
||||
|
||||
migrations/
|
31
Makefile
31
Makefile
@ -1,10 +1,13 @@
|
||||
# List of phony targets (targets that don't represent files)
|
||||
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev
|
||||
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev celery celery-flower redis run-all
|
||||
|
||||
# Define Python and pip executables inside virtual environment
|
||||
PYTHON := venv/bin/python
|
||||
PIP := venv/bin/pip
|
||||
|
||||
# Celery worker command
|
||||
CELERY := venv/bin/celery
|
||||
|
||||
# Default target that runs the application
|
||||
all: run
|
||||
|
||||
@ -83,11 +86,11 @@ todos:
|
||||
@grep -r "TODO\|FIXME" scipaperloader || echo "No TODOs found"
|
||||
|
||||
# Reset the database: delete, initialize, and migrate
|
||||
reset-db:
|
||||
reset-db: venv
|
||||
rm -f $(DB_PATH)
|
||||
flask db init || true
|
||||
flask db migrate -m "Initial migration"
|
||||
flask db upgrade
|
||||
$(PYTHON) -m flask --app scipaperloader db init || true
|
||||
$(PYTHON) -m flask --app scipaperloader db migrate -m "Initial migration"
|
||||
$(PYTHON) -m flask --app scipaperloader db upgrade
|
||||
|
||||
# Create and set up virtual environment
|
||||
venv:
|
||||
@ -130,3 +133,21 @@ dist: format-check lint mypy test
|
||||
|
||||
# Set up complete development environment
|
||||
dev: clean venv
|
||||
|
||||
# Start Celery worker for processing tasks
|
||||
celery: venv
|
||||
$(CELERY) -A celery_worker:celery worker --loglevel=info
|
||||
|
||||
# Monitor Celery tasks with flower web interface
|
||||
celery-flower: venv
|
||||
$(PIP) install flower
|
||||
$(CELERY) -A celery_worker:celery flower --port=5555
|
||||
|
||||
# Check if Redis is running, start if needed
|
||||
redis:
|
||||
@redis-cli ping > /dev/null 2>&1 || (echo "Starting Redis server..." && redis-server --daemonize yes)
|
||||
|
||||
# Run complete application stack (Flask app + Celery worker + Redis)
|
||||
run-all: redis
|
||||
@echo "Starting Flask and Celery..."
|
||||
@$(MAKE) -j2 run celery
|
||||
|
45
README.md
45
README.md
@ -14,7 +14,8 @@ And open it in the browser at [http://localhost:5000/](http://localhost:5000/)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Python >=3.8
|
||||
- Python >=3.8
|
||||
- Redis (for Celery task queue)
|
||||
|
||||
## Development environment
|
||||
|
||||
@ -40,12 +41,44 @@ Python >=3.8
|
||||
add development dependencies under `project.optional-dependencies.*`; run
|
||||
`make clean && make venv` to reinstall the environment
|
||||
|
||||
## Asynchronous Task Processing with Celery
|
||||
|
||||
SciPaperLoader uses Celery for processing large CSV uploads and other background tasks. This allows the application to handle large datasets reliably without blocking the web interface.
|
||||
|
||||
### Running Celery Components
|
||||
|
||||
- `make redis`: ensures Redis server is running (required for Celery)
|
||||
|
||||
- `make celery`: starts a Celery worker to process background tasks
|
||||
|
||||
- `make celery-flower`: starts Flower, a web interface for monitoring Celery tasks at http://localhost:5555
|
||||
|
||||
- `make run-all`: runs the entire stack (Flask app + Celery worker + Redis) in development mode
|
||||
|
||||
### How It Works
|
||||
|
||||
When you upload a CSV file through the web interface:
|
||||
|
||||
1. The file is sent to the server
|
||||
2. A Celery task is created to process the file asynchronously
|
||||
3. The browser shows a progress bar with real-time updates
|
||||
4. The results are displayed when processing is complete
|
||||
|
||||
This architecture allows SciPaperLoader to handle CSV files with thousands of papers without timing out or blocking the web interface.
|
||||
|
||||
## Configuration
|
||||
|
||||
Default configuration is loaded from `scipaperloader.defaults` and can be
|
||||
overriden by environment variables with a `FLASK_` prefix. See
|
||||
[Configuring from Environment Variables](https://flask.palletsprojects.com/en/3.0.x/config/#configuring-from-environment-variables).
|
||||
|
||||
### Celery Configuration
|
||||
|
||||
The following environment variables can be set to configure Celery:
|
||||
|
||||
- `FLASK_CELERY_BROKER_URL`: Redis URL for the message broker (default: `redis://localhost:6379/0`)
|
||||
- `FLASK_CELERY_RESULT_BACKEND`: Redis URL for storing task results (default: `redis://localhost:6379/0`)
|
||||
|
||||
Consider using
|
||||
[dotenv](https://flask.palletsprojects.com/en/3.0.x/cli/#environment-variables-from-dotenv).
|
||||
|
||||
@ -58,4 +91,12 @@ deliver to your server, or copy in your `Dockerfile`, and insall it with `pip`.
|
||||
|
||||
You must set a
|
||||
[SECRET_KEY](https://flask.palletsprojects.com/en/3.0.x/tutorial/deploy/#configure-the-secret-key)
|
||||
in production to a secret and stable value.
|
||||
in production to a secret and stable value.
|
||||
|
||||
### Deploying with Celery
|
||||
|
||||
When deploying to production:
|
||||
|
||||
1. Configure a production-ready Redis instance or use a managed service
|
||||
2. Run Celery workers as system services or in Docker containers
|
||||
3. Consider setting up monitoring for your Celery tasks and workers
|
7
celery_worker.py
Normal file
7
celery_worker.py
Normal file
@ -0,0 +1,7 @@
|
||||
from scipaperloader.celery import celery, configure_celery
|
||||
|
||||
# Configure celery with Flask app
|
||||
configure_celery()
|
||||
|
||||
if __name__ == '__main__':
|
||||
celery.start()
|
@ -13,6 +13,10 @@ dependencies = [
|
||||
"flask-wtf>=1.2.2,<2",
|
||||
"pyzotero>=1.6.11,<2",
|
||||
"pandas>=2.2.3,<3",
|
||||
"celery>=5.5.1,<6",
|
||||
"redis>=5.2.1,<6",
|
||||
"flower>=2.0.1,<3",
|
||||
"flask-migrate>=4.1.0,<5",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
@ -1,5 +1,5 @@
|
||||
from flask import Flask, request
|
||||
|
||||
from flask_migrate import Migrate # Add this line
|
||||
from .config import Config
|
||||
from .db import db
|
||||
from .models import init_schedule_config
|
||||
@ -10,10 +10,15 @@ def create_app(test_config=None):
|
||||
app = Flask(__name__)
|
||||
app.config.from_object(Config)
|
||||
|
||||
# Celery configuration
|
||||
app.config['CELERY_BROKER_URL'] = app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0')
|
||||
app.config['CELERY_RESULT_BACKEND'] = app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0')
|
||||
|
||||
if test_config:
|
||||
app.config.update(test_config)
|
||||
|
||||
db.init_app(app)
|
||||
migrate = Migrate(app, db) # Add this line to initialize Flask-Migrate
|
||||
|
||||
with app.app_context():
|
||||
db.create_all()
|
||||
@ -27,10 +32,19 @@ def create_app(test_config=None):
|
||||
|
||||
@app.before_request
|
||||
def before_request():
|
||||
# Skip logging for static files, health checks, or other frequent requests
|
||||
if request.path.startswith('/static/') or request.path == '/health' or request.path == '/favicon.ico':
|
||||
return
|
||||
|
||||
# Skip task status checks to avoid log spam
|
||||
if request.path.startswith('/task_status/'):
|
||||
return
|
||||
|
||||
action = request.endpoint or request.path or "unknown_request"
|
||||
ActivityLog.log_gui_interaction(
|
||||
action=request.endpoint,
|
||||
description=f"Request to {request.endpoint}",
|
||||
action=action,
|
||||
description=f"Request to {request.path}",
|
||||
extra={"method": request.method, "url": request.url}
|
||||
)
|
||||
|
||||
return app
|
||||
|
||||
return app
|
@ -116,7 +116,7 @@ def export_papers():
|
||||
[
|
||||
paper.id,
|
||||
paper.title,
|
||||
getattr(paper, "journal", ""),
|
||||
paper.journal,
|
||||
paper.doi,
|
||||
paper.issn,
|
||||
paper.status,
|
||||
|
@ -3,26 +3,39 @@ import codecs
|
||||
import csv
|
||||
import datetime
|
||||
from io import StringIO
|
||||
import json
|
||||
|
||||
import pandas as pd
|
||||
from flask import (
|
||||
Blueprint,
|
||||
flash,
|
||||
jsonify,
|
||||
redirect,
|
||||
render_template,
|
||||
request,
|
||||
send_file,
|
||||
session,
|
||||
url_for,
|
||||
current_app
|
||||
)
|
||||
|
||||
from ..db import db
|
||||
from ..models import PaperMetadata
|
||||
from ..models import PaperMetadata, ActivityLog
|
||||
from ..celery import celery # Import the celery instance directly
|
||||
|
||||
bp = Blueprint("upload", __name__)
|
||||
|
||||
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
|
||||
CHUNK_SIZE = 100 # Number of rows to process per batch
|
||||
|
||||
def parse_date(date_str):
|
||||
"""Parse date string into datetime object."""
|
||||
if not date_str or pd.isna(date_str):
|
||||
return None
|
||||
try:
|
||||
return datetime.datetime.strptime(date_str, "%Y-%m-%d")
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
@bp.route("/", methods=["GET", "POST"])
|
||||
def upload():
|
||||
@ -32,136 +45,214 @@ def upload():
|
||||
duplicate_strategy = request.form.get("duplicate_strategy", "skip")
|
||||
|
||||
if not file:
|
||||
return render_template("upload.html.jinja", error="No file selected.")
|
||||
return jsonify({"error": "No file selected."})
|
||||
|
||||
try:
|
||||
stream = codecs.iterdecode(file.stream, "utf-8")
|
||||
content = "".join(stream)
|
||||
df = pd.read_csv(StringIO(content), delimiter=delimiter)
|
||||
except Exception as e:
|
||||
return render_template("upload.html.jinja", error=f"Failed to read CSV file: {e}")
|
||||
stream = codecs.iterdecode(file.stream, "utf-8")
|
||||
content = "".join(stream)
|
||||
|
||||
missing = REQUIRED_COLUMNS - set(df.columns)
|
||||
if missing:
|
||||
return render_template(
|
||||
"upload.html.jinja", error=f"Missing required columns: {', '.join(missing)}"
|
||||
)
|
||||
|
||||
# Optional: parse 'published_online' to date
|
||||
def parse_date(val):
|
||||
if pd.isna(val):
|
||||
return None
|
||||
try:
|
||||
return pd.to_datetime(val).date()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
# Count statistics
|
||||
added_count = 0
|
||||
skipped_count = 0
|
||||
updated_count = 0
|
||||
error_count = 0
|
||||
# Trigger the Celery task
|
||||
task = process_csv.delay(content, delimiter, duplicate_strategy)
|
||||
|
||||
# Collect error information
|
||||
errors = []
|
||||
|
||||
# Process each row
|
||||
for index, row in df.iterrows():
|
||||
try:
|
||||
# Get DOI from row for error reporting
|
||||
doi = str(row.get("doi", "N/A"))
|
||||
|
||||
# Validate required fields
|
||||
for field in ["title", "doi", "issn"]:
|
||||
if pd.isna(row.get(field)) or not str(row.get(field)).strip():
|
||||
raise ValueError(f"Missing required field: {field}")
|
||||
|
||||
# Check if paper with this DOI already exists
|
||||
existing = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
|
||||
if existing:
|
||||
if duplicate_strategy == 'update':
|
||||
# Update existing record
|
||||
existing.title = row["title"]
|
||||
existing.alt_id = row.get("alternative_id")
|
||||
existing.issn = row["issn"]
|
||||
existing.journal = row.get("journal")
|
||||
existing.type = row.get("type")
|
||||
existing.language = row.get("language")
|
||||
existing.published_online = parse_date(row.get("published_online"))
|
||||
updated_count += 1
|
||||
return jsonify({"task_id": task.id})
|
||||
|
||||
return render_template("upload.html.jinja")
|
||||
|
||||
@celery.task(bind=True)
|
||||
def process_csv(self, file_content, delimiter, duplicate_strategy):
|
||||
"""Process CSV file and import paper metadata."""
|
||||
|
||||
# With the ContextTask in place, we're already inside an app context
|
||||
added_count = skipped_count = updated_count = error_count = 0
|
||||
errors = []
|
||||
skipped_records = [] # Add this to track skipped records
|
||||
|
||||
try:
|
||||
# Log the start of import using ActivityLog model
|
||||
ActivityLog.log_import_activity(
|
||||
action="start_csv_import",
|
||||
status="processing",
|
||||
description=f"Starting CSV import with strategy: {duplicate_strategy}",
|
||||
file_size=len(file_content),
|
||||
delimiter=delimiter
|
||||
)
|
||||
|
||||
# Set initial progress percentage
|
||||
self.update_state(state='PROGRESS', meta={'progress': 10})
|
||||
|
||||
# Read CSV into chunks
|
||||
csv_buffer = StringIO(file_content)
|
||||
# Count total chunks
|
||||
csv_buffer.seek(0)
|
||||
total_chunks = len(list(pd.read_csv(csv_buffer, delimiter=delimiter, chunksize=CHUNK_SIZE)))
|
||||
csv_buffer.seek(0)
|
||||
|
||||
# Process each chunk of rows
|
||||
for chunk_idx, chunk in enumerate(pd.read_csv(csv_buffer, delimiter=delimiter, chunksize=CHUNK_SIZE)):
|
||||
for index, row in chunk.iterrows():
|
||||
try:
|
||||
doi = str(row.get("doi", "N/A"))
|
||||
# Validate required fields
|
||||
if pd.isna(row.get("title")) or pd.isna(row.get("doi")) or pd.isna(row.get("issn")):
|
||||
raise ValueError("Missing required fields")
|
||||
|
||||
# Try finding an existing record based on DOI
|
||||
existing = db.session.query(PaperMetadata).filter_by(doi=doi).first()
|
||||
if existing:
|
||||
if duplicate_strategy == "update":
|
||||
existing.title = row["title"]
|
||||
existing.alt_id = row.get("alternative_id")
|
||||
existing.issn = row["issn"]
|
||||
existing.journal = row.get("journal")
|
||||
existing.published_online = parse_date(row.get("published_online"))
|
||||
updated_count += 1
|
||||
else:
|
||||
# Track why this record was skipped
|
||||
skipped_records.append({
|
||||
"row": index + 2,
|
||||
"doi": doi,
|
||||
"reason": f"Duplicate DOI found and strategy is '{duplicate_strategy}'"
|
||||
})
|
||||
skipped_count += 1
|
||||
continue
|
||||
else:
|
||||
# Skip this record
|
||||
skipped_count += 1
|
||||
continue
|
||||
else:
|
||||
# Create new record
|
||||
metadata = PaperMetadata(
|
||||
title=row["title"],
|
||||
doi=doi,
|
||||
alt_id=row.get("alternative_id"),
|
||||
issn=row["issn"],
|
||||
journal=row.get("journal"),
|
||||
type=row.get("type"),
|
||||
language=row.get("language"),
|
||||
published_online=parse_date(row.get("published_online")),
|
||||
status="New",
|
||||
file_path=None,
|
||||
error_msg=None,
|
||||
)
|
||||
db.session.add(metadata)
|
||||
added_count += 1
|
||||
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
errors.append({
|
||||
"row": index + 2, # +2 because index is 0-based and we have a header row
|
||||
"doi": row.get("doi", "N/A"),
|
||||
"error": str(e)
|
||||
})
|
||||
continue # Skip this row and continue with the next
|
||||
metadata = PaperMetadata(
|
||||
title=row["title"],
|
||||
doi=doi,
|
||||
alt_id=row.get("alternative_id"),
|
||||
issn=row["issn"],
|
||||
journal=row.get("journal"),
|
||||
published_online=parse_date(row.get("published_online")),
|
||||
status="New",
|
||||
)
|
||||
db.session.add(metadata)
|
||||
added_count += 1
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
errors.append({"row": index + 2, "doi": row.get("doi", "N/A"), "error": str(e)})
|
||||
|
||||
try:
|
||||
# Commit the chunk and roll session fresh
|
||||
db.session.commit()
|
||||
except Exception as e:
|
||||
db.session.rollback()
|
||||
return render_template(
|
||||
"upload.html.jinja", error=f"Failed to save data to database: {e}"
|
||||
)
|
||||
|
||||
# Prepare error samples for display
|
||||
error_samples = errors[:5] if errors else []
|
||||
|
||||
error_message = None
|
||||
if errors:
|
||||
error_message = f"Encountered {len(errors)} errors. First 5 shown below."
|
||||
|
||||
# Store the full errors list in the session for potential download
|
||||
if errors:
|
||||
# Log periodic progress every 5 chunks
|
||||
if (chunk_idx + 1) % 5 == 0:
|
||||
ActivityLog.log_import_activity(
|
||||
action="import_progress",
|
||||
status="processing",
|
||||
description=f"Processed {chunk_idx+1}/{total_chunks} chunks",
|
||||
current_stats={
|
||||
"added": added_count,
|
||||
"updated": updated_count,
|
||||
"skipped": skipped_count,
|
||||
"errors": error_count
|
||||
}
|
||||
)
|
||||
|
||||
progress = min(90, 10 + int((chunk_idx + 1) * 80 / total_chunks))
|
||||
self.update_state(state='PROGRESS', meta={'progress': progress})
|
||||
|
||||
# Final progress update and completion log
|
||||
self.update_state(state='PROGRESS', meta={'progress': 100})
|
||||
ActivityLog.log_import_activity(
|
||||
action="complete_csv_import",
|
||||
status="success",
|
||||
description="CSV import completed",
|
||||
stats={
|
||||
"added": added_count,
|
||||
"updated": updated_count,
|
||||
"skipped": skipped_count,
|
||||
"errors": error_count
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
db.session.rollback()
|
||||
ActivityLog.log_error(
|
||||
error_message="CSV import failed",
|
||||
exception=e,
|
||||
severity="error",
|
||||
source="upload.process_csv"
|
||||
)
|
||||
return {'error': str(e), 'progress': 0}
|
||||
finally:
|
||||
db.session.remove()
|
||||
|
||||
# If there were errors, store an error CSV for potential download
|
||||
if errors:
|
||||
try:
|
||||
error_csv = StringIO()
|
||||
writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
|
||||
writer.writeheader()
|
||||
writer.writerows(errors)
|
||||
session["error_data"] = error_csv.getvalue()
|
||||
ActivityLog.log_import_activity(
|
||||
action="import_errors",
|
||||
status="error",
|
||||
description=f"Import completed with {error_count} errors",
|
||||
error_csv=error_csv.getvalue(),
|
||||
task_id=self.request.id,
|
||||
error_count=error_count
|
||||
)
|
||||
except Exception:
|
||||
# Do not fail the task if error logging fails
|
||||
pass
|
||||
|
||||
return render_template(
|
||||
"upload.html.jinja",
|
||||
success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
|
||||
error_message=error_message,
|
||||
error_samples=error_samples
|
||||
)
|
||||
# Update the return value to include skipped records information
|
||||
return {
|
||||
"added": added_count,
|
||||
"updated": updated_count,
|
||||
"skipped": skipped_count,
|
||||
"skipped_records": skipped_records[:5], # Include up to 5 examples
|
||||
"skipped_reason_summary": "Records were skipped because they already exist in the database. Use 'update' strategy to update them.",
|
||||
"errors": errors[:5],
|
||||
"error_count": error_count,
|
||||
"task_id": self.request.id
|
||||
}
|
||||
|
||||
@bp.route("/task_status/<task_id>")
|
||||
def task_status(task_id):
|
||||
"""Get status of background task."""
|
||||
task = celery.AsyncResult(task_id)
|
||||
|
||||
if task.state == "PENDING":
|
||||
response = {"state": task.state, "progress": 0}
|
||||
elif task.state == "PROGRESS":
|
||||
response = {
|
||||
"state": task.state,
|
||||
"progress": task.info.get("progress", 0)
|
||||
}
|
||||
elif task.state == "SUCCESS":
|
||||
response = {
|
||||
"state": task.state,
|
||||
"result": task.result
|
||||
}
|
||||
else: # FAILURE, REVOKED, etc.
|
||||
response = {
|
||||
"state": task.state,
|
||||
"error": str(task.info) if task.info else "Unknown error"
|
||||
}
|
||||
|
||||
return jsonify(response)
|
||||
|
||||
return render_template("upload.html.jinja")
|
||||
|
||||
|
||||
@bp.route("/download_error_log")
|
||||
def download_error_log():
|
||||
error_data = session.get("error_data")
|
||||
if not error_data:
|
||||
@bp.route("/download_error_log/<task_id>")
|
||||
def download_error_log(task_id):
|
||||
# Find the most recent error log for this task
|
||||
error_log = ActivityLog.query.filter(
|
||||
ActivityLog.action == "import_errors",
|
||||
ActivityLog.extra_data.like(f'%"{task_id}"%') # Search in JSON
|
||||
).order_by(ActivityLog.timestamp.desc()).first()
|
||||
|
||||
if not error_log:
|
||||
flash("No error data available.")
|
||||
return redirect(url_for("upload.upload"))
|
||||
|
||||
# Get the CSV data from extra_data
|
||||
extra_data = error_log.get_extra_data()
|
||||
error_csv = extra_data.get("error_csv")
|
||||
|
||||
if not error_csv:
|
||||
flash("Error data format is invalid.")
|
||||
return redirect(url_for("upload.upload"))
|
||||
|
||||
buffer = StringIO(error_data)
|
||||
buffer = StringIO(error_csv)
|
||||
return send_file(
|
||||
buffer,
|
||||
mimetype="text/csv",
|
||||
|
43
scipaperloader/celery.py
Normal file
43
scipaperloader/celery.py
Normal file
@ -0,0 +1,43 @@
|
||||
from celery import Celery
|
||||
|
||||
# Create Celery instance without Flask app initially
|
||||
celery = Celery(
|
||||
'scipaperloader',
|
||||
broker='redis://localhost:6379/0',
|
||||
backend='redis://localhost:6379/0',
|
||||
)
|
||||
|
||||
def configure_celery(app=None):
|
||||
"""Configure Celery with the Flask app settings and ensure tasks run in the app context."""
|
||||
if app is None:
|
||||
# Import here to avoid circular import
|
||||
from scipaperloader import create_app
|
||||
app = create_app()
|
||||
|
||||
# Update Celery configuration using the app settings
|
||||
celery.conf.update(
|
||||
broker_url=app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0'),
|
||||
result_backend=app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0'),
|
||||
task_serializer='json',
|
||||
accept_content=['json'],
|
||||
result_serializer='json',
|
||||
timezone='UTC',
|
||||
enable_utc=True,
|
||||
task_time_limit=3600, # 1 hour max runtime
|
||||
task_soft_time_limit=3000, # 50 minutes soft limit
|
||||
worker_max_tasks_per_child=10, # Restart workers after 10 tasks
|
||||
worker_max_memory_per_child=1000000, # 1GB memory limit
|
||||
task_acks_late=True, # Acknowledge tasks after completion
|
||||
task_reject_on_worker_lost=True, # Requeue tasks if worker dies
|
||||
)
|
||||
|
||||
# Create a custom task class that pushes the Flask application context
|
||||
class ContextTask(celery.Task):
|
||||
abstract = True
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
with app.app_context():
|
||||
return self.run(*args, **kwargs)
|
||||
|
||||
celery.Task = ContextTask
|
||||
return celery
|
@ -12,6 +12,7 @@ class ActivityCategory(Enum):
|
||||
SCRAPER_COMMAND = "scraper_command"
|
||||
SCRAPER_ACTIVITY = "scraper_activity"
|
||||
SYSTEM = "system"
|
||||
DATA_IMPORT = "data_import"
|
||||
|
||||
|
||||
class ErrorSeverity(Enum):
|
||||
@ -164,6 +165,20 @@ class ActivityLog(db.Model):
|
||||
db.session.commit()
|
||||
return log
|
||||
|
||||
@classmethod
|
||||
def log_import_activity(cls, action, status=None, description=None, user_id=None, **extra):
|
||||
"""Log data import activities (CSV uploads, bulk imports, etc.)."""
|
||||
log = cls(
|
||||
category=ActivityCategory.DATA_IMPORT.value,
|
||||
action=action,
|
||||
status=status,
|
||||
description=description,
|
||||
user_id=user_id
|
||||
)
|
||||
log.set_extra_data(extra)
|
||||
db.session.add(log)
|
||||
db.session.commit()
|
||||
return log
|
||||
|
||||
class PaperMetadata(db.Model):
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
@ -171,6 +186,7 @@ class PaperMetadata(db.Model):
|
||||
doi = db.Column(db.String, unique=True, index=True)
|
||||
alt_id = db.Column(db.String)
|
||||
issn = db.Column(db.String(32))
|
||||
journal = db.Column(db.String(255))
|
||||
type = db.Column(db.String(50))
|
||||
language = db.Column(db.String(50))
|
||||
published_online = db.Column(db.Date) # or DateTime/String
|
||||
|
@ -1,5 +1,9 @@
|
||||
.message {
|
||||
padding: 10px;
|
||||
font-size: 1.3em;
|
||||
font-family: Arial, sans-serif;
|
||||
padding: 10px;
|
||||
font-size: 1.3em;
|
||||
font-family: Arial, sans-serif;
|
||||
}
|
||||
|
||||
.progress-bar {
|
||||
width: 0%;
|
||||
}
|
||||
|
@ -1,6 +1,8 @@
|
||||
{% extends "base.html.jinja" %} {% block content %}
|
||||
<h1>Welcome to SciPaperLoader</h1>
|
||||
|
||||
<div id="results-container"></div>
|
||||
|
||||
{% if success %}
|
||||
<div class="alert alert-success mt-3">{{ success }}</div>
|
||||
{% endif %} {% if error_message %}
|
||||
@ -40,24 +42,9 @@
|
||||
<li><code>issn</code> – the ISSN of the journal</li>
|
||||
<li><code>title</code> – the title of the paper</li>
|
||||
</ul>
|
||||
<p>
|
||||
The format of your CSV should resemble the response structure of the
|
||||
Crossref API's <code>/journals/{issn}/works</code> endpoint.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<form method="post" action="{{ url_for('upload.upload') }}" enctype="multipart/form-data">
|
||||
<div class="mb-3">
|
||||
<label class="form-label">How to handle duplicate DOIs:</label>
|
||||
<div class="form-check">
|
||||
<input class="form-check-input" type="radio" name="duplicate_strategy" value="skip" id="skip" checked />
|
||||
<label class="form-check-label" for="skip">Skip duplicate entries</label>
|
||||
</div>
|
||||
<div class="form-check">
|
||||
<input class="form-check-input" type="radio" name="duplicate_strategy" value="update" id="update" />
|
||||
<label class="form-check-label" for="update">Update existing entries</label>
|
||||
</div>
|
||||
</div>
|
||||
<form method="post" action="{{ url_for('upload.upload') }}" enctype="multipart/form-data" id="upload-form">
|
||||
<div class="form-group">
|
||||
<label for="file">Upload CSV File</label>
|
||||
<input type="file" name="file" id="file" class="form-control" required />
|
||||
@ -73,4 +60,175 @@
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary mt-3">Upload</button>
|
||||
</form>
|
||||
|
||||
<!-- Progress Modal -->
|
||||
<div id="progressModal" class="modal fade" tabindex="-1">
|
||||
<div class="modal-dialog">
|
||||
<div class="modal-content">
|
||||
<div class="modal-header">
|
||||
<h5 class="modal-title">Processing Your Upload</h5>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
<div class="progress">
|
||||
<div id="progressBar" class="progress-bar" role="progressbar">0%</div>
|
||||
</div>
|
||||
<p id="progressStatus" class="mt-2 text-center">Starting...</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const form = document.getElementById("upload-form");
|
||||
form.addEventListener("submit", function (e) {
|
||||
e.preventDefault();
|
||||
|
||||
// Display loading state immediately
|
||||
const progressModal = new bootstrap.Modal(document.getElementById("progressModal"));
|
||||
progressModal.show();
|
||||
const progressBar = document.getElementById("progressBar");
|
||||
progressBar.style.width = "5%";
|
||||
progressBar.textContent = "Starting...";
|
||||
|
||||
const formData = new FormData(form);
|
||||
|
||||
// Disable the form while processing
|
||||
const submitButton = form.querySelector("button[type='submit']");
|
||||
submitButton.disabled = true;
|
||||
|
||||
fetch(form.action, {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
})
|
||||
.then((response) => response.json())
|
||||
.then((data) => {
|
||||
if (data.error) {
|
||||
// Handle error
|
||||
progressModal.hide();
|
||||
alert(`Error: ${data.error}`);
|
||||
submitButton.disabled = false;
|
||||
return;
|
||||
}
|
||||
|
||||
const taskId = data.task_id;
|
||||
const interval = setInterval(() => {
|
||||
fetch("{{ url_for('upload.task_status', task_id='') }}" + taskId)
|
||||
.then((response) => response.json())
|
||||
.then((status) => {
|
||||
console.log("Task status:", status);
|
||||
if (status.state === "SUCCESS") {
|
||||
clearInterval(interval);
|
||||
progressBar.style.width = "100%";
|
||||
progressBar.textContent = "Completed!";
|
||||
|
||||
setTimeout(() => {
|
||||
progressModal.hide();
|
||||
showResults(status.result);
|
||||
submitButton.disabled = false;
|
||||
}, 1000);
|
||||
} else if (status.state === "FAILURE") {
|
||||
clearInterval(interval);
|
||||
progressBar.style.width = "100%";
|
||||
progressBar.classList.add("bg-danger");
|
||||
progressBar.textContent = "Failed!";
|
||||
|
||||
setTimeout(() => {
|
||||
progressModal.hide();
|
||||
alert(`Task failed: ${status.error || "Unknown error"}`);
|
||||
submitButton.disabled = false;
|
||||
}, 1000);
|
||||
} else {
|
||||
// Update progress bar with more information
|
||||
const progress = status.progress || 0;
|
||||
progressBar.style.width = `${progress}%`;
|
||||
progressBar.textContent = `${progress}% complete`;
|
||||
document.getElementById("progressStatus").innerText = `Processing... (${status.state})`;
|
||||
}
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error("Failed to check task status:", err);
|
||||
});
|
||||
}, 1000);
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error("Upload failed:", err);
|
||||
progressModal.hide();
|
||||
alert("Upload failed. Please try again.");
|
||||
submitButton.disabled = false;
|
||||
});
|
||||
});
|
||||
|
||||
const showResults = (result) => {
|
||||
const message = `Upload completed! Added: ${result.added}, Updated: ${result.updated}, Skipped: ${result.skipped}, Errors: ${result.error_count}`;
|
||||
|
||||
let resultHTML = `<div class="alert alert-success">${message}</div>`;
|
||||
|
||||
// Add skipped records information
|
||||
if (result.skipped > 0) {
|
||||
resultHTML += `
|
||||
<div class="alert alert-info">
|
||||
<h4>${result.skipped} records were skipped</h4>
|
||||
<p>${result.skipped_reason_summary || "Records were skipped because they already exist in the database."}</p>
|
||||
${result.skipped_records && result.skipped_records.length > 0 ? `
|
||||
<p>Examples of skipped records:</p>
|
||||
<table class="table table-sm table-bordered">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Row</th>
|
||||
<th>DOI</th>
|
||||
<th>Reason</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
${result.skipped_records.map(record => `
|
||||
<tr>
|
||||
<td>${record.row}</td>
|
||||
<td>${record.doi}</td>
|
||||
<td>${record.reason}</td>
|
||||
</tr>
|
||||
`).join('')}
|
||||
</tbody>
|
||||
</table>
|
||||
` : ''}
|
||||
</div>`;
|
||||
}
|
||||
|
||||
// Existing error display code
|
||||
if (result.error_count > 0) {
|
||||
resultHTML += `
|
||||
<div class="alert alert-warning">
|
||||
<h4>Some errors occurred (${result.error_count} total)</h4>
|
||||
<p>Showing first ${result.errors.length} of ${result.error_count} errors:</p>
|
||||
<table class="table table-sm table-bordered">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Row</th>
|
||||
<th>DOI</th>
|
||||
<th>Error</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>`;
|
||||
|
||||
result.errors.forEach(error => {
|
||||
resultHTML += `
|
||||
<tr>
|
||||
<td>${error.row}</td>
|
||||
<td>${error.doi}</td>
|
||||
<td>${error.error}</td>
|
||||
</tr>`;
|
||||
});
|
||||
|
||||
resultHTML += `
|
||||
</tbody>
|
||||
</table>
|
||||
<p class="mt-2">Download the complete error log with all ${result.error_count} errors:</p>
|
||||
<a href="/upload/download_error_log/${result.task_id}" class="btn btn-outline-secondary">
|
||||
Download Full Error Log
|
||||
</a>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
document.getElementById("results-container").innerHTML = resultHTML;
|
||||
};
|
||||
</script>
|
||||
{% endblock content %}
|
1641
testdata.csv
Normal file
1641
testdata.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user