implements asynchronous task management to the input module.
This commit is contained in:
parent
5d63e28a61
commit
05f4c8b517
2
.gitignore
vendored
2
.gitignore
vendored
@ -13,3 +13,5 @@ dist/
|
|||||||
*.db
|
*.db
|
||||||
|
|
||||||
*.R
|
*.R
|
||||||
|
|
||||||
|
migrations/
|
31
Makefile
31
Makefile
@ -1,10 +1,13 @@
|
|||||||
# List of phony targets (targets that don't represent files)
|
# List of phony targets (targets that don't represent files)
|
||||||
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev
|
.PHONY: all clean venv run format format-check lint mypy test dist reformat dev celery celery-flower redis run-all
|
||||||
|
|
||||||
# Define Python and pip executables inside virtual environment
|
# Define Python and pip executables inside virtual environment
|
||||||
PYTHON := venv/bin/python
|
PYTHON := venv/bin/python
|
||||||
PIP := venv/bin/pip
|
PIP := venv/bin/pip
|
||||||
|
|
||||||
|
# Celery worker command
|
||||||
|
CELERY := venv/bin/celery
|
||||||
|
|
||||||
# Default target that runs the application
|
# Default target that runs the application
|
||||||
all: run
|
all: run
|
||||||
|
|
||||||
@ -83,11 +86,11 @@ todos:
|
|||||||
@grep -r "TODO\|FIXME" scipaperloader || echo "No TODOs found"
|
@grep -r "TODO\|FIXME" scipaperloader || echo "No TODOs found"
|
||||||
|
|
||||||
# Reset the database: delete, initialize, and migrate
|
# Reset the database: delete, initialize, and migrate
|
||||||
reset-db:
|
reset-db: venv
|
||||||
rm -f $(DB_PATH)
|
rm -f $(DB_PATH)
|
||||||
flask db init || true
|
$(PYTHON) -m flask --app scipaperloader db init || true
|
||||||
flask db migrate -m "Initial migration"
|
$(PYTHON) -m flask --app scipaperloader db migrate -m "Initial migration"
|
||||||
flask db upgrade
|
$(PYTHON) -m flask --app scipaperloader db upgrade
|
||||||
|
|
||||||
# Create and set up virtual environment
|
# Create and set up virtual environment
|
||||||
venv:
|
venv:
|
||||||
@ -130,3 +133,21 @@ dist: format-check lint mypy test
|
|||||||
|
|
||||||
# Set up complete development environment
|
# Set up complete development environment
|
||||||
dev: clean venv
|
dev: clean venv
|
||||||
|
|
||||||
|
# Start Celery worker for processing tasks
|
||||||
|
celery: venv
|
||||||
|
$(CELERY) -A celery_worker:celery worker --loglevel=info
|
||||||
|
|
||||||
|
# Monitor Celery tasks with flower web interface
|
||||||
|
celery-flower: venv
|
||||||
|
$(PIP) install flower
|
||||||
|
$(CELERY) -A celery_worker:celery flower --port=5555
|
||||||
|
|
||||||
|
# Check if Redis is running, start if needed
|
||||||
|
redis:
|
||||||
|
@redis-cli ping > /dev/null 2>&1 || (echo "Starting Redis server..." && redis-server --daemonize yes)
|
||||||
|
|
||||||
|
# Run complete application stack (Flask app + Celery worker + Redis)
|
||||||
|
run-all: redis
|
||||||
|
@echo "Starting Flask and Celery..."
|
||||||
|
@$(MAKE) -j2 run celery
|
||||||
|
43
README.md
43
README.md
@ -14,7 +14,8 @@ And open it in the browser at [http://localhost:5000/](http://localhost:5000/)
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Python >=3.8
|
- Python >=3.8
|
||||||
|
- Redis (for Celery task queue)
|
||||||
|
|
||||||
## Development environment
|
## Development environment
|
||||||
|
|
||||||
@ -40,12 +41,44 @@ Python >=3.8
|
|||||||
add development dependencies under `project.optional-dependencies.*`; run
|
add development dependencies under `project.optional-dependencies.*`; run
|
||||||
`make clean && make venv` to reinstall the environment
|
`make clean && make venv` to reinstall the environment
|
||||||
|
|
||||||
|
## Asynchronous Task Processing with Celery
|
||||||
|
|
||||||
|
SciPaperLoader uses Celery for processing large CSV uploads and other background tasks. This allows the application to handle large datasets reliably without blocking the web interface.
|
||||||
|
|
||||||
|
### Running Celery Components
|
||||||
|
|
||||||
|
- `make redis`: ensures Redis server is running (required for Celery)
|
||||||
|
|
||||||
|
- `make celery`: starts a Celery worker to process background tasks
|
||||||
|
|
||||||
|
- `make celery-flower`: starts Flower, a web interface for monitoring Celery tasks at http://localhost:5555
|
||||||
|
|
||||||
|
- `make run-all`: runs the entire stack (Flask app + Celery worker + Redis) in development mode
|
||||||
|
|
||||||
|
### How It Works
|
||||||
|
|
||||||
|
When you upload a CSV file through the web interface:
|
||||||
|
|
||||||
|
1. The file is sent to the server
|
||||||
|
2. A Celery task is created to process the file asynchronously
|
||||||
|
3. The browser shows a progress bar with real-time updates
|
||||||
|
4. The results are displayed when processing is complete
|
||||||
|
|
||||||
|
This architecture allows SciPaperLoader to handle CSV files with thousands of papers without timing out or blocking the web interface.
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
Default configuration is loaded from `scipaperloader.defaults` and can be
|
Default configuration is loaded from `scipaperloader.defaults` and can be
|
||||||
overriden by environment variables with a `FLASK_` prefix. See
|
overriden by environment variables with a `FLASK_` prefix. See
|
||||||
[Configuring from Environment Variables](https://flask.palletsprojects.com/en/3.0.x/config/#configuring-from-environment-variables).
|
[Configuring from Environment Variables](https://flask.palletsprojects.com/en/3.0.x/config/#configuring-from-environment-variables).
|
||||||
|
|
||||||
|
### Celery Configuration
|
||||||
|
|
||||||
|
The following environment variables can be set to configure Celery:
|
||||||
|
|
||||||
|
- `FLASK_CELERY_BROKER_URL`: Redis URL for the message broker (default: `redis://localhost:6379/0`)
|
||||||
|
- `FLASK_CELERY_RESULT_BACKEND`: Redis URL for storing task results (default: `redis://localhost:6379/0`)
|
||||||
|
|
||||||
Consider using
|
Consider using
|
||||||
[dotenv](https://flask.palletsprojects.com/en/3.0.x/cli/#environment-variables-from-dotenv).
|
[dotenv](https://flask.palletsprojects.com/en/3.0.x/cli/#environment-variables-from-dotenv).
|
||||||
|
|
||||||
@ -59,3 +92,11 @@ deliver to your server, or copy in your `Dockerfile`, and insall it with `pip`.
|
|||||||
You must set a
|
You must set a
|
||||||
[SECRET_KEY](https://flask.palletsprojects.com/en/3.0.x/tutorial/deploy/#configure-the-secret-key)
|
[SECRET_KEY](https://flask.palletsprojects.com/en/3.0.x/tutorial/deploy/#configure-the-secret-key)
|
||||||
in production to a secret and stable value.
|
in production to a secret and stable value.
|
||||||
|
|
||||||
|
### Deploying with Celery
|
||||||
|
|
||||||
|
When deploying to production:
|
||||||
|
|
||||||
|
1. Configure a production-ready Redis instance or use a managed service
|
||||||
|
2. Run Celery workers as system services or in Docker containers
|
||||||
|
3. Consider setting up monitoring for your Celery tasks and workers
|
7
celery_worker.py
Normal file
7
celery_worker.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
from scipaperloader.celery import celery, configure_celery
|
||||||
|
|
||||||
|
# Configure celery with Flask app
|
||||||
|
configure_celery()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
celery.start()
|
@ -13,6 +13,10 @@ dependencies = [
|
|||||||
"flask-wtf>=1.2.2,<2",
|
"flask-wtf>=1.2.2,<2",
|
||||||
"pyzotero>=1.6.11,<2",
|
"pyzotero>=1.6.11,<2",
|
||||||
"pandas>=2.2.3,<3",
|
"pandas>=2.2.3,<3",
|
||||||
|
"celery>=5.5.1,<6",
|
||||||
|
"redis>=5.2.1,<6",
|
||||||
|
"flower>=2.0.1,<3",
|
||||||
|
"flask-migrate>=4.1.0,<5",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from flask import Flask, request
|
from flask import Flask, request
|
||||||
|
from flask_migrate import Migrate # Add this line
|
||||||
from .config import Config
|
from .config import Config
|
||||||
from .db import db
|
from .db import db
|
||||||
from .models import init_schedule_config
|
from .models import init_schedule_config
|
||||||
@ -10,10 +10,15 @@ def create_app(test_config=None):
|
|||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
app.config.from_object(Config)
|
app.config.from_object(Config)
|
||||||
|
|
||||||
|
# Celery configuration
|
||||||
|
app.config['CELERY_BROKER_URL'] = app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0')
|
||||||
|
app.config['CELERY_RESULT_BACKEND'] = app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0')
|
||||||
|
|
||||||
if test_config:
|
if test_config:
|
||||||
app.config.update(test_config)
|
app.config.update(test_config)
|
||||||
|
|
||||||
db.init_app(app)
|
db.init_app(app)
|
||||||
|
migrate = Migrate(app, db) # Add this line to initialize Flask-Migrate
|
||||||
|
|
||||||
with app.app_context():
|
with app.app_context():
|
||||||
db.create_all()
|
db.create_all()
|
||||||
@ -27,9 +32,18 @@ def create_app(test_config=None):
|
|||||||
|
|
||||||
@app.before_request
|
@app.before_request
|
||||||
def before_request():
|
def before_request():
|
||||||
|
# Skip logging for static files, health checks, or other frequent requests
|
||||||
|
if request.path.startswith('/static/') or request.path == '/health' or request.path == '/favicon.ico':
|
||||||
|
return
|
||||||
|
|
||||||
|
# Skip task status checks to avoid log spam
|
||||||
|
if request.path.startswith('/task_status/'):
|
||||||
|
return
|
||||||
|
|
||||||
|
action = request.endpoint or request.path or "unknown_request"
|
||||||
ActivityLog.log_gui_interaction(
|
ActivityLog.log_gui_interaction(
|
||||||
action=request.endpoint,
|
action=action,
|
||||||
description=f"Request to {request.endpoint}",
|
description=f"Request to {request.path}",
|
||||||
extra={"method": request.method, "url": request.url}
|
extra={"method": request.method, "url": request.url}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -116,7 +116,7 @@ def export_papers():
|
|||||||
[
|
[
|
||||||
paper.id,
|
paper.id,
|
||||||
paper.title,
|
paper.title,
|
||||||
getattr(paper, "journal", ""),
|
paper.journal,
|
||||||
paper.doi,
|
paper.doi,
|
||||||
paper.issn,
|
paper.issn,
|
||||||
paper.status,
|
paper.status,
|
||||||
|
@ -3,26 +3,39 @@ import codecs
|
|||||||
import csv
|
import csv
|
||||||
import datetime
|
import datetime
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
import json
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from flask import (
|
from flask import (
|
||||||
Blueprint,
|
Blueprint,
|
||||||
flash,
|
flash,
|
||||||
|
jsonify,
|
||||||
redirect,
|
redirect,
|
||||||
render_template,
|
render_template,
|
||||||
request,
|
request,
|
||||||
send_file,
|
send_file,
|
||||||
session,
|
session,
|
||||||
url_for,
|
url_for,
|
||||||
|
current_app
|
||||||
)
|
)
|
||||||
|
|
||||||
from ..db import db
|
from ..db import db
|
||||||
from ..models import PaperMetadata
|
from ..models import PaperMetadata, ActivityLog
|
||||||
|
from ..celery import celery # Import the celery instance directly
|
||||||
|
|
||||||
bp = Blueprint("upload", __name__)
|
bp = Blueprint("upload", __name__)
|
||||||
|
|
||||||
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
|
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
|
||||||
|
CHUNK_SIZE = 100 # Number of rows to process per batch
|
||||||
|
|
||||||
|
def parse_date(date_str):
|
||||||
|
"""Parse date string into datetime object."""
|
||||||
|
if not date_str or pd.isna(date_str):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.datetime.strptime(date_str, "%Y-%m-%d")
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
@bp.route("/", methods=["GET", "POST"])
|
@bp.route("/", methods=["GET", "POST"])
|
||||||
def upload():
|
def upload():
|
||||||
@ -32,136 +45,214 @@ def upload():
|
|||||||
duplicate_strategy = request.form.get("duplicate_strategy", "skip")
|
duplicate_strategy = request.form.get("duplicate_strategy", "skip")
|
||||||
|
|
||||||
if not file:
|
if not file:
|
||||||
return render_template("upload.html.jinja", error="No file selected.")
|
return jsonify({"error": "No file selected."})
|
||||||
|
|
||||||
try:
|
stream = codecs.iterdecode(file.stream, "utf-8")
|
||||||
stream = codecs.iterdecode(file.stream, "utf-8")
|
content = "".join(stream)
|
||||||
content = "".join(stream)
|
|
||||||
df = pd.read_csv(StringIO(content), delimiter=delimiter)
|
|
||||||
except Exception as e:
|
|
||||||
return render_template("upload.html.jinja", error=f"Failed to read CSV file: {e}")
|
|
||||||
|
|
||||||
missing = REQUIRED_COLUMNS - set(df.columns)
|
# Trigger the Celery task
|
||||||
if missing:
|
task = process_csv.delay(content, delimiter, duplicate_strategy)
|
||||||
return render_template(
|
|
||||||
"upload.html.jinja", error=f"Missing required columns: {', '.join(missing)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Optional: parse 'published_online' to date
|
return jsonify({"task_id": task.id})
|
||||||
def parse_date(val):
|
|
||||||
if pd.isna(val):
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
return pd.to_datetime(val).date()
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Count statistics
|
return render_template("upload.html.jinja")
|
||||||
added_count = 0
|
|
||||||
skipped_count = 0
|
|
||||||
updated_count = 0
|
|
||||||
error_count = 0
|
|
||||||
|
|
||||||
# Collect error information
|
@celery.task(bind=True)
|
||||||
errors = []
|
def process_csv(self, file_content, delimiter, duplicate_strategy):
|
||||||
|
"""Process CSV file and import paper metadata."""
|
||||||
|
|
||||||
# Process each row
|
# With the ContextTask in place, we're already inside an app context
|
||||||
for index, row in df.iterrows():
|
added_count = skipped_count = updated_count = error_count = 0
|
||||||
try:
|
errors = []
|
||||||
# Get DOI from row for error reporting
|
skipped_records = [] # Add this to track skipped records
|
||||||
doi = str(row.get("doi", "N/A"))
|
|
||||||
|
|
||||||
# Validate required fields
|
try:
|
||||||
for field in ["title", "doi", "issn"]:
|
# Log the start of import using ActivityLog model
|
||||||
if pd.isna(row.get(field)) or not str(row.get(field)).strip():
|
ActivityLog.log_import_activity(
|
||||||
raise ValueError(f"Missing required field: {field}")
|
action="start_csv_import",
|
||||||
|
status="processing",
|
||||||
|
description=f"Starting CSV import with strategy: {duplicate_strategy}",
|
||||||
|
file_size=len(file_content),
|
||||||
|
delimiter=delimiter
|
||||||
|
)
|
||||||
|
|
||||||
# Check if paper with this DOI already exists
|
# Set initial progress percentage
|
||||||
existing = PaperMetadata.query.filter_by(doi=doi).first()
|
self.update_state(state='PROGRESS', meta={'progress': 10})
|
||||||
|
|
||||||
if existing:
|
# Read CSV into chunks
|
||||||
if duplicate_strategy == 'update':
|
csv_buffer = StringIO(file_content)
|
||||||
# Update existing record
|
# Count total chunks
|
||||||
existing.title = row["title"]
|
csv_buffer.seek(0)
|
||||||
existing.alt_id = row.get("alternative_id")
|
total_chunks = len(list(pd.read_csv(csv_buffer, delimiter=delimiter, chunksize=CHUNK_SIZE)))
|
||||||
existing.issn = row["issn"]
|
csv_buffer.seek(0)
|
||||||
existing.journal = row.get("journal")
|
|
||||||
existing.type = row.get("type")
|
# Process each chunk of rows
|
||||||
existing.language = row.get("language")
|
for chunk_idx, chunk in enumerate(pd.read_csv(csv_buffer, delimiter=delimiter, chunksize=CHUNK_SIZE)):
|
||||||
existing.published_online = parse_date(row.get("published_online"))
|
for index, row in chunk.iterrows():
|
||||||
updated_count += 1
|
try:
|
||||||
|
doi = str(row.get("doi", "N/A"))
|
||||||
|
# Validate required fields
|
||||||
|
if pd.isna(row.get("title")) or pd.isna(row.get("doi")) or pd.isna(row.get("issn")):
|
||||||
|
raise ValueError("Missing required fields")
|
||||||
|
|
||||||
|
# Try finding an existing record based on DOI
|
||||||
|
existing = db.session.query(PaperMetadata).filter_by(doi=doi).first()
|
||||||
|
if existing:
|
||||||
|
if duplicate_strategy == "update":
|
||||||
|
existing.title = row["title"]
|
||||||
|
existing.alt_id = row.get("alternative_id")
|
||||||
|
existing.issn = row["issn"]
|
||||||
|
existing.journal = row.get("journal")
|
||||||
|
existing.published_online = parse_date(row.get("published_online"))
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
# Track why this record was skipped
|
||||||
|
skipped_records.append({
|
||||||
|
"row": index + 2,
|
||||||
|
"doi": doi,
|
||||||
|
"reason": f"Duplicate DOI found and strategy is '{duplicate_strategy}'"
|
||||||
|
})
|
||||||
|
skipped_count += 1
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
# Skip this record
|
metadata = PaperMetadata(
|
||||||
skipped_count += 1
|
title=row["title"],
|
||||||
continue
|
doi=doi,
|
||||||
else:
|
alt_id=row.get("alternative_id"),
|
||||||
# Create new record
|
issn=row["issn"],
|
||||||
metadata = PaperMetadata(
|
journal=row.get("journal"),
|
||||||
title=row["title"],
|
published_online=parse_date(row.get("published_online")),
|
||||||
doi=doi,
|
status="New",
|
||||||
alt_id=row.get("alternative_id"),
|
)
|
||||||
issn=row["issn"],
|
db.session.add(metadata)
|
||||||
journal=row.get("journal"),
|
added_count += 1
|
||||||
type=row.get("type"),
|
except Exception as e:
|
||||||
language=row.get("language"),
|
error_count += 1
|
||||||
published_online=parse_date(row.get("published_online")),
|
errors.append({"row": index + 2, "doi": row.get("doi", "N/A"), "error": str(e)})
|
||||||
status="New",
|
|
||||||
file_path=None,
|
|
||||||
error_msg=None,
|
|
||||||
)
|
|
||||||
db.session.add(metadata)
|
|
||||||
added_count += 1
|
|
||||||
|
|
||||||
except Exception as e:
|
# Commit the chunk and roll session fresh
|
||||||
error_count += 1
|
|
||||||
errors.append({
|
|
||||||
"row": index + 2, # +2 because index is 0-based and we have a header row
|
|
||||||
"doi": row.get("doi", "N/A"),
|
|
||||||
"error": str(e)
|
|
||||||
})
|
|
||||||
continue # Skip this row and continue with the next
|
|
||||||
|
|
||||||
try:
|
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
except Exception as e:
|
|
||||||
db.session.rollback()
|
|
||||||
return render_template(
|
|
||||||
"upload.html.jinja", error=f"Failed to save data to database: {e}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prepare error samples for display
|
# Log periodic progress every 5 chunks
|
||||||
error_samples = errors[:5] if errors else []
|
if (chunk_idx + 1) % 5 == 0:
|
||||||
|
ActivityLog.log_import_activity(
|
||||||
|
action="import_progress",
|
||||||
|
status="processing",
|
||||||
|
description=f"Processed {chunk_idx+1}/{total_chunks} chunks",
|
||||||
|
current_stats={
|
||||||
|
"added": added_count,
|
||||||
|
"updated": updated_count,
|
||||||
|
"skipped": skipped_count,
|
||||||
|
"errors": error_count
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
error_message = None
|
progress = min(90, 10 + int((chunk_idx + 1) * 80 / total_chunks))
|
||||||
if errors:
|
self.update_state(state='PROGRESS', meta={'progress': progress})
|
||||||
error_message = f"Encountered {len(errors)} errors. First 5 shown below."
|
|
||||||
|
|
||||||
# Store the full errors list in the session for potential download
|
# Final progress update and completion log
|
||||||
if errors:
|
self.update_state(state='PROGRESS', meta={'progress': 100})
|
||||||
|
ActivityLog.log_import_activity(
|
||||||
|
action="complete_csv_import",
|
||||||
|
status="success",
|
||||||
|
description="CSV import completed",
|
||||||
|
stats={
|
||||||
|
"added": added_count,
|
||||||
|
"updated": updated_count,
|
||||||
|
"skipped": skipped_count,
|
||||||
|
"errors": error_count
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
db.session.rollback()
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message="CSV import failed",
|
||||||
|
exception=e,
|
||||||
|
severity="error",
|
||||||
|
source="upload.process_csv"
|
||||||
|
)
|
||||||
|
return {'error': str(e), 'progress': 0}
|
||||||
|
finally:
|
||||||
|
db.session.remove()
|
||||||
|
|
||||||
|
# If there were errors, store an error CSV for potential download
|
||||||
|
if errors:
|
||||||
|
try:
|
||||||
error_csv = StringIO()
|
error_csv = StringIO()
|
||||||
writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
|
writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
|
||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
writer.writerows(errors)
|
writer.writerows(errors)
|
||||||
session["error_data"] = error_csv.getvalue()
|
ActivityLog.log_import_activity(
|
||||||
|
action="import_errors",
|
||||||
|
status="error",
|
||||||
|
description=f"Import completed with {error_count} errors",
|
||||||
|
error_csv=error_csv.getvalue(),
|
||||||
|
task_id=self.request.id,
|
||||||
|
error_count=error_count
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Do not fail the task if error logging fails
|
||||||
|
pass
|
||||||
|
|
||||||
return render_template(
|
# Update the return value to include skipped records information
|
||||||
"upload.html.jinja",
|
return {
|
||||||
success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
|
"added": added_count,
|
||||||
error_message=error_message,
|
"updated": updated_count,
|
||||||
error_samples=error_samples
|
"skipped": skipped_count,
|
||||||
)
|
"skipped_records": skipped_records[:5], # Include up to 5 examples
|
||||||
|
"skipped_reason_summary": "Records were skipped because they already exist in the database. Use 'update' strategy to update them.",
|
||||||
|
"errors": errors[:5],
|
||||||
|
"error_count": error_count,
|
||||||
|
"task_id": self.request.id
|
||||||
|
}
|
||||||
|
|
||||||
return render_template("upload.html.jinja")
|
@bp.route("/task_status/<task_id>")
|
||||||
|
def task_status(task_id):
|
||||||
|
"""Get status of background task."""
|
||||||
|
task = celery.AsyncResult(task_id)
|
||||||
|
|
||||||
|
if task.state == "PENDING":
|
||||||
|
response = {"state": task.state, "progress": 0}
|
||||||
|
elif task.state == "PROGRESS":
|
||||||
|
response = {
|
||||||
|
"state": task.state,
|
||||||
|
"progress": task.info.get("progress", 0)
|
||||||
|
}
|
||||||
|
elif task.state == "SUCCESS":
|
||||||
|
response = {
|
||||||
|
"state": task.state,
|
||||||
|
"result": task.result
|
||||||
|
}
|
||||||
|
else: # FAILURE, REVOKED, etc.
|
||||||
|
response = {
|
||||||
|
"state": task.state,
|
||||||
|
"error": str(task.info) if task.info else "Unknown error"
|
||||||
|
}
|
||||||
|
|
||||||
@bp.route("/download_error_log")
|
return jsonify(response)
|
||||||
def download_error_log():
|
|
||||||
error_data = session.get("error_data")
|
@bp.route("/download_error_log/<task_id>")
|
||||||
if not error_data:
|
def download_error_log(task_id):
|
||||||
|
# Find the most recent error log for this task
|
||||||
|
error_log = ActivityLog.query.filter(
|
||||||
|
ActivityLog.action == "import_errors",
|
||||||
|
ActivityLog.extra_data.like(f'%"{task_id}"%') # Search in JSON
|
||||||
|
).order_by(ActivityLog.timestamp.desc()).first()
|
||||||
|
|
||||||
|
if not error_log:
|
||||||
flash("No error data available.")
|
flash("No error data available.")
|
||||||
return redirect(url_for("upload.upload"))
|
return redirect(url_for("upload.upload"))
|
||||||
|
|
||||||
buffer = StringIO(error_data)
|
# Get the CSV data from extra_data
|
||||||
|
extra_data = error_log.get_extra_data()
|
||||||
|
error_csv = extra_data.get("error_csv")
|
||||||
|
|
||||||
|
if not error_csv:
|
||||||
|
flash("Error data format is invalid.")
|
||||||
|
return redirect(url_for("upload.upload"))
|
||||||
|
|
||||||
|
buffer = StringIO(error_csv)
|
||||||
return send_file(
|
return send_file(
|
||||||
buffer,
|
buffer,
|
||||||
mimetype="text/csv",
|
mimetype="text/csv",
|
||||||
|
43
scipaperloader/celery.py
Normal file
43
scipaperloader/celery.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
from celery import Celery
|
||||||
|
|
||||||
|
# Create Celery instance without Flask app initially
|
||||||
|
celery = Celery(
|
||||||
|
'scipaperloader',
|
||||||
|
broker='redis://localhost:6379/0',
|
||||||
|
backend='redis://localhost:6379/0',
|
||||||
|
)
|
||||||
|
|
||||||
|
def configure_celery(app=None):
|
||||||
|
"""Configure Celery with the Flask app settings and ensure tasks run in the app context."""
|
||||||
|
if app is None:
|
||||||
|
# Import here to avoid circular import
|
||||||
|
from scipaperloader import create_app
|
||||||
|
app = create_app()
|
||||||
|
|
||||||
|
# Update Celery configuration using the app settings
|
||||||
|
celery.conf.update(
|
||||||
|
broker_url=app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0'),
|
||||||
|
result_backend=app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0'),
|
||||||
|
task_serializer='json',
|
||||||
|
accept_content=['json'],
|
||||||
|
result_serializer='json',
|
||||||
|
timezone='UTC',
|
||||||
|
enable_utc=True,
|
||||||
|
task_time_limit=3600, # 1 hour max runtime
|
||||||
|
task_soft_time_limit=3000, # 50 minutes soft limit
|
||||||
|
worker_max_tasks_per_child=10, # Restart workers after 10 tasks
|
||||||
|
worker_max_memory_per_child=1000000, # 1GB memory limit
|
||||||
|
task_acks_late=True, # Acknowledge tasks after completion
|
||||||
|
task_reject_on_worker_lost=True, # Requeue tasks if worker dies
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a custom task class that pushes the Flask application context
|
||||||
|
class ContextTask(celery.Task):
|
||||||
|
abstract = True
|
||||||
|
|
||||||
|
def __call__(self, *args, **kwargs):
|
||||||
|
with app.app_context():
|
||||||
|
return self.run(*args, **kwargs)
|
||||||
|
|
||||||
|
celery.Task = ContextTask
|
||||||
|
return celery
|
@ -12,6 +12,7 @@ class ActivityCategory(Enum):
|
|||||||
SCRAPER_COMMAND = "scraper_command"
|
SCRAPER_COMMAND = "scraper_command"
|
||||||
SCRAPER_ACTIVITY = "scraper_activity"
|
SCRAPER_ACTIVITY = "scraper_activity"
|
||||||
SYSTEM = "system"
|
SYSTEM = "system"
|
||||||
|
DATA_IMPORT = "data_import"
|
||||||
|
|
||||||
|
|
||||||
class ErrorSeverity(Enum):
|
class ErrorSeverity(Enum):
|
||||||
@ -164,6 +165,20 @@ class ActivityLog(db.Model):
|
|||||||
db.session.commit()
|
db.session.commit()
|
||||||
return log
|
return log
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def log_import_activity(cls, action, status=None, description=None, user_id=None, **extra):
|
||||||
|
"""Log data import activities (CSV uploads, bulk imports, etc.)."""
|
||||||
|
log = cls(
|
||||||
|
category=ActivityCategory.DATA_IMPORT.value,
|
||||||
|
action=action,
|
||||||
|
status=status,
|
||||||
|
description=description,
|
||||||
|
user_id=user_id
|
||||||
|
)
|
||||||
|
log.set_extra_data(extra)
|
||||||
|
db.session.add(log)
|
||||||
|
db.session.commit()
|
||||||
|
return log
|
||||||
|
|
||||||
class PaperMetadata(db.Model):
|
class PaperMetadata(db.Model):
|
||||||
id = db.Column(db.Integer, primary_key=True)
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
@ -171,6 +186,7 @@ class PaperMetadata(db.Model):
|
|||||||
doi = db.Column(db.String, unique=True, index=True)
|
doi = db.Column(db.String, unique=True, index=True)
|
||||||
alt_id = db.Column(db.String)
|
alt_id = db.Column(db.String)
|
||||||
issn = db.Column(db.String(32))
|
issn = db.Column(db.String(32))
|
||||||
|
journal = db.Column(db.String(255))
|
||||||
type = db.Column(db.String(50))
|
type = db.Column(db.String(50))
|
||||||
language = db.Column(db.String(50))
|
language = db.Column(db.String(50))
|
||||||
published_online = db.Column(db.Date) # or DateTime/String
|
published_online = db.Column(db.Date) # or DateTime/String
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
.message {
|
.message {
|
||||||
padding: 10px;
|
padding: 10px;
|
||||||
font-size: 1.3em;
|
font-size: 1.3em;
|
||||||
font-family: Arial, sans-serif;
|
font-family: Arial, sans-serif;
|
||||||
|
}
|
||||||
|
|
||||||
|
.progress-bar {
|
||||||
|
width: 0%;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
{% extends "base.html.jinja" %} {% block content %}
|
{% extends "base.html.jinja" %} {% block content %}
|
||||||
<h1>Welcome to SciPaperLoader</h1>
|
<h1>Welcome to SciPaperLoader</h1>
|
||||||
|
|
||||||
|
<div id="results-container"></div>
|
||||||
|
|
||||||
{% if success %}
|
{% if success %}
|
||||||
<div class="alert alert-success mt-3">{{ success }}</div>
|
<div class="alert alert-success mt-3">{{ success }}</div>
|
||||||
{% endif %} {% if error_message %}
|
{% endif %} {% if error_message %}
|
||||||
@ -40,24 +42,9 @@
|
|||||||
<li><code>issn</code> – the ISSN of the journal</li>
|
<li><code>issn</code> – the ISSN of the journal</li>
|
||||||
<li><code>title</code> – the title of the paper</li>
|
<li><code>title</code> – the title of the paper</li>
|
||||||
</ul>
|
</ul>
|
||||||
<p>
|
|
||||||
The format of your CSV should resemble the response structure of the
|
|
||||||
Crossref API's <code>/journals/{issn}/works</code> endpoint.
|
|
||||||
</p>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<form method="post" action="{{ url_for('upload.upload') }}" enctype="multipart/form-data">
|
<form method="post" action="{{ url_for('upload.upload') }}" enctype="multipart/form-data" id="upload-form">
|
||||||
<div class="mb-3">
|
|
||||||
<label class="form-label">How to handle duplicate DOIs:</label>
|
|
||||||
<div class="form-check">
|
|
||||||
<input class="form-check-input" type="radio" name="duplicate_strategy" value="skip" id="skip" checked />
|
|
||||||
<label class="form-check-label" for="skip">Skip duplicate entries</label>
|
|
||||||
</div>
|
|
||||||
<div class="form-check">
|
|
||||||
<input class="form-check-input" type="radio" name="duplicate_strategy" value="update" id="update" />
|
|
||||||
<label class="form-check-label" for="update">Update existing entries</label>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label for="file">Upload CSV File</label>
|
<label for="file">Upload CSV File</label>
|
||||||
<input type="file" name="file" id="file" class="form-control" required />
|
<input type="file" name="file" id="file" class="form-control" required />
|
||||||
@ -73,4 +60,175 @@
|
|||||||
</div>
|
</div>
|
||||||
<button type="submit" class="btn btn-primary mt-3">Upload</button>
|
<button type="submit" class="btn btn-primary mt-3">Upload</button>
|
||||||
</form>
|
</form>
|
||||||
|
|
||||||
|
<!-- Progress Modal -->
|
||||||
|
<div id="progressModal" class="modal fade" tabindex="-1">
|
||||||
|
<div class="modal-dialog">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h5 class="modal-title">Processing Your Upload</h5>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<div class="progress">
|
||||||
|
<div id="progressBar" class="progress-bar" role="progressbar">0%</div>
|
||||||
|
</div>
|
||||||
|
<p id="progressStatus" class="mt-2 text-center">Starting...</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const form = document.getElementById("upload-form");
|
||||||
|
form.addEventListener("submit", function (e) {
|
||||||
|
e.preventDefault();
|
||||||
|
|
||||||
|
// Display loading state immediately
|
||||||
|
const progressModal = new bootstrap.Modal(document.getElementById("progressModal"));
|
||||||
|
progressModal.show();
|
||||||
|
const progressBar = document.getElementById("progressBar");
|
||||||
|
progressBar.style.width = "5%";
|
||||||
|
progressBar.textContent = "Starting...";
|
||||||
|
|
||||||
|
const formData = new FormData(form);
|
||||||
|
|
||||||
|
// Disable the form while processing
|
||||||
|
const submitButton = form.querySelector("button[type='submit']");
|
||||||
|
submitButton.disabled = true;
|
||||||
|
|
||||||
|
fetch(form.action, {
|
||||||
|
method: "POST",
|
||||||
|
body: formData,
|
||||||
|
})
|
||||||
|
.then((response) => response.json())
|
||||||
|
.then((data) => {
|
||||||
|
if (data.error) {
|
||||||
|
// Handle error
|
||||||
|
progressModal.hide();
|
||||||
|
alert(`Error: ${data.error}`);
|
||||||
|
submitButton.disabled = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const taskId = data.task_id;
|
||||||
|
const interval = setInterval(() => {
|
||||||
|
fetch("{{ url_for('upload.task_status', task_id='') }}" + taskId)
|
||||||
|
.then((response) => response.json())
|
||||||
|
.then((status) => {
|
||||||
|
console.log("Task status:", status);
|
||||||
|
if (status.state === "SUCCESS") {
|
||||||
|
clearInterval(interval);
|
||||||
|
progressBar.style.width = "100%";
|
||||||
|
progressBar.textContent = "Completed!";
|
||||||
|
|
||||||
|
setTimeout(() => {
|
||||||
|
progressModal.hide();
|
||||||
|
showResults(status.result);
|
||||||
|
submitButton.disabled = false;
|
||||||
|
}, 1000);
|
||||||
|
} else if (status.state === "FAILURE") {
|
||||||
|
clearInterval(interval);
|
||||||
|
progressBar.style.width = "100%";
|
||||||
|
progressBar.classList.add("bg-danger");
|
||||||
|
progressBar.textContent = "Failed!";
|
||||||
|
|
||||||
|
setTimeout(() => {
|
||||||
|
progressModal.hide();
|
||||||
|
alert(`Task failed: ${status.error || "Unknown error"}`);
|
||||||
|
submitButton.disabled = false;
|
||||||
|
}, 1000);
|
||||||
|
} else {
|
||||||
|
// Update progress bar with more information
|
||||||
|
const progress = status.progress || 0;
|
||||||
|
progressBar.style.width = `${progress}%`;
|
||||||
|
progressBar.textContent = `${progress}% complete`;
|
||||||
|
document.getElementById("progressStatus").innerText = `Processing... (${status.state})`;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
console.error("Failed to check task status:", err);
|
||||||
|
});
|
||||||
|
}, 1000);
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
console.error("Upload failed:", err);
|
||||||
|
progressModal.hide();
|
||||||
|
alert("Upload failed. Please try again.");
|
||||||
|
submitButton.disabled = false;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
const showResults = (result) => {
|
||||||
|
const message = `Upload completed! Added: ${result.added}, Updated: ${result.updated}, Skipped: ${result.skipped}, Errors: ${result.error_count}`;
|
||||||
|
|
||||||
|
let resultHTML = `<div class="alert alert-success">${message}</div>`;
|
||||||
|
|
||||||
|
// Add skipped records information
|
||||||
|
if (result.skipped > 0) {
|
||||||
|
resultHTML += `
|
||||||
|
<div class="alert alert-info">
|
||||||
|
<h4>${result.skipped} records were skipped</h4>
|
||||||
|
<p>${result.skipped_reason_summary || "Records were skipped because they already exist in the database."}</p>
|
||||||
|
${result.skipped_records && result.skipped_records.length > 0 ? `
|
||||||
|
<p>Examples of skipped records:</p>
|
||||||
|
<table class="table table-sm table-bordered">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Row</th>
|
||||||
|
<th>DOI</th>
|
||||||
|
<th>Reason</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
${result.skipped_records.map(record => `
|
||||||
|
<tr>
|
||||||
|
<td>${record.row}</td>
|
||||||
|
<td>${record.doi}</td>
|
||||||
|
<td>${record.reason}</td>
|
||||||
|
</tr>
|
||||||
|
`).join('')}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
` : ''}
|
||||||
|
</div>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Existing error display code
|
||||||
|
if (result.error_count > 0) {
|
||||||
|
resultHTML += `
|
||||||
|
<div class="alert alert-warning">
|
||||||
|
<h4>Some errors occurred (${result.error_count} total)</h4>
|
||||||
|
<p>Showing first ${result.errors.length} of ${result.error_count} errors:</p>
|
||||||
|
<table class="table table-sm table-bordered">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Row</th>
|
||||||
|
<th>DOI</th>
|
||||||
|
<th>Error</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>`;
|
||||||
|
|
||||||
|
result.errors.forEach(error => {
|
||||||
|
resultHTML += `
|
||||||
|
<tr>
|
||||||
|
<td>${error.row}</td>
|
||||||
|
<td>${error.doi}</td>
|
||||||
|
<td>${error.error}</td>
|
||||||
|
</tr>`;
|
||||||
|
});
|
||||||
|
|
||||||
|
resultHTML += `
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
<p class="mt-2">Download the complete error log with all ${result.error_count} errors:</p>
|
||||||
|
<a href="/upload/download_error_log/${result.task_id}" class="btn btn-outline-secondary">
|
||||||
|
Download Full Error Log
|
||||||
|
</a>
|
||||||
|
</div>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
document.getElementById("results-container").innerHTML = resultHTML;
|
||||||
|
};
|
||||||
|
</script>
|
||||||
{% endblock content %}
|
{% endblock content %}
|
1641
testdata.csv
Normal file
1641
testdata.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user