implements asynchronous task management to the input module.

2025-04-12 12:55:19 +02:00 · 2025-04-12 12:55:19 +02:00 · 05f4c8b517
commit 05f4c8b517
parent 5d63e28a61
14 changed files with 2190 additions and 148 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,4 +12,6 @@ dist/

 *.db

-*.R
+*.R
+
+migrations/
--- a/31
+++ b/31
@ -1,10 +1,13 @@
 # List of phony targets (targets that don't represent files)
-.PHONY: all clean venv run format format-check lint mypy test dist reformat dev
+.PHONY: all clean venv run format format-check lint mypy test dist reformat dev celery celery-flower redis run-all

 # Define Python and pip executables inside virtual environment
 PYTHON := venv/bin/python
 PIP := venv/bin/pip

+# Celery worker command
+CELERY := venv/bin/celery
+
 # Default target that runs the application
 all: run

@ -83,11 +86,11 @@ todos:
 	@grep -r "TODO\|FIXME" scipaperloader || echo "No TODOs found"

 # Reset the database: delete, initialize, and migrate
-reset-db:
+reset-db: venv
 	rm -f $(DB_PATH)
-	flask db init || true
-	flask db migrate -m "Initial migration"
-	flask db upgrade
+	$(PYTHON) -m flask --app scipaperloader db init || true
+	$(PYTHON) -m flask --app scipaperloader db migrate -m "Initial migration"
+	$(PYTHON) -m flask --app scipaperloader db upgrade

 # Create and set up virtual environment
 venv:
@ -130,3 +133,21 @@ dist: format-check lint mypy test

 # Set up complete development environment
 dev: clean venv
+
+# Start Celery worker for processing tasks
+celery: venv
+	$(CELERY) -A celery_worker:celery worker --loglevel=info
+    
+# Monitor Celery tasks with flower web interface
+celery-flower: venv
+	$(PIP) install flower
+	$(CELERY) -A celery_worker:celery flower --port=5555
+
+# Check if Redis is running, start if needed
+redis:
+	@redis-cli ping > /dev/null 2>&1 || (echo "Starting Redis server..." && redis-server --daemonize yes)
+
+# Run complete application stack (Flask app + Celery worker + Redis)
+run-all: redis
+	@echo "Starting Flask and Celery..."
+	@$(MAKE) -j2 run celery
--- a/README.md
+++ b/README.md
@ -14,7 +14,8 @@ And open it in the browser at [http://localhost:5000/](http://localhost:5000/)

 ## Prerequisites

-Python >=3.8
+- Python >=3.8
+- Redis (for Celery task queue)

 ## Development environment

@ -40,12 +41,44 @@ Python >=3.8
   add development dependencies under `project.optional-dependencies.*`; run
   `make clean && make venv` to reinstall the environment

+## Asynchronous Task Processing with Celery
+
+SciPaperLoader uses Celery for processing large CSV uploads and other background tasks. This allows the application to handle large datasets reliably without blocking the web interface.
+
+### Running Celery Components
+
+- `make redis`: ensures Redis server is running (required for Celery)
+
+- `make celery`: starts a Celery worker to process background tasks
+
+- `make celery-flower`: starts Flower, a web interface for monitoring Celery tasks at http://localhost:5555
+
+- `make run-all`: runs the entire stack (Flask app + Celery worker + Redis) in development mode
+
+### How It Works
+
+When you upload a CSV file through the web interface:
+
+1. The file is sent to the server
+2. A Celery task is created to process the file asynchronously
+3. The browser shows a progress bar with real-time updates
+4. The results are displayed when processing is complete
+
+This architecture allows SciPaperLoader to handle CSV files with thousands of papers without timing out or blocking the web interface.
+
 ## Configuration

 Default configuration is loaded from `scipaperloader.defaults` and can be
 overriden by environment variables with a `FLASK_` prefix. See
 [Configuring from Environment Variables](https://flask.palletsprojects.com/en/3.0.x/config/#configuring-from-environment-variables).

+### Celery Configuration
+
+The following environment variables can be set to configure Celery:
+
+- `FLASK_CELERY_BROKER_URL`: Redis URL for the message broker (default: `redis://localhost:6379/0`)
+- `FLASK_CELERY_RESULT_BACKEND`: Redis URL for storing task results (default: `redis://localhost:6379/0`)
+
 Consider using
 [dotenv](https://flask.palletsprojects.com/en/3.0.x/cli/#environment-variables-from-dotenv).

@ -58,4 +91,12 @@ deliver to your server, or copy in your `Dockerfile`, and insall it with `pip`.

 You must set a
 [SECRET_KEY](https://flask.palletsprojects.com/en/3.0.x/tutorial/deploy/#configure-the-secret-key)
-in production to a secret and stable value.
+in production to a secret and stable value.
+
+### Deploying with Celery
+
+When deploying to production:
+
+1. Configure a production-ready Redis instance or use a managed service
+2. Run Celery workers as system services or in Docker containers
+3. Consider setting up monitoring for your Celery tasks and workers
--- a/celery_worker.py
+++ b/celery_worker.py
@ -0,0 +1,7 @@
+from scipaperloader.celery import celery, configure_celery
+
+# Configure celery with Flask app
+configure_celery()
+
+if __name__ == '__main__':
+    celery.start()
--- a/dump.rdb
+++ b/dump.rdb
--- a/pyproject.toml
+++ b/pyproject.toml
@ -13,6 +13,10 @@ dependencies = [
    "flask-wtf>=1.2.2,<2",
    "pyzotero>=1.6.11,<2",
    "pandas>=2.2.3,<3",
+    "celery>=5.5.1,<6",
+    "redis>=5.2.1,<6",
+    "flower>=2.0.1,<3",
+    "flask-migrate>=4.1.0,<5",
    ]

 [project.optional-dependencies]
--- a/scipaperloader/init.py
+++ b/scipaperloader/init.py
@ -1,5 +1,5 @@
 from flask import Flask, request
-
+from flask_migrate import Migrate  # Add this line
 from .config import Config
 from .db import db
 from .models import init_schedule_config
@ -10,10 +10,15 @@ def create_app(test_config=None):
    app = Flask(__name__)
    app.config.from_object(Config)

+    # Celery configuration
+    app.config['CELERY_BROKER_URL'] = app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0')
+    app.config['CELERY_RESULT_BACKEND'] = app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0')
+
    if test_config:
        app.config.update(test_config)

    db.init_app(app)
+    migrate = Migrate(app, db)  # Add this line to initialize Flask-Migrate

    with app.app_context():
        db.create_all()
@ -27,10 +32,19 @@ def create_app(test_config=None):

    @app.before_request
    def before_request():
+        # Skip logging for static files, health checks, or other frequent requests
+        if request.path.startswith('/static/') or request.path == '/health' or request.path == '/favicon.ico':
+            return
+            
+        # Skip task status checks to avoid log spam
+        if request.path.startswith('/task_status/'):
+            return
+        
+        action = request.endpoint or request.path or "unknown_request"
        ActivityLog.log_gui_interaction(
-            action=request.endpoint,
-            description=f"Request to {request.endpoint}",
+            action=action,
+            description=f"Request to {request.path}",
            extra={"method": request.method, "url": request.url}
        )
-    
-    return app
+        
+    return app
--- a/scipaperloader/blueprints/papers.py
+++ b/scipaperloader/blueprints/papers.py
@ -116,7 +116,7 @@ def export_papers():
            [
                paper.id,
                paper.title,
-                getattr(paper, "journal", ""),
+                paper.journal,
                paper.doi,
                paper.issn,
                paper.status,
--- a/scipaperloader/blueprints/upload.py
+++ b/scipaperloader/blueprints/upload.py
@ -3,26 +3,39 @@ import codecs
 import csv
 import datetime
 from io import StringIO
+import json

 import pandas as pd
 from flask import (
    Blueprint,
    flash,
+    jsonify,
    redirect,
    render_template,
    request,
    send_file,
    session,
    url_for,
+    current_app
 )

 from ..db import db
-from ..models import PaperMetadata
+from ..models import PaperMetadata, ActivityLog
+from ..celery import celery  # Import the celery instance directly

 bp = Blueprint("upload", __name__)

 REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
+CHUNK_SIZE = 100  # Number of rows to process per batch

+def parse_date(date_str):
+    """Parse date string into datetime object."""
+    if not date_str or pd.isna(date_str):
+        return None
+    try:
+        return datetime.datetime.strptime(date_str, "%Y-%m-%d")
+    except ValueError:
+        return None

@bp.route("/", methods=["GET", "POST"])
 def upload():
@ -32,136 +45,214 @@ def upload():
        duplicate_strategy = request.form.get("duplicate_strategy", "skip")

        if not file:
-            return render_template("upload.html.jinja", error="No file selected.")
+            return jsonify({"error": "No file selected."})

-        try:
-            stream = codecs.iterdecode(file.stream, "utf-8")
-            content = "".join(stream)
-            df = pd.read_csv(StringIO(content), delimiter=delimiter)
-        except Exception as e:
-            return render_template("upload.html.jinja", error=f"Failed to read CSV file: {e}")
+        stream = codecs.iterdecode(file.stream, "utf-8")
+        content = "".join(stream)

-        missing = REQUIRED_COLUMNS - set(df.columns)
-        if missing:
-            return render_template(
-                "upload.html.jinja", error=f"Missing required columns: {', '.join(missing)}"
-            )
-
-        # Optional: parse 'published_online' to date
-        def parse_date(val):
-            if pd.isna(val):
-                return None
-            try:
-                return pd.to_datetime(val).date()
-            except Exception:
-                return None
-
-        # Count statistics
-        added_count = 0
-        skipped_count = 0
-        updated_count = 0
-        error_count = 0
+        # Trigger the Celery task
+        task = process_csv.delay(content, delimiter, duplicate_strategy)
        
-        # Collect error information
-        errors = []
-        
-        # Process each row
-        for index, row in df.iterrows():
-            try:
-                # Get DOI from row for error reporting
-                doi = str(row.get("doi", "N/A"))
-                
-                # Validate required fields
-                for field in ["title", "doi", "issn"]:
-                    if pd.isna(row.get(field)) or not str(row.get(field)).strip():
-                        raise ValueError(f"Missing required field: {field}")
-                
-                # Check if paper with this DOI already exists
-                existing = PaperMetadata.query.filter_by(doi=doi).first()
-                
-                if existing:
-                    if duplicate_strategy == 'update':
-                        # Update existing record
-                        existing.title = row["title"]
-                        existing.alt_id = row.get("alternative_id")
-                        existing.issn = row["issn"]
-                        existing.journal = row.get("journal")
-                        existing.type = row.get("type")
-                        existing.language = row.get("language")
-                        existing.published_online = parse_date(row.get("published_online"))
-                        updated_count += 1
+        return jsonify({"task_id": task.id})
+
+    return render_template("upload.html.jinja")
+
+@celery.task(bind=True)
+def process_csv(self, file_content, delimiter, duplicate_strategy):
+    """Process CSV file and import paper metadata."""
+
+    # With the ContextTask in place, we're already inside an app context
+    added_count = skipped_count = updated_count = error_count = 0
+    errors = []
+    skipped_records = []  # Add this to track skipped records
+
+    try:
+        # Log the start of import using ActivityLog model
+        ActivityLog.log_import_activity(
+            action="start_csv_import",
+            status="processing",
+            description=f"Starting CSV import with strategy: {duplicate_strategy}",
+            file_size=len(file_content),
+            delimiter=delimiter
+        )
+
+        # Set initial progress percentage
+        self.update_state(state='PROGRESS', meta={'progress': 10})
+
+        # Read CSV into chunks
+        csv_buffer = StringIO(file_content)
+        # Count total chunks
+        csv_buffer.seek(0)
+        total_chunks = len(list(pd.read_csv(csv_buffer, delimiter=delimiter, chunksize=CHUNK_SIZE)))
+        csv_buffer.seek(0)
+
+        # Process each chunk of rows
+        for chunk_idx, chunk in enumerate(pd.read_csv(csv_buffer, delimiter=delimiter, chunksize=CHUNK_SIZE)):
+            for index, row in chunk.iterrows():
+                try:
+                    doi = str(row.get("doi", "N/A"))
+                    # Validate required fields
+                    if pd.isna(row.get("title")) or pd.isna(row.get("doi")) or pd.isna(row.get("issn")):
+                        raise ValueError("Missing required fields")
+
+                    # Try finding an existing record based on DOI
+                    existing = db.session.query(PaperMetadata).filter_by(doi=doi).first()
+                    if existing:
+                        if duplicate_strategy == "update":
+                            existing.title = row["title"]
+                            existing.alt_id = row.get("alternative_id")
+                            existing.issn = row["issn"]
+                            existing.journal = row.get("journal")
+                            existing.published_online = parse_date(row.get("published_online"))
+                            updated_count += 1
+                        else:
+                            # Track why this record was skipped
+                            skipped_records.append({
+                                "row": index + 2,
+                                "doi": doi,
+                                "reason": f"Duplicate DOI found and strategy is '{duplicate_strategy}'"
+                            })
+                            skipped_count += 1
+                            continue
                    else:
-                        # Skip this record
-                        skipped_count += 1
-                        continue
-                else:
-                    # Create new record
-                    metadata = PaperMetadata(
-                        title=row["title"],
-                        doi=doi,
-                        alt_id=row.get("alternative_id"),
-                        issn=row["issn"],
-                        journal=row.get("journal"),
-                        type=row.get("type"),
-                        language=row.get("language"),
-                        published_online=parse_date(row.get("published_online")),
-                        status="New",
-                        file_path=None,
-                        error_msg=None,
-                    )
-                    db.session.add(metadata)
-                    added_count += 1
-                    
-            except Exception as e:
-                error_count += 1
-                errors.append({
-                    "row": index + 2,  # +2 because index is 0-based and we have a header row
-                    "doi": row.get("doi", "N/A"),
-                    "error": str(e)
-                })
-                continue  # Skip this row and continue with the next
+                        metadata = PaperMetadata(
+                            title=row["title"],
+                            doi=doi,
+                            alt_id=row.get("alternative_id"),
+                            issn=row["issn"],
+                            journal=row.get("journal"),
+                            published_online=parse_date(row.get("published_online")),
+                            status="New",
+                        )
+                        db.session.add(metadata)
+                        added_count += 1
+                except Exception as e:
+                    error_count += 1
+                    errors.append({"row": index + 2, "doi": row.get("doi", "N/A"), "error": str(e)})

-        try:
+            # Commit the chunk and roll session fresh
            db.session.commit()
-        except Exception as e:
-            db.session.rollback()
-            return render_template(
-                "upload.html.jinja", error=f"Failed to save data to database: {e}"
-            )

-        # Prepare error samples for display
-        error_samples = errors[:5] if errors else []
-        
-        error_message = None
-        if errors:
-            error_message = f"Encountered {len(errors)} errors. First 5 shown below."
-            
-        # Store the full errors list in the session for potential download
-        if errors:
+            # Log periodic progress every 5 chunks
+            if (chunk_idx + 1) % 5 == 0:
+                ActivityLog.log_import_activity(
+                    action="import_progress",
+                    status="processing",
+                    description=f"Processed {chunk_idx+1}/{total_chunks} chunks",
+                    current_stats={
+                        "added": added_count,
+                        "updated": updated_count,
+                        "skipped": skipped_count,
+                        "errors": error_count
+                    }
+                )
+
+            progress = min(90, 10 + int((chunk_idx + 1) * 80 / total_chunks))
+            self.update_state(state='PROGRESS', meta={'progress': progress})
+
+        # Final progress update and completion log
+        self.update_state(state='PROGRESS', meta={'progress': 100})
+        ActivityLog.log_import_activity(
+            action="complete_csv_import",
+            status="success",
+            description="CSV import completed",
+            stats={
+                "added": added_count,
+                "updated": updated_count,
+                "skipped": skipped_count,
+                "errors": error_count
+            }
+        )
+
+    except Exception as e:
+        db.session.rollback()
+        ActivityLog.log_error(
+            error_message="CSV import failed",
+            exception=e,
+            severity="error",
+            source="upload.process_csv"
+        )
+        return {'error': str(e), 'progress': 0}
+    finally:
+        db.session.remove()
+
+    # If there were errors, store an error CSV for potential download
+    if errors:
+        try:
            error_csv = StringIO()
            writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
            writer.writeheader()
            writer.writerows(errors)
-            session["error_data"] = error_csv.getvalue()
+            ActivityLog.log_import_activity(
+                action="import_errors",
+                status="error",
+                description=f"Import completed with {error_count} errors",
+                error_csv=error_csv.getvalue(),
+                task_id=self.request.id,
+                error_count=error_count
+            )
+        except Exception:
+            # Do not fail the task if error logging fails
+            pass

-        return render_template(
-            "upload.html.jinja", 
-            success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
-            error_message=error_message,
-            error_samples=error_samples
-        )
+    # Update the return value to include skipped records information
+    return {
+        "added": added_count,
+        "updated": updated_count,
+        "skipped": skipped_count,
+        "skipped_records": skipped_records[:5],  # Include up to 5 examples
+        "skipped_reason_summary": "Records were skipped because they already exist in the database. Use 'update' strategy to update them.",
+        "errors": errors[:5],
+        "error_count": error_count,
+        "task_id": self.request.id
+    }
+    
+@bp.route("/task_status/<task_id>")
+def task_status(task_id):
+    """Get status of background task."""
+    task = celery.AsyncResult(task_id)
+    
+    if task.state == "PENDING":
+        response = {"state": task.state, "progress": 0}
+    elif task.state == "PROGRESS":
+        response = {
+            "state": task.state, 
+            "progress": task.info.get("progress", 0)
+        }
+    elif task.state == "SUCCESS":
+        response = {
+            "state": task.state,
+            "result": task.result
+        }
+    else:  # FAILURE, REVOKED, etc.
+        response = {
+            "state": task.state,
+            "error": str(task.info) if task.info else "Unknown error"
+        }
+    
+    return jsonify(response)

-    return render_template("upload.html.jinja")
-
-
-@bp.route("/download_error_log")
-def download_error_log():
-    error_data = session.get("error_data")
-    if not error_data:
+@bp.route("/download_error_log/<task_id>")
+def download_error_log(task_id):
+    # Find the most recent error log for this task
+    error_log = ActivityLog.query.filter(
+        ActivityLog.action == "import_errors",
+        ActivityLog.extra_data.like(f'%"{task_id}"%')  # Search in JSON
+    ).order_by(ActivityLog.timestamp.desc()).first()
+    
+    if not error_log:
        flash("No error data available.")
        return redirect(url_for("upload.upload"))
+    
+    # Get the CSV data from extra_data
+    extra_data = error_log.get_extra_data()
+    error_csv = extra_data.get("error_csv")
+    
+    if not error_csv:
+        flash("Error data format is invalid.")
+        return redirect(url_for("upload.upload"))
        
-    buffer = StringIO(error_data)
+    buffer = StringIO(error_csv)
    return send_file(
        buffer,
        mimetype="text/csv",
--- a/scipaperloader/celery.py
+++ b/scipaperloader/celery.py
@ -0,0 +1,43 @@
+from celery import Celery
+
+# Create Celery instance without Flask app initially
+celery = Celery(
+    'scipaperloader',
+    broker='redis://localhost:6379/0',
+    backend='redis://localhost:6379/0',
+)
+
+def configure_celery(app=None):
+    """Configure Celery with the Flask app settings and ensure tasks run in the app context."""
+    if app is None:
+        # Import here to avoid circular import
+        from scipaperloader import create_app
+        app = create_app()
+
+    # Update Celery configuration using the app settings
+    celery.conf.update(
+        broker_url=app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0'),
+        result_backend=app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0'),
+        task_serializer='json',
+        accept_content=['json'],
+        result_serializer='json',
+        timezone='UTC',
+        enable_utc=True,
+        task_time_limit=3600,         # 1 hour max runtime
+        task_soft_time_limit=3000,      # 50 minutes soft limit
+        worker_max_tasks_per_child=10,  # Restart workers after 10 tasks
+        worker_max_memory_per_child=1000000,  # 1GB memory limit
+        task_acks_late=True,            # Acknowledge tasks after completion
+        task_reject_on_worker_lost=True,  # Requeue tasks if worker dies
+    )
+
+    # Create a custom task class that pushes the Flask application context
+    class ContextTask(celery.Task):
+        abstract = True
+
+        def __call__(self, *args, **kwargs):
+            with app.app_context():
+                return self.run(*args, **kwargs)
+
+    celery.Task = ContextTask
+    return celery
--- a/scipaperloader/models.py
+++ b/scipaperloader/models.py
@ -12,6 +12,7 @@ class ActivityCategory(Enum):
    SCRAPER_COMMAND = "scraper_command"
    SCRAPER_ACTIVITY = "scraper_activity"
    SYSTEM = "system"
+    DATA_IMPORT = "data_import"


 class ErrorSeverity(Enum):
@ -164,6 +165,20 @@ class ActivityLog(db.Model):
        db.session.commit()
        return log

+    @classmethod
+    def log_import_activity(cls, action, status=None, description=None, user_id=None, **extra):
+        """Log data import activities (CSV uploads, bulk imports, etc.)."""
+        log = cls(
+            category=ActivityCategory.DATA_IMPORT.value,
+            action=action,
+            status=status,
+            description=description,
+            user_id=user_id
+        )
+        log.set_extra_data(extra)
+        db.session.add(log)
+        db.session.commit()
+        return log

 class PaperMetadata(db.Model):
    id = db.Column(db.Integer, primary_key=True)
@ -171,6 +186,7 @@ class PaperMetadata(db.Model):
    doi = db.Column(db.String, unique=True, index=True)
    alt_id = db.Column(db.String)
    issn = db.Column(db.String(32))
+    journal = db.Column(db.String(255))
    type = db.Column(db.String(50))
    language = db.Column(db.String(50))
    published_online = db.Column(db.Date)  # or DateTime/String
--- a/scipaperloader/static/styles.css
+++ b/scipaperloader/static/styles.css
@ -1,5 +1,9 @@
 .message {
-    padding: 10px;
-    font-size: 1.3em;
-    font-family: Arial, sans-serif;
+  padding: 10px;
+  font-size: 1.3em;
+  font-family: Arial, sans-serif;
+}
+
+.progress-bar {
+  width: 0%;
 }
--- a/scipaperloader/templates/upload.html.jinja
+++ b/scipaperloader/templates/upload.html.jinja
@ -1,6 +1,8 @@
 {% extends "base.html.jinja" %} {% block content %}
 <h1>Welcome to SciPaperLoader</h1>

+<div id="results-container"></div>
+
 {% if success %}
 <div class="alert alert-success mt-3">{{ success }}</div>
 {% endif %} {% if error_message %}
@ -40,24 +42,9 @@
    <li><code>issn</code> – the ISSN of the journal</li>
    <li><code>title</code> – the title of the paper</li>
  </ul>
-  <p>
-    The format of your CSV should resemble the response structure of the
-    Crossref API's <code>/journals/{issn}/works</code> endpoint.
-  </p>
 </div>

-<form method="post" action="{{ url_for('upload.upload') }}" enctype="multipart/form-data">
-  <div class="mb-3">
-    <label class="form-label">How to handle duplicate DOIs:</label>
-    <div class="form-check">
-      <input class="form-check-input" type="radio" name="duplicate_strategy" value="skip" id="skip" checked />
-      <label class="form-check-label" for="skip">Skip duplicate entries</label>
-    </div>
-    <div class="form-check">
-      <input class="form-check-input" type="radio" name="duplicate_strategy" value="update" id="update" />
-      <label class="form-check-label" for="update">Update existing entries</label>
-    </div>
-  </div>
+<form method="post" action="{{ url_for('upload.upload') }}" enctype="multipart/form-data" id="upload-form">
  <div class="form-group">
    <label for="file">Upload CSV File</label>
    <input type="file" name="file" id="file" class="form-control" required />
@ -73,4 +60,175 @@
  </div>
  <button type="submit" class="btn btn-primary mt-3">Upload</button>
 </form>
+
+<!-- Progress Modal -->
+<div id="progressModal" class="modal fade" tabindex="-1">
+  <div class="modal-dialog">
+    <div class="modal-content">
+      <div class="modal-header">
+        <h5 class="modal-title">Processing Your Upload</h5>
+      </div>
+      <div class="modal-body">
+        <div class="progress">
+          <div id="progressBar" class="progress-bar" role="progressbar">0%</div>
+        </div>
+        <p id="progressStatus" class="mt-2 text-center">Starting...</p>
+      </div>
+    </div>
+  </div>
+</div>
+
+<script>
+  const form = document.getElementById("upload-form");
+  form.addEventListener("submit", function (e) {
+    e.preventDefault();
+
+    // Display loading state immediately
+    const progressModal = new bootstrap.Modal(document.getElementById("progressModal"));
+    progressModal.show();
+    const progressBar = document.getElementById("progressBar");
+    progressBar.style.width = "5%";
+    progressBar.textContent = "Starting...";
+
+    const formData = new FormData(form);
+
+    // Disable the form while processing
+    const submitButton = form.querySelector("button[type='submit']");
+    submitButton.disabled = true;
+
+    fetch(form.action, {
+      method: "POST",
+      body: formData,
+    })
+      .then((response) => response.json())
+      .then((data) => {
+        if (data.error) {
+          // Handle error
+          progressModal.hide();
+          alert(`Error: ${data.error}`);
+          submitButton.disabled = false;
+          return;
+        }
+
+        const taskId = data.task_id;
+        const interval = setInterval(() => {
+          fetch("{{ url_for('upload.task_status', task_id='') }}" + taskId)
+            .then((response) => response.json())
+            .then((status) => {
+              console.log("Task status:", status);
+              if (status.state === "SUCCESS") {
+                clearInterval(interval);
+                progressBar.style.width = "100%";
+                progressBar.textContent = "Completed!";
+
+                setTimeout(() => {
+                  progressModal.hide();
+                  showResults(status.result);
+                  submitButton.disabled = false;
+                }, 1000);
+              } else if (status.state === "FAILURE") {
+                clearInterval(interval);
+                progressBar.style.width = "100%";
+                progressBar.classList.add("bg-danger");
+                progressBar.textContent = "Failed!";
+
+                setTimeout(() => {
+                  progressModal.hide();
+                  alert(`Task failed: ${status.error || "Unknown error"}`);
+                  submitButton.disabled = false;
+                }, 1000);
+              } else {
+                // Update progress bar with more information
+                const progress = status.progress || 0;
+                progressBar.style.width = `${progress}%`;
+                progressBar.textContent = `${progress}% complete`;
+                document.getElementById("progressStatus").innerText = `Processing... (${status.state})`;
+              }
+            })
+            .catch((err) => {
+              console.error("Failed to check task status:", err);
+            });
+        }, 1000);
+      })
+      .catch((err) => {
+        console.error("Upload failed:", err);
+        progressModal.hide();
+        alert("Upload failed. Please try again.");
+        submitButton.disabled = false;
+      });
+  });
+
+  const showResults = (result) => {
+    const message = `Upload completed! Added: ${result.added}, Updated: ${result.updated}, Skipped: ${result.skipped}, Errors: ${result.error_count}`;
+
+    let resultHTML = `<div class="alert alert-success">${message}</div>`;
+
+    // Add skipped records information
+    if (result.skipped > 0) {
+      resultHTML += `
+        <div class="alert alert-info">
+          <h4>${result.skipped} records were skipped</h4>
+          <p>${result.skipped_reason_summary || "Records were skipped because they already exist in the database."}</p>
+          ${result.skipped_records && result.skipped_records.length > 0 ? `
+            <p>Examples of skipped records:</p>
+            <table class="table table-sm table-bordered">
+              <thead>
+                <tr>
+                  <th>Row</th>
+                  <th>DOI</th>
+                  <th>Reason</th>
+                </tr>
+              </thead>
+              <tbody>
+                ${result.skipped_records.map(record => `
+                  <tr>
+                    <td>${record.row}</td>
+                    <td>${record.doi}</td>
+                    <td>${record.reason}</td>
+                  </tr>
+                `).join('')}
+              </tbody>
+            </table>
+          ` : ''}
+        </div>`;
+    }
+
+    // Existing error display code
+    if (result.error_count > 0) {
+      resultHTML += `
+        <div class="alert alert-warning">
+          <h4>Some errors occurred (${result.error_count} total)</h4>
+          <p>Showing first ${result.errors.length} of ${result.error_count} errors:</p>
+          <table class="table table-sm table-bordered">
+            <thead>
+              <tr>
+                <th>Row</th>
+                <th>DOI</th>
+                <th>Error</th>
+              </tr>
+            </thead>
+            <tbody>`;
+
+      result.errors.forEach(error => {
+        resultHTML += `
+          <tr>
+            <td>${error.row}</td>
+            <td>${error.doi}</td>
+            <td>${error.error}</td>
+          </tr>`;
+      });
+
+      resultHTML += `
+            </tbody>
+          </table>
+          <p class="mt-2">Download the complete error log with all ${result.error_count} errors:</p>
+          <a href="/upload/download_error_log/${result.task_id}" class="btn btn-outline-secondary">
+            Download Full Error Log
+          </a>
+        </div>`;
+    }
+
+    document.getElementById("results-container").innerHTML = resultHTML;
+  };
+</script>
 {% endblock content %}
--- a/testdata.csv
+++ b/testdata.csv