implements asynchronous task management to the input module.

2025-04-12 12:55:19 +02:00 · 2025-04-12 12:55:19 +02:00 · 05f4c8b517
commit 05f4c8b517
parent 5d63e28a61
14 changed files with 2190 additions and 148 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,3 +13,5 @@ dist/
 *.db
 *.R
 migrations/
--- a/31
+++ b/31
@ -1,10 +1,13 @@
 # List of phony targets (targets that don't represent files)
-.PHONY: all clean venv run format format-check lint mypy test dist reformat dev
+.PHONY: all clean venv run format format-check lint mypy test dist reformat dev celery celery-flower redis run-all
 # Define Python and pip executables inside virtual environment
 PYTHON := venv/bin/python
 PIP := venv/bin/pip
 # Celery worker command
 CELERY := venv/bin/celery
 # Default target that runs the application
 all: run
@ -83,11 +86,11 @@ todos:
 	@grep -r "TODO\|FIXME" scipaperloader || echo "No TODOs found"
 # Reset the database: delete, initialize, and migrate
-reset-db:
+reset-db: venv
 	rm -f $(DB_PATH)
-	flask db init || true
+	$(PYTHON) -m flask --app scipaperloader db init || true
-	flask db migrate -m "Initial migration"
+	$(PYTHON) -m flask --app scipaperloader db migrate -m "Initial migration"
-	flask db upgrade
+	$(PYTHON) -m flask --app scipaperloader db upgrade
 # Create and set up virtual environment
 venv:
@ -130,3 +133,21 @@ dist: format-check lint mypy test
 # Set up complete development environment
 dev: clean venv
 # Start Celery worker for processing tasks
 celery: venv
 	$(CELERY) -A celery_worker:celery worker --loglevel=info
 # Monitor Celery tasks with flower web interface
 celery-flower: venv
 	$(PIP) install flower
 	$(CELERY) -A celery_worker:celery flower --port=5555
 # Check if Redis is running, start if needed
 redis:
 	@redis-cli ping > /dev/null 2>&1 || (echo "Starting Redis server..." && redis-server --daemonize yes)
 # Run complete application stack (Flask app + Celery worker + Redis)
 run-all: redis
 	@echo "Starting Flask and Celery..."
 	@$(MAKE) -j2 run celery
--- a/README.md
+++ b/README.md
@ -14,7 +14,8 @@ And open it in the browser at [http://localhost:5000/](http://localhost:5000/)
 ## Prerequisites
-Python >=3.8
+- Python >=3.8
 - Redis (for Celery task queue)
 ## Development environment
@ -40,12 +41,44 @@ Python >=3.8
   add development dependencies under `project.optional-dependencies.*`; run
   `make clean && make venv` to reinstall the environment
 ## Asynchronous Task Processing with Celery
 SciPaperLoader uses Celery for processing large CSV uploads and other background tasks. This allows the application to handle large datasets reliably without blocking the web interface.
 ### Running Celery Components
 - `make redis`: ensures Redis server is running (required for Celery)
 - `make celery`: starts a Celery worker to process background tasks
 - `make celery-flower`: starts Flower, a web interface for monitoring Celery tasks at http://localhost:5555
 - `make run-all`: runs the entire stack (Flask app + Celery worker + Redis) in development mode
 ### How It Works
 When you upload a CSV file through the web interface:
 1. The file is sent to the server
 2. A Celery task is created to process the file asynchronously
 3. The browser shows a progress bar with real-time updates
 4. The results are displayed when processing is complete
 This architecture allows SciPaperLoader to handle CSV files with thousands of papers without timing out or blocking the web interface.
 ## Configuration
 Default configuration is loaded from `scipaperloader.defaults` and can be
 overriden by environment variables with a `FLASK_` prefix. See
 [Configuring from Environment Variables](https://flask.palletsprojects.com/en/3.0.x/config/#configuring-from-environment-variables).
 ### Celery Configuration
 The following environment variables can be set to configure Celery:
 - `FLASK_CELERY_BROKER_URL`: Redis URL for the message broker (default: `redis://localhost:6379/0`)
 - `FLASK_CELERY_RESULT_BACKEND`: Redis URL for storing task results (default: `redis://localhost:6379/0`)
 Consider using
 [dotenv](https://flask.palletsprojects.com/en/3.0.x/cli/#environment-variables-from-dotenv).
@ -59,3 +92,11 @@ deliver to your server, or copy in your `Dockerfile`, and insall it with `pip`.
 You must set a
 [SECRET_KEY](https://flask.palletsprojects.com/en/3.0.x/tutorial/deploy/#configure-the-secret-key)
 in production to a secret and stable value.
 ### Deploying with Celery
 When deploying to production:
 1. Configure a production-ready Redis instance or use a managed service
 2. Run Celery workers as system services or in Docker containers
 3. Consider setting up monitoring for your Celery tasks and workers
--- a/celery_worker.py
+++ b/celery_worker.py
@ -0,0 +1,7 @@
 from scipaperloader.celery import celery, configure_celery
 # Configure celery with Flask app
 configure_celery()
 if __name__ == '__main__':
    celery.start()
--- a/dump.rdb
+++ b/dump.rdb
--- a/pyproject.toml
+++ b/pyproject.toml
@ -13,6 +13,10 @@ dependencies = [
    "flask-wtf>=1.2.2,<2",
    "pyzotero>=1.6.11,<2",
    "pandas>=2.2.3,<3",
    "celery>=5.5.1,<6",
    "redis>=5.2.1,<6",
    "flower>=2.0.1,<3",
    "flask-migrate>=4.1.0,<5",
    ]
 [project.optional-dependencies]
--- a/scipaperloader/init.py
+++ b/scipaperloader/init.py
@ -1,5 +1,5 @@
 from flask import Flask, request
-
+from flask_migrate import Migrate  # Add this line
 from .config import Config
 from .db import db
 from .models import init_schedule_config
@ -10,10 +10,15 @@ def create_app(test_config=None):
    app = Flask(__name__)
    app.config.from_object(Config)
    # Celery configuration
    app.config['CELERY_BROKER_URL'] = app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0')
    app.config['CELERY_RESULT_BACKEND'] = app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0')
    if test_config:
        app.config.update(test_config)
    db.init_app(app)
    migrate = Migrate(app, db)  # Add this line to initialize Flask-Migrate
    with app.app_context():
        db.create_all()
@ -27,9 +32,18 @@ def create_app(test_config=None):
    @app.before_request
    def before_request():
        # Skip logging for static files, health checks, or other frequent requests
        if request.path.startswith('/static/') or request.path == '/health' or request.path == '/favicon.ico':
            return
        # Skip task status checks to avoid log spam
        if request.path.startswith('/task_status/'):
            return
        action = request.endpoint or request.path or "unknown_request"
        ActivityLog.log_gui_interaction(
-            action=request.endpoint,
+            action=action,
-            description=f"Request to {request.endpoint}",
+            description=f"Request to {request.path}",
            extra={"method": request.method, "url": request.url}
        )
--- a/scipaperloader/blueprints/papers.py
+++ b/scipaperloader/blueprints/papers.py
@ -116,7 +116,7 @@ def export_papers():
            [
                paper.id,
                paper.title,
-                getattr(paper, "journal", ""),
+                paper.journal,
                paper.doi,
                paper.issn,
                paper.status,
--- a/scipaperloader/blueprints/upload.py
+++ b/scipaperloader/blueprints/upload.py
@ -3,26 +3,39 @@ import codecs
 import csv
 import datetime
 from io import StringIO
 import json
 import pandas as pd
 from flask import (
    Blueprint,
    flash,
    jsonify,
    redirect,
    render_template,
    request,
    send_file,
    session,
    url_for,
    current_app
 )
 from ..db import db
-from ..models import PaperMetadata
+from ..models import PaperMetadata, ActivityLog
 from ..celery import celery  # Import the celery instance directly
 bp = Blueprint("upload", __name__)
 REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
 CHUNK_SIZE = 100  # Number of rows to process per batch
 def parse_date(date_str):
    """Parse date string into datetime object."""
    if not date_str or pd.isna(date_str):
        return None
    try:
        return datetime.datetime.strptime(date_str, "%Y-%m-%d")
    except ValueError:
        return None
@bp.route("/", methods=["GET", "POST"])
 def upload():
@ -32,136 +45,214 @@ def upload():
        duplicate_strategy = request.form.get("duplicate_strategy", "skip")
        if not file:
-            return render_template("upload.html.jinja", error="No file selected.")
+            return jsonify({"error": "No file selected."})
-        try:
+        stream = codecs.iterdecode(file.stream, "utf-8")
-            stream = codecs.iterdecode(file.stream, "utf-8")
+        content = "".join(stream)
            content = "".join(stream)
            df = pd.read_csv(StringIO(content), delimiter=delimiter)
        except Exception as e:
            return render_template("upload.html.jinja", error=f"Failed to read CSV file: {e}")
-        missing = REQUIRED_COLUMNS - set(df.columns)
+        # Trigger the Celery task
-        if missing:
+        task = process_csv.delay(content, delimiter, duplicate_strategy)
            return render_template(
                "upload.html.jinja", error=f"Missing required columns: {', '.join(missing)}"
            )
-        # Optional: parse 'published_online' to date
+        return jsonify({"task_id": task.id})
        def parse_date(val):
            if pd.isna(val):
                return None
            try:
                return pd.to_datetime(val).date()
            except Exception:
                return None
-        # Count statistics
+    return render_template("upload.html.jinja")
        added_count = 0
        skipped_count = 0
        updated_count = 0
        error_count = 0
-        # Collect error information
+@celery.task(bind=True)
-        errors = []
+def process_csv(self, file_content, delimiter, duplicate_strategy):
    """Process CSV file and import paper metadata."""
-        # Process each row
+    # With the ContextTask in place, we're already inside an app context
-        for index, row in df.iterrows():
+    added_count = skipped_count = updated_count = error_count = 0
-            try:
+    errors = []
-                # Get DOI from row for error reporting
+    skipped_records = []  # Add this to track skipped records
                doi = str(row.get("doi", "N/A"))
-                # Validate required fields
+    try:
-                for field in ["title", "doi", "issn"]:
+        # Log the start of import using ActivityLog model
-                    if pd.isna(row.get(field)) or not str(row.get(field)).strip():
+        ActivityLog.log_import_activity(
-                        raise ValueError(f"Missing required field: {field}")
+            action="start_csv_import",
            status="processing",
            description=f"Starting CSV import with strategy: {duplicate_strategy}",
            file_size=len(file_content),
            delimiter=delimiter
        )
-                # Check if paper with this DOI already exists
+        # Set initial progress percentage
-                existing = PaperMetadata.query.filter_by(doi=doi).first()
+        self.update_state(state='PROGRESS', meta={'progress': 10})
-                if existing:
+        # Read CSV into chunks
-                    if duplicate_strategy == 'update':
+        csv_buffer = StringIO(file_content)
-                        # Update existing record
+        # Count total chunks
-                        existing.title = row["title"]
+        csv_buffer.seek(0)
-                        existing.alt_id = row.get("alternative_id")
+        total_chunks = len(list(pd.read_csv(csv_buffer, delimiter=delimiter, chunksize=CHUNK_SIZE)))
-                        existing.issn = row["issn"]
+        csv_buffer.seek(0)
-                        existing.journal = row.get("journal")
+
-                        existing.type = row.get("type")
+        # Process each chunk of rows
-                        existing.language = row.get("language")
+        for chunk_idx, chunk in enumerate(pd.read_csv(csv_buffer, delimiter=delimiter, chunksize=CHUNK_SIZE)):
-                        existing.published_online = parse_date(row.get("published_online"))
+            for index, row in chunk.iterrows():
-                        updated_count += 1
+                try:
                    doi = str(row.get("doi", "N/A"))
                    # Validate required fields
                    if pd.isna(row.get("title")) or pd.isna(row.get("doi")) or pd.isna(row.get("issn")):
                        raise ValueError("Missing required fields")
                    # Try finding an existing record based on DOI
                    existing = db.session.query(PaperMetadata).filter_by(doi=doi).first()
                    if existing:
                        if duplicate_strategy == "update":
                            existing.title = row["title"]
                            existing.alt_id = row.get("alternative_id")
                            existing.issn = row["issn"]
                            existing.journal = row.get("journal")
                            existing.published_online = parse_date(row.get("published_online"))
                            updated_count += 1
                        else:
                            # Track why this record was skipped
                            skipped_records.append({
                                "row": index + 2,
                                "doi": doi,
                                "reason": f"Duplicate DOI found and strategy is '{duplicate_strategy}'"
                            })
                            skipped_count += 1
                            continue
                    else:
-                        # Skip this record
+                        metadata = PaperMetadata(
-                        skipped_count += 1
+                            title=row["title"],
-                        continue
+                            doi=doi,
-                else:
+                            alt_id=row.get("alternative_id"),
-                    # Create new record
+                            issn=row["issn"],
-                    metadata = PaperMetadata(
+                            journal=row.get("journal"),
-                        title=row["title"],
+                            published_online=parse_date(row.get("published_online")),
-                        doi=doi,
+                            status="New",
-                        alt_id=row.get("alternative_id"),
+                        )
-                        issn=row["issn"],
+                        db.session.add(metadata)
-                        journal=row.get("journal"),
+                        added_count += 1
-                        type=row.get("type"),
+                except Exception as e:
-                        language=row.get("language"),
+                    error_count += 1
-                        published_online=parse_date(row.get("published_online")),
+                    errors.append({"row": index + 2, "doi": row.get("doi", "N/A"), "error": str(e)})
                        status="New",
                        file_path=None,
                        error_msg=None,
                    )
                    db.session.add(metadata)
                    added_count += 1
-            except Exception as e:
+            # Commit the chunk and roll session fresh
                error_count += 1
                errors.append({
                    "row": index + 2,  # +2 because index is 0-based and we have a header row
                    "doi": row.get("doi", "N/A"),
                    "error": str(e)
                })
                continue  # Skip this row and continue with the next
        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            return render_template(
                "upload.html.jinja", error=f"Failed to save data to database: {e}"
            )
-        # Prepare error samples for display
+            # Log periodic progress every 5 chunks
-        error_samples = errors[:5] if errors else []
+            if (chunk_idx + 1) % 5 == 0:
                ActivityLog.log_import_activity(
                    action="import_progress",
                    status="processing",
                    description=f"Processed {chunk_idx+1}/{total_chunks} chunks",
                    current_stats={
                        "added": added_count,
                        "updated": updated_count,
                        "skipped": skipped_count,
                        "errors": error_count
                    }
                )
-        error_message = None
+            progress = min(90, 10 + int((chunk_idx + 1) * 80 / total_chunks))
-        if errors:
+            self.update_state(state='PROGRESS', meta={'progress': progress})
            error_message = f"Encountered {len(errors)} errors. First 5 shown below."
-        # Store the full errors list in the session for potential download
+        # Final progress update and completion log
-        if errors:
+        self.update_state(state='PROGRESS', meta={'progress': 100})
        ActivityLog.log_import_activity(
            action="complete_csv_import",
            status="success",
            description="CSV import completed",
            stats={
                "added": added_count,
                "updated": updated_count,
                "skipped": skipped_count,
                "errors": error_count
            }
        )
    except Exception as e:
        db.session.rollback()
        ActivityLog.log_error(
            error_message="CSV import failed",
            exception=e,
            severity="error",
            source="upload.process_csv"
        )
        return {'error': str(e), 'progress': 0}
    finally:
        db.session.remove()
    # If there were errors, store an error CSV for potential download
    if errors:
        try:
            error_csv = StringIO()
            writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
            writer.writeheader()
            writer.writerows(errors)
-            session["error_data"] = error_csv.getvalue()
+            ActivityLog.log_import_activity(
                action="import_errors",
                status="error",
                description=f"Import completed with {error_count} errors",
                error_csv=error_csv.getvalue(),
                task_id=self.request.id,
                error_count=error_count
            )
        except Exception:
            # Do not fail the task if error logging fails
            pass
-        return render_template(
+    # Update the return value to include skipped records information
-            "upload.html.jinja", 
+    return {
-            success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
+        "added": added_count,
-            error_message=error_message,
+        "updated": updated_count,
-            error_samples=error_samples
+        "skipped": skipped_count,
-        )
+        "skipped_records": skipped_records[:5],  # Include up to 5 examples
        "skipped_reason_summary": "Records were skipped because they already exist in the database. Use 'update' strategy to update them.",
        "errors": errors[:5],
        "error_count": error_count,
        "task_id": self.request.id
    }
-    return render_template("upload.html.jinja")
+@bp.route("/task_status/<task_id>")
 def task_status(task_id):
    """Get status of background task."""
    task = celery.AsyncResult(task_id)
    if task.state == "PENDING":
        response = {"state": task.state, "progress": 0}
    elif task.state == "PROGRESS":
        response = {
            "state": task.state, 
            "progress": task.info.get("progress", 0)
        }
    elif task.state == "SUCCESS":
        response = {
            "state": task.state,
            "result": task.result
        }
    else:  # FAILURE, REVOKED, etc.
        response = {
            "state": task.state,
            "error": str(task.info) if task.info else "Unknown error"
        }
-@bp.route("/download_error_log")
+    return jsonify(response)
-def download_error_log():
+
-    error_data = session.get("error_data")
+@bp.route("/download_error_log/<task_id>")
-    if not error_data:
+def download_error_log(task_id):
    # Find the most recent error log for this task
    error_log = ActivityLog.query.filter(
        ActivityLog.action == "import_errors",
        ActivityLog.extra_data.like(f'%"{task_id}"%')  # Search in JSON
    ).order_by(ActivityLog.timestamp.desc()).first()
    if not error_log:
        flash("No error data available.")
        return redirect(url_for("upload.upload"))
-    buffer = StringIO(error_data)
+    # Get the CSV data from extra_data
    extra_data = error_log.get_extra_data()
    error_csv = extra_data.get("error_csv")
    if not error_csv:
        flash("Error data format is invalid.")
        return redirect(url_for("upload.upload"))
    buffer = StringIO(error_csv)
    return send_file(
        buffer,
        mimetype="text/csv",
--- a/scipaperloader/celery.py
+++ b/scipaperloader/celery.py
@ -0,0 +1,43 @@
 from celery import Celery
 # Create Celery instance without Flask app initially
 celery = Celery(
    'scipaperloader',
    broker='redis://localhost:6379/0',
    backend='redis://localhost:6379/0',
 )
 def configure_celery(app=None):
    """Configure Celery with the Flask app settings and ensure tasks run in the app context."""
    if app is None:
        # Import here to avoid circular import
        from scipaperloader import create_app
        app = create_app()
    # Update Celery configuration using the app settings
    celery.conf.update(
        broker_url=app.config.get('CELERY_BROKER_URL', 'redis://localhost:6379/0'),
        result_backend=app.config.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0'),
        task_serializer='json',
        accept_content=['json'],
        result_serializer='json',
        timezone='UTC',
        enable_utc=True,
        task_time_limit=3600,         # 1 hour max runtime
        task_soft_time_limit=3000,      # 50 minutes soft limit
        worker_max_tasks_per_child=10,  # Restart workers after 10 tasks
        worker_max_memory_per_child=1000000,  # 1GB memory limit
        task_acks_late=True,            # Acknowledge tasks after completion
        task_reject_on_worker_lost=True,  # Requeue tasks if worker dies
    )
    # Create a custom task class that pushes the Flask application context
    class ContextTask(celery.Task):
        abstract = True
        def __call__(self, *args, **kwargs):
            with app.app_context():
                return self.run(*args, **kwargs)
    celery.Task = ContextTask
    return celery
--- a/scipaperloader/models.py
+++ b/scipaperloader/models.py
@ -12,6 +12,7 @@ class ActivityCategory(Enum):
    SCRAPER_COMMAND = "scraper_command"
    SCRAPER_ACTIVITY = "scraper_activity"
    SYSTEM = "system"
    DATA_IMPORT = "data_import"
 class ErrorSeverity(Enum):
@ -164,6 +165,20 @@ class ActivityLog(db.Model):
        db.session.commit()
        return log
    @classmethod
    def log_import_activity(cls, action, status=None, description=None, user_id=None, **extra):
        """Log data import activities (CSV uploads, bulk imports, etc.)."""
        log = cls(
            category=ActivityCategory.DATA_IMPORT.value,
            action=action,
            status=status,
            description=description,
            user_id=user_id
        )
        log.set_extra_data(extra)
        db.session.add(log)
        db.session.commit()
        return log
 class PaperMetadata(db.Model):
    id = db.Column(db.Integer, primary_key=True)
@ -171,6 +186,7 @@ class PaperMetadata(db.Model):
    doi = db.Column(db.String, unique=True, index=True)
    alt_id = db.Column(db.String)
    issn = db.Column(db.String(32))
    journal = db.Column(db.String(255))
    type = db.Column(db.String(50))
    language = db.Column(db.String(50))
    published_online = db.Column(db.Date)  # or DateTime/String
--- a/scipaperloader/static/styles.css
+++ b/scipaperloader/static/styles.css
@ -1,5 +1,9 @@
 .message {
-    padding: 10px;
+  padding: 10px;
-    font-size: 1.3em;
+  font-size: 1.3em;
-    font-family: Arial, sans-serif;
+  font-family: Arial, sans-serif;
 }
 .progress-bar {
  width: 0%;
 }
--- a/scipaperloader/templates/upload.html.jinja
+++ b/scipaperloader/templates/upload.html.jinja
@ -1,6 +1,8 @@
 {% extends "base.html.jinja" %} {% block content %}
 <h1>Welcome to SciPaperLoader</h1>
 <div id="results-container"></div>
 {% if success %}
 <div class="alert alert-success mt-3">{{ success }}</div>
 {% endif %} {% if error_message %}
@ -40,24 +42,9 @@
    <li><code>issn</code> – the ISSN of the journal</li>
    <li><code>title</code> – the title of the paper</li>
  </ul>
  <p>
    The format of your CSV should resemble the response structure of the
    Crossref API's <code>/journals/{issn}/works</code> endpoint.
  </p>
 </div>
-<form method="post" action="{{ url_for('upload.upload') }}" enctype="multipart/form-data">
+<form method="post" action="{{ url_for('upload.upload') }}" enctype="multipart/form-data" id="upload-form">
  <div class="mb-3">
    <label class="form-label">How to handle duplicate DOIs:</label>
    <div class="form-check">
      <input class="form-check-input" type="radio" name="duplicate_strategy" value="skip" id="skip" checked />
      <label class="form-check-label" for="skip">Skip duplicate entries</label>
    </div>
    <div class="form-check">
      <input class="form-check-input" type="radio" name="duplicate_strategy" value="update" id="update" />
      <label class="form-check-label" for="update">Update existing entries</label>
    </div>
  </div>
  <div class="form-group">
    <label for="file">Upload CSV File</label>
    <input type="file" name="file" id="file" class="form-control" required />
@ -73,4 +60,175 @@
  </div>
  <button type="submit" class="btn btn-primary mt-3">Upload</button>
 </form>
 <!-- Progress Modal -->
 <div id="progressModal" class="modal fade" tabindex="-1">
  <div class="modal-dialog">
    <div class="modal-content">
      <div class="modal-header">
        <h5 class="modal-title">Processing Your Upload</h5>
      </div>
      <div class="modal-body">
        <div class="progress">
          <div id="progressBar" class="progress-bar" role="progressbar">0%</div>
        </div>
        <p id="progressStatus" class="mt-2 text-center">Starting...</p>
      </div>
    </div>
  </div>
 </div>
 <script>
  const form = document.getElementById("upload-form");
  form.addEventListener("submit", function (e) {
    e.preventDefault();
    // Display loading state immediately
    const progressModal = new bootstrap.Modal(document.getElementById("progressModal"));
    progressModal.show();
    const progressBar = document.getElementById("progressBar");
    progressBar.style.width = "5%";
    progressBar.textContent = "Starting...";
    const formData = new FormData(form);
    // Disable the form while processing
    const submitButton = form.querySelector("button[type='submit']");
    submitButton.disabled = true;
    fetch(form.action, {
      method: "POST",
      body: formData,
    })
      .then((response) => response.json())
      .then((data) => {
        if (data.error) {
          // Handle error
          progressModal.hide();
          alert(`Error: ${data.error}`);
          submitButton.disabled = false;
          return;
        }
        const taskId = data.task_id;
        const interval = setInterval(() => {
          fetch("{{ url_for('upload.task_status', task_id='') }}" + taskId)
            .then((response) => response.json())
            .then((status) => {
              console.log("Task status:", status);
              if (status.state === "SUCCESS") {
                clearInterval(interval);
                progressBar.style.width = "100%";
                progressBar.textContent = "Completed!";
                setTimeout(() => {
                  progressModal.hide();
                  showResults(status.result);
                  submitButton.disabled = false;
                }, 1000);
              } else if (status.state === "FAILURE") {
                clearInterval(interval);
                progressBar.style.width = "100%";
                progressBar.classList.add("bg-danger");
                progressBar.textContent = "Failed!";
                setTimeout(() => {
                  progressModal.hide();
                  alert(`Task failed: ${status.error || "Unknown error"}`);
                  submitButton.disabled = false;
                }, 1000);
              } else {
                // Update progress bar with more information
                const progress = status.progress || 0;
                progressBar.style.width = `${progress}%`;
                progressBar.textContent = `${progress}% complete`;
                document.getElementById("progressStatus").innerText = `Processing... (${status.state})`;
              }
            })
            .catch((err) => {
              console.error("Failed to check task status:", err);
            });
        }, 1000);
      })
      .catch((err) => {
        console.error("Upload failed:", err);
        progressModal.hide();
        alert("Upload failed. Please try again.");
        submitButton.disabled = false;
      });
  });
  const showResults = (result) => {
    const message = `Upload completed! Added: ${result.added}, Updated: ${result.updated}, Skipped: ${result.skipped}, Errors: ${result.error_count}`;
    let resultHTML = `<div class="alert alert-success">${message}</div>`;
    // Add skipped records information
    if (result.skipped > 0) {
      resultHTML += `
        <div class="alert alert-info">
          <h4>${result.skipped} records were skipped</h4>
          <p>${result.skipped_reason_summary || "Records were skipped because they already exist in the database."}</p>
          ${result.skipped_records && result.skipped_records.length > 0 ? `
            <p>Examples of skipped records:</p>
            <table class="table table-sm table-bordered">
              <thead>
                <tr>
                  <th>Row</th>
                  <th>DOI</th>
                  <th>Reason</th>
                </tr>
              </thead>
              <tbody>
                ${result.skipped_records.map(record => `
                  <tr>
                    <td>${record.row}</td>
                    <td>${record.doi}</td>
                    <td>${record.reason}</td>
                  </tr>
                `).join('')}
              </tbody>
            </table>
          ` : ''}
        </div>`;
    }
    // Existing error display code
    if (result.error_count > 0) {
      resultHTML += `
        <div class="alert alert-warning">
          <h4>Some errors occurred (${result.error_count} total)</h4>
          <p>Showing first ${result.errors.length} of ${result.error_count} errors:</p>
          <table class="table table-sm table-bordered">
            <thead>
              <tr>
                <th>Row</th>
                <th>DOI</th>
                <th>Error</th>
              </tr>
            </thead>
            <tbody>`;
      result.errors.forEach(error => {
        resultHTML += `
          <tr>
            <td>${error.row}</td>
            <td>${error.doi}</td>
            <td>${error.error}</td>
          </tr>`;
      });
      resultHTML += `
            </tbody>
          </table>
          <p class="mt-2">Download the complete error log with all ${result.error_count} errors:</p>
          <a href="/upload/download_error_log/${result.task_id}" class="btn btn-outline-secondary">
            Download Full Error Log
          </a>
        </div>`;
    }
    document.getElementById("results-container").innerHTML = resultHTML;
  };
 </script>
 {% endblock content %}
--- a/testdata.csv
+++ b/testdata.csv