From 3a73aaf8aa82783e2d928215c7f2233eab96edbf Mon Sep 17 00:00:00 2001
From: Michael Beck <ich@mbeck.cologne>
Date: Tue, 1 Apr 2025 23:17:34 +0200
Subject: [PATCH] added error handling to csv import

---
 scipaperloader/templates/upload.html |  49 ++++++++---
 scipaperloader/views.py              | 117 +++++++++++++++++++++++----
 2 files changed, 142 insertions(+), 24 deletions(-)
diff --git a/scipaperloader/templates/upload.html b/scipaperloader/templates/upload.html
index e83533b..5c23d20 100644
--- a/scipaperloader/templates/upload.html
+++ b/scipaperloader/templates/upload.html
@@ -1,7 +1,35 @@
 {% extends 'base.html' %}
 {% block content %}
 <h1>Welcome to SciPaperLoader</h1>
-<p>Your paper scraping tool is ready.</p>
+
+{% if success %}
+    <div class="alert alert-success mt-3">{{ success }}</div>
+{% endif %}
+
+{% if error_message %}
+    <div class="alert alert-warning mt-3">
+        <h4>{{ error_message }}</h4>
+        <table class="table table-sm table-bordered">
+            <thead>
+                <tr>
+                    <th>Row</th>
+                    <th>DOI</th>
+                    <th>Error</th>
+                </tr>
+            </thead>
+            <tbody>
+                {% for error in error_samples %}
+                <tr>
+                    <td>{{ error.row }}</td>
+                    <td>{{ error.doi }}</td>
+                    <td>{{ error.error }}</td>
+                </tr>
+                {% endfor %}
+            </tbody>
+        </table>
+        <a href="{{ url_for('main.download_error_log') }}" class="btn btn-outline-secondary">Download Full Error Log</a>
+    </div>
+{% endif %}
 
 <div class="alert alert-info">
     <p><strong>Instructions:</strong> Please upload a CSV file containing academic paper metadata. The file must include the following columns:</p>
@@ -16,6 +44,17 @@
 </div>
 
 <form method="POST" action="{{ url_for('main.upload') }}" enctype="multipart/form-data">
+    <div class="mb-3">
+        <label class="form-label">How to handle duplicate DOIs:</label>
+        <div class="form-check">
+            <input class="form-check-input" type="radio" name="duplicate_strategy" value="skip" id="skip" checked>
+            <label class="form-check-label" for="skip">Skip duplicate entries</label>
+        </div>
+        <div class="form-check">
+            <input class="form-check-input" type="radio" name="duplicate_strategy" value="update" id="update">
+            <label class="form-check-label" for="update">Update existing entries</label>
+        </div>
+    </div>
     <div class="form-group">
         <label for="file">Upload CSV File</label>
         <input type="file" name="file" id="file" class="form-control" required>
@@ -31,12 +70,4 @@
     </div>
     <button type="submit" class="btn btn-primary mt-3">Upload</button>
 </form>
-
-{% if error %}
-    <div class="alert alert-danger mt-3">{{ error }}</div>
-{% endif %}
-
-{% if success %}
-    <div class="alert alert-success mt-3">{{ success }}</div>
-{% endif %}
 {% endblock %}
diff --git a/scipaperloader/views.py b/scipaperloader/views.py
index 128737b..1720c2a 100644
--- a/scipaperloader/views.py
+++ b/scipaperloader/views.py
@@ -13,8 +13,10 @@ from flask import (
     render_template,
     request,
     send_file,
+    session,  # Add this line
     url_for,
 )
+
 from sqlalchemy import asc, desc
 
 from .db import db
@@ -36,6 +38,7 @@ def upload():
     if request.method == "POST":
         file = request.files.get("file")
         delimiter = request.form.get("delimiter", ",")
+        duplicate_strategy = request.form.get("duplicate_strategy", "skip")
 
         if not file:
             return render_template("upload.html", error="No file selected.")
@@ -62,20 +65,70 @@ def upload():
             except Exception:
                 return None
 
-        for _, row in df.iterrows():
-            metadata = PaperMetadata(
-                title=row["title"],
-                doi=row["doi"],
-                alt_id=row.get("alternative_id"),
-                issn=row["issn"],
-                type=row.get("type"),
-                language=row.get("language"),
-                published_online=parse_date(row.get("published_online")),
-                status="New",
-                file_path=None,
-                error_msg=None,
-            )
-            db.session.add(metadata)
+        # Count statistics
+        added_count = 0
+        skipped_count = 0
+        updated_count = 0
+        error_count = 0
+        
+        # Collect error information
+        errors = []
+        
+        # Process each row
+        for index, row in df.iterrows():
+            try:
+                # Get DOI from row for error reporting
+                doi = str(row.get("doi", "N/A"))
+                
+                # Validate required fields
+                for field in ["title", "doi", "issn"]:
+                    if pd.isna(row.get(field)) or not str(row.get(field)).strip():
+                        raise ValueError(f"Missing required field: {field}")
+                
+                # Check if paper with this DOI already exists
+                existing = PaperMetadata.query.filter_by(doi=doi).first()
+                
+                if existing:
+                    if duplicate_strategy == 'update':
+                        # Update existing record
+                        existing.title = row["title"]
+                        existing.alt_id = row.get("alternative_id")
+                        existing.issn = row["issn"]
+                        existing.journal = row.get("journal")
+                        existing.type = row.get("type")
+                        existing.language = row.get("language")
+                        existing.published_online = parse_date(row.get("published_online"))
+                        updated_count += 1
+                    else:
+                        # Skip this record
+                        skipped_count += 1
+                        continue
+                else:
+                    # Create new record
+                    metadata = PaperMetadata(
+                        title=row["title"],
+                        doi=doi,
+                        alt_id=row.get("alternative_id"),
+                        issn=row["issn"],
+                        journal=row.get("journal"),
+                        type=row.get("type"),
+                        language=row.get("language"),
+                        published_online=parse_date(row.get("published_online")),
+                        status="New",
+                        file_path=None,
+                        error_msg=None,
+                    )
+                    db.session.add(metadata)
+                    added_count += 1
+                    
+            except Exception as e:
+                error_count += 1
+                errors.append({
+                    "row": index + 2,  # +2 because index is 0-based and we have a header row
+                    "doi": row.get("doi", "N/A"),
+                    "error": str(e)
+                })
+                continue  # Skip this row and continue with the next
 
         try:
             db.session.commit()
@@ -85,12 +138,46 @@ def upload():
                 "upload.html", error=f"Failed to save data to database: {e}"
             )
 
+        # Prepare error samples for display
+        error_samples = errors[:5] if errors else []
+        
+        error_message = None
+        if errors:
+            error_message = f"Encountered {len(errors)} errors. First 5 shown below."
+            
+        # Store the full errors list in the session for potential download
+        if errors:
+            error_csv = StringIO()
+            writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
+            writer.writeheader()
+            writer.writerows(errors)
+            session["error_data"] = error_csv.getvalue()
+
         return render_template(
-            "upload.html", success="File uploaded and validated successfully!"
+            "upload.html", 
+            success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
+            error_message=error_message,
+            error_samples=error_samples
         )
 
     return render_template("upload.html")
 
+# Add a route to download the error log
+@bp.route("/download_error_log")
+def download_error_log():
+    error_data = session.get("error_data")
+    if not error_data:
+        flash("No error data available.")
+        return redirect(url_for("main.upload"))
+        
+    buffer = StringIO(error_data)
+    return send_file(
+        buffer,
+        mimetype="text/csv",
+        as_attachment=True,
+        download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+    )
+    
 
 @bp.route("/papers")
 def list_papers():