From 3a73aaf8aa82783e2d928215c7f2233eab96edbf Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Tue, 1 Apr 2025 23:17:34 +0200 Subject: [PATCH] added error handling to csv import --- scipaperloader/templates/upload.html | 49 ++++++++--- scipaperloader/views.py | 117 +++++++++++++++++++++++---- 2 files changed, 142 insertions(+), 24 deletions(-) diff --git a/scipaperloader/templates/upload.html b/scipaperloader/templates/upload.html index e83533b..5c23d20 100644 --- a/scipaperloader/templates/upload.html +++ b/scipaperloader/templates/upload.html @@ -1,7 +1,35 @@ {% extends 'base.html' %} {% block content %}

Welcome to SciPaperLoader

-

Your paper scraping tool is ready.

+ +{% if success %} +
{{ success }}
+{% endif %} + +{% if error_message %} +
+

{{ error_message }}

+ + + + + + + + + + {% for error in error_samples %} + + + + + + {% endfor %} + +
RowDOIError
{{ error.row }}{{ error.doi }}{{ error.error }}
+ Download Full Error Log +
+{% endif %}

Instructions: Please upload a CSV file containing academic paper metadata. The file must include the following columns:

@@ -16,6 +44,17 @@
+
+ +
+ + +
+
+ + +
+
@@ -31,12 +70,4 @@
- -{% if error %} -
{{ error }}
-{% endif %} - -{% if success %} -
{{ success }}
-{% endif %} {% endblock %} diff --git a/scipaperloader/views.py b/scipaperloader/views.py index 128737b..1720c2a 100644 --- a/scipaperloader/views.py +++ b/scipaperloader/views.py @@ -13,8 +13,10 @@ from flask import ( render_template, request, send_file, + session, # Add this line url_for, ) + from sqlalchemy import asc, desc from .db import db @@ -36,6 +38,7 @@ def upload(): if request.method == "POST": file = request.files.get("file") delimiter = request.form.get("delimiter", ",") + duplicate_strategy = request.form.get("duplicate_strategy", "skip") if not file: return render_template("upload.html", error="No file selected.") @@ -62,20 +65,70 @@ def upload(): except Exception: return None - for _, row in df.iterrows(): - metadata = PaperMetadata( - title=row["title"], - doi=row["doi"], - alt_id=row.get("alternative_id"), - issn=row["issn"], - type=row.get("type"), - language=row.get("language"), - published_online=parse_date(row.get("published_online")), - status="New", - file_path=None, - error_msg=None, - ) - db.session.add(metadata) + # Count statistics + added_count = 0 + skipped_count = 0 + updated_count = 0 + error_count = 0 + + # Collect error information + errors = [] + + # Process each row + for index, row in df.iterrows(): + try: + # Get DOI from row for error reporting + doi = str(row.get("doi", "N/A")) + + # Validate required fields + for field in ["title", "doi", "issn"]: + if pd.isna(row.get(field)) or not str(row.get(field)).strip(): + raise ValueError(f"Missing required field: {field}") + + # Check if paper with this DOI already exists + existing = PaperMetadata.query.filter_by(doi=doi).first() + + if existing: + if duplicate_strategy == 'update': + # Update existing record + existing.title = row["title"] + existing.alt_id = row.get("alternative_id") + existing.issn = row["issn"] + existing.journal = row.get("journal") + existing.type = row.get("type") + existing.language = row.get("language") + existing.published_online = parse_date(row.get("published_online")) + updated_count += 1 + else: + # Skip this record + skipped_count += 1 + continue + else: + # Create new record + metadata = PaperMetadata( + title=row["title"], + doi=doi, + alt_id=row.get("alternative_id"), + issn=row["issn"], + journal=row.get("journal"), + type=row.get("type"), + language=row.get("language"), + published_online=parse_date(row.get("published_online")), + status="New", + file_path=None, + error_msg=None, + ) + db.session.add(metadata) + added_count += 1 + + except Exception as e: + error_count += 1 + errors.append({ + "row": index + 2, # +2 because index is 0-based and we have a header row + "doi": row.get("doi", "N/A"), + "error": str(e) + }) + continue # Skip this row and continue with the next try: db.session.commit() @@ -85,12 +138,46 @@ def upload(): "upload.html", error=f"Failed to save data to database: {e}" ) + # Prepare error samples for display + error_samples = errors[:5] if errors else [] + + error_message = None + if errors: + error_message = f"Encountered {len(errors)} errors. First 5 shown below." + + # Store the full errors list in the session for potential download + if errors: + error_csv = StringIO() + writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"]) + writer.writeheader() + writer.writerows(errors) + session["error_data"] = error_csv.getvalue() + return render_template( - "upload.html", success="File uploaded and validated successfully!" + "upload.html", + success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}", + error_message=error_message, + error_samples=error_samples ) return render_template("upload.html") +# Add a route to download the error log +@bp.route("/download_error_log") +def download_error_log(): + error_data = session.get("error_data") + if not error_data: + flash("No error data available.") + return redirect(url_for("main.upload")) + + buffer = StringIO(error_data) + return send_file( + buffer, + mimetype="text/csv", + as_attachment=True, + download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" + ) + @bp.route("/papers") def list_papers():