added error handling to csv import

This commit is contained in:
Michael Beck 2025-04-01 23:17:34 +02:00
parent d2b99ec0d7
commit 3a73aaf8aa
2 changed files with 142 additions and 24 deletions

View File

@ -1,7 +1,35 @@
{% extends 'base.html' %} {% extends 'base.html' %}
{% block content %} {% block content %}
<h1>Welcome to SciPaperLoader</h1> <h1>Welcome to SciPaperLoader</h1>
<p>Your paper scraping tool is ready.</p>
{% if success %}
<div class="alert alert-success mt-3">{{ success }}</div>
{% endif %}
{% if error_message %}
<div class="alert alert-warning mt-3">
<h4>{{ error_message }}</h4>
<table class="table table-sm table-bordered">
<thead>
<tr>
<th>Row</th>
<th>DOI</th>
<th>Error</th>
</tr>
</thead>
<tbody>
{% for error in error_samples %}
<tr>
<td>{{ error.row }}</td>
<td>{{ error.doi }}</td>
<td>{{ error.error }}</td>
</tr>
{% endfor %}
</tbody>
</table>
<a href="{{ url_for('main.download_error_log') }}" class="btn btn-outline-secondary">Download Full Error Log</a>
</div>
{% endif %}
<div class="alert alert-info"> <div class="alert alert-info">
<p><strong>Instructions:</strong> Please upload a CSV file containing academic paper metadata. The file must include the following columns:</p> <p><strong>Instructions:</strong> Please upload a CSV file containing academic paper metadata. The file must include the following columns:</p>
@ -16,6 +44,17 @@
</div> </div>
<form method="POST" action="{{ url_for('main.upload') }}" enctype="multipart/form-data"> <form method="POST" action="{{ url_for('main.upload') }}" enctype="multipart/form-data">
<div class="mb-3">
<label class="form-label">How to handle duplicate DOIs:</label>
<div class="form-check">
<input class="form-check-input" type="radio" name="duplicate_strategy" value="skip" id="skip" checked>
<label class="form-check-label" for="skip">Skip duplicate entries</label>
</div>
<div class="form-check">
<input class="form-check-input" type="radio" name="duplicate_strategy" value="update" id="update">
<label class="form-check-label" for="update">Update existing entries</label>
</div>
</div>
<div class="form-group"> <div class="form-group">
<label for="file">Upload CSV File</label> <label for="file">Upload CSV File</label>
<input type="file" name="file" id="file" class="form-control" required> <input type="file" name="file" id="file" class="form-control" required>
@ -31,12 +70,4 @@
</div> </div>
<button type="submit" class="btn btn-primary mt-3">Upload</button> <button type="submit" class="btn btn-primary mt-3">Upload</button>
</form> </form>
{% if error %}
<div class="alert alert-danger mt-3">{{ error }}</div>
{% endif %}
{% if success %}
<div class="alert alert-success mt-3">{{ success }}</div>
{% endif %}
{% endblock %} {% endblock %}

View File

@ -13,8 +13,10 @@ from flask import (
render_template, render_template,
request, request,
send_file, send_file,
session, # Add this line
url_for, url_for,
) )
from sqlalchemy import asc, desc from sqlalchemy import asc, desc
from .db import db from .db import db
@ -36,6 +38,7 @@ def upload():
if request.method == "POST": if request.method == "POST":
file = request.files.get("file") file = request.files.get("file")
delimiter = request.form.get("delimiter", ",") delimiter = request.form.get("delimiter", ",")
duplicate_strategy = request.form.get("duplicate_strategy", "skip")
if not file: if not file:
return render_template("upload.html", error="No file selected.") return render_template("upload.html", error="No file selected.")
@ -62,12 +65,52 @@ def upload():
except Exception: except Exception:
return None return None
for _, row in df.iterrows(): # Count statistics
added_count = 0
skipped_count = 0
updated_count = 0
error_count = 0
# Collect error information
errors = []
# Process each row
for index, row in df.iterrows():
try:
# Get DOI from row for error reporting
doi = str(row.get("doi", "N/A"))
# Validate required fields
for field in ["title", "doi", "issn"]:
if pd.isna(row.get(field)) or not str(row.get(field)).strip():
raise ValueError(f"Missing required field: {field}")
# Check if paper with this DOI already exists
existing = PaperMetadata.query.filter_by(doi=doi).first()
if existing:
if duplicate_strategy == 'update':
# Update existing record
existing.title = row["title"]
existing.alt_id = row.get("alternative_id")
existing.issn = row["issn"]
existing.journal = row.get("journal")
existing.type = row.get("type")
existing.language = row.get("language")
existing.published_online = parse_date(row.get("published_online"))
updated_count += 1
else:
# Skip this record
skipped_count += 1
continue
else:
# Create new record
metadata = PaperMetadata( metadata = PaperMetadata(
title=row["title"], title=row["title"],
doi=row["doi"], doi=doi,
alt_id=row.get("alternative_id"), alt_id=row.get("alternative_id"),
issn=row["issn"], issn=row["issn"],
journal=row.get("journal"),
type=row.get("type"), type=row.get("type"),
language=row.get("language"), language=row.get("language"),
published_online=parse_date(row.get("published_online")), published_online=parse_date(row.get("published_online")),
@ -76,6 +119,16 @@ def upload():
error_msg=None, error_msg=None,
) )
db.session.add(metadata) db.session.add(metadata)
added_count += 1
except Exception as e:
error_count += 1
errors.append({
"row": index + 2, # +2 because index is 0-based and we have a header row
"doi": row.get("doi", "N/A"),
"error": str(e)
})
continue # Skip this row and continue with the next
try: try:
db.session.commit() db.session.commit()
@ -85,12 +138,46 @@ def upload():
"upload.html", error=f"Failed to save data to database: {e}" "upload.html", error=f"Failed to save data to database: {e}"
) )
# Prepare error samples for display
error_samples = errors[:5] if errors else []
error_message = None
if errors:
error_message = f"Encountered {len(errors)} errors. First 5 shown below."
# Store the full errors list in the session for potential download
if errors:
error_csv = StringIO()
writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
writer.writeheader()
writer.writerows(errors)
session["error_data"] = error_csv.getvalue()
return render_template( return render_template(
"upload.html", success="File uploaded and validated successfully!" "upload.html",
success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
error_message=error_message,
error_samples=error_samples
) )
return render_template("upload.html") return render_template("upload.html")
# Add a route to download the error log
@bp.route("/download_error_log")
def download_error_log():
error_data = session.get("error_data")
if not error_data:
flash("No error data available.")
return redirect(url_for("main.upload"))
buffer = StringIO(error_data)
return send_file(
buffer,
mimetype="text/csv",
as_attachment=True,
download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
)
@bp.route("/papers") @bp.route("/papers")
def list_papers(): def list_papers():