added error handling to csv import
This commit is contained in:
parent
d2b99ec0d7
commit
3a73aaf8aa
@ -1,7 +1,35 @@
|
||||
{% extends 'base.html' %}
|
||||
{% block content %}
|
||||
<h1>Welcome to SciPaperLoader</h1>
|
||||
<p>Your paper scraping tool is ready.</p>
|
||||
|
||||
{% if success %}
|
||||
<div class="alert alert-success mt-3">{{ success }}</div>
|
||||
{% endif %}
|
||||
|
||||
{% if error_message %}
|
||||
<div class="alert alert-warning mt-3">
|
||||
<h4>{{ error_message }}</h4>
|
||||
<table class="table table-sm table-bordered">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Row</th>
|
||||
<th>DOI</th>
|
||||
<th>Error</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for error in error_samples %}
|
||||
<tr>
|
||||
<td>{{ error.row }}</td>
|
||||
<td>{{ error.doi }}</td>
|
||||
<td>{{ error.error }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
<a href="{{ url_for('main.download_error_log') }}" class="btn btn-outline-secondary">Download Full Error Log</a>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="alert alert-info">
|
||||
<p><strong>Instructions:</strong> Please upload a CSV file containing academic paper metadata. The file must include the following columns:</p>
|
||||
@ -16,6 +44,17 @@
|
||||
</div>
|
||||
|
||||
<form method="POST" action="{{ url_for('main.upload') }}" enctype="multipart/form-data">
|
||||
<div class="mb-3">
|
||||
<label class="form-label">How to handle duplicate DOIs:</label>
|
||||
<div class="form-check">
|
||||
<input class="form-check-input" type="radio" name="duplicate_strategy" value="skip" id="skip" checked>
|
||||
<label class="form-check-label" for="skip">Skip duplicate entries</label>
|
||||
</div>
|
||||
<div class="form-check">
|
||||
<input class="form-check-input" type="radio" name="duplicate_strategy" value="update" id="update">
|
||||
<label class="form-check-label" for="update">Update existing entries</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="file">Upload CSV File</label>
|
||||
<input type="file" name="file" id="file" class="form-control" required>
|
||||
@ -31,12 +70,4 @@
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary mt-3">Upload</button>
|
||||
</form>
|
||||
|
||||
{% if error %}
|
||||
<div class="alert alert-danger mt-3">{{ error }}</div>
|
||||
{% endif %}
|
||||
|
||||
{% if success %}
|
||||
<div class="alert alert-success mt-3">{{ success }}</div>
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
|
@ -13,8 +13,10 @@ from flask import (
|
||||
render_template,
|
||||
request,
|
||||
send_file,
|
||||
session, # Add this line
|
||||
url_for,
|
||||
)
|
||||
|
||||
from sqlalchemy import asc, desc
|
||||
|
||||
from .db import db
|
||||
@ -36,6 +38,7 @@ def upload():
|
||||
if request.method == "POST":
|
||||
file = request.files.get("file")
|
||||
delimiter = request.form.get("delimiter", ",")
|
||||
duplicate_strategy = request.form.get("duplicate_strategy", "skip")
|
||||
|
||||
if not file:
|
||||
return render_template("upload.html", error="No file selected.")
|
||||
@ -62,12 +65,52 @@ def upload():
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
for _, row in df.iterrows():
|
||||
# Count statistics
|
||||
added_count = 0
|
||||
skipped_count = 0
|
||||
updated_count = 0
|
||||
error_count = 0
|
||||
|
||||
# Collect error information
|
||||
errors = []
|
||||
|
||||
# Process each row
|
||||
for index, row in df.iterrows():
|
||||
try:
|
||||
# Get DOI from row for error reporting
|
||||
doi = str(row.get("doi", "N/A"))
|
||||
|
||||
# Validate required fields
|
||||
for field in ["title", "doi", "issn"]:
|
||||
if pd.isna(row.get(field)) or not str(row.get(field)).strip():
|
||||
raise ValueError(f"Missing required field: {field}")
|
||||
|
||||
# Check if paper with this DOI already exists
|
||||
existing = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
|
||||
if existing:
|
||||
if duplicate_strategy == 'update':
|
||||
# Update existing record
|
||||
existing.title = row["title"]
|
||||
existing.alt_id = row.get("alternative_id")
|
||||
existing.issn = row["issn"]
|
||||
existing.journal = row.get("journal")
|
||||
existing.type = row.get("type")
|
||||
existing.language = row.get("language")
|
||||
existing.published_online = parse_date(row.get("published_online"))
|
||||
updated_count += 1
|
||||
else:
|
||||
# Skip this record
|
||||
skipped_count += 1
|
||||
continue
|
||||
else:
|
||||
# Create new record
|
||||
metadata = PaperMetadata(
|
||||
title=row["title"],
|
||||
doi=row["doi"],
|
||||
doi=doi,
|
||||
alt_id=row.get("alternative_id"),
|
||||
issn=row["issn"],
|
||||
journal=row.get("journal"),
|
||||
type=row.get("type"),
|
||||
language=row.get("language"),
|
||||
published_online=parse_date(row.get("published_online")),
|
||||
@ -76,6 +119,16 @@ def upload():
|
||||
error_msg=None,
|
||||
)
|
||||
db.session.add(metadata)
|
||||
added_count += 1
|
||||
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
errors.append({
|
||||
"row": index + 2, # +2 because index is 0-based and we have a header row
|
||||
"doi": row.get("doi", "N/A"),
|
||||
"error": str(e)
|
||||
})
|
||||
continue # Skip this row and continue with the next
|
||||
|
||||
try:
|
||||
db.session.commit()
|
||||
@ -85,12 +138,46 @@ def upload():
|
||||
"upload.html", error=f"Failed to save data to database: {e}"
|
||||
)
|
||||
|
||||
# Prepare error samples for display
|
||||
error_samples = errors[:5] if errors else []
|
||||
|
||||
error_message = None
|
||||
if errors:
|
||||
error_message = f"Encountered {len(errors)} errors. First 5 shown below."
|
||||
|
||||
# Store the full errors list in the session for potential download
|
||||
if errors:
|
||||
error_csv = StringIO()
|
||||
writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
|
||||
writer.writeheader()
|
||||
writer.writerows(errors)
|
||||
session["error_data"] = error_csv.getvalue()
|
||||
|
||||
return render_template(
|
||||
"upload.html", success="File uploaded and validated successfully!"
|
||||
"upload.html",
|
||||
success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
|
||||
error_message=error_message,
|
||||
error_samples=error_samples
|
||||
)
|
||||
|
||||
return render_template("upload.html")
|
||||
|
||||
# Add a route to download the error log
|
||||
@bp.route("/download_error_log")
|
||||
def download_error_log():
|
||||
error_data = session.get("error_data")
|
||||
if not error_data:
|
||||
flash("No error data available.")
|
||||
return redirect(url_for("main.upload"))
|
||||
|
||||
buffer = StringIO(error_data)
|
||||
return send_file(
|
||||
buffer,
|
||||
mimetype="text/csv",
|
||||
as_attachment=True,
|
||||
download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
)
|
||||
|
||||
|
||||
@bp.route("/papers")
|
||||
def list_papers():
|
||||
|
Loading…
x
Reference in New Issue
Block a user