diff --git a/scipaperloader/templates/upload.html b/scipaperloader/templates/upload.html
index e83533b..5c23d20 100644
--- a/scipaperloader/templates/upload.html
+++ b/scipaperloader/templates/upload.html
@@ -1,7 +1,35 @@
{% extends 'base.html' %}
{% block content %}
Welcome to SciPaperLoader
-Your paper scraping tool is ready.
+
+{% if success %}
+ {{ success }}
+{% endif %}
+
+{% if error_message %}
+
+
{{ error_message }}
+
+
+
+ Row |
+ DOI |
+ Error |
+
+
+
+ {% for error in error_samples %}
+
+ {{ error.row }} |
+ {{ error.doi }} |
+ {{ error.error }} |
+
+ {% endfor %}
+
+
+
Download Full Error Log
+
+{% endif %}
Instructions: Please upload a CSV file containing academic paper metadata. The file must include the following columns:
@@ -16,6 +44,17 @@
-
-{% if error %}
- {{ error }}
-{% endif %}
-
-{% if success %}
- {{ success }}
-{% endif %}
{% endblock %}
diff --git a/scipaperloader/views.py b/scipaperloader/views.py
index 128737b..1720c2a 100644
--- a/scipaperloader/views.py
+++ b/scipaperloader/views.py
@@ -13,8 +13,10 @@ from flask import (
render_template,
request,
send_file,
+ session, # Add this line
url_for,
)
+
from sqlalchemy import asc, desc
from .db import db
@@ -36,6 +38,7 @@ def upload():
if request.method == "POST":
file = request.files.get("file")
delimiter = request.form.get("delimiter", ",")
+ duplicate_strategy = request.form.get("duplicate_strategy", "skip")
if not file:
return render_template("upload.html", error="No file selected.")
@@ -62,20 +65,70 @@ def upload():
except Exception:
return None
- for _, row in df.iterrows():
- metadata = PaperMetadata(
- title=row["title"],
- doi=row["doi"],
- alt_id=row.get("alternative_id"),
- issn=row["issn"],
- type=row.get("type"),
- language=row.get("language"),
- published_online=parse_date(row.get("published_online")),
- status="New",
- file_path=None,
- error_msg=None,
- )
- db.session.add(metadata)
+ # Count statistics
+ added_count = 0
+ skipped_count = 0
+ updated_count = 0
+ error_count = 0
+
+ # Collect error information
+ errors = []
+
+ # Process each row
+ for index, row in df.iterrows():
+ try:
+ # Get DOI from row for error reporting
+ doi = str(row.get("doi", "N/A"))
+
+ # Validate required fields
+ for field in ["title", "doi", "issn"]:
+ if pd.isna(row.get(field)) or not str(row.get(field)).strip():
+ raise ValueError(f"Missing required field: {field}")
+
+ # Check if paper with this DOI already exists
+ existing = PaperMetadata.query.filter_by(doi=doi).first()
+
+ if existing:
+ if duplicate_strategy == 'update':
+ # Update existing record
+ existing.title = row["title"]
+ existing.alt_id = row.get("alternative_id")
+ existing.issn = row["issn"]
+ existing.journal = row.get("journal")
+ existing.type = row.get("type")
+ existing.language = row.get("language")
+ existing.published_online = parse_date(row.get("published_online"))
+ updated_count += 1
+ else:
+ # Skip this record
+ skipped_count += 1
+ continue
+ else:
+ # Create new record
+ metadata = PaperMetadata(
+ title=row["title"],
+ doi=doi,
+ alt_id=row.get("alternative_id"),
+ issn=row["issn"],
+ journal=row.get("journal"),
+ type=row.get("type"),
+ language=row.get("language"),
+ published_online=parse_date(row.get("published_online")),
+ status="New",
+ file_path=None,
+ error_msg=None,
+ )
+ db.session.add(metadata)
+ added_count += 1
+
+ except Exception as e:
+ error_count += 1
+ errors.append({
+ "row": index + 2, # +2 because index is 0-based and we have a header row
+ "doi": row.get("doi", "N/A"),
+ "error": str(e)
+ })
+ continue # Skip this row and continue with the next
try:
db.session.commit()
@@ -85,12 +138,46 @@ def upload():
"upload.html", error=f"Failed to save data to database: {e}"
)
+ # Prepare error samples for display
+ error_samples = errors[:5] if errors else []
+
+ error_message = None
+ if errors:
+ error_message = f"Encountered {len(errors)} errors. First 5 shown below."
+
+ # Store the full errors list in the session for potential download
+ if errors:
+ error_csv = StringIO()
+ writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
+ writer.writeheader()
+ writer.writerows(errors)
+ session["error_data"] = error_csv.getvalue()
+
return render_template(
- "upload.html", success="File uploaded and validated successfully!"
+ "upload.html",
+ success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
+ error_message=error_message,
+ error_samples=error_samples
)
return render_template("upload.html")
+# Add a route to download the error log
+@bp.route("/download_error_log")
+def download_error_log():
+ error_data = session.get("error_data")
+ if not error_data:
+ flash("No error data available.")
+ return redirect(url_for("main.upload"))
+
+ buffer = StringIO(error_data)
+ return send_file(
+ buffer,
+ mimetype="text/csv",
+ as_attachment=True,
+ download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+ )
+
@bp.route("/papers")
def list_papers():