170 lines
5.8 KiB
Python

"""Upload functionality for paper metadata."""
import codecs
import csv
import datetime
from io import StringIO
import pandas as pd
from flask import (
Blueprint,
flash,
redirect,
render_template,
request,
send_file,
session,
url_for,
)
from ..db import db
from ..models import PaperMetadata
bp = Blueprint("upload", __name__)
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
@bp.route("/", methods=["GET", "POST"])
def upload():
if request.method == "POST":
file = request.files.get("file")
delimiter = request.form.get("delimiter", ",")
duplicate_strategy = request.form.get("duplicate_strategy", "skip")
if not file:
return render_template("upload.html", error="No file selected.")
try:
stream = codecs.iterdecode(file.stream, "utf-8")
content = "".join(stream)
df = pd.read_csv(StringIO(content), delimiter=delimiter)
except Exception as e:
return render_template("upload.html", error=f"Failed to read CSV file: {e}")
missing = REQUIRED_COLUMNS - set(df.columns)
if missing:
return render_template(
"upload.html", error=f"Missing required columns: {', '.join(missing)}"
)
# Optional: parse 'published_online' to date
def parse_date(val):
if pd.isna(val):
return None
try:
return pd.to_datetime(val).date()
except Exception:
return None
# Count statistics
added_count = 0
skipped_count = 0
updated_count = 0
error_count = 0
# Collect error information
errors = []
# Process each row
for index, row in df.iterrows():
try:
# Get DOI from row for error reporting
doi = str(row.get("doi", "N/A"))
# Validate required fields
for field in ["title", "doi", "issn"]:
if pd.isna(row.get(field)) or not str(row.get(field)).strip():
raise ValueError(f"Missing required field: {field}")
# Check if paper with this DOI already exists
existing = PaperMetadata.query.filter_by(doi=doi).first()
if existing:
if duplicate_strategy == 'update':
# Update existing record
existing.title = row["title"]
existing.alt_id = row.get("alternative_id")
existing.issn = row["issn"]
existing.journal = row.get("journal")
existing.type = row.get("type")
existing.language = row.get("language")
existing.published_online = parse_date(row.get("published_online"))
updated_count += 1
else:
# Skip this record
skipped_count += 1
continue
else:
# Create new record
metadata = PaperMetadata(
title=row["title"],
doi=doi,
alt_id=row.get("alternative_id"),
issn=row["issn"],
journal=row.get("journal"),
type=row.get("type"),
language=row.get("language"),
published_online=parse_date(row.get("published_online")),
status="New",
file_path=None,
error_msg=None,
)
db.session.add(metadata)
added_count += 1
except Exception as e:
error_count += 1
errors.append({
"row": index + 2, # +2 because index is 0-based and we have a header row
"doi": row.get("doi", "N/A"),
"error": str(e)
})
continue # Skip this row and continue with the next
try:
db.session.commit()
except Exception as e:
db.session.rollback()
return render_template(
"upload.html", error=f"Failed to save data to database: {e}"
)
# Prepare error samples for display
error_samples = errors[:5] if errors else []
error_message = None
if errors:
error_message = f"Encountered {len(errors)} errors. First 5 shown below."
# Store the full errors list in the session for potential download
if errors:
error_csv = StringIO()
writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
writer.writeheader()
writer.writerows(errors)
session["error_data"] = error_csv.getvalue()
return render_template(
"upload.html",
success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
error_message=error_message,
error_samples=error_samples
)
return render_template("upload.html")
@bp.route("/download_error_log")
def download_error_log():
error_data = session.get("error_data")
if not error_data:
flash("No error data available.")
return redirect(url_for("upload.upload"))
buffer = StringIO(error_data)
return send_file(
buffer,
mimetype="text/csv",
as_attachment=True,
download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
)