170 lines
5.8 KiB
Python
170 lines
5.8 KiB
Python
"""Upload functionality for paper metadata."""
|
|
import codecs
|
|
import csv
|
|
import datetime
|
|
from io import StringIO
|
|
|
|
import pandas as pd
|
|
from flask import (
|
|
Blueprint,
|
|
flash,
|
|
redirect,
|
|
render_template,
|
|
request,
|
|
send_file,
|
|
session,
|
|
url_for,
|
|
)
|
|
|
|
from ..db import db
|
|
from ..models import PaperMetadata
|
|
|
|
bp = Blueprint("upload", __name__)
|
|
|
|
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
|
|
|
|
|
|
@bp.route("/", methods=["GET", "POST"])
|
|
def upload():
|
|
if request.method == "POST":
|
|
file = request.files.get("file")
|
|
delimiter = request.form.get("delimiter", ",")
|
|
duplicate_strategy = request.form.get("duplicate_strategy", "skip")
|
|
|
|
if not file:
|
|
return render_template("upload.html.jinja", error="No file selected.")
|
|
|
|
try:
|
|
stream = codecs.iterdecode(file.stream, "utf-8")
|
|
content = "".join(stream)
|
|
df = pd.read_csv(StringIO(content), delimiter=delimiter)
|
|
except Exception as e:
|
|
return render_template("upload.html.jinja", error=f"Failed to read CSV file: {e}")
|
|
|
|
missing = REQUIRED_COLUMNS - set(df.columns)
|
|
if missing:
|
|
return render_template(
|
|
"upload.html.jinja", error=f"Missing required columns: {', '.join(missing)}"
|
|
)
|
|
|
|
# Optional: parse 'published_online' to date
|
|
def parse_date(val):
|
|
if pd.isna(val):
|
|
return None
|
|
try:
|
|
return pd.to_datetime(val).date()
|
|
except Exception:
|
|
return None
|
|
|
|
# Count statistics
|
|
added_count = 0
|
|
skipped_count = 0
|
|
updated_count = 0
|
|
error_count = 0
|
|
|
|
# Collect error information
|
|
errors = []
|
|
|
|
# Process each row
|
|
for index, row in df.iterrows():
|
|
try:
|
|
# Get DOI from row for error reporting
|
|
doi = str(row.get("doi", "N/A"))
|
|
|
|
# Validate required fields
|
|
for field in ["title", "doi", "issn"]:
|
|
if pd.isna(row.get(field)) or not str(row.get(field)).strip():
|
|
raise ValueError(f"Missing required field: {field}")
|
|
|
|
# Check if paper with this DOI already exists
|
|
existing = PaperMetadata.query.filter_by(doi=doi).first()
|
|
|
|
if existing:
|
|
if duplicate_strategy == 'update':
|
|
# Update existing record
|
|
existing.title = row["title"]
|
|
existing.alt_id = row.get("alternative_id")
|
|
existing.issn = row["issn"]
|
|
existing.journal = row.get("journal")
|
|
existing.type = row.get("type")
|
|
existing.language = row.get("language")
|
|
existing.published_online = parse_date(row.get("published_online"))
|
|
updated_count += 1
|
|
else:
|
|
# Skip this record
|
|
skipped_count += 1
|
|
continue
|
|
else:
|
|
# Create new record
|
|
metadata = PaperMetadata(
|
|
title=row["title"],
|
|
doi=doi,
|
|
alt_id=row.get("alternative_id"),
|
|
issn=row["issn"],
|
|
journal=row.get("journal"),
|
|
type=row.get("type"),
|
|
language=row.get("language"),
|
|
published_online=parse_date(row.get("published_online")),
|
|
status="New",
|
|
file_path=None,
|
|
error_msg=None,
|
|
)
|
|
db.session.add(metadata)
|
|
added_count += 1
|
|
|
|
except Exception as e:
|
|
error_count += 1
|
|
errors.append({
|
|
"row": index + 2, # +2 because index is 0-based and we have a header row
|
|
"doi": row.get("doi", "N/A"),
|
|
"error": str(e)
|
|
})
|
|
continue # Skip this row and continue with the next
|
|
|
|
try:
|
|
db.session.commit()
|
|
except Exception as e:
|
|
db.session.rollback()
|
|
return render_template(
|
|
"upload.html.jinja", error=f"Failed to save data to database: {e}"
|
|
)
|
|
|
|
# Prepare error samples for display
|
|
error_samples = errors[:5] if errors else []
|
|
|
|
error_message = None
|
|
if errors:
|
|
error_message = f"Encountered {len(errors)} errors. First 5 shown below."
|
|
|
|
# Store the full errors list in the session for potential download
|
|
if errors:
|
|
error_csv = StringIO()
|
|
writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
|
|
writer.writeheader()
|
|
writer.writerows(errors)
|
|
session["error_data"] = error_csv.getvalue()
|
|
|
|
return render_template(
|
|
"upload.html.jinja",
|
|
success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
|
|
error_message=error_message,
|
|
error_samples=error_samples
|
|
)
|
|
|
|
return render_template("upload.html.jinja")
|
|
|
|
|
|
@bp.route("/download_error_log")
|
|
def download_error_log():
|
|
error_data = session.get("error_data")
|
|
if not error_data:
|
|
flash("No error data available.")
|
|
return redirect(url_for("upload.upload"))
|
|
|
|
buffer = StringIO(error_data)
|
|
return send_file(
|
|
buffer,
|
|
mimetype="text/csv",
|
|
as_attachment=True,
|
|
download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
|
) |