"""Upload functionality for paper metadata.""" import codecs import csv import datetime from io import StringIO import pandas as pd from flask import ( Blueprint, flash, redirect, render_template, request, send_file, session, url_for, ) from ..db import db from ..models import PaperMetadata bp = Blueprint("upload", __name__) REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"} @bp.route("/", methods=["GET", "POST"]) def upload(): if request.method == "POST": file = request.files.get("file") delimiter = request.form.get("delimiter", ",") duplicate_strategy = request.form.get("duplicate_strategy", "skip") if not file: return render_template("upload.html.jina", error="No file selected.") try: stream = codecs.iterdecode(file.stream, "utf-8") content = "".join(stream) df = pd.read_csv(StringIO(content), delimiter=delimiter) except Exception as e: return render_template("upload.html.jina", error=f"Failed to read CSV file: {e}") missing = REQUIRED_COLUMNS - set(df.columns) if missing: return render_template( "upload.html.jina", error=f"Missing required columns: {', '.join(missing)}" ) # Optional: parse 'published_online' to date def parse_date(val): if pd.isna(val): return None try: return pd.to_datetime(val).date() except Exception: return None # Count statistics added_count = 0 skipped_count = 0 updated_count = 0 error_count = 0 # Collect error information errors = [] # Process each row for index, row in df.iterrows(): try: # Get DOI from row for error reporting doi = str(row.get("doi", "N/A")) # Validate required fields for field in ["title", "doi", "issn"]: if pd.isna(row.get(field)) or not str(row.get(field)).strip(): raise ValueError(f"Missing required field: {field}") # Check if paper with this DOI already exists existing = PaperMetadata.query.filter_by(doi=doi).first() if existing: if duplicate_strategy == 'update': # Update existing record existing.title = row["title"] existing.alt_id = row.get("alternative_id") existing.issn = row["issn"] existing.journal = row.get("journal") existing.type = row.get("type") existing.language = row.get("language") existing.published_online = parse_date(row.get("published_online")) updated_count += 1 else: # Skip this record skipped_count += 1 continue else: # Create new record metadata = PaperMetadata( title=row["title"], doi=doi, alt_id=row.get("alternative_id"), issn=row["issn"], journal=row.get("journal"), type=row.get("type"), language=row.get("language"), published_online=parse_date(row.get("published_online")), status="New", file_path=None, error_msg=None, ) db.session.add(metadata) added_count += 1 except Exception as e: error_count += 1 errors.append({ "row": index + 2, # +2 because index is 0-based and we have a header row "doi": row.get("doi", "N/A"), "error": str(e) }) continue # Skip this row and continue with the next try: db.session.commit() except Exception as e: db.session.rollback() return render_template( "upload.html.jina", error=f"Failed to save data to database: {e}" ) # Prepare error samples for display error_samples = errors[:5] if errors else [] error_message = None if errors: error_message = f"Encountered {len(errors)} errors. First 5 shown below." # Store the full errors list in the session for potential download if errors: error_csv = StringIO() writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"]) writer.writeheader() writer.writerows(errors) session["error_data"] = error_csv.getvalue() return render_template( "upload.html.jina", success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}", error_message=error_message, error_samples=error_samples ) return render_template("upload.html.jina") @bp.route("/download_error_log") def download_error_log(): error_data = session.get("error_data") if not error_data: flash("No error data available.") return redirect(url_for("upload.upload")) buffer = StringIO(error_data) return send_file( buffer, mimetype="text/csv", as_attachment=True, download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" )