SciPaperLoader/scipaperloader/blueprints/upload.py

"""Upload functionality for paper metadata."""
import codecs
import csv
import datetime
from io import StringIO

import pandas as pd
from flask import (
    Blueprint,
    flash,
    redirect,
    render_template,
    request,
    send_file,
    session,
    url_for,
)

from ..db import db
from ..models import PaperMetadata

bp = Blueprint("upload", __name__)

REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}


@bp.route("/", methods=["GET", "POST"])
def upload():
    if request.method == "POST":
        file = request.files.get("file")
        delimiter = request.form.get("delimiter", ",")
        duplicate_strategy = request.form.get("duplicate_strategy", "skip")

        if not file:
            return render_template("upload.html", error="No file selected.")

        try:
            stream = codecs.iterdecode(file.stream, "utf-8")
            content = "".join(stream)
            df = pd.read_csv(StringIO(content), delimiter=delimiter)
        except Exception as e:
            return render_template("upload.html", error=f"Failed to read CSV file: {e}")

        missing = REQUIRED_COLUMNS - set(df.columns)
        if missing:
            return render_template(
                "upload.html", error=f"Missing required columns: {', '.join(missing)}"
            )

        # Optional: parse 'published_online' to date
        def parse_date(val):
            if pd.isna(val):
                return None
            try:
                return pd.to_datetime(val).date()
            except Exception:
                return None

        # Count statistics
        added_count = 0
        skipped_count = 0
        updated_count = 0
        error_count = 0

        # Collect error information
        errors = []

        # Process each row
        for index, row in df.iterrows():
            try:
                # Get DOI from row for error reporting
                doi = str(row.get("doi", "N/A"))

                # Validate required fields
                for field in ["title", "doi", "issn"]:
                    if pd.isna(row.get(field)) or not str(row.get(field)).strip():
                        raise ValueError(f"Missing required field: {field}")

                # Check if paper with this DOI already exists
                existing = PaperMetadata.query.filter_by(doi=doi).first()

                if existing:
                    if duplicate_strategy == 'update':
                        # Update existing record
                        existing.title = row["title"]
                        existing.alt_id = row.get("alternative_id")
                        existing.issn = row["issn"]
                        existing.journal = row.get("journal")
                        existing.type = row.get("type")
                        existing.language = row.get("language")
                        existing.published_online = parse_date(row.get("published_online"))
                        updated_count += 1
                    else:
                        # Skip this record
                        skipped_count += 1
                        continue
                else:
                    # Create new record
                    metadata = PaperMetadata(
                        title=row["title"],
                        doi=doi,
                        alt_id=row.get("alternative_id"),
                        issn=row["issn"],
                        journal=row.get("journal"),
                        type=row.get("type"),
                        language=row.get("language"),
                        published_online=parse_date(row.get("published_online")),
                        status="New",
                        file_path=None,
                        error_msg=None,
                    )
                    db.session.add(metadata)
                    added_count += 1

            except Exception as e:
                error_count += 1
                errors.append({
                    "row": index + 2,  # +2 because index is 0-based and we have a header row
                    "doi": row.get("doi", "N/A"),
                    "error": str(e)
                })
                continue  # Skip this row and continue with the next

        try:
            db.session.commit()
        except Exception as e:
            db.session.rollback()
            return render_template(
                "upload.html", error=f"Failed to save data to database: {e}"
            )

        # Prepare error samples for display
        error_samples = errors[:5] if errors else []

        error_message = None
        if errors:
            error_message = f"Encountered {len(errors)} errors. First 5 shown below."

        # Store the full errors list in the session for potential download
        if errors:
            error_csv = StringIO()
            writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"])
            writer.writeheader()
            writer.writerows(errors)
            session["error_data"] = error_csv.getvalue()

        return render_template(
            "upload.html",
            success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}",
            error_message=error_message,
            error_samples=error_samples
        )

    return render_template("upload.html")


@bp.route("/download_error_log")
def download_error_log():
    error_data = session.get("error_data")
    if not error_data:
        flash("No error data available.")
        return redirect(url_for("upload.upload"))

    buffer = StringIO(error_data)
    return send_file(
        buffer,
        mimetype="text/csv",
        as_attachment=True,
        download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    )