adds csv importer

This commit is contained in:
Michael Beck 2025-04-01 21:13:55 +02:00
parent dd707e2a9f
commit 59b6404b99
5 changed files with 103 additions and 12 deletions

View File

@ -1,6 +1,6 @@
from .db import db from .db import db
class Paper(db.Model): class PaperMetadata(db.Model):
id = db.Column(db.Integer, primary_key=True) id = db.Column(db.Integer, primary_key=True)
title = db.Column(db.Text) title = db.Column(db.Text)
doi = db.Column(db.String, unique=True, index=True) doi = db.Column(db.String, unique=True, index=True)
@ -8,10 +8,12 @@ class Paper(db.Model):
issn = db.Column(db.String(32)) issn = db.Column(db.String(32))
type = db.Column(db.String(50)) type = db.Column(db.String(50))
language = db.Column(db.String(50)) language = db.Column(db.String(50))
published_date = db.Column(db.Date) # or DateTime/String published_online = db.Column(db.Date) # or DateTime/String
status = db.Column(db.String(10)) # 'Pending','Done','Failed' status = db.Column(db.String(10)) # 'Pending','Done','Failed'
file_path = db.Column(db.Text) file_path = db.Column(db.Text)
error_msg = db.Column(db.Text) error_msg = db.Column(db.Text)
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
updated_at = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp())
# plus maybe timestamps for created/updated # plus maybe timestamps for created/updated
class ScheduleConfig(db.Model): class ScheduleConfig(db.Model):

View File

@ -1,6 +1,6 @@
import time import time
from .db import db from .db import db
from .models import Paper from .models import PaperMetadata
def run_scraper(): def run_scraper():
while True: while True:

View File

@ -17,7 +17,7 @@
<div class="collapse navbar-collapse" id="navbarSupportedContent"> <div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav me-auto mb-2 mb-lg-0"> <ul class="navbar-nav me-auto mb-2 mb-lg-0">
<li class="nav-item"> <li class="nav-item">
<a class="nav-link" href="/import">Import CSV</a> <a class="nav-link" href="/upload">Import CSV</a>
</li> </li>
<li class="nav-item"> <li class="nav-item">
<a class="nav-link" href="/papers">Papers</a> <a class="nav-link" href="/papers">Papers</a>

View File

@ -2,5 +2,41 @@
{% block content %} {% block content %}
<h1>Welcome to SciPaperLoader</h1> <h1>Welcome to SciPaperLoader</h1>
<p>Your paper scraping tool is ready.</p> <p>Your paper scraping tool is ready.</p>
<a href="{{ url_for('main.upload') }}" class="btn btn-primary">Upload CSV</a>
{% endblock %} <div class="alert alert-info">
<p><strong>Instructions:</strong> Please upload a CSV file containing academic paper metadata. The file must include the following columns:</p>
<ul>
<li><code>alternative_id</code> an alternative title or abbreviation</li>
<li><code>journal</code> the journal name</li>
<li><code>doi</code> the digital object identifier</li>
<li><code>issn</code> the ISSN of the journal</li>
<li><code>title</code> the title of the paper</li>
</ul>
<p>The format of your CSV should resemble the response structure of the Crossref API's <code>/journals/{issn}/works</code> endpoint.</p>
</div>
<form method="POST" action="{{ url_for('main.upload') }}" enctype="multipart/form-data">
<div class="form-group">
<label for="file">Upload CSV File</label>
<input type="file" name="file" id="file" class="form-control" required>
</div>
<div class="form-group mt-3">
<label for="delimiter">Choose CSV Delimiter</label>
<select name="delimiter" id="delimiter" class="form-control">
<option value=",">Comma (,)</option>
<option value=";">Semicolon (;)</option>
<option value="\t">Tab (\\t)</option>
<option value="|">Pipe (|)</option>
</select>
</div>
<button type="submit" class="btn btn-primary mt-3">Upload</button>
</form>
{% if error %}
<div class="alert alert-danger mt-3">{{ error }}</div>
{% endif %}
{% if success %}
<div class="alert alert-success mt-3">{{ success }}</div>
{% endif %}
{% endblock %}

View File

@ -1,6 +1,9 @@
from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for
from .models import ScheduleConfig, VolumeConfig from .models import ScheduleConfig, VolumeConfig, PaperMetadata
from .db import db from .db import db
import pandas as pd
from io import StringIO
import codecs
bp = Blueprint('main', __name__) bp = Blueprint('main', __name__)
@ -9,13 +12,63 @@ bp = Blueprint('main', __name__)
def index(): def index():
return render_template("index.html") return render_template("index.html")
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
@bp.route("/upload", methods=["GET", "POST"])
@bp.route('/upload', methods=['GET', 'POST'])
def upload(): def upload():
if request.method == "POST": if request.method == 'POST':
# CSV upload logic here file = request.files.get('file')
pass delimiter = request.form.get('delimiter', ',')
return render_template("upload.html")
if not file:
return render_template('upload.html', error="No file selected.")
try:
stream = codecs.iterdecode(file.stream, 'utf-8')
content = ''.join(stream)
df = pd.read_csv(StringIO(content), delimiter=delimiter)
except Exception as e:
return render_template('upload.html', error=f"Failed to read CSV file: {e}")
missing = REQUIRED_COLUMNS - set(df.columns)
if missing:
return render_template('upload.html', error=f"Missing required columns: {', '.join(missing)}")
# Optional: parse 'published_online' to date
def parse_date(val):
if pd.isna(val):
return None
try:
return pd.to_datetime(val).date()
except Exception:
return None
for _, row in df.iterrows():
metadata = PaperMetadata(
title=row['title'],
doi=row['doi'],
alt_id=row.get('alternative_id'),
issn=row['issn'],
type=row.get('type'),
language=row.get('language'),
published_online=parse_date(row.get('published_online')),
status=None,
file_path=None,
error_msg=None
)
db.session.add(metadata)
try:
db.session.commit()
except Exception as e:
db.session.rollback()
return render_template('upload.html', error=f"Failed to save data to database: {e}")
return render_template('upload.html', success="File uploaded and validated successfully!")
return render_template('upload.html')
@bp.route("/papers") @bp.route("/papers")