adds csv importer

This commit is contained in:
Michael Beck 2025-04-01 21:13:55 +02:00
parent dd707e2a9f
commit 59b6404b99
5 changed files with 103 additions and 12 deletions

View File

@ -1,6 +1,6 @@
from .db import db
class Paper(db.Model):
class PaperMetadata(db.Model):
id = db.Column(db.Integer, primary_key=True)
title = db.Column(db.Text)
doi = db.Column(db.String, unique=True, index=True)
@ -8,10 +8,12 @@ class Paper(db.Model):
issn = db.Column(db.String(32))
type = db.Column(db.String(50))
language = db.Column(db.String(50))
published_date = db.Column(db.Date) # or DateTime/String
published_online = db.Column(db.Date) # or DateTime/String
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
file_path = db.Column(db.Text)
error_msg = db.Column(db.Text)
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
updated_at = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp())
# plus maybe timestamps for created/updated
class ScheduleConfig(db.Model):

View File

@ -1,6 +1,6 @@
import time
from .db import db
from .models import Paper
from .models import PaperMetadata
def run_scraper():
while True:

View File

@ -17,7 +17,7 @@
<div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
<li class="nav-item">
<a class="nav-link" href="/import">Import CSV</a>
<a class="nav-link" href="/upload">Import CSV</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/papers">Papers</a>

View File

@ -2,5 +2,41 @@
{% block content %}
<h1>Welcome to SciPaperLoader</h1>
<p>Your paper scraping tool is ready.</p>
<a href="{{ url_for('main.upload') }}" class="btn btn-primary">Upload CSV</a>
<div class="alert alert-info">
<p><strong>Instructions:</strong> Please upload a CSV file containing academic paper metadata. The file must include the following columns:</p>
<ul>
<li><code>alternative_id</code> an alternative title or abbreviation</li>
<li><code>journal</code> the journal name</li>
<li><code>doi</code> the digital object identifier</li>
<li><code>issn</code> the ISSN of the journal</li>
<li><code>title</code> the title of the paper</li>
</ul>
<p>The format of your CSV should resemble the response structure of the Crossref API's <code>/journals/{issn}/works</code> endpoint.</p>
</div>
<form method="POST" action="{{ url_for('main.upload') }}" enctype="multipart/form-data">
<div class="form-group">
<label for="file">Upload CSV File</label>
<input type="file" name="file" id="file" class="form-control" required>
</div>
<div class="form-group mt-3">
<label for="delimiter">Choose CSV Delimiter</label>
<select name="delimiter" id="delimiter" class="form-control">
<option value=",">Comma (,)</option>
<option value=";">Semicolon (;)</option>
<option value="\t">Tab (\\t)</option>
<option value="|">Pipe (|)</option>
</select>
</div>
<button type="submit" class="btn btn-primary mt-3">Upload</button>
</form>
{% if error %}
<div class="alert alert-danger mt-3">{{ error }}</div>
{% endif %}
{% if success %}
<div class="alert alert-success mt-3">{{ success }}</div>
{% endif %}
{% endblock %}

View File

@ -1,6 +1,9 @@
from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for
from .models import ScheduleConfig, VolumeConfig
from .models import ScheduleConfig, VolumeConfig, PaperMetadata
from .db import db
import pandas as pd
from io import StringIO
import codecs
bp = Blueprint('main', __name__)
@ -9,13 +12,63 @@ bp = Blueprint('main', __name__)
def index():
return render_template("index.html")
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
@bp.route("/upload", methods=["GET", "POST"])
@bp.route('/upload', methods=['GET', 'POST'])
def upload():
if request.method == "POST":
# CSV upload logic here
pass
return render_template("upload.html")
if request.method == 'POST':
file = request.files.get('file')
delimiter = request.form.get('delimiter', ',')
if not file:
return render_template('upload.html', error="No file selected.")
try:
stream = codecs.iterdecode(file.stream, 'utf-8')
content = ''.join(stream)
df = pd.read_csv(StringIO(content), delimiter=delimiter)
except Exception as e:
return render_template('upload.html', error=f"Failed to read CSV file: {e}")
missing = REQUIRED_COLUMNS - set(df.columns)
if missing:
return render_template('upload.html', error=f"Missing required columns: {', '.join(missing)}")
# Optional: parse 'published_online' to date
def parse_date(val):
if pd.isna(val):
return None
try:
return pd.to_datetime(val).date()
except Exception:
return None
for _, row in df.iterrows():
metadata = PaperMetadata(
title=row['title'],
doi=row['doi'],
alt_id=row.get('alternative_id'),
issn=row['issn'],
type=row.get('type'),
language=row.get('language'),
published_online=parse_date(row.get('published_online')),
status=None,
file_path=None,
error_msg=None
)
db.session.add(metadata)
try:
db.session.commit()
except Exception as e:
db.session.rollback()
return render_template('upload.html', error=f"Failed to save data to database: {e}")
return render_template('upload.html', success="File uploaded and validated successfully!")
return render_template('upload.html')
@bp.route("/papers")