adds csv importer
This commit is contained in:
parent
dd707e2a9f
commit
59b6404b99
@ -1,6 +1,6 @@
|
||||
from .db import db
|
||||
|
||||
class Paper(db.Model):
|
||||
class PaperMetadata(db.Model):
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
title = db.Column(db.Text)
|
||||
doi = db.Column(db.String, unique=True, index=True)
|
||||
@ -8,10 +8,12 @@ class Paper(db.Model):
|
||||
issn = db.Column(db.String(32))
|
||||
type = db.Column(db.String(50))
|
||||
language = db.Column(db.String(50))
|
||||
published_date = db.Column(db.Date) # or DateTime/String
|
||||
published_online = db.Column(db.Date) # or DateTime/String
|
||||
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
|
||||
file_path = db.Column(db.Text)
|
||||
error_msg = db.Column(db.Text)
|
||||
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
|
||||
updated_at = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp())
|
||||
# plus maybe timestamps for created/updated
|
||||
|
||||
class ScheduleConfig(db.Model):
|
||||
|
@ -1,6 +1,6 @@
|
||||
import time
|
||||
from .db import db
|
||||
from .models import Paper
|
||||
from .models import PaperMetadata
|
||||
|
||||
def run_scraper():
|
||||
while True:
|
||||
|
@ -17,7 +17,7 @@
|
||||
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
||||
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="/import">Import CSV</a>
|
||||
<a class="nav-link" href="/upload">Import CSV</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="/papers">Papers</a>
|
||||
|
@ -2,5 +2,41 @@
|
||||
{% block content %}
|
||||
<h1>Welcome to SciPaperLoader</h1>
|
||||
<p>Your paper scraping tool is ready.</p>
|
||||
<a href="{{ url_for('main.upload') }}" class="btn btn-primary">Upload CSV</a>
|
||||
|
||||
<div class="alert alert-info">
|
||||
<p><strong>Instructions:</strong> Please upload a CSV file containing academic paper metadata. The file must include the following columns:</p>
|
||||
<ul>
|
||||
<li><code>alternative_id</code> – an alternative title or abbreviation</li>
|
||||
<li><code>journal</code> – the journal name</li>
|
||||
<li><code>doi</code> – the digital object identifier</li>
|
||||
<li><code>issn</code> – the ISSN of the journal</li>
|
||||
<li><code>title</code> – the title of the paper</li>
|
||||
</ul>
|
||||
<p>The format of your CSV should resemble the response structure of the Crossref API's <code>/journals/{issn}/works</code> endpoint.</p>
|
||||
</div>
|
||||
|
||||
<form method="POST" action="{{ url_for('main.upload') }}" enctype="multipart/form-data">
|
||||
<div class="form-group">
|
||||
<label for="file">Upload CSV File</label>
|
||||
<input type="file" name="file" id="file" class="form-control" required>
|
||||
</div>
|
||||
<div class="form-group mt-3">
|
||||
<label for="delimiter">Choose CSV Delimiter</label>
|
||||
<select name="delimiter" id="delimiter" class="form-control">
|
||||
<option value=",">Comma (,)</option>
|
||||
<option value=";">Semicolon (;)</option>
|
||||
<option value="\t">Tab (\\t)</option>
|
||||
<option value="|">Pipe (|)</option>
|
||||
</select>
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary mt-3">Upload</button>
|
||||
</form>
|
||||
|
||||
{% if error %}
|
||||
<div class="alert alert-danger mt-3">{{ error }}</div>
|
||||
{% endif %}
|
||||
|
||||
{% if success %}
|
||||
<div class="alert alert-success mt-3">{{ success }}</div>
|
||||
{% endif %}
|
||||
{% endblock %}
|
@ -1,6 +1,9 @@
|
||||
from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for
|
||||
from .models import ScheduleConfig, VolumeConfig
|
||||
from .models import ScheduleConfig, VolumeConfig, PaperMetadata
|
||||
from .db import db
|
||||
import pandas as pd
|
||||
from io import StringIO
|
||||
import codecs
|
||||
|
||||
bp = Blueprint('main', __name__)
|
||||
|
||||
@ -9,13 +12,63 @@ bp = Blueprint('main', __name__)
|
||||
def index():
|
||||
return render_template("index.html")
|
||||
|
||||
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
|
||||
|
||||
@bp.route("/upload", methods=["GET", "POST"])
|
||||
|
||||
@bp.route('/upload', methods=['GET', 'POST'])
|
||||
def upload():
|
||||
if request.method == "POST":
|
||||
# CSV upload logic here
|
||||
pass
|
||||
return render_template("upload.html")
|
||||
if request.method == 'POST':
|
||||
file = request.files.get('file')
|
||||
delimiter = request.form.get('delimiter', ',')
|
||||
|
||||
if not file:
|
||||
return render_template('upload.html', error="No file selected.")
|
||||
|
||||
try:
|
||||
stream = codecs.iterdecode(file.stream, 'utf-8')
|
||||
content = ''.join(stream)
|
||||
df = pd.read_csv(StringIO(content), delimiter=delimiter)
|
||||
except Exception as e:
|
||||
return render_template('upload.html', error=f"Failed to read CSV file: {e}")
|
||||
|
||||
missing = REQUIRED_COLUMNS - set(df.columns)
|
||||
if missing:
|
||||
return render_template('upload.html', error=f"Missing required columns: {', '.join(missing)}")
|
||||
|
||||
# Optional: parse 'published_online' to date
|
||||
def parse_date(val):
|
||||
if pd.isna(val):
|
||||
return None
|
||||
try:
|
||||
return pd.to_datetime(val).date()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
for _, row in df.iterrows():
|
||||
metadata = PaperMetadata(
|
||||
title=row['title'],
|
||||
doi=row['doi'],
|
||||
alt_id=row.get('alternative_id'),
|
||||
issn=row['issn'],
|
||||
type=row.get('type'),
|
||||
language=row.get('language'),
|
||||
published_online=parse_date(row.get('published_online')),
|
||||
status=None,
|
||||
file_path=None,
|
||||
error_msg=None
|
||||
)
|
||||
db.session.add(metadata)
|
||||
|
||||
try:
|
||||
db.session.commit()
|
||||
except Exception as e:
|
||||
db.session.rollback()
|
||||
return render_template('upload.html', error=f"Failed to save data to database: {e}")
|
||||
|
||||
return render_template('upload.html', success="File uploaded and validated successfully!")
|
||||
|
||||
return render_template('upload.html')
|
||||
|
||||
|
||||
|
||||
@bp.route("/papers")
|
||||
|
Loading…
x
Reference in New Issue
Block a user