adds csv importer
This commit is contained in:
parent
dd707e2a9f
commit
59b6404b99
@ -1,6 +1,6 @@
|
|||||||
from .db import db
|
from .db import db
|
||||||
|
|
||||||
class Paper(db.Model):
|
class PaperMetadata(db.Model):
|
||||||
id = db.Column(db.Integer, primary_key=True)
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
title = db.Column(db.Text)
|
title = db.Column(db.Text)
|
||||||
doi = db.Column(db.String, unique=True, index=True)
|
doi = db.Column(db.String, unique=True, index=True)
|
||||||
@ -8,10 +8,12 @@ class Paper(db.Model):
|
|||||||
issn = db.Column(db.String(32))
|
issn = db.Column(db.String(32))
|
||||||
type = db.Column(db.String(50))
|
type = db.Column(db.String(50))
|
||||||
language = db.Column(db.String(50))
|
language = db.Column(db.String(50))
|
||||||
published_date = db.Column(db.Date) # or DateTime/String
|
published_online = db.Column(db.Date) # or DateTime/String
|
||||||
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
|
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
|
||||||
file_path = db.Column(db.Text)
|
file_path = db.Column(db.Text)
|
||||||
error_msg = db.Column(db.Text)
|
error_msg = db.Column(db.Text)
|
||||||
|
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
|
||||||
|
updated_at = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp())
|
||||||
# plus maybe timestamps for created/updated
|
# plus maybe timestamps for created/updated
|
||||||
|
|
||||||
class ScheduleConfig(db.Model):
|
class ScheduleConfig(db.Model):
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import time
|
import time
|
||||||
from .db import db
|
from .db import db
|
||||||
from .models import Paper
|
from .models import PaperMetadata
|
||||||
|
|
||||||
def run_scraper():
|
def run_scraper():
|
||||||
while True:
|
while True:
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
<div class="collapse navbar-collapse" id="navbarSupportedContent">
|
||||||
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
|
<ul class="navbar-nav me-auto mb-2 mb-lg-0">
|
||||||
<li class="nav-item">
|
<li class="nav-item">
|
||||||
<a class="nav-link" href="/import">Import CSV</a>
|
<a class="nav-link" href="/upload">Import CSV</a>
|
||||||
</li>
|
</li>
|
||||||
<li class="nav-item">
|
<li class="nav-item">
|
||||||
<a class="nav-link" href="/papers">Papers</a>
|
<a class="nav-link" href="/papers">Papers</a>
|
||||||
|
@ -2,5 +2,41 @@
|
|||||||
{% block content %}
|
{% block content %}
|
||||||
<h1>Welcome to SciPaperLoader</h1>
|
<h1>Welcome to SciPaperLoader</h1>
|
||||||
<p>Your paper scraping tool is ready.</p>
|
<p>Your paper scraping tool is ready.</p>
|
||||||
<a href="{{ url_for('main.upload') }}" class="btn btn-primary">Upload CSV</a>
|
|
||||||
{% endblock %}
|
<div class="alert alert-info">
|
||||||
|
<p><strong>Instructions:</strong> Please upload a CSV file containing academic paper metadata. The file must include the following columns:</p>
|
||||||
|
<ul>
|
||||||
|
<li><code>alternative_id</code> – an alternative title or abbreviation</li>
|
||||||
|
<li><code>journal</code> – the journal name</li>
|
||||||
|
<li><code>doi</code> – the digital object identifier</li>
|
||||||
|
<li><code>issn</code> – the ISSN of the journal</li>
|
||||||
|
<li><code>title</code> – the title of the paper</li>
|
||||||
|
</ul>
|
||||||
|
<p>The format of your CSV should resemble the response structure of the Crossref API's <code>/journals/{issn}/works</code> endpoint.</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<form method="POST" action="{{ url_for('main.upload') }}" enctype="multipart/form-data">
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="file">Upload CSV File</label>
|
||||||
|
<input type="file" name="file" id="file" class="form-control" required>
|
||||||
|
</div>
|
||||||
|
<div class="form-group mt-3">
|
||||||
|
<label for="delimiter">Choose CSV Delimiter</label>
|
||||||
|
<select name="delimiter" id="delimiter" class="form-control">
|
||||||
|
<option value=",">Comma (,)</option>
|
||||||
|
<option value=";">Semicolon (;)</option>
|
||||||
|
<option value="\t">Tab (\\t)</option>
|
||||||
|
<option value="|">Pipe (|)</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<button type="submit" class="btn btn-primary mt-3">Upload</button>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
{% if error %}
|
||||||
|
<div class="alert alert-danger mt-3">{{ error }}</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if success %}
|
||||||
|
<div class="alert alert-success mt-3">{{ success }}</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endblock %}
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for
|
from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for
|
||||||
from .models import ScheduleConfig, VolumeConfig
|
from .models import ScheduleConfig, VolumeConfig, PaperMetadata
|
||||||
from .db import db
|
from .db import db
|
||||||
|
import pandas as pd
|
||||||
|
from io import StringIO
|
||||||
|
import codecs
|
||||||
|
|
||||||
bp = Blueprint('main', __name__)
|
bp = Blueprint('main', __name__)
|
||||||
|
|
||||||
@ -9,13 +12,63 @@ bp = Blueprint('main', __name__)
|
|||||||
def index():
|
def index():
|
||||||
return render_template("index.html")
|
return render_template("index.html")
|
||||||
|
|
||||||
|
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
|
||||||
|
|
||||||
@bp.route("/upload", methods=["GET", "POST"])
|
|
||||||
|
@bp.route('/upload', methods=['GET', 'POST'])
|
||||||
def upload():
|
def upload():
|
||||||
if request.method == "POST":
|
if request.method == 'POST':
|
||||||
# CSV upload logic here
|
file = request.files.get('file')
|
||||||
pass
|
delimiter = request.form.get('delimiter', ',')
|
||||||
return render_template("upload.html")
|
|
||||||
|
if not file:
|
||||||
|
return render_template('upload.html', error="No file selected.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
stream = codecs.iterdecode(file.stream, 'utf-8')
|
||||||
|
content = ''.join(stream)
|
||||||
|
df = pd.read_csv(StringIO(content), delimiter=delimiter)
|
||||||
|
except Exception as e:
|
||||||
|
return render_template('upload.html', error=f"Failed to read CSV file: {e}")
|
||||||
|
|
||||||
|
missing = REQUIRED_COLUMNS - set(df.columns)
|
||||||
|
if missing:
|
||||||
|
return render_template('upload.html', error=f"Missing required columns: {', '.join(missing)}")
|
||||||
|
|
||||||
|
# Optional: parse 'published_online' to date
|
||||||
|
def parse_date(val):
|
||||||
|
if pd.isna(val):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return pd.to_datetime(val).date()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
metadata = PaperMetadata(
|
||||||
|
title=row['title'],
|
||||||
|
doi=row['doi'],
|
||||||
|
alt_id=row.get('alternative_id'),
|
||||||
|
issn=row['issn'],
|
||||||
|
type=row.get('type'),
|
||||||
|
language=row.get('language'),
|
||||||
|
published_online=parse_date(row.get('published_online')),
|
||||||
|
status=None,
|
||||||
|
file_path=None,
|
||||||
|
error_msg=None
|
||||||
|
)
|
||||||
|
db.session.add(metadata)
|
||||||
|
|
||||||
|
try:
|
||||||
|
db.session.commit()
|
||||||
|
except Exception as e:
|
||||||
|
db.session.rollback()
|
||||||
|
return render_template('upload.html', error=f"Failed to save data to database: {e}")
|
||||||
|
|
||||||
|
return render_template('upload.html', success="File uploaded and validated successfully!")
|
||||||
|
|
||||||
|
return render_template('upload.html')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@bp.route("/papers")
|
@bp.route("/papers")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user