This commit is contained in:
Michael Beck 2025-04-01 23:06:15 +02:00
parent 7a41e531bd
commit d2b99ec0d7
6 changed files with 131 additions and 86 deletions

View File

@ -1,8 +1,10 @@
from flask import Flask
from .config import Config
from .db import db
from .models import init_schedule_config
def create_app(test_config=None):
app = Flask(__name__)
app.config.from_object(Config)
@ -18,9 +20,10 @@ def create_app(test_config=None):
@app.context_processor
def inject_app_title():
return {'app_title': app.config['APP_TITLE']}
return {"app_title": app.config["APP_TITLE"]}
from . import views
app.register_blueprint(views.bp)
return app

View File

@ -1,7 +1,8 @@
import os
class Config:
SECRET_KEY = os.environ.get('SECRET_KEY', 'dev')
SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL', 'sqlite:///papers.db')
SECRET_KEY = os.environ.get("SECRET_KEY", "dev")
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
SQLALCHEMY_TRACK_MODIFICATIONS = False
APP_TITLE = os.environ.get('APP_TITLE', 'SciPaperLoader')
APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")

View File

@ -1,5 +1,6 @@
from .db import db
class PaperMetadata(db.Model):
id = db.Column(db.Integer, primary_key=True)
title = db.Column(db.Text)
@ -9,21 +10,28 @@ class PaperMetadata(db.Model):
type = db.Column(db.String(50))
language = db.Column(db.String(50))
published_online = db.Column(db.Date) # or DateTime/String
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
file_path = db.Column(db.Text)
error_msg = db.Column(db.Text)
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
updated_at = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp())
updated_at = db.Column(
db.DateTime,
default=db.func.current_timestamp(),
onupdate=db.func.current_timestamp(),
)
# plus maybe timestamps for created/updated
class ScheduleConfig(db.Model):
hour = db.Column(db.Integer, primary_key=True) # 0-23
weight = db.Column(db.Float) # weight
class VolumeConfig(db.Model):
id = db.Column(db.Integer, primary_key=True)
volume = db.Column(db.Float) # volume of papers to scrape per day
def init_schedule_config():
"""Initialize ScheduleConfig with default values if empty"""
if ScheduleConfig.query.count() == 0:
@ -39,7 +47,7 @@ def init_schedule_config():
# Evening hours (medium volume)
*[(hour, 0.5) for hour in range(17, 21)],
# Late evening (high volume)
*[(hour, 0.8) for hour in range(21, 24)]
*[(hour, 0.8) for hour in range(21, 24)],
]
for hour, weight in default_schedule:

View File

@ -1,7 +1,9 @@
import time
from .db import db
from .models import PaperMetadata
def run_scraper():
while True:
with db.app.app_context():

View File

@ -1,15 +1,26 @@
from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for, send_file
from .models import ScheduleConfig, VolumeConfig, PaperMetadata
from .db import db
import pandas as pd
from io import StringIO
import codecs
import csv
import datetime
import io
import csv
from io import StringIO
import pandas as pd
from flask import (
Blueprint,
current_app,
flash,
redirect,
render_template,
request,
send_file,
url_for,
)
from sqlalchemy import asc, desc
bp = Blueprint('main', __name__)
from .db import db
from .models import PaperMetadata, ScheduleConfig, VolumeConfig
bp = Blueprint("main", __name__)
@bp.route("/")
@ -20,25 +31,27 @@ def index():
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
@bp.route('/upload', methods=['GET', 'POST'])
@bp.route("/upload", methods=["GET", "POST"])
def upload():
if request.method == 'POST':
file = request.files.get('file')
delimiter = request.form.get('delimiter', ',')
if request.method == "POST":
file = request.files.get("file")
delimiter = request.form.get("delimiter", ",")
if not file:
return render_template('upload.html', error="No file selected.")
return render_template("upload.html", error="No file selected.")
try:
stream = codecs.iterdecode(file.stream, 'utf-8')
content = ''.join(stream)
stream = codecs.iterdecode(file.stream, "utf-8")
content = "".join(stream)
df = pd.read_csv(StringIO(content), delimiter=delimiter)
except Exception as e:
return render_template('upload.html', error=f"Failed to read CSV file: {e}")
return render_template("upload.html", error=f"Failed to read CSV file: {e}")
missing = REQUIRED_COLUMNS - set(df.columns)
if missing:
return render_template('upload.html', error=f"Missing required columns: {', '.join(missing)}")
return render_template(
"upload.html", error=f"Missing required columns: {', '.join(missing)}"
)
# Optional: parse 'published_online' to date
def parse_date(val):
@ -51,16 +64,16 @@ def upload():
for _, row in df.iterrows():
metadata = PaperMetadata(
title=row['title'],
doi=row['doi'],
alt_id=row.get('alternative_id'),
issn=row['issn'],
type=row.get('type'),
language=row.get('language'),
published_online=parse_date(row.get('published_online')),
title=row["title"],
doi=row["doi"],
alt_id=row.get("alternative_id"),
issn=row["issn"],
type=row.get("type"),
language=row.get("language"),
published_online=parse_date(row.get("published_online")),
status="New",
file_path=None,
error_msg=None
error_msg=None,
)
db.session.add(metadata)
@ -68,26 +81,30 @@ def upload():
db.session.commit()
except Exception as e:
db.session.rollback()
return render_template('upload.html', error=f"Failed to save data to database: {e}")
return render_template(
"upload.html", error=f"Failed to save data to database: {e}"
)
return render_template('upload.html', success="File uploaded and validated successfully!")
return render_template(
"upload.html", success="File uploaded and validated successfully!"
)
return render_template('upload.html')
return render_template("upload.html")
@bp.route('/papers')
@bp.route("/papers")
def list_papers():
page = request.args.get('page', 1, type=int)
page = request.args.get("page", 1, type=int)
per_page = 50
# Filters
status = request.args.get('status')
created_from = request.args.get('created_from')
created_to = request.args.get('created_to')
updated_from = request.args.get('updated_from')
updated_to = request.args.get('updated_to')
sort_by = request.args.get('sort_by', 'created_at')
sort_dir = request.args.get('sort_dir', 'desc')
status = request.args.get("status")
created_from = request.args.get("created_from")
created_to = request.args.get("created_to")
updated_from = request.args.get("updated_from")
updated_to = request.args.get("updated_to")
sort_by = request.args.get("sort_by", "created_at")
sort_dir = request.args.get("sort_dir", "desc")
query = PaperMetadata.query
@ -97,8 +114,9 @@ def list_papers():
def parse_date(val):
from datetime import datetime
try:
return datetime.strptime(val, '%Y-%m-%d')
return datetime.strptime(val, "%Y-%m-%d")
except (ValueError, TypeError):
return None
@ -113,7 +131,7 @@ def list_papers():
# Sorting
sort_col = getattr(PaperMetadata, sort_by, PaperMetadata.created_at)
sort_func = desc if sort_dir == 'desc' else asc
sort_func = desc if sort_dir == "desc" else asc
query = query.order_by(sort_func(sort_col))
# Pagination
@ -129,7 +147,7 @@ def list_papers():
status_counts = {status: count for status, count in status_counts}
return render_template(
'papers.html',
"papers.html",
papers=pagination.items,
pagination=pagination,
total_papers=total_papers,
@ -139,18 +157,18 @@ def list_papers():
)
@bp.route('/papers/export')
@bp.route("/papers/export")
def export_papers():
query = PaperMetadata.query
# Filters
status = request.args.get('status')
created_from = request.args.get('created_from')
created_to = request.args.get('created_to')
updated_from = request.args.get('updated_from')
updated_to = request.args.get('updated_to')
sort_by = request.args.get('sort_by', 'created_at')
sort_dir = request.args.get('sort_dir', 'desc')
status = request.args.get("status")
created_from = request.args.get("created_from")
created_to = request.args.get("created_to")
updated_from = request.args.get("updated_from")
updated_to = request.args.get("updated_to")
sort_by = request.args.get("sort_by", "created_at")
sort_dir = request.args.get("sort_dir", "desc")
query = PaperMetadata.query
@ -166,35 +184,40 @@ def export_papers():
output = io.StringIO()
writer = csv.writer(output)
writer.writerow(['ID', 'Title', 'Journal', 'DOI', 'ISSN',
'Status', 'Created At', 'Updated At'])
writer.writerow(
["ID", "Title", "Journal", "DOI", "ISSN", "Status", "Created At", "Updated At"]
)
for paper in query:
writer.writerow([
paper.id,
paper.title,
getattr(paper, 'journal', ''),
paper.doi,
paper.issn,
paper.status,
paper.created_at,
paper.updated_at
])
writer.writerow(
[
paper.id,
paper.title,
getattr(paper, "journal", ""),
paper.doi,
paper.issn,
paper.status,
paper.created_at,
paper.updated_at,
]
)
output.seek(0)
return send_file(io.BytesIO(output.read().encode('utf-8')),
mimetype='text/csv',
as_attachment=True,
download_name='papers.csv')
return send_file(
io.BytesIO(output.read().encode("utf-8")),
mimetype="text/csv",
as_attachment=True,
download_name="papers.csv",
)
from flask import jsonify, render_template
@bp.route('/papers/<int:paper_id>/detail')
@bp.route("/papers/<int:paper_id>/detail")
def paper_detail(paper_id):
paper = PaperMetadata.query.get_or_404(paper_id)
return render_template('partials/paper_detail_modal.html', paper=paper)
return render_template("partials/paper_detail_modal.html", paper=paper)
@bp.route("/schedule", methods=["GET", "POST"])
@ -202,10 +225,10 @@ def schedule():
if request.method == "POST":
try:
# Check if we're updating volume or schedule
if 'total_volume' in request.form:
if "total_volume" in request.form:
# Volume update
try:
new_volume = float(request.form.get('total_volume', 0))
new_volume = float(request.form.get("total_volume", 0))
if new_volume <= 0 or new_volume > 1000:
raise ValueError("Volume must be between 1 and 1000")
@ -234,7 +257,8 @@ def schedule():
weight = float(request.form.get(key, 0))
if weight < 0 or weight > 5:
raise ValueError(
f"Weight for hour {hour} must be between 0 and 5")
f"Weight for hour {hour} must be between 0 and 5"
)
except ValueError:
raise ValueError(f"Invalid weight value for hour {hour}")
@ -255,10 +279,17 @@ def schedule():
db.session.rollback()
flash(f"Error updating schedule: {str(e)}", "error")
schedule = {sc.hour: sc.weight for sc in ScheduleConfig.query.order_by(
ScheduleConfig.hour).all()}
schedule = {
sc.hour: sc.weight
for sc in ScheduleConfig.query.order_by(ScheduleConfig.hour).all()
}
volume = VolumeConfig.query.first()
return render_template("schedule.html", schedule=schedule, volume=volume.volume, app_title="PaperScraper")
return render_template(
"schedule.html",
schedule=schedule,
volume=volume.volume,
app_title="PaperScraper",
)
@bp.route("/logs")