This commit is contained in:
Michael Beck 2025-04-01 23:06:15 +02:00
parent 7a41e531bd
commit d2b99ec0d7
6 changed files with 131 additions and 86 deletions

View File

@ -1,8 +1,10 @@
from flask import Flask from flask import Flask
from .config import Config from .config import Config
from .db import db from .db import db
from .models import init_schedule_config from .models import init_schedule_config
def create_app(test_config=None): def create_app(test_config=None):
app = Flask(__name__) app = Flask(__name__)
app.config.from_object(Config) app.config.from_object(Config)
@ -18,9 +20,10 @@ def create_app(test_config=None):
@app.context_processor @app.context_processor
def inject_app_title(): def inject_app_title():
return {'app_title': app.config['APP_TITLE']} return {"app_title": app.config["APP_TITLE"]}
from . import views from . import views
app.register_blueprint(views.bp) app.register_blueprint(views.bp)
return app return app

View File

@ -1,7 +1,8 @@
import os import os
class Config: class Config:
SECRET_KEY = os.environ.get('SECRET_KEY', 'dev') SECRET_KEY = os.environ.get("SECRET_KEY", "dev")
SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL', 'sqlite:///papers.db') SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
SQLALCHEMY_TRACK_MODIFICATIONS = False SQLALCHEMY_TRACK_MODIFICATIONS = False
APP_TITLE = os.environ.get('APP_TITLE', 'SciPaperLoader') APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")

View File

@ -1,3 +1,3 @@
from flask_sqlalchemy import SQLAlchemy from flask_sqlalchemy import SQLAlchemy
db = SQLAlchemy() db = SQLAlchemy()

View File

@ -1,5 +1,6 @@
from .db import db from .db import db
class PaperMetadata(db.Model): class PaperMetadata(db.Model):
id = db.Column(db.Integer, primary_key=True) id = db.Column(db.Integer, primary_key=True)
title = db.Column(db.Text) title = db.Column(db.Text)
@ -9,21 +10,28 @@ class PaperMetadata(db.Model):
type = db.Column(db.String(50)) type = db.Column(db.String(50))
language = db.Column(db.String(50)) language = db.Column(db.String(50))
published_online = db.Column(db.Date) # or DateTime/String published_online = db.Column(db.Date) # or DateTime/String
status = db.Column(db.String(10)) # 'Pending','Done','Failed' status = db.Column(db.String(10)) # 'Pending','Done','Failed'
file_path = db.Column(db.Text) file_path = db.Column(db.Text)
error_msg = db.Column(db.Text) error_msg = db.Column(db.Text)
created_at = db.Column(db.DateTime, default=db.func.current_timestamp()) created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
updated_at = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp()) updated_at = db.Column(
db.DateTime,
default=db.func.current_timestamp(),
onupdate=db.func.current_timestamp(),
)
# plus maybe timestamps for created/updated # plus maybe timestamps for created/updated
class ScheduleConfig(db.Model): class ScheduleConfig(db.Model):
hour = db.Column(db.Integer, primary_key=True) # 0-23 hour = db.Column(db.Integer, primary_key=True) # 0-23
weight = db.Column(db.Float) # weight weight = db.Column(db.Float) # weight
class VolumeConfig(db.Model): class VolumeConfig(db.Model):
id = db.Column(db.Integer, primary_key=True) id = db.Column(db.Integer, primary_key=True)
volume = db.Column(db.Float) # volume of papers to scrape per day volume = db.Column(db.Float) # volume of papers to scrape per day
def init_schedule_config(): def init_schedule_config():
"""Initialize ScheduleConfig with default values if empty""" """Initialize ScheduleConfig with default values if empty"""
if ScheduleConfig.query.count() == 0: if ScheduleConfig.query.count() == 0:
@ -39,17 +47,17 @@ def init_schedule_config():
# Evening hours (medium volume) # Evening hours (medium volume)
*[(hour, 0.5) for hour in range(17, 21)], *[(hour, 0.5) for hour in range(17, 21)],
# Late evening (high volume) # Late evening (high volume)
*[(hour, 0.8) for hour in range(21, 24)] *[(hour, 0.8) for hour in range(21, 24)],
] ]
for hour, weight in default_schedule: for hour, weight in default_schedule:
config = ScheduleConfig(hour=hour, weight=weight) config = ScheduleConfig(hour=hour, weight=weight)
db.session.add(config) db.session.add(config)
db.session.commit() db.session.commit()
if VolumeConfig.query.count() == 0: if VolumeConfig.query.count() == 0:
# Default volume configuration # Default volume configuration
default_volume = VolumeConfig(volume=100) default_volume = VolumeConfig(volume=100)
db.session.add(default_volume) db.session.add(default_volume)
db.session.commit() db.session.commit()

View File

@ -1,7 +1,9 @@
import time import time
from .db import db from .db import db
from .models import PaperMetadata from .models import PaperMetadata
def run_scraper(): def run_scraper():
while True: while True:
with db.app.app_context(): with db.app.app_context():

View File

@ -1,15 +1,26 @@
from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for, send_file
from .models import ScheduleConfig, VolumeConfig, PaperMetadata
from .db import db
import pandas as pd
from io import StringIO
import codecs import codecs
import csv
import datetime import datetime
import io import io
import csv from io import StringIO
import pandas as pd
from flask import (
Blueprint,
current_app,
flash,
redirect,
render_template,
request,
send_file,
url_for,
)
from sqlalchemy import asc, desc from sqlalchemy import asc, desc
bp = Blueprint('main', __name__) from .db import db
from .models import PaperMetadata, ScheduleConfig, VolumeConfig
bp = Blueprint("main", __name__)
@bp.route("/") @bp.route("/")
@ -20,25 +31,27 @@ def index():
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"} REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
@bp.route('/upload', methods=['GET', 'POST']) @bp.route("/upload", methods=["GET", "POST"])
def upload(): def upload():
if request.method == 'POST': if request.method == "POST":
file = request.files.get('file') file = request.files.get("file")
delimiter = request.form.get('delimiter', ',') delimiter = request.form.get("delimiter", ",")
if not file: if not file:
return render_template('upload.html', error="No file selected.") return render_template("upload.html", error="No file selected.")
try: try:
stream = codecs.iterdecode(file.stream, 'utf-8') stream = codecs.iterdecode(file.stream, "utf-8")
content = ''.join(stream) content = "".join(stream)
df = pd.read_csv(StringIO(content), delimiter=delimiter) df = pd.read_csv(StringIO(content), delimiter=delimiter)
except Exception as e: except Exception as e:
return render_template('upload.html', error=f"Failed to read CSV file: {e}") return render_template("upload.html", error=f"Failed to read CSV file: {e}")
missing = REQUIRED_COLUMNS - set(df.columns) missing = REQUIRED_COLUMNS - set(df.columns)
if missing: if missing:
return render_template('upload.html', error=f"Missing required columns: {', '.join(missing)}") return render_template(
"upload.html", error=f"Missing required columns: {', '.join(missing)}"
)
# Optional: parse 'published_online' to date # Optional: parse 'published_online' to date
def parse_date(val): def parse_date(val):
@ -51,16 +64,16 @@ def upload():
for _, row in df.iterrows(): for _, row in df.iterrows():
metadata = PaperMetadata( metadata = PaperMetadata(
title=row['title'], title=row["title"],
doi=row['doi'], doi=row["doi"],
alt_id=row.get('alternative_id'), alt_id=row.get("alternative_id"),
issn=row['issn'], issn=row["issn"],
type=row.get('type'), type=row.get("type"),
language=row.get('language'), language=row.get("language"),
published_online=parse_date(row.get('published_online')), published_online=parse_date(row.get("published_online")),
status="New", status="New",
file_path=None, file_path=None,
error_msg=None error_msg=None,
) )
db.session.add(metadata) db.session.add(metadata)
@ -68,26 +81,30 @@ def upload():
db.session.commit() db.session.commit()
except Exception as e: except Exception as e:
db.session.rollback() db.session.rollback()
return render_template('upload.html', error=f"Failed to save data to database: {e}") return render_template(
"upload.html", error=f"Failed to save data to database: {e}"
)
return render_template('upload.html', success="File uploaded and validated successfully!") return render_template(
"upload.html", success="File uploaded and validated successfully!"
)
return render_template('upload.html') return render_template("upload.html")
@bp.route('/papers') @bp.route("/papers")
def list_papers(): def list_papers():
page = request.args.get('page', 1, type=int) page = request.args.get("page", 1, type=int)
per_page = 50 per_page = 50
# Filters # Filters
status = request.args.get('status') status = request.args.get("status")
created_from = request.args.get('created_from') created_from = request.args.get("created_from")
created_to = request.args.get('created_to') created_to = request.args.get("created_to")
updated_from = request.args.get('updated_from') updated_from = request.args.get("updated_from")
updated_to = request.args.get('updated_to') updated_to = request.args.get("updated_to")
sort_by = request.args.get('sort_by', 'created_at') sort_by = request.args.get("sort_by", "created_at")
sort_dir = request.args.get('sort_dir', 'desc') sort_dir = request.args.get("sort_dir", "desc")
query = PaperMetadata.query query = PaperMetadata.query
@ -97,8 +114,9 @@ def list_papers():
def parse_date(val): def parse_date(val):
from datetime import datetime from datetime import datetime
try: try:
return datetime.strptime(val, '%Y-%m-%d') return datetime.strptime(val, "%Y-%m-%d")
except (ValueError, TypeError): except (ValueError, TypeError):
return None return None
@ -113,7 +131,7 @@ def list_papers():
# Sorting # Sorting
sort_col = getattr(PaperMetadata, sort_by, PaperMetadata.created_at) sort_col = getattr(PaperMetadata, sort_by, PaperMetadata.created_at)
sort_func = desc if sort_dir == 'desc' else asc sort_func = desc if sort_dir == "desc" else asc
query = query.order_by(sort_func(sort_col)) query = query.order_by(sort_func(sort_col))
# Pagination # Pagination
@ -129,7 +147,7 @@ def list_papers():
status_counts = {status: count for status, count in status_counts} status_counts = {status: count for status, count in status_counts}
return render_template( return render_template(
'papers.html', "papers.html",
papers=pagination.items, papers=pagination.items,
pagination=pagination, pagination=pagination,
total_papers=total_papers, total_papers=total_papers,
@ -139,18 +157,18 @@ def list_papers():
) )
@bp.route('/papers/export') @bp.route("/papers/export")
def export_papers(): def export_papers():
query = PaperMetadata.query query = PaperMetadata.query
# Filters # Filters
status = request.args.get('status') status = request.args.get("status")
created_from = request.args.get('created_from') created_from = request.args.get("created_from")
created_to = request.args.get('created_to') created_to = request.args.get("created_to")
updated_from = request.args.get('updated_from') updated_from = request.args.get("updated_from")
updated_to = request.args.get('updated_to') updated_to = request.args.get("updated_to")
sort_by = request.args.get('sort_by', 'created_at') sort_by = request.args.get("sort_by", "created_at")
sort_dir = request.args.get('sort_dir', 'desc') sort_dir = request.args.get("sort_dir", "desc")
query = PaperMetadata.query query = PaperMetadata.query
@ -166,35 +184,40 @@ def export_papers():
output = io.StringIO() output = io.StringIO()
writer = csv.writer(output) writer = csv.writer(output)
writer.writerow(['ID', 'Title', 'Journal', 'DOI', 'ISSN', writer.writerow(
'Status', 'Created At', 'Updated At']) ["ID", "Title", "Journal", "DOI", "ISSN", "Status", "Created At", "Updated At"]
)
for paper in query: for paper in query:
writer.writerow([ writer.writerow(
paper.id, [
paper.title, paper.id,
getattr(paper, 'journal', ''), paper.title,
paper.doi, getattr(paper, "journal", ""),
paper.issn, paper.doi,
paper.status, paper.issn,
paper.created_at, paper.status,
paper.updated_at paper.created_at,
]) paper.updated_at,
]
)
output.seek(0) output.seek(0)
return send_file(io.BytesIO(output.read().encode('utf-8')), return send_file(
mimetype='text/csv', io.BytesIO(output.read().encode("utf-8")),
as_attachment=True, mimetype="text/csv",
download_name='papers.csv') as_attachment=True,
download_name="papers.csv",
)
from flask import jsonify, render_template from flask import jsonify, render_template
@bp.route('/papers/<int:paper_id>/detail') @bp.route("/papers/<int:paper_id>/detail")
def paper_detail(paper_id): def paper_detail(paper_id):
paper = PaperMetadata.query.get_or_404(paper_id) paper = PaperMetadata.query.get_or_404(paper_id)
return render_template('partials/paper_detail_modal.html', paper=paper) return render_template("partials/paper_detail_modal.html", paper=paper)
@bp.route("/schedule", methods=["GET", "POST"]) @bp.route("/schedule", methods=["GET", "POST"])
@ -202,10 +225,10 @@ def schedule():
if request.method == "POST": if request.method == "POST":
try: try:
# Check if we're updating volume or schedule # Check if we're updating volume or schedule
if 'total_volume' in request.form: if "total_volume" in request.form:
# Volume update # Volume update
try: try:
new_volume = float(request.form.get('total_volume', 0)) new_volume = float(request.form.get("total_volume", 0))
if new_volume <= 0 or new_volume > 1000: if new_volume <= 0 or new_volume > 1000:
raise ValueError("Volume must be between 1 and 1000") raise ValueError("Volume must be between 1 and 1000")
@ -234,7 +257,8 @@ def schedule():
weight = float(request.form.get(key, 0)) weight = float(request.form.get(key, 0))
if weight < 0 or weight > 5: if weight < 0 or weight > 5:
raise ValueError( raise ValueError(
f"Weight for hour {hour} must be between 0 and 5") f"Weight for hour {hour} must be between 0 and 5"
)
except ValueError: except ValueError:
raise ValueError(f"Invalid weight value for hour {hour}") raise ValueError(f"Invalid weight value for hour {hour}")
@ -255,10 +279,17 @@ def schedule():
db.session.rollback() db.session.rollback()
flash(f"Error updating schedule: {str(e)}", "error") flash(f"Error updating schedule: {str(e)}", "error")
schedule = {sc.hour: sc.weight for sc in ScheduleConfig.query.order_by( schedule = {
ScheduleConfig.hour).all()} sc.hour: sc.weight
for sc in ScheduleConfig.query.order_by(ScheduleConfig.hour).all()
}
volume = VolumeConfig.query.first() volume = VolumeConfig.query.first()
return render_template("schedule.html", schedule=schedule, volume=volume.volume, app_title="PaperScraper") return render_template(
"schedule.html",
schedule=schedule,
volume=volume.volume,
app_title="PaperScraper",
)
@bp.route("/logs") @bp.route("/logs")