From d2b99ec0d769c81051a9ee3dfe706b49cd7324f6 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Tue, 1 Apr 2025 23:06:15 +0200 Subject: [PATCH] formats --- scipaperloader/__init__.py | 5 +- scipaperloader/config.py | 7 +- scipaperloader/db.py | 2 +- scipaperloader/models.py | 28 +++--- scipaperloader/scraper.py | 2 + scipaperloader/views.py | 173 ++++++++++++++++++++++--------------- 6 files changed, 131 insertions(+), 86 deletions(-) diff --git a/scipaperloader/__init__.py b/scipaperloader/__init__.py index 6d48825..534e9c4 100644 --- a/scipaperloader/__init__.py +++ b/scipaperloader/__init__.py @@ -1,8 +1,10 @@ from flask import Flask + from .config import Config from .db import db from .models import init_schedule_config + def create_app(test_config=None): app = Flask(__name__) app.config.from_object(Config) @@ -18,9 +20,10 @@ def create_app(test_config=None): @app.context_processor def inject_app_title(): - return {'app_title': app.config['APP_TITLE']} + return {"app_title": app.config["APP_TITLE"]} from . import views + app.register_blueprint(views.bp) return app diff --git a/scipaperloader/config.py b/scipaperloader/config.py index 4159388..a4d59ea 100644 --- a/scipaperloader/config.py +++ b/scipaperloader/config.py @@ -1,7 +1,8 @@ import os + class Config: - SECRET_KEY = os.environ.get('SECRET_KEY', 'dev') - SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL', 'sqlite:///papers.db') + SECRET_KEY = os.environ.get("SECRET_KEY", "dev") + SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db") SQLALCHEMY_TRACK_MODIFICATIONS = False - APP_TITLE = os.environ.get('APP_TITLE', 'SciPaperLoader') + APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader") diff --git a/scipaperloader/db.py b/scipaperloader/db.py index 2e1eeb6..f0b13d6 100644 --- a/scipaperloader/db.py +++ b/scipaperloader/db.py @@ -1,3 +1,3 @@ from flask_sqlalchemy import SQLAlchemy -db = SQLAlchemy() \ No newline at end of file +db = SQLAlchemy() diff --git a/scipaperloader/models.py b/scipaperloader/models.py index 2e1def8..9c3d40c 100644 --- a/scipaperloader/models.py +++ b/scipaperloader/models.py @@ -1,5 +1,6 @@ from .db import db + class PaperMetadata(db.Model): id = db.Column(db.Integer, primary_key=True) title = db.Column(db.Text) @@ -9,21 +10,28 @@ class PaperMetadata(db.Model): type = db.Column(db.String(50)) language = db.Column(db.String(50)) published_online = db.Column(db.Date) # or DateTime/String - status = db.Column(db.String(10)) # 'Pending','Done','Failed' + status = db.Column(db.String(10)) # 'Pending','Done','Failed' file_path = db.Column(db.Text) error_msg = db.Column(db.Text) created_at = db.Column(db.DateTime, default=db.func.current_timestamp()) - updated_at = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp()) + updated_at = db.Column( + db.DateTime, + default=db.func.current_timestamp(), + onupdate=db.func.current_timestamp(), + ) # plus maybe timestamps for created/updated - + + class ScheduleConfig(db.Model): hour = db.Column(db.Integer, primary_key=True) # 0-23 weight = db.Column(db.Float) # weight - + + class VolumeConfig(db.Model): id = db.Column(db.Integer, primary_key=True) volume = db.Column(db.Float) # volume of papers to scrape per day - + + def init_schedule_config(): """Initialize ScheduleConfig with default values if empty""" if ScheduleConfig.query.count() == 0: @@ -39,17 +47,17 @@ def init_schedule_config(): # Evening hours (medium volume) *[(hour, 0.5) for hour in range(17, 21)], # Late evening (high volume) - *[(hour, 0.8) for hour in range(21, 24)] + *[(hour, 0.8) for hour in range(21, 24)], ] - + for hour, weight in default_schedule: config = ScheduleConfig(hour=hour, weight=weight) db.session.add(config) - + db.session.commit() - + if VolumeConfig.query.count() == 0: # Default volume configuration default_volume = VolumeConfig(volume=100) db.session.add(default_volume) - db.session.commit() \ No newline at end of file + db.session.commit() diff --git a/scipaperloader/scraper.py b/scipaperloader/scraper.py index 7e62501..7e386e6 100644 --- a/scipaperloader/scraper.py +++ b/scipaperloader/scraper.py @@ -1,7 +1,9 @@ import time + from .db import db from .models import PaperMetadata + def run_scraper(): while True: with db.app.app_context(): diff --git a/scipaperloader/views.py b/scipaperloader/views.py index 0a7c7f6..128737b 100644 --- a/scipaperloader/views.py +++ b/scipaperloader/views.py @@ -1,15 +1,26 @@ -from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for, send_file -from .models import ScheduleConfig, VolumeConfig, PaperMetadata -from .db import db -import pandas as pd -from io import StringIO import codecs +import csv import datetime import io -import csv +from io import StringIO + +import pandas as pd +from flask import ( + Blueprint, + current_app, + flash, + redirect, + render_template, + request, + send_file, + url_for, +) from sqlalchemy import asc, desc -bp = Blueprint('main', __name__) +from .db import db +from .models import PaperMetadata, ScheduleConfig, VolumeConfig + +bp = Blueprint("main", __name__) @bp.route("/") @@ -20,25 +31,27 @@ def index(): REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"} -@bp.route('/upload', methods=['GET', 'POST']) +@bp.route("/upload", methods=["GET", "POST"]) def upload(): - if request.method == 'POST': - file = request.files.get('file') - delimiter = request.form.get('delimiter', ',') + if request.method == "POST": + file = request.files.get("file") + delimiter = request.form.get("delimiter", ",") if not file: - return render_template('upload.html', error="No file selected.") + return render_template("upload.html", error="No file selected.") try: - stream = codecs.iterdecode(file.stream, 'utf-8') - content = ''.join(stream) + stream = codecs.iterdecode(file.stream, "utf-8") + content = "".join(stream) df = pd.read_csv(StringIO(content), delimiter=delimiter) except Exception as e: - return render_template('upload.html', error=f"Failed to read CSV file: {e}") + return render_template("upload.html", error=f"Failed to read CSV file: {e}") missing = REQUIRED_COLUMNS - set(df.columns) if missing: - return render_template('upload.html', error=f"Missing required columns: {', '.join(missing)}") + return render_template( + "upload.html", error=f"Missing required columns: {', '.join(missing)}" + ) # Optional: parse 'published_online' to date def parse_date(val): @@ -51,16 +64,16 @@ def upload(): for _, row in df.iterrows(): metadata = PaperMetadata( - title=row['title'], - doi=row['doi'], - alt_id=row.get('alternative_id'), - issn=row['issn'], - type=row.get('type'), - language=row.get('language'), - published_online=parse_date(row.get('published_online')), + title=row["title"], + doi=row["doi"], + alt_id=row.get("alternative_id"), + issn=row["issn"], + type=row.get("type"), + language=row.get("language"), + published_online=parse_date(row.get("published_online")), status="New", file_path=None, - error_msg=None + error_msg=None, ) db.session.add(metadata) @@ -68,26 +81,30 @@ def upload(): db.session.commit() except Exception as e: db.session.rollback() - return render_template('upload.html', error=f"Failed to save data to database: {e}") + return render_template( + "upload.html", error=f"Failed to save data to database: {e}" + ) - return render_template('upload.html', success="File uploaded and validated successfully!") + return render_template( + "upload.html", success="File uploaded and validated successfully!" + ) - return render_template('upload.html') + return render_template("upload.html") -@bp.route('/papers') +@bp.route("/papers") def list_papers(): - page = request.args.get('page', 1, type=int) + page = request.args.get("page", 1, type=int) per_page = 50 # Filters - status = request.args.get('status') - created_from = request.args.get('created_from') - created_to = request.args.get('created_to') - updated_from = request.args.get('updated_from') - updated_to = request.args.get('updated_to') - sort_by = request.args.get('sort_by', 'created_at') - sort_dir = request.args.get('sort_dir', 'desc') + status = request.args.get("status") + created_from = request.args.get("created_from") + created_to = request.args.get("created_to") + updated_from = request.args.get("updated_from") + updated_to = request.args.get("updated_to") + sort_by = request.args.get("sort_by", "created_at") + sort_dir = request.args.get("sort_dir", "desc") query = PaperMetadata.query @@ -97,8 +114,9 @@ def list_papers(): def parse_date(val): from datetime import datetime + try: - return datetime.strptime(val, '%Y-%m-%d') + return datetime.strptime(val, "%Y-%m-%d") except (ValueError, TypeError): return None @@ -113,7 +131,7 @@ def list_papers(): # Sorting sort_col = getattr(PaperMetadata, sort_by, PaperMetadata.created_at) - sort_func = desc if sort_dir == 'desc' else asc + sort_func = desc if sort_dir == "desc" else asc query = query.order_by(sort_func(sort_col)) # Pagination @@ -129,7 +147,7 @@ def list_papers(): status_counts = {status: count for status, count in status_counts} return render_template( - 'papers.html', + "papers.html", papers=pagination.items, pagination=pagination, total_papers=total_papers, @@ -139,18 +157,18 @@ def list_papers(): ) -@bp.route('/papers/export') +@bp.route("/papers/export") def export_papers(): query = PaperMetadata.query # Filters - status = request.args.get('status') - created_from = request.args.get('created_from') - created_to = request.args.get('created_to') - updated_from = request.args.get('updated_from') - updated_to = request.args.get('updated_to') - sort_by = request.args.get('sort_by', 'created_at') - sort_dir = request.args.get('sort_dir', 'desc') + status = request.args.get("status") + created_from = request.args.get("created_from") + created_to = request.args.get("created_to") + updated_from = request.args.get("updated_from") + updated_to = request.args.get("updated_to") + sort_by = request.args.get("sort_by", "created_at") + sort_dir = request.args.get("sort_dir", "desc") query = PaperMetadata.query @@ -166,35 +184,40 @@ def export_papers(): output = io.StringIO() writer = csv.writer(output) - writer.writerow(['ID', 'Title', 'Journal', 'DOI', 'ISSN', - 'Status', 'Created At', 'Updated At']) + writer.writerow( + ["ID", "Title", "Journal", "DOI", "ISSN", "Status", "Created At", "Updated At"] + ) for paper in query: - writer.writerow([ - paper.id, - paper.title, - getattr(paper, 'journal', ''), - paper.doi, - paper.issn, - paper.status, - paper.created_at, - paper.updated_at - ]) + writer.writerow( + [ + paper.id, + paper.title, + getattr(paper, "journal", ""), + paper.doi, + paper.issn, + paper.status, + paper.created_at, + paper.updated_at, + ] + ) output.seek(0) - return send_file(io.BytesIO(output.read().encode('utf-8')), - mimetype='text/csv', - as_attachment=True, - download_name='papers.csv') + return send_file( + io.BytesIO(output.read().encode("utf-8")), + mimetype="text/csv", + as_attachment=True, + download_name="papers.csv", + ) from flask import jsonify, render_template -@bp.route('/papers//detail') +@bp.route("/papers//detail") def paper_detail(paper_id): paper = PaperMetadata.query.get_or_404(paper_id) - return render_template('partials/paper_detail_modal.html', paper=paper) + return render_template("partials/paper_detail_modal.html", paper=paper) @bp.route("/schedule", methods=["GET", "POST"]) @@ -202,10 +225,10 @@ def schedule(): if request.method == "POST": try: # Check if we're updating volume or schedule - if 'total_volume' in request.form: + if "total_volume" in request.form: # Volume update try: - new_volume = float(request.form.get('total_volume', 0)) + new_volume = float(request.form.get("total_volume", 0)) if new_volume <= 0 or new_volume > 1000: raise ValueError("Volume must be between 1 and 1000") @@ -234,7 +257,8 @@ def schedule(): weight = float(request.form.get(key, 0)) if weight < 0 or weight > 5: raise ValueError( - f"Weight for hour {hour} must be between 0 and 5") + f"Weight for hour {hour} must be between 0 and 5" + ) except ValueError: raise ValueError(f"Invalid weight value for hour {hour}") @@ -255,10 +279,17 @@ def schedule(): db.session.rollback() flash(f"Error updating schedule: {str(e)}", "error") - schedule = {sc.hour: sc.weight for sc in ScheduleConfig.query.order_by( - ScheduleConfig.hour).all()} + schedule = { + sc.hour: sc.weight + for sc in ScheduleConfig.query.order_by(ScheduleConfig.hour).all() + } volume = VolumeConfig.query.first() - return render_template("schedule.html", schedule=schedule, volume=volume.volume, app_title="PaperScraper") + return render_template( + "schedule.html", + schedule=schedule, + volume=volume.volume, + app_title="PaperScraper", + ) @bp.route("/logs")