diff --git a/scipaperloader/__init__.py b/scipaperloader/__init__.py index 534e9c4..8fbdff7 100644 --- a/scipaperloader/__init__.py +++ b/scipaperloader/__init__.py @@ -3,6 +3,7 @@ from flask import Flask from .config import Config from .db import db from .models import init_schedule_config +from .blueprints import register_blueprints def create_app(test_config=None): @@ -22,8 +23,6 @@ def create_app(test_config=None): def inject_app_title(): return {"app_title": app.config["APP_TITLE"]} - from . import views - - app.register_blueprint(views.bp) + register_blueprints(app) return app diff --git a/scipaperloader/blueprints/__init__.py b/scipaperloader/blueprints/__init__.py new file mode 100644 index 0000000..c3b1eac --- /dev/null +++ b/scipaperloader/blueprints/__init__.py @@ -0,0 +1,15 @@ +"""Blueprint registration module.""" +from flask import Flask + +from .main import bp as main_bp +from .papers import bp as papers_bp +from .upload import bp as upload_bp +from .schedule import bp as schedule_bp + + +def register_blueprints(app: Flask): + """Register all blueprints with the Flask application.""" + app.register_blueprint(main_bp) + app.register_blueprint(papers_bp, url_prefix='/papers') + app.register_blueprint(upload_bp, url_prefix='/upload') + app.register_blueprint(schedule_bp, url_prefix='/schedule') \ No newline at end of file diff --git a/scipaperloader/blueprints/main.py b/scipaperloader/blueprints/main.py new file mode 100644 index 0000000..be4cd58 --- /dev/null +++ b/scipaperloader/blueprints/main.py @@ -0,0 +1,19 @@ +"""Main routes for the application.""" +from flask import Blueprint, render_template + +bp = Blueprint("main", __name__) + + +@bp.route("/") +def index(): + return render_template("index.html") + + +@bp.route("/logs") +def logs(): + return render_template("logs.html", app_title="PaperScraper") + + +@bp.route("/about") +def about(): + return render_template("about.html", app_title="PaperScraper") \ No newline at end of file diff --git a/scipaperloader/blueprints/papers.py b/scipaperloader/blueprints/papers.py new file mode 100644 index 0000000..890271e --- /dev/null +++ b/scipaperloader/blueprints/papers.py @@ -0,0 +1,140 @@ +"""Paper management routes.""" +import csv +import datetime +import io + +from flask import ( + Blueprint, + render_template, + request, + send_file, +) +from sqlalchemy import asc, desc + +from ..db import db +from ..models import PaperMetadata + +bp = Blueprint("papers", __name__) + + +@bp.route("/") +def list_papers(): + page = request.args.get("page", 1, type=int) + per_page = 50 + + # Filters + status = request.args.get("status") + created_from = request.args.get("created_from") + created_to = request.args.get("created_to") + updated_from = request.args.get("updated_from") + updated_to = request.args.get("updated_to") + sort_by = request.args.get("sort_by", "created_at") + sort_dir = request.args.get("sort_dir", "desc") + + query = PaperMetadata.query + + # Apply filters + if status: + query = query.filter(PaperMetadata.status == status) + + def parse_date(val): + from datetime import datetime + + try: + return datetime.strptime(val, "%Y-%m-%d") + except (ValueError, TypeError): + return None + + if created_from := parse_date(created_from): + query = query.filter(PaperMetadata.created_at >= created_from) + if created_to := parse_date(created_to): + query = query.filter(PaperMetadata.created_at <= created_to) + if updated_from := parse_date(updated_from): + query = query.filter(PaperMetadata.updated_at >= updated_from) + if updated_to := parse_date(updated_to): + query = query.filter(PaperMetadata.updated_at <= updated_to) + + # Sorting + sort_col = getattr(PaperMetadata, sort_by, PaperMetadata.created_at) + sort_func = desc if sort_dir == "desc" else asc + query = query.order_by(sort_func(sort_col)) + + # Pagination + pagination = query.paginate(page=page, per_page=per_page, error_out=False) + + # Statistics + total_papers = PaperMetadata.query.count() + status_counts = ( + db.session.query(PaperMetadata.status, db.func.count(PaperMetadata.status)) + .group_by(PaperMetadata.status) + .all() + ) + status_counts = {status: count for status, count in status_counts} + + return render_template( + "papers.html", + papers=pagination.items, + pagination=pagination, + total_papers=total_papers, + status_counts=status_counts, + sort_by=sort_by, + sort_dir=sort_dir, + ) + + +@bp.route("/export") +def export_papers(): + # Filters + status = request.args.get("status") + created_from = request.args.get("created_from") + created_to = request.args.get("created_to") + updated_from = request.args.get("updated_from") + updated_to = request.args.get("updated_to") + sort_by = request.args.get("sort_by", "created_at") + sort_dir = request.args.get("sort_dir", "desc") + + query = PaperMetadata.query + + # Apply filters + if status: + query = query.filter(PaperMetadata.status == status) + + def parse_date(val): + try: + return datetime.datetime.strptime(val, "%Y-%m-%d") + except Exception: + return None + + output = io.StringIO() + writer = csv.writer(output) + writer.writerow( + ["ID", "Title", "Journal", "DOI", "ISSN", "Status", "Created At", "Updated At"] + ) + + for paper in query: + writer.writerow( + [ + paper.id, + paper.title, + getattr(paper, "journal", ""), + paper.doi, + paper.issn, + paper.status, + paper.created_at, + paper.updated_at, + ] + ) + + output.seek(0) + return send_file( + io.BytesIO(output.read().encode("utf-8")), + mimetype="text/csv", + as_attachment=True, + download_name="papers.csv", + ) + + +@bp.route("//detail") +def paper_detail(paper_id): + paper = PaperMetadata.query.get_or_404(paper_id) + return render_template("partials/paper_detail_modal.html", paper=paper) \ No newline at end of file diff --git a/scipaperloader/blueprints/schedule.py b/scipaperloader/blueprints/schedule.py new file mode 100644 index 0000000..7d4f878 --- /dev/null +++ b/scipaperloader/blueprints/schedule.py @@ -0,0 +1,79 @@ +"""Schedule configuration routes.""" +from flask import Blueprint, flash, render_template, request + +from ..db import db +from ..models import ScheduleConfig, VolumeConfig + +bp = Blueprint("schedule", __name__) + + +@bp.route("/", methods=["GET", "POST"]) +def schedule(): + if request.method == "POST": + try: + # Check if we're updating volume or schedule + if "total_volume" in request.form: + # Volume update + try: + new_volume = float(request.form.get("total_volume", 0)) + if new_volume <= 0 or new_volume > 1000: + raise ValueError("Volume must be between 1 and 1000") + + volume_config = VolumeConfig.query.first() + if not volume_config: + volume_config = VolumeConfig(volume=new_volume) + db.session.add(volume_config) + else: + volume_config.volume = new_volume + + db.session.commit() + flash("Volume updated successfully!", "success") + + except ValueError as e: + db.session.rollback() + flash(f"Error updating volume: {str(e)}", "error") + else: + # Schedule update logic + # Validate form data + for hour in range(24): + key = f"hour_{hour}" + if key not in request.form: + raise ValueError(f"Missing data for hour {hour}") + + try: + weight = float(request.form.get(key, 0)) + if weight < 0 or weight > 5: + raise ValueError( + f"Weight for hour {hour} must be between 0 and 5" + ) + except ValueError: + raise ValueError(f"Invalid weight value for hour {hour}") + + # Update database if validation passes + for hour in range(24): + key = f"hour_{hour}" + weight = float(request.form.get(key, 0)) + config = ScheduleConfig.query.get(hour) + if config: + config.weight = weight + else: + db.session.add(ScheduleConfig(hour=hour, weight=weight)) + + db.session.commit() + flash("Schedule updated successfully!", "success") + + except ValueError as e: + db.session.rollback() + flash(f"Error updating schedule: {str(e)}", "error") + + schedule = { + sc.hour: sc.weight + for sc in ScheduleConfig.query.order_by(ScheduleConfig.hour).all() + } + volume = VolumeConfig.query.first() + return render_template( + "schedule.html", + schedule=schedule, + volume=volume.volume if volume else 0, + app_title="PaperScraper", + ) \ No newline at end of file diff --git a/scipaperloader/blueprints/upload.py b/scipaperloader/blueprints/upload.py new file mode 100644 index 0000000..07bc7d1 --- /dev/null +++ b/scipaperloader/blueprints/upload.py @@ -0,0 +1,170 @@ +"""Upload functionality for paper metadata.""" +import codecs +import csv +import datetime +from io import StringIO + +import pandas as pd +from flask import ( + Blueprint, + flash, + redirect, + render_template, + request, + send_file, + session, + url_for, +) + +from ..db import db +from ..models import PaperMetadata + +bp = Blueprint("upload", __name__) + +REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"} + + +@bp.route("/", methods=["GET", "POST"]) +def upload(): + if request.method == "POST": + file = request.files.get("file") + delimiter = request.form.get("delimiter", ",") + duplicate_strategy = request.form.get("duplicate_strategy", "skip") + + if not file: + return render_template("upload.html", error="No file selected.") + + try: + stream = codecs.iterdecode(file.stream, "utf-8") + content = "".join(stream) + df = pd.read_csv(StringIO(content), delimiter=delimiter) + except Exception as e: + return render_template("upload.html", error=f"Failed to read CSV file: {e}") + + missing = REQUIRED_COLUMNS - set(df.columns) + if missing: + return render_template( + "upload.html", error=f"Missing required columns: {', '.join(missing)}" + ) + + # Optional: parse 'published_online' to date + def parse_date(val): + if pd.isna(val): + return None + try: + return pd.to_datetime(val).date() + except Exception: + return None + + # Count statistics + added_count = 0 + skipped_count = 0 + updated_count = 0 + error_count = 0 + + # Collect error information + errors = [] + + # Process each row + for index, row in df.iterrows(): + try: + # Get DOI from row for error reporting + doi = str(row.get("doi", "N/A")) + + # Validate required fields + for field in ["title", "doi", "issn"]: + if pd.isna(row.get(field)) or not str(row.get(field)).strip(): + raise ValueError(f"Missing required field: {field}") + + # Check if paper with this DOI already exists + existing = PaperMetadata.query.filter_by(doi=doi).first() + + if existing: + if duplicate_strategy == 'update': + # Update existing record + existing.title = row["title"] + existing.alt_id = row.get("alternative_id") + existing.issn = row["issn"] + existing.journal = row.get("journal") + existing.type = row.get("type") + existing.language = row.get("language") + existing.published_online = parse_date(row.get("published_online")) + updated_count += 1 + else: + # Skip this record + skipped_count += 1 + continue + else: + # Create new record + metadata = PaperMetadata( + title=row["title"], + doi=doi, + alt_id=row.get("alternative_id"), + issn=row["issn"], + journal=row.get("journal"), + type=row.get("type"), + language=row.get("language"), + published_online=parse_date(row.get("published_online")), + status="New", + file_path=None, + error_msg=None, + ) + db.session.add(metadata) + added_count += 1 + + except Exception as e: + error_count += 1 + errors.append({ + "row": index + 2, # +2 because index is 0-based and we have a header row + "doi": row.get("doi", "N/A"), + "error": str(e) + }) + continue # Skip this row and continue with the next + + try: + db.session.commit() + except Exception as e: + db.session.rollback() + return render_template( + "upload.html", error=f"Failed to save data to database: {e}" + ) + + # Prepare error samples for display + error_samples = errors[:5] if errors else [] + + error_message = None + if errors: + error_message = f"Encountered {len(errors)} errors. First 5 shown below." + + # Store the full errors list in the session for potential download + if errors: + error_csv = StringIO() + writer = csv.DictWriter(error_csv, fieldnames=["row", "doi", "error"]) + writer.writeheader() + writer.writerows(errors) + session["error_data"] = error_csv.getvalue() + + return render_template( + "upload.html", + success=f"File processed! Added: {added_count}, Updated: {updated_count}, Skipped: {skipped_count}, Errors: {error_count}", + error_message=error_message, + error_samples=error_samples + ) + + return render_template("upload.html") + + +@bp.route("/download_error_log") +def download_error_log(): + error_data = session.get("error_data") + if not error_data: + flash("No error data available.") + return redirect(url_for("upload.upload")) + + buffer = StringIO(error_data) + return send_file( + buffer, + mimetype="text/csv", + as_attachment=True, + download_name=f"upload_errors_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" + ) \ No newline at end of file diff --git a/scipaperloader/templates/index.html b/scipaperloader/templates/index.html index 252fb0f..e9d28a6 100644 --- a/scipaperloader/templates/index.html +++ b/scipaperloader/templates/index.html @@ -16,7 +16,7 @@ (title, DOI, ISSN, etc.) are stored. Errors are reported without aborting the batch.

- Upload Now + Upload Now @@ -43,7 +43,7 @@ inspect errors. Files are stored on disk in structured folders per DOI.

- Browse Papers @@ -59,7 +59,7 @@ volume (e.g. 2/hour at daytime, 0 at night) to match your bandwidth or usage pattern.

- Adjust Schedule diff --git a/scipaperloader/templates/nav.html b/scipaperloader/templates/nav.html index f1924c2..cf96974 100644 --- a/scipaperloader/templates/nav.html +++ b/scipaperloader/templates/nav.html @@ -17,13 +17,13 @@ @@ -137,44 +137,44 @@ {% for paper in papers %} - + @@ -193,7 +193,7 @@
  • {% set params = request.args.to_dict() %} {% set _ = params.pop('page', None) %} - +
  • @@ -208,7 +208,7 @@
  • {% set params = request.args.to_dict() %} {% set _ = params.pop('page', None) %} - {{ page_num }} + {{ page_num }}
  • {% else %}
  • @@ -219,7 +219,7 @@
  • {% set params = request.args.to_dict() %} {% set _ = params.pop('page', None) %} - +
  • diff --git a/scipaperloader/templates/schedule.html b/scipaperloader/templates/schedule.html index caea561..f60cff8 100644 --- a/scipaperloader/templates/schedule.html +++ b/scipaperloader/templates/schedule.html @@ -120,7 +120,7 @@
    @@ -139,7 +139,7 @@

    Current Schedule

    - +
    {% set params = request.args.to_dict() %} {% set params = params.update({'sort_by': 'title', 'sort_dir': title_sort}) or params %} - Title + Title {% set params = request.args.to_dict() %} {% set params = params.update({'sort_by': 'journal', 'sort_dir': journal_sort}) or params %} - Journal + Journal {% set params = request.args.to_dict() %} {% set params = params.update({'sort_by': 'doi', 'sort_dir': doi_sort}) or params %} - DOI + DOI {% set params = request.args.to_dict() %} {% set params = params.update({'sort_by': 'issn', 'sort_dir': issn_sort}) or params %} - ISSN + ISSN {% set params = request.args.to_dict() %} {% set params = params.update({'sort_by': 'status', 'sort_dir': status_sort}) or params %} - Status + Status {% set params = request.args.to_dict() %} {% set params = params.update({'sort_by': 'created_at', 'sort_dir': created_sort}) or params %} - Created + Created {% set params = request.args.to_dict() %} {% set params = params.update({'sort_by': 'updated_at', 'sort_dir': updated_sort}) or params %} - Updated + Updated
    {{ paper.title }}{{ paper.title }} {{ paper.journal }} {{ paper.doi }} {{ paper.issn }}