formats
This commit is contained in:
parent
7a41e531bd
commit
d2b99ec0d7
@ -1,8 +1,10 @@
|
||||
from flask import Flask
|
||||
|
||||
from .config import Config
|
||||
from .db import db
|
||||
from .models import init_schedule_config
|
||||
|
||||
|
||||
def create_app(test_config=None):
|
||||
app = Flask(__name__)
|
||||
app.config.from_object(Config)
|
||||
@ -18,9 +20,10 @@ def create_app(test_config=None):
|
||||
|
||||
@app.context_processor
|
||||
def inject_app_title():
|
||||
return {'app_title': app.config['APP_TITLE']}
|
||||
return {"app_title": app.config["APP_TITLE"]}
|
||||
|
||||
from . import views
|
||||
|
||||
app.register_blueprint(views.bp)
|
||||
|
||||
return app
|
||||
|
@ -1,7 +1,8 @@
|
||||
import os
|
||||
|
||||
|
||||
class Config:
|
||||
SECRET_KEY = os.environ.get('SECRET_KEY', 'dev')
|
||||
SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL', 'sqlite:///papers.db')
|
||||
SECRET_KEY = os.environ.get("SECRET_KEY", "dev")
|
||||
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
|
||||
SQLALCHEMY_TRACK_MODIFICATIONS = False
|
||||
APP_TITLE = os.environ.get('APP_TITLE', 'SciPaperLoader')
|
||||
APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
|
||||
|
@ -1,5 +1,6 @@
|
||||
from .db import db
|
||||
|
||||
|
||||
class PaperMetadata(db.Model):
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
title = db.Column(db.Text)
|
||||
@ -9,21 +10,28 @@ class PaperMetadata(db.Model):
|
||||
type = db.Column(db.String(50))
|
||||
language = db.Column(db.String(50))
|
||||
published_online = db.Column(db.Date) # or DateTime/String
|
||||
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
|
||||
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
|
||||
file_path = db.Column(db.Text)
|
||||
error_msg = db.Column(db.Text)
|
||||
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
|
||||
updated_at = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp())
|
||||
updated_at = db.Column(
|
||||
db.DateTime,
|
||||
default=db.func.current_timestamp(),
|
||||
onupdate=db.func.current_timestamp(),
|
||||
)
|
||||
# plus maybe timestamps for created/updated
|
||||
|
||||
|
||||
class ScheduleConfig(db.Model):
|
||||
hour = db.Column(db.Integer, primary_key=True) # 0-23
|
||||
weight = db.Column(db.Float) # weight
|
||||
|
||||
|
||||
class VolumeConfig(db.Model):
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
volume = db.Column(db.Float) # volume of papers to scrape per day
|
||||
|
||||
|
||||
def init_schedule_config():
|
||||
"""Initialize ScheduleConfig with default values if empty"""
|
||||
if ScheduleConfig.query.count() == 0:
|
||||
@ -39,7 +47,7 @@ def init_schedule_config():
|
||||
# Evening hours (medium volume)
|
||||
*[(hour, 0.5) for hour in range(17, 21)],
|
||||
# Late evening (high volume)
|
||||
*[(hour, 0.8) for hour in range(21, 24)]
|
||||
*[(hour, 0.8) for hour in range(21, 24)],
|
||||
]
|
||||
|
||||
for hour, weight in default_schedule:
|
||||
|
@ -1,7 +1,9 @@
|
||||
import time
|
||||
|
||||
from .db import db
|
||||
from .models import PaperMetadata
|
||||
|
||||
|
||||
def run_scraper():
|
||||
while True:
|
||||
with db.app.app_context():
|
||||
|
@ -1,15 +1,26 @@
|
||||
from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for, send_file
|
||||
from .models import ScheduleConfig, VolumeConfig, PaperMetadata
|
||||
from .db import db
|
||||
import pandas as pd
|
||||
from io import StringIO
|
||||
import codecs
|
||||
import csv
|
||||
import datetime
|
||||
import io
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import pandas as pd
|
||||
from flask import (
|
||||
Blueprint,
|
||||
current_app,
|
||||
flash,
|
||||
redirect,
|
||||
render_template,
|
||||
request,
|
||||
send_file,
|
||||
url_for,
|
||||
)
|
||||
from sqlalchemy import asc, desc
|
||||
|
||||
bp = Blueprint('main', __name__)
|
||||
from .db import db
|
||||
from .models import PaperMetadata, ScheduleConfig, VolumeConfig
|
||||
|
||||
bp = Blueprint("main", __name__)
|
||||
|
||||
|
||||
@bp.route("/")
|
||||
@ -20,25 +31,27 @@ def index():
|
||||
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
|
||||
|
||||
|
||||
@bp.route('/upload', methods=['GET', 'POST'])
|
||||
@bp.route("/upload", methods=["GET", "POST"])
|
||||
def upload():
|
||||
if request.method == 'POST':
|
||||
file = request.files.get('file')
|
||||
delimiter = request.form.get('delimiter', ',')
|
||||
if request.method == "POST":
|
||||
file = request.files.get("file")
|
||||
delimiter = request.form.get("delimiter", ",")
|
||||
|
||||
if not file:
|
||||
return render_template('upload.html', error="No file selected.")
|
||||
return render_template("upload.html", error="No file selected.")
|
||||
|
||||
try:
|
||||
stream = codecs.iterdecode(file.stream, 'utf-8')
|
||||
content = ''.join(stream)
|
||||
stream = codecs.iterdecode(file.stream, "utf-8")
|
||||
content = "".join(stream)
|
||||
df = pd.read_csv(StringIO(content), delimiter=delimiter)
|
||||
except Exception as e:
|
||||
return render_template('upload.html', error=f"Failed to read CSV file: {e}")
|
||||
return render_template("upload.html", error=f"Failed to read CSV file: {e}")
|
||||
|
||||
missing = REQUIRED_COLUMNS - set(df.columns)
|
||||
if missing:
|
||||
return render_template('upload.html', error=f"Missing required columns: {', '.join(missing)}")
|
||||
return render_template(
|
||||
"upload.html", error=f"Missing required columns: {', '.join(missing)}"
|
||||
)
|
||||
|
||||
# Optional: parse 'published_online' to date
|
||||
def parse_date(val):
|
||||
@ -51,16 +64,16 @@ def upload():
|
||||
|
||||
for _, row in df.iterrows():
|
||||
metadata = PaperMetadata(
|
||||
title=row['title'],
|
||||
doi=row['doi'],
|
||||
alt_id=row.get('alternative_id'),
|
||||
issn=row['issn'],
|
||||
type=row.get('type'),
|
||||
language=row.get('language'),
|
||||
published_online=parse_date(row.get('published_online')),
|
||||
title=row["title"],
|
||||
doi=row["doi"],
|
||||
alt_id=row.get("alternative_id"),
|
||||
issn=row["issn"],
|
||||
type=row.get("type"),
|
||||
language=row.get("language"),
|
||||
published_online=parse_date(row.get("published_online")),
|
||||
status="New",
|
||||
file_path=None,
|
||||
error_msg=None
|
||||
error_msg=None,
|
||||
)
|
||||
db.session.add(metadata)
|
||||
|
||||
@ -68,26 +81,30 @@ def upload():
|
||||
db.session.commit()
|
||||
except Exception as e:
|
||||
db.session.rollback()
|
||||
return render_template('upload.html', error=f"Failed to save data to database: {e}")
|
||||
return render_template(
|
||||
"upload.html", error=f"Failed to save data to database: {e}"
|
||||
)
|
||||
|
||||
return render_template('upload.html', success="File uploaded and validated successfully!")
|
||||
return render_template(
|
||||
"upload.html", success="File uploaded and validated successfully!"
|
||||
)
|
||||
|
||||
return render_template('upload.html')
|
||||
return render_template("upload.html")
|
||||
|
||||
|
||||
@bp.route('/papers')
|
||||
@bp.route("/papers")
|
||||
def list_papers():
|
||||
page = request.args.get('page', 1, type=int)
|
||||
page = request.args.get("page", 1, type=int)
|
||||
per_page = 50
|
||||
|
||||
# Filters
|
||||
status = request.args.get('status')
|
||||
created_from = request.args.get('created_from')
|
||||
created_to = request.args.get('created_to')
|
||||
updated_from = request.args.get('updated_from')
|
||||
updated_to = request.args.get('updated_to')
|
||||
sort_by = request.args.get('sort_by', 'created_at')
|
||||
sort_dir = request.args.get('sort_dir', 'desc')
|
||||
status = request.args.get("status")
|
||||
created_from = request.args.get("created_from")
|
||||
created_to = request.args.get("created_to")
|
||||
updated_from = request.args.get("updated_from")
|
||||
updated_to = request.args.get("updated_to")
|
||||
sort_by = request.args.get("sort_by", "created_at")
|
||||
sort_dir = request.args.get("sort_dir", "desc")
|
||||
|
||||
query = PaperMetadata.query
|
||||
|
||||
@ -97,8 +114,9 @@ def list_papers():
|
||||
|
||||
def parse_date(val):
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
return datetime.strptime(val, '%Y-%m-%d')
|
||||
return datetime.strptime(val, "%Y-%m-%d")
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
@ -113,7 +131,7 @@ def list_papers():
|
||||
|
||||
# Sorting
|
||||
sort_col = getattr(PaperMetadata, sort_by, PaperMetadata.created_at)
|
||||
sort_func = desc if sort_dir == 'desc' else asc
|
||||
sort_func = desc if sort_dir == "desc" else asc
|
||||
query = query.order_by(sort_func(sort_col))
|
||||
|
||||
# Pagination
|
||||
@ -129,7 +147,7 @@ def list_papers():
|
||||
status_counts = {status: count for status, count in status_counts}
|
||||
|
||||
return render_template(
|
||||
'papers.html',
|
||||
"papers.html",
|
||||
papers=pagination.items,
|
||||
pagination=pagination,
|
||||
total_papers=total_papers,
|
||||
@ -139,18 +157,18 @@ def list_papers():
|
||||
)
|
||||
|
||||
|
||||
@bp.route('/papers/export')
|
||||
@bp.route("/papers/export")
|
||||
def export_papers():
|
||||
query = PaperMetadata.query
|
||||
|
||||
# Filters
|
||||
status = request.args.get('status')
|
||||
created_from = request.args.get('created_from')
|
||||
created_to = request.args.get('created_to')
|
||||
updated_from = request.args.get('updated_from')
|
||||
updated_to = request.args.get('updated_to')
|
||||
sort_by = request.args.get('sort_by', 'created_at')
|
||||
sort_dir = request.args.get('sort_dir', 'desc')
|
||||
status = request.args.get("status")
|
||||
created_from = request.args.get("created_from")
|
||||
created_to = request.args.get("created_to")
|
||||
updated_from = request.args.get("updated_from")
|
||||
updated_to = request.args.get("updated_to")
|
||||
sort_by = request.args.get("sort_by", "created_at")
|
||||
sort_dir = request.args.get("sort_dir", "desc")
|
||||
|
||||
query = PaperMetadata.query
|
||||
|
||||
@ -166,35 +184,40 @@ def export_papers():
|
||||
|
||||
output = io.StringIO()
|
||||
writer = csv.writer(output)
|
||||
writer.writerow(['ID', 'Title', 'Journal', 'DOI', 'ISSN',
|
||||
'Status', 'Created At', 'Updated At'])
|
||||
writer.writerow(
|
||||
["ID", "Title", "Journal", "DOI", "ISSN", "Status", "Created At", "Updated At"]
|
||||
)
|
||||
|
||||
for paper in query:
|
||||
writer.writerow([
|
||||
paper.id,
|
||||
paper.title,
|
||||
getattr(paper, 'journal', ''),
|
||||
paper.doi,
|
||||
paper.issn,
|
||||
paper.status,
|
||||
paper.created_at,
|
||||
paper.updated_at
|
||||
])
|
||||
writer.writerow(
|
||||
[
|
||||
paper.id,
|
||||
paper.title,
|
||||
getattr(paper, "journal", ""),
|
||||
paper.doi,
|
||||
paper.issn,
|
||||
paper.status,
|
||||
paper.created_at,
|
||||
paper.updated_at,
|
||||
]
|
||||
)
|
||||
|
||||
output.seek(0)
|
||||
return send_file(io.BytesIO(output.read().encode('utf-8')),
|
||||
mimetype='text/csv',
|
||||
as_attachment=True,
|
||||
download_name='papers.csv')
|
||||
return send_file(
|
||||
io.BytesIO(output.read().encode("utf-8")),
|
||||
mimetype="text/csv",
|
||||
as_attachment=True,
|
||||
download_name="papers.csv",
|
||||
)
|
||||
|
||||
from flask import jsonify, render_template
|
||||
|
||||
|
||||
@bp.route('/papers/<int:paper_id>/detail')
|
||||
@bp.route("/papers/<int:paper_id>/detail")
|
||||
def paper_detail(paper_id):
|
||||
paper = PaperMetadata.query.get_or_404(paper_id)
|
||||
|
||||
return render_template('partials/paper_detail_modal.html', paper=paper)
|
||||
return render_template("partials/paper_detail_modal.html", paper=paper)
|
||||
|
||||
|
||||
@bp.route("/schedule", methods=["GET", "POST"])
|
||||
@ -202,10 +225,10 @@ def schedule():
|
||||
if request.method == "POST":
|
||||
try:
|
||||
# Check if we're updating volume or schedule
|
||||
if 'total_volume' in request.form:
|
||||
if "total_volume" in request.form:
|
||||
# Volume update
|
||||
try:
|
||||
new_volume = float(request.form.get('total_volume', 0))
|
||||
new_volume = float(request.form.get("total_volume", 0))
|
||||
if new_volume <= 0 or new_volume > 1000:
|
||||
raise ValueError("Volume must be between 1 and 1000")
|
||||
|
||||
@ -234,7 +257,8 @@ def schedule():
|
||||
weight = float(request.form.get(key, 0))
|
||||
if weight < 0 or weight > 5:
|
||||
raise ValueError(
|
||||
f"Weight for hour {hour} must be between 0 and 5")
|
||||
f"Weight for hour {hour} must be between 0 and 5"
|
||||
)
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid weight value for hour {hour}")
|
||||
|
||||
@ -255,10 +279,17 @@ def schedule():
|
||||
db.session.rollback()
|
||||
flash(f"Error updating schedule: {str(e)}", "error")
|
||||
|
||||
schedule = {sc.hour: sc.weight for sc in ScheduleConfig.query.order_by(
|
||||
ScheduleConfig.hour).all()}
|
||||
schedule = {
|
||||
sc.hour: sc.weight
|
||||
for sc in ScheduleConfig.query.order_by(ScheduleConfig.hour).all()
|
||||
}
|
||||
volume = VolumeConfig.query.first()
|
||||
return render_template("schedule.html", schedule=schedule, volume=volume.volume, app_title="PaperScraper")
|
||||
return render_template(
|
||||
"schedule.html",
|
||||
schedule=schedule,
|
||||
volume=volume.volume,
|
||||
app_title="PaperScraper",
|
||||
)
|
||||
|
||||
|
||||
@bp.route("/logs")
|
||||
|
Loading…
x
Reference in New Issue
Block a user