formats
This commit is contained in:
parent
7a41e531bd
commit
d2b99ec0d7
@ -1,8 +1,10 @@
|
|||||||
from flask import Flask
|
from flask import Flask
|
||||||
|
|
||||||
from .config import Config
|
from .config import Config
|
||||||
from .db import db
|
from .db import db
|
||||||
from .models import init_schedule_config
|
from .models import init_schedule_config
|
||||||
|
|
||||||
|
|
||||||
def create_app(test_config=None):
|
def create_app(test_config=None):
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
app.config.from_object(Config)
|
app.config.from_object(Config)
|
||||||
@ -18,9 +20,10 @@ def create_app(test_config=None):
|
|||||||
|
|
||||||
@app.context_processor
|
@app.context_processor
|
||||||
def inject_app_title():
|
def inject_app_title():
|
||||||
return {'app_title': app.config['APP_TITLE']}
|
return {"app_title": app.config["APP_TITLE"]}
|
||||||
|
|
||||||
from . import views
|
from . import views
|
||||||
|
|
||||||
app.register_blueprint(views.bp)
|
app.register_blueprint(views.bp)
|
||||||
|
|
||||||
return app
|
return app
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
SECRET_KEY = os.environ.get('SECRET_KEY', 'dev')
|
SECRET_KEY = os.environ.get("SECRET_KEY", "dev")
|
||||||
SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL', 'sqlite:///papers.db')
|
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
|
||||||
SQLALCHEMY_TRACK_MODIFICATIONS = False
|
SQLALCHEMY_TRACK_MODIFICATIONS = False
|
||||||
APP_TITLE = os.environ.get('APP_TITLE', 'SciPaperLoader')
|
APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from .db import db
|
from .db import db
|
||||||
|
|
||||||
|
|
||||||
class PaperMetadata(db.Model):
|
class PaperMetadata(db.Model):
|
||||||
id = db.Column(db.Integer, primary_key=True)
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
title = db.Column(db.Text)
|
title = db.Column(db.Text)
|
||||||
@ -13,17 +14,24 @@ class PaperMetadata(db.Model):
|
|||||||
file_path = db.Column(db.Text)
|
file_path = db.Column(db.Text)
|
||||||
error_msg = db.Column(db.Text)
|
error_msg = db.Column(db.Text)
|
||||||
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
|
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
|
||||||
updated_at = db.Column(db.DateTime, default=db.func.current_timestamp(), onupdate=db.func.current_timestamp())
|
updated_at = db.Column(
|
||||||
|
db.DateTime,
|
||||||
|
default=db.func.current_timestamp(),
|
||||||
|
onupdate=db.func.current_timestamp(),
|
||||||
|
)
|
||||||
# plus maybe timestamps for created/updated
|
# plus maybe timestamps for created/updated
|
||||||
|
|
||||||
|
|
||||||
class ScheduleConfig(db.Model):
|
class ScheduleConfig(db.Model):
|
||||||
hour = db.Column(db.Integer, primary_key=True) # 0-23
|
hour = db.Column(db.Integer, primary_key=True) # 0-23
|
||||||
weight = db.Column(db.Float) # weight
|
weight = db.Column(db.Float) # weight
|
||||||
|
|
||||||
|
|
||||||
class VolumeConfig(db.Model):
|
class VolumeConfig(db.Model):
|
||||||
id = db.Column(db.Integer, primary_key=True)
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
volume = db.Column(db.Float) # volume of papers to scrape per day
|
volume = db.Column(db.Float) # volume of papers to scrape per day
|
||||||
|
|
||||||
|
|
||||||
def init_schedule_config():
|
def init_schedule_config():
|
||||||
"""Initialize ScheduleConfig with default values if empty"""
|
"""Initialize ScheduleConfig with default values if empty"""
|
||||||
if ScheduleConfig.query.count() == 0:
|
if ScheduleConfig.query.count() == 0:
|
||||||
@ -39,7 +47,7 @@ def init_schedule_config():
|
|||||||
# Evening hours (medium volume)
|
# Evening hours (medium volume)
|
||||||
*[(hour, 0.5) for hour in range(17, 21)],
|
*[(hour, 0.5) for hour in range(17, 21)],
|
||||||
# Late evening (high volume)
|
# Late evening (high volume)
|
||||||
*[(hour, 0.8) for hour in range(21, 24)]
|
*[(hour, 0.8) for hour in range(21, 24)],
|
||||||
]
|
]
|
||||||
|
|
||||||
for hour, weight in default_schedule:
|
for hour, weight in default_schedule:
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
from .db import db
|
from .db import db
|
||||||
from .models import PaperMetadata
|
from .models import PaperMetadata
|
||||||
|
|
||||||
|
|
||||||
def run_scraper():
|
def run_scraper():
|
||||||
while True:
|
while True:
|
||||||
with db.app.app_context():
|
with db.app.app_context():
|
||||||
|
@ -1,15 +1,26 @@
|
|||||||
from flask import Blueprint, render_template, current_app, request, flash, redirect, url_for, send_file
|
|
||||||
from .models import ScheduleConfig, VolumeConfig, PaperMetadata
|
|
||||||
from .db import db
|
|
||||||
import pandas as pd
|
|
||||||
from io import StringIO
|
|
||||||
import codecs
|
import codecs
|
||||||
|
import csv
|
||||||
import datetime
|
import datetime
|
||||||
import io
|
import io
|
||||||
import csv
|
from io import StringIO
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from flask import (
|
||||||
|
Blueprint,
|
||||||
|
current_app,
|
||||||
|
flash,
|
||||||
|
redirect,
|
||||||
|
render_template,
|
||||||
|
request,
|
||||||
|
send_file,
|
||||||
|
url_for,
|
||||||
|
)
|
||||||
from sqlalchemy import asc, desc
|
from sqlalchemy import asc, desc
|
||||||
|
|
||||||
bp = Blueprint('main', __name__)
|
from .db import db
|
||||||
|
from .models import PaperMetadata, ScheduleConfig, VolumeConfig
|
||||||
|
|
||||||
|
bp = Blueprint("main", __name__)
|
||||||
|
|
||||||
|
|
||||||
@bp.route("/")
|
@bp.route("/")
|
||||||
@ -20,25 +31,27 @@ def index():
|
|||||||
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
|
REQUIRED_COLUMNS = {"alternative_id", "journal", "doi", "issn", "title"}
|
||||||
|
|
||||||
|
|
||||||
@bp.route('/upload', methods=['GET', 'POST'])
|
@bp.route("/upload", methods=["GET", "POST"])
|
||||||
def upload():
|
def upload():
|
||||||
if request.method == 'POST':
|
if request.method == "POST":
|
||||||
file = request.files.get('file')
|
file = request.files.get("file")
|
||||||
delimiter = request.form.get('delimiter', ',')
|
delimiter = request.form.get("delimiter", ",")
|
||||||
|
|
||||||
if not file:
|
if not file:
|
||||||
return render_template('upload.html', error="No file selected.")
|
return render_template("upload.html", error="No file selected.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stream = codecs.iterdecode(file.stream, 'utf-8')
|
stream = codecs.iterdecode(file.stream, "utf-8")
|
||||||
content = ''.join(stream)
|
content = "".join(stream)
|
||||||
df = pd.read_csv(StringIO(content), delimiter=delimiter)
|
df = pd.read_csv(StringIO(content), delimiter=delimiter)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return render_template('upload.html', error=f"Failed to read CSV file: {e}")
|
return render_template("upload.html", error=f"Failed to read CSV file: {e}")
|
||||||
|
|
||||||
missing = REQUIRED_COLUMNS - set(df.columns)
|
missing = REQUIRED_COLUMNS - set(df.columns)
|
||||||
if missing:
|
if missing:
|
||||||
return render_template('upload.html', error=f"Missing required columns: {', '.join(missing)}")
|
return render_template(
|
||||||
|
"upload.html", error=f"Missing required columns: {', '.join(missing)}"
|
||||||
|
)
|
||||||
|
|
||||||
# Optional: parse 'published_online' to date
|
# Optional: parse 'published_online' to date
|
||||||
def parse_date(val):
|
def parse_date(val):
|
||||||
@ -51,16 +64,16 @@ def upload():
|
|||||||
|
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
metadata = PaperMetadata(
|
metadata = PaperMetadata(
|
||||||
title=row['title'],
|
title=row["title"],
|
||||||
doi=row['doi'],
|
doi=row["doi"],
|
||||||
alt_id=row.get('alternative_id'),
|
alt_id=row.get("alternative_id"),
|
||||||
issn=row['issn'],
|
issn=row["issn"],
|
||||||
type=row.get('type'),
|
type=row.get("type"),
|
||||||
language=row.get('language'),
|
language=row.get("language"),
|
||||||
published_online=parse_date(row.get('published_online')),
|
published_online=parse_date(row.get("published_online")),
|
||||||
status="New",
|
status="New",
|
||||||
file_path=None,
|
file_path=None,
|
||||||
error_msg=None
|
error_msg=None,
|
||||||
)
|
)
|
||||||
db.session.add(metadata)
|
db.session.add(metadata)
|
||||||
|
|
||||||
@ -68,26 +81,30 @@ def upload():
|
|||||||
db.session.commit()
|
db.session.commit()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
db.session.rollback()
|
db.session.rollback()
|
||||||
return render_template('upload.html', error=f"Failed to save data to database: {e}")
|
return render_template(
|
||||||
|
"upload.html", error=f"Failed to save data to database: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
return render_template('upload.html', success="File uploaded and validated successfully!")
|
return render_template(
|
||||||
|
"upload.html", success="File uploaded and validated successfully!"
|
||||||
|
)
|
||||||
|
|
||||||
return render_template('upload.html')
|
return render_template("upload.html")
|
||||||
|
|
||||||
|
|
||||||
@bp.route('/papers')
|
@bp.route("/papers")
|
||||||
def list_papers():
|
def list_papers():
|
||||||
page = request.args.get('page', 1, type=int)
|
page = request.args.get("page", 1, type=int)
|
||||||
per_page = 50
|
per_page = 50
|
||||||
|
|
||||||
# Filters
|
# Filters
|
||||||
status = request.args.get('status')
|
status = request.args.get("status")
|
||||||
created_from = request.args.get('created_from')
|
created_from = request.args.get("created_from")
|
||||||
created_to = request.args.get('created_to')
|
created_to = request.args.get("created_to")
|
||||||
updated_from = request.args.get('updated_from')
|
updated_from = request.args.get("updated_from")
|
||||||
updated_to = request.args.get('updated_to')
|
updated_to = request.args.get("updated_to")
|
||||||
sort_by = request.args.get('sort_by', 'created_at')
|
sort_by = request.args.get("sort_by", "created_at")
|
||||||
sort_dir = request.args.get('sort_dir', 'desc')
|
sort_dir = request.args.get("sort_dir", "desc")
|
||||||
|
|
||||||
query = PaperMetadata.query
|
query = PaperMetadata.query
|
||||||
|
|
||||||
@ -97,8 +114,9 @@ def list_papers():
|
|||||||
|
|
||||||
def parse_date(val):
|
def parse_date(val):
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return datetime.strptime(val, '%Y-%m-%d')
|
return datetime.strptime(val, "%Y-%m-%d")
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -113,7 +131,7 @@ def list_papers():
|
|||||||
|
|
||||||
# Sorting
|
# Sorting
|
||||||
sort_col = getattr(PaperMetadata, sort_by, PaperMetadata.created_at)
|
sort_col = getattr(PaperMetadata, sort_by, PaperMetadata.created_at)
|
||||||
sort_func = desc if sort_dir == 'desc' else asc
|
sort_func = desc if sort_dir == "desc" else asc
|
||||||
query = query.order_by(sort_func(sort_col))
|
query = query.order_by(sort_func(sort_col))
|
||||||
|
|
||||||
# Pagination
|
# Pagination
|
||||||
@ -129,7 +147,7 @@ def list_papers():
|
|||||||
status_counts = {status: count for status, count in status_counts}
|
status_counts = {status: count for status, count in status_counts}
|
||||||
|
|
||||||
return render_template(
|
return render_template(
|
||||||
'papers.html',
|
"papers.html",
|
||||||
papers=pagination.items,
|
papers=pagination.items,
|
||||||
pagination=pagination,
|
pagination=pagination,
|
||||||
total_papers=total_papers,
|
total_papers=total_papers,
|
||||||
@ -139,18 +157,18 @@ def list_papers():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@bp.route('/papers/export')
|
@bp.route("/papers/export")
|
||||||
def export_papers():
|
def export_papers():
|
||||||
query = PaperMetadata.query
|
query = PaperMetadata.query
|
||||||
|
|
||||||
# Filters
|
# Filters
|
||||||
status = request.args.get('status')
|
status = request.args.get("status")
|
||||||
created_from = request.args.get('created_from')
|
created_from = request.args.get("created_from")
|
||||||
created_to = request.args.get('created_to')
|
created_to = request.args.get("created_to")
|
||||||
updated_from = request.args.get('updated_from')
|
updated_from = request.args.get("updated_from")
|
||||||
updated_to = request.args.get('updated_to')
|
updated_to = request.args.get("updated_to")
|
||||||
sort_by = request.args.get('sort_by', 'created_at')
|
sort_by = request.args.get("sort_by", "created_at")
|
||||||
sort_dir = request.args.get('sort_dir', 'desc')
|
sort_dir = request.args.get("sort_dir", "desc")
|
||||||
|
|
||||||
query = PaperMetadata.query
|
query = PaperMetadata.query
|
||||||
|
|
||||||
@ -166,35 +184,40 @@ def export_papers():
|
|||||||
|
|
||||||
output = io.StringIO()
|
output = io.StringIO()
|
||||||
writer = csv.writer(output)
|
writer = csv.writer(output)
|
||||||
writer.writerow(['ID', 'Title', 'Journal', 'DOI', 'ISSN',
|
writer.writerow(
|
||||||
'Status', 'Created At', 'Updated At'])
|
["ID", "Title", "Journal", "DOI", "ISSN", "Status", "Created At", "Updated At"]
|
||||||
|
)
|
||||||
|
|
||||||
for paper in query:
|
for paper in query:
|
||||||
writer.writerow([
|
writer.writerow(
|
||||||
|
[
|
||||||
paper.id,
|
paper.id,
|
||||||
paper.title,
|
paper.title,
|
||||||
getattr(paper, 'journal', ''),
|
getattr(paper, "journal", ""),
|
||||||
paper.doi,
|
paper.doi,
|
||||||
paper.issn,
|
paper.issn,
|
||||||
paper.status,
|
paper.status,
|
||||||
paper.created_at,
|
paper.created_at,
|
||||||
paper.updated_at
|
paper.updated_at,
|
||||||
])
|
]
|
||||||
|
)
|
||||||
|
|
||||||
output.seek(0)
|
output.seek(0)
|
||||||
return send_file(io.BytesIO(output.read().encode('utf-8')),
|
return send_file(
|
||||||
mimetype='text/csv',
|
io.BytesIO(output.read().encode("utf-8")),
|
||||||
|
mimetype="text/csv",
|
||||||
as_attachment=True,
|
as_attachment=True,
|
||||||
download_name='papers.csv')
|
download_name="papers.csv",
|
||||||
|
)
|
||||||
|
|
||||||
from flask import jsonify, render_template
|
from flask import jsonify, render_template
|
||||||
|
|
||||||
|
|
||||||
@bp.route('/papers/<int:paper_id>/detail')
|
@bp.route("/papers/<int:paper_id>/detail")
|
||||||
def paper_detail(paper_id):
|
def paper_detail(paper_id):
|
||||||
paper = PaperMetadata.query.get_or_404(paper_id)
|
paper = PaperMetadata.query.get_or_404(paper_id)
|
||||||
|
|
||||||
return render_template('partials/paper_detail_modal.html', paper=paper)
|
return render_template("partials/paper_detail_modal.html", paper=paper)
|
||||||
|
|
||||||
|
|
||||||
@bp.route("/schedule", methods=["GET", "POST"])
|
@bp.route("/schedule", methods=["GET", "POST"])
|
||||||
@ -202,10 +225,10 @@ def schedule():
|
|||||||
if request.method == "POST":
|
if request.method == "POST":
|
||||||
try:
|
try:
|
||||||
# Check if we're updating volume or schedule
|
# Check if we're updating volume or schedule
|
||||||
if 'total_volume' in request.form:
|
if "total_volume" in request.form:
|
||||||
# Volume update
|
# Volume update
|
||||||
try:
|
try:
|
||||||
new_volume = float(request.form.get('total_volume', 0))
|
new_volume = float(request.form.get("total_volume", 0))
|
||||||
if new_volume <= 0 or new_volume > 1000:
|
if new_volume <= 0 or new_volume > 1000:
|
||||||
raise ValueError("Volume must be between 1 and 1000")
|
raise ValueError("Volume must be between 1 and 1000")
|
||||||
|
|
||||||
@ -234,7 +257,8 @@ def schedule():
|
|||||||
weight = float(request.form.get(key, 0))
|
weight = float(request.form.get(key, 0))
|
||||||
if weight < 0 or weight > 5:
|
if weight < 0 or weight > 5:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Weight for hour {hour} must be between 0 and 5")
|
f"Weight for hour {hour} must be between 0 and 5"
|
||||||
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise ValueError(f"Invalid weight value for hour {hour}")
|
raise ValueError(f"Invalid weight value for hour {hour}")
|
||||||
|
|
||||||
@ -255,10 +279,17 @@ def schedule():
|
|||||||
db.session.rollback()
|
db.session.rollback()
|
||||||
flash(f"Error updating schedule: {str(e)}", "error")
|
flash(f"Error updating schedule: {str(e)}", "error")
|
||||||
|
|
||||||
schedule = {sc.hour: sc.weight for sc in ScheduleConfig.query.order_by(
|
schedule = {
|
||||||
ScheduleConfig.hour).all()}
|
sc.hour: sc.weight
|
||||||
|
for sc in ScheduleConfig.query.order_by(ScheduleConfig.hour).all()
|
||||||
|
}
|
||||||
volume = VolumeConfig.query.first()
|
volume = VolumeConfig.query.first()
|
||||||
return render_template("schedule.html", schedule=schedule, volume=volume.volume, app_title="PaperScraper")
|
return render_template(
|
||||||
|
"schedule.html",
|
||||||
|
schedule=schedule,
|
||||||
|
volume=volume.volume,
|
||||||
|
app_title="PaperScraper",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@bp.route("/logs")
|
@bp.route("/logs")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user