Compare commits
3 Commits
11f086aa64
...
987c76969b
Author | SHA1 | Date | |
---|---|---|---|
![]() |
987c76969b | ||
![]() |
012163ba3f | ||
![]() |
8f2375215d |
@ -1,6 +1,7 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from flask import Blueprint, jsonify, request
|
from flask import Blueprint, jsonify, request
|
||||||
from ..models import ActivityLog, ActivityCategory
|
from ..models import ActivityLog, ActivityCategory, PaperMetadata
|
||||||
|
from .. import db
|
||||||
|
|
||||||
bp = Blueprint("api", __name__, url_prefix="/api")
|
bp = Blueprint("api", __name__, url_prefix="/api")
|
||||||
|
|
||||||
@ -47,4 +48,91 @@ def get_activity_logs():
|
|||||||
}
|
}
|
||||||
result.append(log_data)
|
result.append(log_data)
|
||||||
|
|
||||||
return jsonify(result)
|
return jsonify(result)
|
||||||
|
|
||||||
|
@bp.route("/papers")
|
||||||
|
def search_papers():
|
||||||
|
"""
|
||||||
|
Search for papers by title, DOI, or ID.
|
||||||
|
|
||||||
|
Query parameters:
|
||||||
|
- query: Search term (required)
|
||||||
|
- limit: Maximum number of results (default: 10)
|
||||||
|
"""
|
||||||
|
query = request.args.get('query', '')
|
||||||
|
limit = int(request.args.get('limit', 10))
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
return jsonify({
|
||||||
|
"success": False,
|
||||||
|
"message": "Search query is required",
|
||||||
|
"papers": []
|
||||||
|
})
|
||||||
|
|
||||||
|
# Try to parse query as an ID first
|
||||||
|
try:
|
||||||
|
paper_id = int(query)
|
||||||
|
paper_by_id = PaperMetadata.query.get(paper_id)
|
||||||
|
if paper_by_id:
|
||||||
|
return jsonify({
|
||||||
|
"success": True,
|
||||||
|
"papers": [{
|
||||||
|
"id": paper_by_id.id,
|
||||||
|
"title": paper_by_id.title,
|
||||||
|
"doi": paper_by_id.doi,
|
||||||
|
"journal": paper_by_id.journal,
|
||||||
|
"status": paper_by_id.status,
|
||||||
|
"created_at": paper_by_id.created_at.isoformat() if paper_by_id.created_at else None,
|
||||||
|
"updated_at": paper_by_id.updated_at.isoformat() if paper_by_id.updated_at else None
|
||||||
|
}]
|
||||||
|
})
|
||||||
|
except ValueError:
|
||||||
|
pass # Not an ID, continue with text search
|
||||||
|
|
||||||
|
# Search in title and DOI
|
||||||
|
search_term = f"%{query}%"
|
||||||
|
papers = PaperMetadata.query.filter(
|
||||||
|
db.or_(
|
||||||
|
PaperMetadata.title.ilike(search_term),
|
||||||
|
PaperMetadata.doi.ilike(search_term)
|
||||||
|
)
|
||||||
|
).limit(limit).all()
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
"success": True,
|
||||||
|
"papers": [{
|
||||||
|
"id": paper.id,
|
||||||
|
"title": paper.title,
|
||||||
|
"doi": paper.doi,
|
||||||
|
"journal": paper.journal,
|
||||||
|
"status": paper.status,
|
||||||
|
"created_at": paper.created_at.isoformat() if paper.created_at else None,
|
||||||
|
"updated_at": paper.updated_at.isoformat() if paper.updated_at else None
|
||||||
|
} for paper in papers]
|
||||||
|
})
|
||||||
|
|
||||||
|
@bp.route("/papers/<int:paper_id>")
|
||||||
|
def get_paper(paper_id):
|
||||||
|
"""Get details of a single paper by ID."""
|
||||||
|
paper = PaperMetadata.query.get(paper_id)
|
||||||
|
|
||||||
|
if not paper:
|
||||||
|
return jsonify({
|
||||||
|
"success": False,
|
||||||
|
"message": f"Paper with ID {paper_id} not found"
|
||||||
|
})
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
"success": True,
|
||||||
|
"paper": {
|
||||||
|
"id": paper.id,
|
||||||
|
"title": paper.title,
|
||||||
|
"doi": paper.doi,
|
||||||
|
"journal": paper.journal,
|
||||||
|
"status": paper.status,
|
||||||
|
"error_msg": paper.error_msg,
|
||||||
|
"file_path": paper.file_path,
|
||||||
|
"created_at": paper.created_at.isoformat() if paper.created_at else None,
|
||||||
|
"updated_at": paper.updated_at.isoformat() if paper.updated_at else None
|
||||||
|
}
|
||||||
|
})
|
@ -1,10 +1,11 @@
|
|||||||
"""Configuration management blueprint."""
|
"""Configuration management blueprint."""
|
||||||
from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify
|
from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify, current_app
|
||||||
from ..db import db
|
from ..db import db
|
||||||
# Import the new model
|
# Import the new model
|
||||||
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig
|
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig, PaperMetadata
|
||||||
from ..defaults import MAX_VOLUME
|
from ..defaults import MAX_VOLUME
|
||||||
import os # Import os for path validation
|
import os # Import os for path validation
|
||||||
|
from scipaperloader.scrapers import __path__ as scrapers_path
|
||||||
|
|
||||||
bp = Blueprint("config", __name__, url_prefix="/config")
|
bp = Blueprint("config", __name__, url_prefix="/config")
|
||||||
|
|
||||||
@ -69,25 +70,31 @@ def _update_download_path(new_path):
|
|||||||
# Try to create it if it doesn't exist
|
# Try to create it if it doesn't exist
|
||||||
try:
|
try:
|
||||||
os.makedirs(new_path, exist_ok=True)
|
os.makedirs(new_path, exist_ok=True)
|
||||||
ActivityLog.log_system_activity(
|
ActivityLog.log_scraper_activity(
|
||||||
action="create_directory",
|
action="create_directory",
|
||||||
status="info",
|
status="info",
|
||||||
description=f"Created download directory: {new_path}"
|
description=f"Created download directory: {new_path}"
|
||||||
)
|
)
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
ActivityLog.log_system_activity(
|
ActivityLog.log_error(
|
||||||
action="create_directory",
|
error_message=f"Failed to create download directory: {new_path}, Error: {str(e)}",
|
||||||
status="error",
|
source="update_download_path"
|
||||||
description=f"Failed to create download directory: {new_path}, Error: {str(e)}"
|
|
||||||
)
|
)
|
||||||
return False, f"Path '{new_path}' is not a valid directory and could not be created: {e}", None
|
return False, f"Path '{new_path}' is not a valid directory and could not be created: {e}", None
|
||||||
|
|
||||||
|
# Check if the path is readable
|
||||||
|
if not os.access(new_path, os.R_OK):
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Download path '{new_path}' is not readable.",
|
||||||
|
source="check_directory_permissions"
|
||||||
|
)
|
||||||
|
return False, f"Path '{new_path}' exists but is not readable by the application.", None
|
||||||
|
|
||||||
# Check if the path is writable
|
# Check if the path is writable
|
||||||
if not os.access(new_path, os.W_OK):
|
if not os.access(new_path, os.W_OK):
|
||||||
ActivityLog.log_system_activity(
|
ActivityLog.log_error(
|
||||||
action="check_directory_permissions",
|
error_message=f"Download path '{new_path}' is not writable.",
|
||||||
status="error",
|
source="check_directory_permissions"
|
||||||
description=f"Download path '{new_path}' is not writable."
|
|
||||||
)
|
)
|
||||||
return False, f"Path '{new_path}' exists but is not writable by the application.", None
|
return False, f"Path '{new_path}' exists but is not writable by the application.", None
|
||||||
# --- End of validation ---
|
# --- End of validation ---
|
||||||
@ -281,6 +288,46 @@ def update_schedule():
|
|||||||
return redirect(url_for("config.schedule"))
|
return redirect(url_for("config.schedule"))
|
||||||
|
|
||||||
|
|
||||||
|
@bp.route("/update/scraper_module", methods=["POST"])
|
||||||
|
def update_scraper_module():
|
||||||
|
"""Update the scraper module configuration."""
|
||||||
|
from ..models import ScraperModuleConfig
|
||||||
|
|
||||||
|
new_scraper_module = request.form.get("scraper_module")
|
||||||
|
if not new_scraper_module:
|
||||||
|
flash("Scraper module cannot be empty.", "error")
|
||||||
|
return redirect(url_for("config.general"))
|
||||||
|
|
||||||
|
# Validate that the module exists and is valid
|
||||||
|
from scipaperloader.scrapers.factory import get_available_scrapers
|
||||||
|
available_modules = [m["name"] for m in get_available_scrapers()]
|
||||||
|
|
||||||
|
if new_scraper_module not in available_modules:
|
||||||
|
flash(f"Invalid scraper module: {new_scraper_module}", "error")
|
||||||
|
return redirect(url_for("config.general"))
|
||||||
|
|
||||||
|
# Update the database configuration
|
||||||
|
ScraperModuleConfig.set_module(new_scraper_module)
|
||||||
|
flash(f"Scraper module updated to '{new_scraper_module}'.", "success")
|
||||||
|
return redirect(url_for("config.general"))
|
||||||
|
|
||||||
|
|
||||||
|
@bp.context_processor
|
||||||
|
def inject_scraper_modules():
|
||||||
|
"""Inject available scraper modules into the template context."""
|
||||||
|
from scipaperloader.scrapers.factory import get_available_scrapers
|
||||||
|
from ..models import ScraperModuleConfig
|
||||||
|
|
||||||
|
available_scrapers = get_available_scrapers()
|
||||||
|
current_module = ScraperModuleConfig.get_current_module()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"available_scraper_modules": [s["name"] for s in available_scrapers],
|
||||||
|
"current_scraper_module": current_module,
|
||||||
|
"scraper_details": {s["name"]: s for s in available_scrapers}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@bp.route("/api/schedule/stats")
|
@bp.route("/api/schedule/stats")
|
||||||
def schedule_stats():
|
def schedule_stats():
|
||||||
"""Get statistics about the current schedule configuration."""
|
"""Get statistics about the current schedule configuration."""
|
||||||
@ -361,4 +408,36 @@ def api_update_config():
|
|||||||
return jsonify({
|
return jsonify({
|
||||||
"success": False,
|
"success": False,
|
||||||
"message": f"Unexpected error: {str(e)}"
|
"message": f"Unexpected error: {str(e)}"
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@bp.route("/delete_all_papers", methods=["POST"])
|
||||||
|
def delete_all_papers():
|
||||||
|
"""Delete all paper records from the database."""
|
||||||
|
try:
|
||||||
|
# Count papers before deletion for logging purposes
|
||||||
|
paper_count = PaperMetadata.query.count()
|
||||||
|
|
||||||
|
# Delete all records from the PaperMetadata table
|
||||||
|
PaperMetadata.query.delete()
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
# Log the action
|
||||||
|
ActivityLog.log_config_change(
|
||||||
|
config_key="database",
|
||||||
|
old_value=f"{paper_count} papers",
|
||||||
|
new_value="0 papers",
|
||||||
|
description=f"Deleted all {paper_count} papers from the database"
|
||||||
|
)
|
||||||
|
|
||||||
|
flash(f"Successfully deleted all {paper_count} papers from the database.", "success")
|
||||||
|
except Exception as e:
|
||||||
|
db.session.rollback()
|
||||||
|
flash(f"Failed to delete papers: {str(e)}", "error")
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Failed to delete all papers: {str(e)}",
|
||||||
|
exception=e,
|
||||||
|
source="config.delete_all_papers"
|
||||||
|
)
|
||||||
|
|
||||||
|
return redirect(url_for("config.general"))
|
@ -6,12 +6,13 @@ import os # Import os for path joining
|
|||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from flask import Blueprint, jsonify, render_template, request, current_app, flash
|
from flask import Blueprint, jsonify, render_template, request, current_app, flash
|
||||||
# Import the new model
|
# Import the new model
|
||||||
from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory, ScheduleConfig, ScraperState, DownloadPathConfig
|
from ..models import VolumeConfig, ActivityLog, PaperMetadata, ActivityCategory, ScheduleConfig, ScraperState, DownloadPathConfig, ScraperModuleConfig
|
||||||
from ..db import db
|
from ..db import db
|
||||||
from ..celery import celery
|
from ..celery import celery
|
||||||
from ..defaults import MAX_VOLUME
|
from ..defaults import MAX_VOLUME
|
||||||
from celery.schedules import crontab
|
from celery.schedules import crontab
|
||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
|
from scipaperloader.scrapers.factory import get_scraper, get_available_scrapers
|
||||||
|
|
||||||
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
|
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
|
||||||
|
|
||||||
@ -153,7 +154,7 @@ def stop_scraper():
|
|||||||
|
|
||||||
# Stop any running tasks
|
# Stop any running tasks
|
||||||
task_types_to_revoke = [
|
task_types_to_revoke = [
|
||||||
'scipaperloader.blueprints.scraper.dummy_process_paper',
|
'scipaperloader.blueprints.scraper.process_paper',
|
||||||
'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
|
'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
|
||||||
'scipaperloader.blueprints.scraper.run_periodic_dummy_scraper'
|
'scipaperloader.blueprints.scraper.run_periodic_dummy_scraper'
|
||||||
]
|
]
|
||||||
@ -224,7 +225,7 @@ def pause_scraper():
|
|||||||
# Just revoke processing tasks, but leave the periodic tasks running
|
# Just revoke processing tasks, but leave the periodic tasks running
|
||||||
# so it can continue to check the state (which is now paused)
|
# so it can continue to check the state (which is now paused)
|
||||||
task_types_to_revoke = [
|
task_types_to_revoke = [
|
||||||
'scipaperloader.blueprints.scraper.dummy_process_paper',
|
'scipaperloader.blueprints.scraper.process_paper',
|
||||||
'scipaperloader.blueprints.scraper.dummy_scheduled_scraper'
|
'scipaperloader.blueprints.scraper.dummy_scheduled_scraper'
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -373,70 +374,7 @@ def update_config():
|
|||||||
return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"})
|
return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"})
|
||||||
|
|
||||||
|
|
||||||
@celery.task(bind=True)
|
|
||||||
def dummy_scrape_paper(self):
|
|
||||||
"""Simulate scraping a single paper."""
|
|
||||||
# Simulate success or failure
|
|
||||||
success = random.random() > 0.3 # 70% success rate
|
|
||||||
|
|
||||||
# Simulate processing time
|
|
||||||
import time
|
|
||||||
time.sleep(random.randint(2, 5)) # 2-5 seconds
|
|
||||||
|
|
||||||
if success:
|
|
||||||
# Create a dummy paper
|
|
||||||
new_paper = PaperMetadata(
|
|
||||||
title=f"Dummy Paper {random.randint(1000, 9999)}",
|
|
||||||
doi=f"10.1234/dummy.{random.randint(1000, 9999)}",
|
|
||||||
journal=random.choice([
|
|
||||||
"Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
|
|
||||||
"Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
|
|
||||||
]),
|
|
||||||
type="article",
|
|
||||||
language="en",
|
|
||||||
published_online=datetime.now().date(),
|
|
||||||
status="Done",
|
|
||||||
file_path="/path/to/dummy/paper.pdf"
|
|
||||||
)
|
|
||||||
|
|
||||||
db.session.add(new_paper)
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
# Log the successful scrape
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="scrape_paper",
|
|
||||||
paper_id=new_paper.id,
|
|
||||||
status="success",
|
|
||||||
description=f"Successfully scraped paper {new_paper.doi}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"paper_id": new_paper.id,
|
|
||||||
"title": new_paper.title,
|
|
||||||
"doi": new_paper.doi
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
# Log the failed scrape
|
|
||||||
error_message = random.choice([
|
|
||||||
"Connection timeout",
|
|
||||||
"404 Not Found",
|
|
||||||
"Access denied",
|
|
||||||
"Invalid DOI format",
|
|
||||||
"PDF download failed",
|
|
||||||
"Rate limited by publisher"
|
|
||||||
])
|
|
||||||
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="scrape_paper",
|
|
||||||
status="error",
|
|
||||||
description=f"Failed to scrape paper: {error_message}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": False,
|
|
||||||
"error": error_message
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@celery.task
|
@celery.task
|
||||||
@ -545,11 +483,11 @@ def dummy_scheduled_scraper():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# --- Now schedule processing for the newly selected "Pending" papers ---
|
# --- Now schedule processing for the newly selected "Pending" papers ---
|
||||||
# (Assuming dummy_process_paper takes a paper_id)
|
# (Using the new modular process_paper task)
|
||||||
# Add random delays for processing within the hour (e.g., up to 3600 seconds)
|
# Add random delays for processing within the hour (e.g., up to 3600 seconds)
|
||||||
for paper_id in selected_paper_ids:
|
for paper_id in selected_paper_ids:
|
||||||
delay = random.uniform(1, 3500) # Random delay up to ~58 minutes
|
delay = random.uniform(1, 3500) # Random delay up to ~58 minutes
|
||||||
dummy_process_paper.apply_async(args=[paper_id], countdown=delay)
|
process_paper.apply_async(args=[paper_id], countdown=delay)
|
||||||
|
|
||||||
ActivityLog.log_scraper_activity(
|
ActivityLog.log_scraper_activity(
|
||||||
action="schedule_processing",
|
action="schedule_processing",
|
||||||
@ -568,109 +506,6 @@ def dummy_scheduled_scraper():
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@celery.task(bind=True)
|
|
||||||
def dummy_process_paper(self, paper_id):
|
|
||||||
"""
|
|
||||||
Process a single paper for the dummy scraper.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
paper_id (int): ID of the paper to process
|
|
||||||
"""
|
|
||||||
# First check if the scraper is still active and not paused
|
|
||||||
scraper_state = ScraperState.get_current_state()
|
|
||||||
if not scraper_state.is_active or scraper_state.is_paused:
|
|
||||||
# Log that task was skipped due to scraper being stopped or paused
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="process_paper",
|
|
||||||
status="info",
|
|
||||||
description=f"Skipped processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Get the paper from database
|
|
||||||
paper = PaperMetadata.query.get(paper_id)
|
|
||||||
if not paper:
|
|
||||||
# Log error if paper not found
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="process_paper",
|
|
||||||
status="error",
|
|
||||||
description=f"Paper with ID {paper_id} not found"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Simulate random success/failure (70% success rate)
|
|
||||||
success = random.random() < 0.7
|
|
||||||
|
|
||||||
# Simulate processing time (1-5 seconds)
|
|
||||||
process_time = random.uniform(1, 5)
|
|
||||||
time.sleep(process_time)
|
|
||||||
|
|
||||||
# Check again if scraper is still active and not paused after the time delay
|
|
||||||
# This ensures we don't process papers if the scraper was stopped during the delay
|
|
||||||
scraper_state = ScraperState.get_current_state()
|
|
||||||
if not scraper_state.is_active or scraper_state.is_paused:
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="process_paper",
|
|
||||||
status="info",
|
|
||||||
description=f"Cancelled processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
if success:
|
|
||||||
# --- Get configured download path ---
|
|
||||||
download_base_path = DownloadPathConfig.get_path()
|
|
||||||
# Ensure the base path exists (optional, but good practice)
|
|
||||||
# os.makedirs(download_base_path, exist_ok=True)
|
|
||||||
|
|
||||||
# --- Construct the file path ---
|
|
||||||
# Sanitize DOI for use in filename
|
|
||||||
safe_doi = paper.doi.replace('/', '_').replace(':', '_')
|
|
||||||
filename = f"{safe_doi}.pdf"
|
|
||||||
full_path = os.path.join(download_base_path, filename)
|
|
||||||
|
|
||||||
# Update paper status to "Done" and set the file path
|
|
||||||
paper.status = "Done"
|
|
||||||
paper.file_path = full_path # Use the constructed path
|
|
||||||
|
|
||||||
# Log success
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="process_paper",
|
|
||||||
paper_id=paper.id,
|
|
||||||
status="success",
|
|
||||||
description=f"Successfully processed paper: {paper.doi}. File at: {full_path}" # Log path
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Update paper status to "Failed"
|
|
||||||
paper.status = "Failed"
|
|
||||||
|
|
||||||
# Generate random error message
|
|
||||||
error_message = random.choice([
|
|
||||||
"Publisher website unavailable",
|
|
||||||
"No PDF download link found",
|
|
||||||
"Access restricted",
|
|
||||||
"Download timeout",
|
|
||||||
"Invalid DOI",
|
|
||||||
"Rate limited by publisher"
|
|
||||||
])
|
|
||||||
paper.error_msg = error_message
|
|
||||||
|
|
||||||
# Log failure
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="process_paper",
|
|
||||||
paper_id=paper.id,
|
|
||||||
status="error",
|
|
||||||
description=f"Failed to process paper: {error_message}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Update the timestamp
|
|
||||||
paper.updated_at = datetime.utcnow()
|
|
||||||
|
|
||||||
# Commit changes to database
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return success
|
|
||||||
|
|
||||||
|
|
||||||
@celery.task(bind=True)
|
@celery.task(bind=True)
|
||||||
def process_paper_batch(self, paper_ids):
|
def process_paper_batch(self, paper_ids):
|
||||||
"""
|
"""
|
||||||
@ -914,3 +749,168 @@ def calculate_papers_for_current_hour():
|
|||||||
)
|
)
|
||||||
|
|
||||||
return papers_this_hour
|
return papers_this_hour
|
||||||
|
|
||||||
|
|
||||||
|
@celery.task(bind=True)
|
||||||
|
def process_paper(self, paper_id):
|
||||||
|
"""Process a paper using the configured scraper."""
|
||||||
|
from scipaperloader.models import PaperMetadata
|
||||||
|
paper = PaperMetadata.query.get(paper_id)
|
||||||
|
if not paper:
|
||||||
|
return {"status": "error", "message": f"Paper with ID {paper_id} not found"}
|
||||||
|
|
||||||
|
scraper = get_scraper()
|
||||||
|
result = scraper.scrape(paper.doi)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"paper_id": paper_id,
|
||||||
|
"status": result.status,
|
||||||
|
"message": result.message
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@celery.task(bind=True)
|
||||||
|
@celery.task(bind=True)
|
||||||
|
def process_paper_with_scraper(self, paper_id, scraper_module):
|
||||||
|
"""Process a paper using a specific scraper module."""
|
||||||
|
from scipaperloader.models import PaperMetadata
|
||||||
|
import importlib
|
||||||
|
from ..scrapers.base import BaseScraper
|
||||||
|
|
||||||
|
paper = PaperMetadata.query.get(paper_id)
|
||||||
|
if not paper:
|
||||||
|
return {"status": "error", "message": f"Paper with ID {paper_id} not found"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Import the specified scraper module
|
||||||
|
module = importlib.import_module(f"scipaperloader.scrapers.{scraper_module}")
|
||||||
|
cls = getattr(module, "Scraper")
|
||||||
|
|
||||||
|
# Validate that it's a BaseScraper
|
||||||
|
if not issubclass(cls, BaseScraper):
|
||||||
|
error_msg = f"Scraper class in module '{scraper_module}' does not inherit from BaseScraper"
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=error_msg,
|
||||||
|
source="process_paper_with_scraper"
|
||||||
|
)
|
||||||
|
return {"status": "error", "message": error_msg}
|
||||||
|
|
||||||
|
# Instantiate and use the scraper
|
||||||
|
scraper = cls()
|
||||||
|
result = scraper.scrape(paper.doi)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"paper_id": paper_id,
|
||||||
|
"status": result.status,
|
||||||
|
"message": result.message,
|
||||||
|
"scraper": scraper_module
|
||||||
|
}
|
||||||
|
|
||||||
|
except (ImportError, AttributeError) as e:
|
||||||
|
error_msg = f"Failed to load scraper module '{scraper_module}': {str(e)}"
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=error_msg,
|
||||||
|
source="process_paper_with_scraper"
|
||||||
|
)
|
||||||
|
return {"status": "error", "message": error_msg}
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Error processing paper with scraper '{scraper_module}': {str(e)}"
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=error_msg,
|
||||||
|
source="process_paper_with_scraper",
|
||||||
|
exception=e
|
||||||
|
)
|
||||||
|
return {"status": "error", "message": error_msg}
|
||||||
|
|
||||||
|
|
||||||
|
@bp.route("/process_single/<int:paper_id>", methods=["POST"])
|
||||||
|
def process_single_paper(paper_id):
|
||||||
|
"""Process a single paper by ID."""
|
||||||
|
try:
|
||||||
|
# Check if paper exists
|
||||||
|
paper = PaperMetadata.query.get(paper_id)
|
||||||
|
if not paper:
|
||||||
|
return jsonify({
|
||||||
|
"success": False,
|
||||||
|
"message": f"Paper with ID {paper_id} not found"
|
||||||
|
})
|
||||||
|
|
||||||
|
# Get the scraper module name from the request
|
||||||
|
scraper_module = None
|
||||||
|
if request.is_json and request.json:
|
||||||
|
scraper_module = request.json.get('scraper_module')
|
||||||
|
|
||||||
|
# Update status to Pending
|
||||||
|
old_status = paper.status
|
||||||
|
paper.status = "Pending"
|
||||||
|
paper.updated_at = datetime.utcnow()
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
# Log that we're processing this paper
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="manual_process_paper",
|
||||||
|
paper_id=paper_id,
|
||||||
|
status="pending",
|
||||||
|
description=f"Manual processing initiated for paper: {paper.title}" +
|
||||||
|
(f" using {scraper_module} scraper" if scraper_module else "")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start the task (without delay since it's manual)
|
||||||
|
if scraper_module:
|
||||||
|
task = process_paper_with_scraper.delay(paper_id, scraper_module)
|
||||||
|
else:
|
||||||
|
task = process_paper.delay(paper_id)
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
"success": True,
|
||||||
|
"task_id": task.id,
|
||||||
|
"message": f"Processing paper '{paper.title}' (ID: {paper_id})" +
|
||||||
|
(f" using {scraper_module} scraper" if scraper_module else "") +
|
||||||
|
f". Previous status: {old_status}"
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
db.session.rollback()
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Failed to process paper {paper_id}: {str(e)}",
|
||||||
|
exception=e,
|
||||||
|
source="process_single_paper"
|
||||||
|
)
|
||||||
|
return jsonify({
|
||||||
|
"success": False,
|
||||||
|
"message": f"Error: {str(e)}"
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@bp.route("/available_scrapers")
|
||||||
|
def available_scrapers():
|
||||||
|
"""Get list of available scraper modules."""
|
||||||
|
from scipaperloader.scrapers.factory import get_available_scrapers
|
||||||
|
from ..models import ScraperModuleConfig
|
||||||
|
|
||||||
|
try:
|
||||||
|
scrapers = get_available_scrapers()
|
||||||
|
current_module = ScraperModuleConfig.get_current_module()
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
"success": True,
|
||||||
|
"scrapers": [
|
||||||
|
{
|
||||||
|
"name": s["name"],
|
||||||
|
"description": s["description"],
|
||||||
|
"is_current": s["name"] == current_module
|
||||||
|
} for s in scrapers
|
||||||
|
],
|
||||||
|
"current": current_module
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Failed to get available scrapers: {str(e)}",
|
||||||
|
source="available_scrapers"
|
||||||
|
)
|
||||||
|
return jsonify({
|
||||||
|
"success": False,
|
||||||
|
"message": f"Error: {str(e)}",
|
||||||
|
"scrapers": []
|
||||||
|
})
|
||||||
|
@ -6,3 +6,4 @@ class Config:
|
|||||||
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
|
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
|
||||||
SQLALCHEMY_TRACK_MODIFICATIONS = False
|
SQLALCHEMY_TRACK_MODIFICATIONS = False
|
||||||
APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
|
APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
|
||||||
|
SCRAPER_MODULE = os.environ.get("SCRAPER_MODULE", "dummy")
|
||||||
|
@ -277,6 +277,40 @@ class ScraperState(db.Model):
|
|||||||
return state.is_active and not state.is_paused
|
return state.is_active and not state.is_paused
|
||||||
|
|
||||||
|
|
||||||
|
class ScraperModuleConfig(db.Model):
|
||||||
|
"""Model to store the configured scraper module."""
|
||||||
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
|
module_name = db.Column(db.String(100), default="dummy")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_current_module(cls):
|
||||||
|
"""Get the currently configured scraper module."""
|
||||||
|
config = cls.query.first()
|
||||||
|
if not config:
|
||||||
|
config = cls(module_name="dummy")
|
||||||
|
db.session.add(config)
|
||||||
|
db.session.commit()
|
||||||
|
return config.module_name
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def set_module(cls, module_name):
|
||||||
|
"""Set the scraper module."""
|
||||||
|
config = cls.query.first()
|
||||||
|
if not config:
|
||||||
|
config = cls(module_name=module_name)
|
||||||
|
db.session.add(config)
|
||||||
|
else:
|
||||||
|
old_value = config.module_name
|
||||||
|
config.module_name = module_name
|
||||||
|
ActivityLog.log_config_change(
|
||||||
|
config_key="scraper_module",
|
||||||
|
old_value=old_value,
|
||||||
|
new_value=module_name,
|
||||||
|
description="Updated scraper module configuration"
|
||||||
|
)
|
||||||
|
db.session.commit()
|
||||||
|
return config
|
||||||
|
|
||||||
def init_schedule_config():
|
def init_schedule_config():
|
||||||
"""Initialize ScheduleConfig with default values if empty"""
|
"""Initialize ScheduleConfig with default values if empty"""
|
||||||
if ScheduleConfig.query.count() == 0:
|
if ScheduleConfig.query.count() == 0:
|
||||||
|
2
scipaperloader/scrapers/__init__.py
Normal file
2
scipaperloader/scrapers/__init__.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
# This package contains all scraper modules.
|
||||||
|
# Each scraper should implement the BaseScraper interface from base.py.
|
34
scipaperloader/scrapers/base.py
Normal file
34
scipaperloader/scrapers/base.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import NamedTuple, Optional, Dict
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
class ScrapeResult(NamedTuple):
|
||||||
|
status: str # "success", "error", "skipped"
|
||||||
|
message: str # human-readable status
|
||||||
|
data: Optional[Dict] # any extra payload (file_path, metadata, etc.)
|
||||||
|
duration: Optional[float] = None # processing time in seconds
|
||||||
|
timestamp: Optional[datetime] = None # when the operation completed
|
||||||
|
|
||||||
|
class BaseScraper(ABC):
|
||||||
|
"""Base class for all scraper implementations."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def scrape(self, doi: str) -> ScrapeResult:
|
||||||
|
"""
|
||||||
|
Fetch metadata and/or download paper for the given DOI.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doi: The DOI of the paper to scrape
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ScrapeResult with status, message, and optional data
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_name(self) -> str:
|
||||||
|
"""Return the name of this scraper."""
|
||||||
|
return self.__class__.__name__
|
||||||
|
|
||||||
|
def get_description(self) -> str:
|
||||||
|
"""Return a description of this scraper."""
|
||||||
|
return getattr(self.__class__, "__doc__", "No description available")
|
191
scipaperloader/scrapers/dummy.py
Normal file
191
scipaperloader/scrapers/dummy.py
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
import time
|
||||||
|
import random
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from .base import BaseScraper, ScrapeResult
|
||||||
|
from flask import current_app
|
||||||
|
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||||
|
from ..db import db
|
||||||
|
|
||||||
|
class Scraper(BaseScraper):
|
||||||
|
"""Dummy scraper for testing purposes that simulates paper downloading."""
|
||||||
|
|
||||||
|
def scrape(self, doi: str) -> ScrapeResult:
|
||||||
|
"""Simulate scraping a paper with realistic timing and random success/failure."""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||||
|
if not paper:
|
||||||
|
return ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=f"No paper found for DOI {doi}",
|
||||||
|
data=None,
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Simulate processing time (1-3 seconds)
|
||||||
|
processing_time = random.uniform(1, 3)
|
||||||
|
time.sleep(processing_time)
|
||||||
|
|
||||||
|
# Simulate 80% success rate
|
||||||
|
success = random.random() < 0.8
|
||||||
|
|
||||||
|
if success:
|
||||||
|
# Get download path and create an actual dummy file
|
||||||
|
download_path = DownloadPathConfig.get_path()
|
||||||
|
file_name = f"{doi.replace('/', '_')}.pdf"
|
||||||
|
file_path = f"{download_path}/{file_name}"
|
||||||
|
|
||||||
|
# Check if the path is readable and writable
|
||||||
|
if not os.path.exists(download_path):
|
||||||
|
try:
|
||||||
|
# Create directory if it doesn't exist
|
||||||
|
os.makedirs(download_path, exist_ok=True)
|
||||||
|
except OSError as e:
|
||||||
|
error_msg = f"Failed to create download directory: {str(e)}"
|
||||||
|
paper.status = "Failed"
|
||||||
|
paper.error_msg = error_msg
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="dummy_scrape_path_error",
|
||||||
|
status="error",
|
||||||
|
description=error_msg,
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
return ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=error_msg,
|
||||||
|
data={"error_code": "path_creation_error"},
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if the path is readable
|
||||||
|
if not os.access(download_path, os.R_OK):
|
||||||
|
error_msg = f"Download path '{download_path}' is not readable"
|
||||||
|
paper.status = "Failed"
|
||||||
|
paper.error_msg = error_msg
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="dummy_scrape_path_error",
|
||||||
|
status="error",
|
||||||
|
description=error_msg,
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
return ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=error_msg,
|
||||||
|
data={"error_code": "path_read_error"},
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if the path is writable
|
||||||
|
if not os.access(download_path, os.W_OK):
|
||||||
|
error_msg = f"Download path '{download_path}' is not writable"
|
||||||
|
paper.status = "Failed"
|
||||||
|
paper.error_msg = error_msg
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="dummy_scrape_path_error",
|
||||||
|
status="error",
|
||||||
|
description=error_msg,
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
return ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=error_msg,
|
||||||
|
data={"error_code": "path_write_error"},
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a simple dummy PDF file
|
||||||
|
try:
|
||||||
|
with open(file_path, 'w') as f:
|
||||||
|
f.write(f"Dummy PDF file for paper with DOI: {doi}\n")
|
||||||
|
f.write(f"Title: {paper.title}\n")
|
||||||
|
f.write(f"Journal: {paper.journal}\n")
|
||||||
|
f.write(f"Generated: {datetime.utcnow().isoformat()}\n")
|
||||||
|
f.write("\nThis is a dummy file created by the SciPaperLoader dummy scraper.\n")
|
||||||
|
|
||||||
|
# Update paper status
|
||||||
|
paper.status = "Done"
|
||||||
|
paper.file_path = file_path
|
||||||
|
paper.error_msg = None
|
||||||
|
except Exception as e:
|
||||||
|
# Handle file creation errors
|
||||||
|
error_msg = f"Failed to create dummy file: {str(e)}"
|
||||||
|
paper.status = "Failed"
|
||||||
|
paper.error_msg = error_msg
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="dummy_scrape_file_error",
|
||||||
|
status="error",
|
||||||
|
description=error_msg,
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
return ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=error_msg,
|
||||||
|
data={"error_code": "file_creation_error"},
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Log success
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="dummy_scrape",
|
||||||
|
status="success",
|
||||||
|
description=f"Successfully scraped {doi}",
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
result = ScrapeResult(
|
||||||
|
status="success",
|
||||||
|
message=f"Successfully scraped {doi}",
|
||||||
|
data={
|
||||||
|
"file_path": file_path,
|
||||||
|
"title": paper.title,
|
||||||
|
"journal": paper.journal
|
||||||
|
},
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Simulate failure
|
||||||
|
error_messages = [
|
||||||
|
"Paper not found in database",
|
||||||
|
"Access denied by publisher",
|
||||||
|
"Rate limit exceeded",
|
||||||
|
"Network timeout",
|
||||||
|
"Invalid DOI format"
|
||||||
|
]
|
||||||
|
error_msg = random.choice(error_messages)
|
||||||
|
|
||||||
|
paper.status = "Failed"
|
||||||
|
paper.error_msg = error_msg
|
||||||
|
|
||||||
|
# Log failure
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="dummy_scrape",
|
||||||
|
status="error",
|
||||||
|
description=f"Failed to scrape {doi}: {error_msg}",
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
result = ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=f"Failed to scrape {doi}: {error_msg}",
|
||||||
|
data={"error_code": "dummy_error"},
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
db.session.commit()
|
||||||
|
return result
|
59
scipaperloader/scrapers/factory.py
Normal file
59
scipaperloader/scrapers/factory.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
import importlib
|
||||||
|
from flask import current_app
|
||||||
|
from .base import BaseScraper
|
||||||
|
|
||||||
|
def get_scraper() -> BaseScraper:
|
||||||
|
"""Load the configured scraper module dynamically with error handling."""
|
||||||
|
from ..models import ScraperModuleConfig, ActivityLog
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get module name from database first, fallback to config
|
||||||
|
name = ScraperModuleConfig.get_current_module()
|
||||||
|
if not name:
|
||||||
|
name = current_app.config.get("SCRAPER_MODULE", "dummy")
|
||||||
|
|
||||||
|
module = importlib.import_module(f"scipaperloader.scrapers.{name}")
|
||||||
|
cls = getattr(module, "Scraper")
|
||||||
|
|
||||||
|
# Validate that it's actually a BaseScraper
|
||||||
|
if not issubclass(cls, BaseScraper):
|
||||||
|
raise TypeError(f"Scraper class in module '{name}' does not inherit from BaseScraper")
|
||||||
|
|
||||||
|
return cls()
|
||||||
|
|
||||||
|
except (ImportError, AttributeError, TypeError) as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Failed to load scraper module '{name}': {str(e)}",
|
||||||
|
source="scraper_factory",
|
||||||
|
severity="error"
|
||||||
|
)
|
||||||
|
# Fallback to dummy scraper
|
||||||
|
from .dummy import Scraper as DummyScraper
|
||||||
|
return DummyScraper()
|
||||||
|
|
||||||
|
def get_available_scrapers():
|
||||||
|
"""Get list of available scraper modules."""
|
||||||
|
import os
|
||||||
|
from scipaperloader.scrapers import __path__ as scrapers_path
|
||||||
|
|
||||||
|
modules = []
|
||||||
|
scrapers_dir = scrapers_path[0]
|
||||||
|
|
||||||
|
for filename in os.listdir(scrapers_dir):
|
||||||
|
if filename.endswith(".py") and filename not in ("__init__.py", "base.py", "factory.py"):
|
||||||
|
module_name = filename[:-3]
|
||||||
|
try:
|
||||||
|
# Try to import and validate the module
|
||||||
|
module = importlib.import_module(f"scipaperloader.scrapers.{module_name}")
|
||||||
|
cls = getattr(module, "Scraper", None)
|
||||||
|
if cls and issubclass(cls, BaseScraper):
|
||||||
|
modules.append({
|
||||||
|
"name": module_name,
|
||||||
|
"class": cls,
|
||||||
|
"description": getattr(cls, "__doc__", "No description available")
|
||||||
|
})
|
||||||
|
except (ImportError, AttributeError, TypeError):
|
||||||
|
# Skip invalid modules
|
||||||
|
pass
|
||||||
|
|
||||||
|
return modules
|
@ -9,52 +9,112 @@
|
|||||||
<!-- include flash messages template -->
|
<!-- include flash messages template -->
|
||||||
{% include "partials/flash_messages.html.jinja" %}
|
{% include "partials/flash_messages.html.jinja" %}
|
||||||
|
|
||||||
<form action="{{ url_for('config.update_general') }}" method="post">
|
<div class="row">
|
||||||
<div class="form-section">
|
<!-- General Settings Column -->
|
||||||
<h6>Scraper Volume</h6>
|
<div class="col-md-6">
|
||||||
<p class="text-muted">Configure the total number of papers to scrape per day.</p>
|
<form action="{{ url_for('config.update_general') }}" method="post">
|
||||||
|
<div class="form-section">
|
||||||
|
<h6>Scraper Volume</h6>
|
||||||
|
<p class="text-muted">Configure the total number of papers to scrape per day.</p>
|
||||||
|
|
||||||
<div class="mb-3">
|
<div class="mb-3">
|
||||||
<label for="totalVolume" class="form-label">Papers per day:</label>
|
<label for="totalVolume" class="form-label">Papers per day:</label>
|
||||||
<input type="number" class="form-control" id="totalVolume" name="total_volume" min="1"
|
<input type="number" class="form-control" id="totalVolume" name="total_volume"
|
||||||
max="{{ max_volume }}" value="{{ volume_config.volume }}" required>
|
min="1" max="{{ max_volume }}" value="{{ volume_config.volume }}" required>
|
||||||
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
|
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="form-section">
|
||||||
|
<h6>Download Path</h6>
|
||||||
|
<p class="text-muted">Base directory where scraped paper files will be stored.</p>
|
||||||
|
<div class="mb-3">
|
||||||
|
<label for="downloadPath" class="form-label">Download Directory:</label>
|
||||||
|
<input type="text" class="form-control" id="downloadPath" name="download_path"
|
||||||
|
value="{{ download_path_config.path }}" required>
|
||||||
|
<div class="form-text">Enter the full path to the download directory (e.g.,
|
||||||
|
/data/papers).
|
||||||
|
Ensure the directory exists and the application has write permissions.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="form-section">
|
||||||
|
<h6>System Settings</h6>
|
||||||
|
<p class="text-muted">Configure general system behavior.</p>
|
||||||
|
|
||||||
|
<div class="mb-3 form-check">
|
||||||
|
<input type="checkbox" class="form-check-input" id="enableNotifications" checked>
|
||||||
|
<label class="form-check-label" for="enableNotifications">
|
||||||
|
Enable email notifications
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="mb-3 form-check">
|
||||||
|
<input type="checkbox" class="form-check-input" id="enableLogging" checked>
|
||||||
|
<label class="form-check-label" for="enableLogging">
|
||||||
|
Enable detailed activity logging
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<button type="submit" class="btn btn-primary">Save General Settings</button>
|
||||||
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="form-section">
|
<!-- Scraper Module Column -->
|
||||||
<h6>Download Path</h6>
|
<div class="col-md-6">
|
||||||
<p class="text-muted">Base directory where scraped paper files will be stored.</p>
|
<form method="post" action="{{ url_for('config.update_scraper_module') }}">
|
||||||
<div class="mb-3">
|
<div class="form-section">
|
||||||
<label for="downloadPath" class="form-label">Download Directory:</label>
|
<h6>Scraper Module</h6>
|
||||||
<input type="text" class="form-control" id="downloadPath" name="download_path"
|
<p class="text-muted">Select which scraper module to use for processing papers.</p>
|
||||||
value="{{ download_path_config.path }}" required>
|
|
||||||
<div class="form-text">Enter the full path to the download directory (e.g., /data/papers).
|
<div class="mb-3">
|
||||||
Ensure the directory exists and the application has write permissions.</div>
|
<label for="scraper_module" class="form-label">Active Scraper Module:</label>
|
||||||
|
<select class="form-control" id="scraper_module" name="scraper_module">
|
||||||
|
{% for module in available_scraper_modules %}
|
||||||
|
<option value="{{ module }}" {% if module==current_scraper_module %} selected
|
||||||
|
{%endif %}>
|
||||||
|
{{ module }}
|
||||||
|
{% if scraper_details[module] %}
|
||||||
|
- {{ scraper_details[module].description[:50] }}...
|
||||||
|
{% endif %}
|
||||||
|
</option>
|
||||||
|
{% endfor %}
|
||||||
|
</select>
|
||||||
|
<div class="form-text">
|
||||||
|
Current module: <strong>{{ current_scraper_module }}</strong>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<button type="submit" class="btn btn-primary">Update Scraper Module</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Database Management Section -->
|
||||||
|
<div class="row mt-4">
|
||||||
|
<div class="col-12">
|
||||||
|
<div class="card border-danger">
|
||||||
|
<div class="card-header bg-danger text-white">
|
||||||
|
<h5>Database Management</h5>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="form-section">
|
||||||
|
<h6>Delete All Papers</h6>
|
||||||
|
<p class="text-muted">This action will permanently delete all paper records from the
|
||||||
|
database. This cannot be undone.</p>
|
||||||
|
|
||||||
|
<form method="post" action="{{ url_for('config.delete_all_papers') }}" class="mt-3"
|
||||||
|
onsubmit="return confirm('WARNING: You are about to delete ALL papers from the database. This action cannot be undone. Are you sure you want to proceed?');">
|
||||||
|
<button type="submit" class="btn btn-danger">
|
||||||
|
<i class="fas fa-trash-alt"></i> Delete All Papers
|
||||||
|
</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
<div class="form-section">
|
|
||||||
<h6>System Settings</h6>
|
|
||||||
<p class="text-muted">Configure general system behavior.</p>
|
|
||||||
|
|
||||||
<div class="mb-3 form-check">
|
|
||||||
<input type="checkbox" class="form-check-input" id="enableNotifications" checked>
|
|
||||||
<label class="form-check-label" for="enableNotifications">
|
|
||||||
Enable email notifications
|
|
||||||
</label>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="mb-3 form-check">
|
|
||||||
<input type="checkbox" class="form-check-input" id="enableLogging" checked>
|
|
||||||
<label class="form-check-label" for="enableLogging">
|
|
||||||
Enable detailed activity logging
|
|
||||||
</label>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<button type="submit" class="btn btn-primary">Save General Settings</button>
|
|
||||||
</form>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -36,6 +36,28 @@
|
|||||||
max-width: 350px;
|
max-width: 350px;
|
||||||
z-index: 1050;
|
z-index: 1050;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.search-results-container {
|
||||||
|
max-height: 300px;
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Paper status badges */
|
||||||
|
.badge-new {
|
||||||
|
background-color: #17a2b8;
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge-pending {
|
||||||
|
background-color: #ffc107;
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge-done {
|
||||||
|
background-color: #28a745;
|
||||||
|
}
|
||||||
|
|
||||||
|
.badge-failed {
|
||||||
|
background-color: #dc3545;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
{% endblock styles %}
|
{% endblock styles %}
|
||||||
|
|
||||||
@ -89,6 +111,61 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- New row for single paper processing -->
|
||||||
|
<div class="row mb-4">
|
||||||
|
<div class="col-12">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<h5>Process Single Paper</h5>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-6">
|
||||||
|
<form id="searchPaperForm" class="mb-3">
|
||||||
|
<div class="input-group">
|
||||||
|
<input type="text" id="paperSearchInput" class="form-control"
|
||||||
|
placeholder="Search paper by title, DOI, or ID...">
|
||||||
|
<button class="btn btn-outline-secondary" type="submit">Search</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-6">
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="scraperSelect">Scraper Module:</label>
|
||||||
|
<select class="form-control" id="scraperSelect">
|
||||||
|
<option value="">Use default system scraper</option>
|
||||||
|
<!-- Available scrapers will be populated here -->
|
||||||
|
</select>
|
||||||
|
<div class="form-text">
|
||||||
|
Select which scraper to use for processing the paper
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="searchResults" class="mt-3 search-results-container d-none">
|
||||||
|
<table class="table table-hover table-striped">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>ID</th>
|
||||||
|
<th>Title</th>
|
||||||
|
<th>DOI</th>
|
||||||
|
<th>Status</th>
|
||||||
|
<th>Actions</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody id="paperSearchResults">
|
||||||
|
<!-- Search results will be populated here -->
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div id="processingStatus" class="alert alert-info mt-3 d-none"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="row mb-4">
|
<div class="row mb-4">
|
||||||
<div class="col-12">
|
<div class="col-12">
|
||||||
<div class="card">
|
<div class="card">
|
||||||
@ -164,12 +241,19 @@
|
|||||||
const resetButton = document.getElementById('resetButton');
|
const resetButton = document.getElementById('resetButton');
|
||||||
const notificationsToggle = document.getElementById('notificationsToggle');
|
const notificationsToggle = document.getElementById('notificationsToggle');
|
||||||
const activityLog = document.getElementById('activityLog');
|
const activityLog = document.getElementById('activityLog');
|
||||||
|
const searchForm = document.getElementById('searchPaperForm');
|
||||||
|
const searchInput = document.getElementById('paperSearchInput');
|
||||||
|
const searchResults = document.getElementById('searchResults');
|
||||||
|
const processingStatus = document.getElementById('processingStatus');
|
||||||
|
const paperSearchResults = document.getElementById('paperSearchResults');
|
||||||
|
const scraperSelect = document.getElementById('scraperSelect');
|
||||||
|
|
||||||
// Initialize the page
|
// Initialize the page
|
||||||
document.addEventListener('DOMContentLoaded', function () {
|
document.addEventListener('DOMContentLoaded', function () {
|
||||||
initStatusPolling();
|
initStatusPolling();
|
||||||
loadActivityStats(currentTimeRange);
|
loadActivityStats(currentTimeRange);
|
||||||
loadRecentActivity();
|
loadRecentActivity();
|
||||||
|
loadAvailableScrapers();
|
||||||
|
|
||||||
// Initialize event listeners
|
// Initialize event listeners
|
||||||
startButton.addEventListener('click', startScraper);
|
startButton.addEventListener('click', startScraper);
|
||||||
@ -177,6 +261,10 @@
|
|||||||
stopButton.addEventListener('click', stopScraper);
|
stopButton.addEventListener('click', stopScraper);
|
||||||
resetButton.addEventListener('click', resetScraper);
|
resetButton.addEventListener('click', resetScraper);
|
||||||
notificationsToggle.addEventListener('click', toggleNotifications);
|
notificationsToggle.addEventListener('click', toggleNotifications);
|
||||||
|
searchForm.addEventListener('submit', function (e) {
|
||||||
|
e.preventDefault();
|
||||||
|
searchPapers();
|
||||||
|
});
|
||||||
|
|
||||||
document.getElementById('volumeForm').addEventListener('submit', function (e) {
|
document.getElementById('volumeForm').addEventListener('submit', function (e) {
|
||||||
e.preventDefault();
|
e.preventDefault();
|
||||||
@ -193,6 +281,185 @@
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Load available scraper modules
|
||||||
|
function loadAvailableScrapers() {
|
||||||
|
fetch('/scraper/available_scrapers')
|
||||||
|
.then(response => response.json())
|
||||||
|
.then(data => {
|
||||||
|
if (data.success && data.scrapers && data.scrapers.length > 0) {
|
||||||
|
// Clear previous options except the default one
|
||||||
|
while (scraperSelect.options.length > 1) {
|
||||||
|
scraperSelect.remove(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add each scraper as an option
|
||||||
|
data.scrapers.forEach(scraper => {
|
||||||
|
const option = document.createElement('option');
|
||||||
|
option.value = scraper.name;
|
||||||
|
option.textContent = `${scraper.name} - ${scraper.description.substring(0, 50)}${scraper.description.length > 50 ? '...' : ''}`;
|
||||||
|
if (scraper.is_current) {
|
||||||
|
option.textContent += ' (system default)';
|
||||||
|
}
|
||||||
|
scraperSelect.appendChild(option);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
// If no scrapers or error, add a note
|
||||||
|
const option = document.createElement('option');
|
||||||
|
option.disabled = true;
|
||||||
|
option.textContent = 'No scrapers available';
|
||||||
|
scraperSelect.appendChild(option);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Error loading scrapers:', error);
|
||||||
|
const option = document.createElement('option');
|
||||||
|
option.disabled = true;
|
||||||
|
option.textContent = 'Error loading scrapers';
|
||||||
|
scraperSelect.appendChild(option);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search papers function
|
||||||
|
function searchPapers() {
|
||||||
|
const query = searchInput.value.trim();
|
||||||
|
|
||||||
|
if (!query) {
|
||||||
|
showFlashMessage('Please enter a search term', 'warning');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show loading message
|
||||||
|
paperSearchResults.innerHTML = '<tr><td colspan="5" class="text-center">Searching papers...</td></tr>';
|
||||||
|
searchResults.classList.remove('d-none');
|
||||||
|
|
||||||
|
// Fetch papers from API
|
||||||
|
fetch(`/api/papers?query=${encodeURIComponent(query)}`)
|
||||||
|
.then(response => response.json())
|
||||||
|
.then(data => {
|
||||||
|
if (!data.papers || data.papers.length === 0) {
|
||||||
|
paperSearchResults.innerHTML = '<tr><td colspan="5" class="text-center">No papers found matching your search</td></tr>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
paperSearchResults.innerHTML = '';
|
||||||
|
|
||||||
|
data.papers.forEach(paper => {
|
||||||
|
const row = document.createElement('tr');
|
||||||
|
|
||||||
|
// Create status badge
|
||||||
|
let statusBadge = '';
|
||||||
|
if (paper.status === 'New') {
|
||||||
|
statusBadge = '<span class="badge bg-info">New</span>';
|
||||||
|
} else if (paper.status === 'Pending') {
|
||||||
|
statusBadge = '<span class="badge bg-warning text-dark">Pending</span>';
|
||||||
|
} else if (paper.status === 'Done') {
|
||||||
|
statusBadge = '<span class="badge bg-success">Done</span>';
|
||||||
|
} else if (paper.status === 'Failed') {
|
||||||
|
statusBadge = '<span class="badge bg-danger">Failed</span>';
|
||||||
|
} else {
|
||||||
|
statusBadge = `<span class="badge bg-secondary">${paper.status}</span>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create process button (enabled only for papers not in 'Pending' status)
|
||||||
|
const processButtonDisabled = paper.status === 'Pending' ? 'disabled' : '';
|
||||||
|
|
||||||
|
// Truncate title if too long
|
||||||
|
const truncatedTitle = paper.title.length > 70 ? paper.title.substring(0, 70) + '...' : paper.title;
|
||||||
|
|
||||||
|
row.innerHTML = `
|
||||||
|
<td>${paper.id}</td>
|
||||||
|
<td title="${paper.title}">${truncatedTitle}</td>
|
||||||
|
<td>${paper.doi || 'N/A'}</td>
|
||||||
|
<td>${statusBadge}</td>
|
||||||
|
<td>
|
||||||
|
<button class="btn btn-sm btn-primary process-paper-btn"
|
||||||
|
data-paper-id="${paper.id}"
|
||||||
|
${processButtonDisabled}>
|
||||||
|
Process Now
|
||||||
|
</button>
|
||||||
|
</td>
|
||||||
|
`;
|
||||||
|
|
||||||
|
paperSearchResults.appendChild(row);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add event listeners to the process buttons
|
||||||
|
document.querySelectorAll('.process-paper-btn').forEach(btn => {
|
||||||
|
btn.addEventListener('click', function () {
|
||||||
|
processSinglePaper(this.getAttribute('data-paper-id'));
|
||||||
|
});
|
||||||
|
});
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Error searching papers:', error);
|
||||||
|
paperSearchResults.innerHTML = '<tr><td colspan="5" class="text-center">Error searching papers</td></tr>';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process a single paper
|
||||||
|
function processSinglePaper(paperId) {
|
||||||
|
// Disable all process buttons to prevent multiple clicks
|
||||||
|
document.querySelectorAll('.process-paper-btn').forEach(btn => {
|
||||||
|
btn.disabled = true;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Show processing status
|
||||||
|
processingStatus.textContent = 'Processing paper...';
|
||||||
|
processingStatus.classList.remove('d-none');
|
||||||
|
|
||||||
|
// Get selected scraper
|
||||||
|
const selectedScraper = scraperSelect.value;
|
||||||
|
|
||||||
|
// Send request to process the paper
|
||||||
|
fetch(`/scraper/process_single/${paperId}`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
scraper_module: selectedScraper
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.then(response => response.json())
|
||||||
|
.then(data => {
|
||||||
|
if (data.success) {
|
||||||
|
processingStatus.textContent = data.message;
|
||||||
|
processingStatus.className = 'alert alert-success mt-3';
|
||||||
|
|
||||||
|
// Update status in the search results
|
||||||
|
const row = document.querySelector(`.process-paper-btn[data-paper-id="${paperId}"]`).closest('tr');
|
||||||
|
const statusCell = row.querySelector('td:nth-child(4)');
|
||||||
|
statusCell.innerHTML = '<span class="badge bg-warning text-dark">Pending</span>';
|
||||||
|
|
||||||
|
// Show notification
|
||||||
|
showFlashMessage(data.message, 'success');
|
||||||
|
|
||||||
|
// Set up polling to check paper status and refresh activity
|
||||||
|
pollPaperStatus(paperId, 3000, 20);
|
||||||
|
} else {
|
||||||
|
processingStatus.textContent = data.message;
|
||||||
|
processingStatus.className = 'alert alert-danger mt-3';
|
||||||
|
showFlashMessage(data.message, 'error');
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Error processing paper:', error);
|
||||||
|
processingStatus.textContent = 'Error: Could not process paper';
|
||||||
|
processingStatus.className = 'alert alert-danger mt-3';
|
||||||
|
showFlashMessage('Error processing paper', 'error');
|
||||||
|
})
|
||||||
|
.finally(() => {
|
||||||
|
// Re-enable the process buttons after a short delay
|
||||||
|
setTimeout(() => {
|
||||||
|
document.querySelectorAll('.process-paper-btn').forEach(btn => {
|
||||||
|
if (btn.getAttribute('data-paper-id') !== paperId) {
|
||||||
|
btn.disabled = false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}, 1000);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Status polling
|
// Status polling
|
||||||
function initStatusPolling() {
|
function initStatusPolling() {
|
||||||
updateStatus();
|
updateStatus();
|
||||||
@ -285,39 +552,39 @@
|
|||||||
if (confirm("Are you sure you want to reset the scraper? This will stop all current tasks, optionally clear non-pending papers, and restart the scraper.")) {
|
if (confirm("Are you sure you want to reset the scraper? This will stop all current tasks, optionally clear non-pending papers, and restart the scraper.")) {
|
||||||
// Disable button to prevent multiple clicks
|
// Disable button to prevent multiple clicks
|
||||||
resetButton.disabled = true;
|
resetButton.disabled = true;
|
||||||
|
|
||||||
// Show a loading message
|
// Show a loading message
|
||||||
showFlashMessage('Resetting scraper, please wait...', 'info');
|
showFlashMessage('Resetting scraper, please wait...', 'info');
|
||||||
|
|
||||||
fetch('/scraper/reset', {
|
fetch('/scraper/reset', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'application/json'
|
'Content-Type': 'application/json'
|
||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
clear_papers: true // You could make this configurable with a checkbox
|
clear_papers: true // You could make this configurable with a checkbox
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(data => {
|
.then(data => {
|
||||||
if (data.success) {
|
if (data.success) {
|
||||||
showFlashMessage('Scraper has been completely reset and restarted', 'success');
|
showFlashMessage('Scraper has been completely reset and restarted', 'success');
|
||||||
// Update everything
|
// Update everything
|
||||||
updateStatus();
|
updateStatus();
|
||||||
loadActivityStats(currentTimeRange);
|
loadActivityStats(currentTimeRange);
|
||||||
setTimeout(() => { loadRecentActivity(); }, 1000);
|
setTimeout(() => { loadRecentActivity(); }, 1000);
|
||||||
} else {
|
} else {
|
||||||
showFlashMessage(data.message || 'Error resetting scraper', 'error');
|
showFlashMessage(data.message || 'Error resetting scraper', 'error');
|
||||||
}
|
}
|
||||||
// Re-enable button
|
// Re-enable button
|
||||||
resetButton.disabled = false;
|
resetButton.disabled = false;
|
||||||
})
|
})
|
||||||
.catch(error => {
|
.catch(error => {
|
||||||
console.error("Error resetting scraper:", error);
|
console.error("Error resetting scraper:", error);
|
||||||
showFlashMessage('Error resetting scraper: ' + error.message, 'error');
|
showFlashMessage('Error resetting scraper: ' + error.message, 'error');
|
||||||
// Re-enable button
|
// Re-enable button
|
||||||
resetButton.disabled = false;
|
resetButton.disabled = false;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -345,6 +612,97 @@
|
|||||||
notificationsEnabled = notificationsToggle.checked;
|
notificationsEnabled = notificationsToggle.checked;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Poll paper status until it changes from Pending
|
||||||
|
function pollPaperStatus(paperId, interval = 3000, maxAttempts = 20) {
|
||||||
|
let attempts = 0;
|
||||||
|
|
||||||
|
// Immediately refresh activity log to show the initial pending status
|
||||||
|
loadRecentActivity();
|
||||||
|
|
||||||
|
const checkStatus = () => {
|
||||||
|
attempts++;
|
||||||
|
console.log(`Checking status of paper ${paperId}, attempt ${attempts}/${maxAttempts}`);
|
||||||
|
|
||||||
|
// Fetch the current paper status
|
||||||
|
fetch(`/api/papers/${paperId}`)
|
||||||
|
.then(response => response.json())
|
||||||
|
.then(data => {
|
||||||
|
if (data && data.paper) {
|
||||||
|
const paper = data.paper;
|
||||||
|
console.log(`Paper status: ${paper.status}`);
|
||||||
|
|
||||||
|
// Update the UI with the current status
|
||||||
|
const row = document.querySelector(`.process-paper-btn[data-paper-id="${paperId}"]`).closest('tr');
|
||||||
|
if (row) {
|
||||||
|
const statusCell = row.querySelector('td:nth-child(4)');
|
||||||
|
let statusBadge = '';
|
||||||
|
|
||||||
|
if (paper.status === 'New') {
|
||||||
|
statusBadge = '<span class="badge bg-info">New</span>';
|
||||||
|
} else if (paper.status === 'Pending') {
|
||||||
|
statusBadge = '<span class="badge bg-warning text-dark">Pending</span>';
|
||||||
|
} else if (paper.status === 'Done') {
|
||||||
|
statusBadge = '<span class="badge bg-success">Done</span>';
|
||||||
|
} else if (paper.status === 'Failed') {
|
||||||
|
statusBadge = '<span class="badge bg-danger">Failed</span>';
|
||||||
|
} else {
|
||||||
|
statusBadge = `<span class="badge bg-secondary">${paper.status}</span>`;
|
||||||
|
}
|
||||||
|
|
||||||
|
statusCell.innerHTML = statusBadge;
|
||||||
|
|
||||||
|
// Update processing status message if status changed
|
||||||
|
if (paper.status !== 'Pending') {
|
||||||
|
if (paper.status === 'Done') {
|
||||||
|
processingStatus.textContent = `Paper processed successfully: ${paper.title}`;
|
||||||
|
processingStatus.className = 'alert alert-success mt-3';
|
||||||
|
} else if (paper.status === 'Failed') {
|
||||||
|
processingStatus.textContent = `Paper processing failed: ${paper.error_msg || 'Unknown error'}`;
|
||||||
|
processingStatus.className = 'alert alert-danger mt-3';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Always refresh activity log
|
||||||
|
loadRecentActivity();
|
||||||
|
|
||||||
|
// If status is still pending and we haven't reached max attempts, check again
|
||||||
|
if (paper.status === 'Pending' && attempts < maxAttempts) {
|
||||||
|
setTimeout(checkStatus, interval);
|
||||||
|
} else {
|
||||||
|
// If status changed or we reached max attempts, refresh chart data too
|
||||||
|
loadActivityStats(currentTimeRange);
|
||||||
|
|
||||||
|
// Show notification if status changed
|
||||||
|
if (paper.status !== 'Pending') {
|
||||||
|
const status = paper.status === 'Done' ? 'success' : 'error';
|
||||||
|
const message = paper.status === 'Done'
|
||||||
|
? `Paper processed successfully: ${paper.title}`
|
||||||
|
: `Paper processing failed: ${paper.error_msg || 'Unknown error'}`;
|
||||||
|
showFlashMessage(message, status);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we hit max attempts but status is still pending, show a message
|
||||||
|
if (paper.status === 'Pending' && attempts >= maxAttempts) {
|
||||||
|
processingStatus.textContent = 'Paper is still being processed. Check the activity log for updates.';
|
||||||
|
processingStatus.className = 'alert alert-info mt-3';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error(`Error polling paper status: ${error}`);
|
||||||
|
// If there's an error, we can still try again if under max attempts
|
||||||
|
if (attempts < maxAttempts) {
|
||||||
|
setTimeout(checkStatus, interval);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
// Start checking
|
||||||
|
setTimeout(checkStatus, interval);
|
||||||
|
}
|
||||||
|
|
||||||
// Load data functions
|
// Load data functions
|
||||||
function loadActivityStats(hours) {
|
function loadActivityStats(hours) {
|
||||||
fetch(`/scraper/stats?hours=${hours}`)
|
fetch(`/scraper/stats?hours=${hours}`)
|
||||||
@ -359,8 +717,10 @@
|
|||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(data => {
|
.then(data => {
|
||||||
renderActivityLog(data);
|
renderActivityLog(data);
|
||||||
|
console.log("Activity log refreshed with latest data");
|
||||||
})
|
})
|
||||||
.catch(() => {
|
.catch((error) => {
|
||||||
|
console.error("Failed to load activity logs:", error);
|
||||||
// If the API endpoint doesn't exist, just show a message
|
// If the API endpoint doesn't exist, just show a message
|
||||||
activityLog.innerHTML = '<tr><td colspan="4" class="text-center">Activity log API not available</td></tr>';
|
activityLog.innerHTML = '<tr><td colspan="4" class="text-center">Activity log API not available</td></tr>';
|
||||||
});
|
});
|
||||||
@ -467,6 +827,26 @@
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Flash message function
|
||||||
|
function showFlashMessage(message, type) {
|
||||||
|
const flashContainer = document.createElement('div');
|
||||||
|
flashContainer.className = `alert alert-${type === 'error' ? 'danger' : type} alert-dismissible fade show notification`;
|
||||||
|
flashContainer.innerHTML = `
|
||||||
|
${message}
|
||||||
|
<button type="button" class="btn-close" data-bs-dismiss="alert" aria-label="Close"></button>
|
||||||
|
`;
|
||||||
|
|
||||||
|
document.body.appendChild(flashContainer);
|
||||||
|
|
||||||
|
// Auto dismiss after 5 seconds
|
||||||
|
setTimeout(() => {
|
||||||
|
flashContainer.classList.remove('show');
|
||||||
|
setTimeout(() => {
|
||||||
|
flashContainer.remove();
|
||||||
|
}, 150); // Remove after fade out animation
|
||||||
|
}, 5000);
|
||||||
|
}
|
||||||
|
|
||||||
// WebSocket for real-time notifications
|
// WebSocket for real-time notifications
|
||||||
function setupWebSocket() {
|
function setupWebSocket() {
|
||||||
// If WebSocket is available, implement it here
|
// If WebSocket is available, implement it here
|
||||||
|
Loading…
x
Reference in New Issue
Block a user