modularizes the scraper methods
This commit is contained in:
parent
11f086aa64
commit
8f2375215d
@ -1,10 +1,11 @@
|
|||||||
"""Configuration management blueprint."""
|
"""Configuration management blueprint."""
|
||||||
from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify
|
from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify, current_app
|
||||||
from ..db import db
|
from ..db import db
|
||||||
# Import the new model
|
# Import the new model
|
||||||
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig
|
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig
|
||||||
from ..defaults import MAX_VOLUME
|
from ..defaults import MAX_VOLUME
|
||||||
import os # Import os for path validation
|
import os # Import os for path validation
|
||||||
|
from scipaperloader.scrapers import __path__ as scrapers_path
|
||||||
|
|
||||||
bp = Blueprint("config", __name__, url_prefix="/config")
|
bp = Blueprint("config", __name__, url_prefix="/config")
|
||||||
|
|
||||||
@ -281,6 +282,46 @@ def update_schedule():
|
|||||||
return redirect(url_for("config.schedule"))
|
return redirect(url_for("config.schedule"))
|
||||||
|
|
||||||
|
|
||||||
|
@bp.route("/update/scraper_module", methods=["POST"])
|
||||||
|
def update_scraper_module():
|
||||||
|
"""Update the scraper module configuration."""
|
||||||
|
from ..models import ScraperModuleConfig
|
||||||
|
|
||||||
|
new_scraper_module = request.form.get("scraper_module")
|
||||||
|
if not new_scraper_module:
|
||||||
|
flash("Scraper module cannot be empty.", "error")
|
||||||
|
return redirect(url_for("config.general"))
|
||||||
|
|
||||||
|
# Validate that the module exists and is valid
|
||||||
|
from scipaperloader.scrapers.factory import get_available_scrapers
|
||||||
|
available_modules = [m["name"] for m in get_available_scrapers()]
|
||||||
|
|
||||||
|
if new_scraper_module not in available_modules:
|
||||||
|
flash(f"Invalid scraper module: {new_scraper_module}", "error")
|
||||||
|
return redirect(url_for("config.general"))
|
||||||
|
|
||||||
|
# Update the database configuration
|
||||||
|
ScraperModuleConfig.set_module(new_scraper_module)
|
||||||
|
flash(f"Scraper module updated to '{new_scraper_module}'.", "success")
|
||||||
|
return redirect(url_for("config.general"))
|
||||||
|
|
||||||
|
|
||||||
|
@bp.context_processor
|
||||||
|
def inject_scraper_modules():
|
||||||
|
"""Inject available scraper modules into the template context."""
|
||||||
|
from scipaperloader.scrapers.factory import get_available_scrapers
|
||||||
|
from ..models import ScraperModuleConfig
|
||||||
|
|
||||||
|
available_scrapers = get_available_scrapers()
|
||||||
|
current_module = ScraperModuleConfig.get_current_module()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"available_scraper_modules": [s["name"] for s in available_scrapers],
|
||||||
|
"current_scraper_module": current_module,
|
||||||
|
"scraper_details": {s["name"]: s for s in available_scrapers}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@bp.route("/api/schedule/stats")
|
@bp.route("/api/schedule/stats")
|
||||||
def schedule_stats():
|
def schedule_stats():
|
||||||
"""Get statistics about the current schedule configuration."""
|
"""Get statistics about the current schedule configuration."""
|
||||||
|
@ -12,6 +12,7 @@ from ..celery import celery
|
|||||||
from ..defaults import MAX_VOLUME
|
from ..defaults import MAX_VOLUME
|
||||||
from celery.schedules import crontab
|
from celery.schedules import crontab
|
||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
|
from scipaperloader.scrapers.factory import get_scraper
|
||||||
|
|
||||||
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
|
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
|
||||||
|
|
||||||
@ -153,7 +154,7 @@ def stop_scraper():
|
|||||||
|
|
||||||
# Stop any running tasks
|
# Stop any running tasks
|
||||||
task_types_to_revoke = [
|
task_types_to_revoke = [
|
||||||
'scipaperloader.blueprints.scraper.dummy_process_paper',
|
'scipaperloader.blueprints.scraper.process_paper',
|
||||||
'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
|
'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
|
||||||
'scipaperloader.blueprints.scraper.run_periodic_dummy_scraper'
|
'scipaperloader.blueprints.scraper.run_periodic_dummy_scraper'
|
||||||
]
|
]
|
||||||
@ -224,7 +225,7 @@ def pause_scraper():
|
|||||||
# Just revoke processing tasks, but leave the periodic tasks running
|
# Just revoke processing tasks, but leave the periodic tasks running
|
||||||
# so it can continue to check the state (which is now paused)
|
# so it can continue to check the state (which is now paused)
|
||||||
task_types_to_revoke = [
|
task_types_to_revoke = [
|
||||||
'scipaperloader.blueprints.scraper.dummy_process_paper',
|
'scipaperloader.blueprints.scraper.process_paper',
|
||||||
'scipaperloader.blueprints.scraper.dummy_scheduled_scraper'
|
'scipaperloader.blueprints.scraper.dummy_scheduled_scraper'
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -373,70 +374,7 @@ def update_config():
|
|||||||
return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"})
|
return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"})
|
||||||
|
|
||||||
|
|
||||||
@celery.task(bind=True)
|
|
||||||
def dummy_scrape_paper(self):
|
|
||||||
"""Simulate scraping a single paper."""
|
|
||||||
# Simulate success or failure
|
|
||||||
success = random.random() > 0.3 # 70% success rate
|
|
||||||
|
|
||||||
# Simulate processing time
|
|
||||||
import time
|
|
||||||
time.sleep(random.randint(2, 5)) # 2-5 seconds
|
|
||||||
|
|
||||||
if success:
|
|
||||||
# Create a dummy paper
|
|
||||||
new_paper = PaperMetadata(
|
|
||||||
title=f"Dummy Paper {random.randint(1000, 9999)}",
|
|
||||||
doi=f"10.1234/dummy.{random.randint(1000, 9999)}",
|
|
||||||
journal=random.choice([
|
|
||||||
"Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
|
|
||||||
"Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
|
|
||||||
]),
|
|
||||||
type="article",
|
|
||||||
language="en",
|
|
||||||
published_online=datetime.now().date(),
|
|
||||||
status="Done",
|
|
||||||
file_path="/path/to/dummy/paper.pdf"
|
|
||||||
)
|
|
||||||
|
|
||||||
db.session.add(new_paper)
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
# Log the successful scrape
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="scrape_paper",
|
|
||||||
paper_id=new_paper.id,
|
|
||||||
status="success",
|
|
||||||
description=f"Successfully scraped paper {new_paper.doi}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"paper_id": new_paper.id,
|
|
||||||
"title": new_paper.title,
|
|
||||||
"doi": new_paper.doi
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
# Log the failed scrape
|
|
||||||
error_message = random.choice([
|
|
||||||
"Connection timeout",
|
|
||||||
"404 Not Found",
|
|
||||||
"Access denied",
|
|
||||||
"Invalid DOI format",
|
|
||||||
"PDF download failed",
|
|
||||||
"Rate limited by publisher"
|
|
||||||
])
|
|
||||||
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="scrape_paper",
|
|
||||||
status="error",
|
|
||||||
description=f"Failed to scrape paper: {error_message}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": False,
|
|
||||||
"error": error_message
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@celery.task
|
@celery.task
|
||||||
@ -545,11 +483,11 @@ def dummy_scheduled_scraper():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# --- Now schedule processing for the newly selected "Pending" papers ---
|
# --- Now schedule processing for the newly selected "Pending" papers ---
|
||||||
# (Assuming dummy_process_paper takes a paper_id)
|
# (Using the new modular process_paper task)
|
||||||
# Add random delays for processing within the hour (e.g., up to 3600 seconds)
|
# Add random delays for processing within the hour (e.g., up to 3600 seconds)
|
||||||
for paper_id in selected_paper_ids:
|
for paper_id in selected_paper_ids:
|
||||||
delay = random.uniform(1, 3500) # Random delay up to ~58 minutes
|
delay = random.uniform(1, 3500) # Random delay up to ~58 minutes
|
||||||
dummy_process_paper.apply_async(args=[paper_id], countdown=delay)
|
process_paper.apply_async(args=[paper_id], countdown=delay)
|
||||||
|
|
||||||
ActivityLog.log_scraper_activity(
|
ActivityLog.log_scraper_activity(
|
||||||
action="schedule_processing",
|
action="schedule_processing",
|
||||||
@ -568,109 +506,6 @@ def dummy_scheduled_scraper():
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@celery.task(bind=True)
|
|
||||||
def dummy_process_paper(self, paper_id):
|
|
||||||
"""
|
|
||||||
Process a single paper for the dummy scraper.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
paper_id (int): ID of the paper to process
|
|
||||||
"""
|
|
||||||
# First check if the scraper is still active and not paused
|
|
||||||
scraper_state = ScraperState.get_current_state()
|
|
||||||
if not scraper_state.is_active or scraper_state.is_paused:
|
|
||||||
# Log that task was skipped due to scraper being stopped or paused
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="process_paper",
|
|
||||||
status="info",
|
|
||||||
description=f"Skipped processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Get the paper from database
|
|
||||||
paper = PaperMetadata.query.get(paper_id)
|
|
||||||
if not paper:
|
|
||||||
# Log error if paper not found
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="process_paper",
|
|
||||||
status="error",
|
|
||||||
description=f"Paper with ID {paper_id} not found"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Simulate random success/failure (70% success rate)
|
|
||||||
success = random.random() < 0.7
|
|
||||||
|
|
||||||
# Simulate processing time (1-5 seconds)
|
|
||||||
process_time = random.uniform(1, 5)
|
|
||||||
time.sleep(process_time)
|
|
||||||
|
|
||||||
# Check again if scraper is still active and not paused after the time delay
|
|
||||||
# This ensures we don't process papers if the scraper was stopped during the delay
|
|
||||||
scraper_state = ScraperState.get_current_state()
|
|
||||||
if not scraper_state.is_active or scraper_state.is_paused:
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="process_paper",
|
|
||||||
status="info",
|
|
||||||
description=f"Cancelled processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
if success:
|
|
||||||
# --- Get configured download path ---
|
|
||||||
download_base_path = DownloadPathConfig.get_path()
|
|
||||||
# Ensure the base path exists (optional, but good practice)
|
|
||||||
# os.makedirs(download_base_path, exist_ok=True)
|
|
||||||
|
|
||||||
# --- Construct the file path ---
|
|
||||||
# Sanitize DOI for use in filename
|
|
||||||
safe_doi = paper.doi.replace('/', '_').replace(':', '_')
|
|
||||||
filename = f"{safe_doi}.pdf"
|
|
||||||
full_path = os.path.join(download_base_path, filename)
|
|
||||||
|
|
||||||
# Update paper status to "Done" and set the file path
|
|
||||||
paper.status = "Done"
|
|
||||||
paper.file_path = full_path # Use the constructed path
|
|
||||||
|
|
||||||
# Log success
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="process_paper",
|
|
||||||
paper_id=paper.id,
|
|
||||||
status="success",
|
|
||||||
description=f"Successfully processed paper: {paper.doi}. File at: {full_path}" # Log path
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Update paper status to "Failed"
|
|
||||||
paper.status = "Failed"
|
|
||||||
|
|
||||||
# Generate random error message
|
|
||||||
error_message = random.choice([
|
|
||||||
"Publisher website unavailable",
|
|
||||||
"No PDF download link found",
|
|
||||||
"Access restricted",
|
|
||||||
"Download timeout",
|
|
||||||
"Invalid DOI",
|
|
||||||
"Rate limited by publisher"
|
|
||||||
])
|
|
||||||
paper.error_msg = error_message
|
|
||||||
|
|
||||||
# Log failure
|
|
||||||
ActivityLog.log_scraper_activity(
|
|
||||||
action="process_paper",
|
|
||||||
paper_id=paper.id,
|
|
||||||
status="error",
|
|
||||||
description=f"Failed to process paper: {error_message}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Update the timestamp
|
|
||||||
paper.updated_at = datetime.utcnow()
|
|
||||||
|
|
||||||
# Commit changes to database
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
return success
|
|
||||||
|
|
||||||
|
|
||||||
@celery.task(bind=True)
|
@celery.task(bind=True)
|
||||||
def process_paper_batch(self, paper_ids):
|
def process_paper_batch(self, paper_ids):
|
||||||
"""
|
"""
|
||||||
@ -914,3 +749,21 @@ def calculate_papers_for_current_hour():
|
|||||||
)
|
)
|
||||||
|
|
||||||
return papers_this_hour
|
return papers_this_hour
|
||||||
|
|
||||||
|
|
||||||
|
@celery.task(bind=True)
|
||||||
|
def process_paper(self, paper_id):
|
||||||
|
"""Process a paper using the configured scraper."""
|
||||||
|
from scipaperloader.models import PaperMetadata
|
||||||
|
paper = PaperMetadata.query.get(paper_id)
|
||||||
|
if not paper:
|
||||||
|
return {"status": "error", "message": f"Paper with ID {paper_id} not found"}
|
||||||
|
|
||||||
|
scraper = get_scraper()
|
||||||
|
result = scraper.scrape(paper.doi)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"paper_id": paper_id,
|
||||||
|
"status": result.status,
|
||||||
|
"message": result.message
|
||||||
|
}
|
||||||
|
@ -6,3 +6,4 @@ class Config:
|
|||||||
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
|
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
|
||||||
SQLALCHEMY_TRACK_MODIFICATIONS = False
|
SQLALCHEMY_TRACK_MODIFICATIONS = False
|
||||||
APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
|
APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
|
||||||
|
SCRAPER_MODULE = os.environ.get("SCRAPER_MODULE", "dummy")
|
||||||
|
@ -277,6 +277,40 @@ class ScraperState(db.Model):
|
|||||||
return state.is_active and not state.is_paused
|
return state.is_active and not state.is_paused
|
||||||
|
|
||||||
|
|
||||||
|
class ScraperModuleConfig(db.Model):
|
||||||
|
"""Model to store the configured scraper module."""
|
||||||
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
|
module_name = db.Column(db.String(100), default="dummy")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_current_module(cls):
|
||||||
|
"""Get the currently configured scraper module."""
|
||||||
|
config = cls.query.first()
|
||||||
|
if not config:
|
||||||
|
config = cls(module_name="dummy")
|
||||||
|
db.session.add(config)
|
||||||
|
db.session.commit()
|
||||||
|
return config.module_name
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def set_module(cls, module_name):
|
||||||
|
"""Set the scraper module."""
|
||||||
|
config = cls.query.first()
|
||||||
|
if not config:
|
||||||
|
config = cls(module_name=module_name)
|
||||||
|
db.session.add(config)
|
||||||
|
else:
|
||||||
|
old_value = config.module_name
|
||||||
|
config.module_name = module_name
|
||||||
|
ActivityLog.log_config_change(
|
||||||
|
config_key="scraper_module",
|
||||||
|
old_value=old_value,
|
||||||
|
new_value=module_name,
|
||||||
|
description="Updated scraper module configuration"
|
||||||
|
)
|
||||||
|
db.session.commit()
|
||||||
|
return config
|
||||||
|
|
||||||
def init_schedule_config():
|
def init_schedule_config():
|
||||||
"""Initialize ScheduleConfig with default values if empty"""
|
"""Initialize ScheduleConfig with default values if empty"""
|
||||||
if ScheduleConfig.query.count() == 0:
|
if ScheduleConfig.query.count() == 0:
|
||||||
|
2
scipaperloader/scrapers/__init__.py
Normal file
2
scipaperloader/scrapers/__init__.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
# This package contains all scraper modules.
|
||||||
|
# Each scraper should implement the BaseScraper interface from base.py.
|
34
scipaperloader/scrapers/base.py
Normal file
34
scipaperloader/scrapers/base.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import NamedTuple, Optional, Dict
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
class ScrapeResult(NamedTuple):
|
||||||
|
status: str # "success", "error", "skipped"
|
||||||
|
message: str # human-readable status
|
||||||
|
data: Optional[Dict] # any extra payload (file_path, metadata, etc.)
|
||||||
|
duration: Optional[float] = None # processing time in seconds
|
||||||
|
timestamp: Optional[datetime] = None # when the operation completed
|
||||||
|
|
||||||
|
class BaseScraper(ABC):
|
||||||
|
"""Base class for all scraper implementations."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def scrape(self, doi: str) -> ScrapeResult:
|
||||||
|
"""
|
||||||
|
Fetch metadata and/or download paper for the given DOI.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doi: The DOI of the paper to scrape
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ScrapeResult with status, message, and optional data
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_name(self) -> str:
|
||||||
|
"""Return the name of this scraper."""
|
||||||
|
return self.__class__.__name__
|
||||||
|
|
||||||
|
def get_description(self) -> str:
|
||||||
|
"""Return a description of this scraper."""
|
||||||
|
return getattr(self.__class__, "__doc__", "No description available")
|
94
scipaperloader/scrapers/dummy.py
Normal file
94
scipaperloader/scrapers/dummy.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
import time
|
||||||
|
import random
|
||||||
|
from datetime import datetime
|
||||||
|
from .base import BaseScraper, ScrapeResult
|
||||||
|
from flask import current_app
|
||||||
|
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||||
|
from ..db import db
|
||||||
|
|
||||||
|
class Scraper(BaseScraper):
|
||||||
|
"""Dummy scraper for testing purposes that simulates paper downloading."""
|
||||||
|
|
||||||
|
def scrape(self, doi: str) -> ScrapeResult:
|
||||||
|
"""Simulate scraping a paper with realistic timing and random success/failure."""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||||
|
if not paper:
|
||||||
|
return ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=f"No paper found for DOI {doi}",
|
||||||
|
data=None,
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Simulate processing time (1-3 seconds)
|
||||||
|
processing_time = random.uniform(1, 3)
|
||||||
|
time.sleep(processing_time)
|
||||||
|
|
||||||
|
# Simulate 80% success rate
|
||||||
|
success = random.random() < 0.8
|
||||||
|
|
||||||
|
if success:
|
||||||
|
# Get download path and simulate file creation
|
||||||
|
download_path = DownloadPathConfig.get_path()
|
||||||
|
file_name = f"{doi.replace('/', '_')}.pdf"
|
||||||
|
file_path = f"{download_path}/{file_name}"
|
||||||
|
|
||||||
|
# Update paper status
|
||||||
|
paper.status = "Done"
|
||||||
|
paper.file_path = file_path
|
||||||
|
paper.error_msg = None
|
||||||
|
|
||||||
|
# Log success
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="dummy_scrape",
|
||||||
|
status="success",
|
||||||
|
description=f"Successfully scraped {doi}",
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
result = ScrapeResult(
|
||||||
|
status="success",
|
||||||
|
message=f"Successfully scraped {doi}",
|
||||||
|
data={
|
||||||
|
"file_path": file_path,
|
||||||
|
"title": paper.title,
|
||||||
|
"journal": paper.journal
|
||||||
|
},
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Simulate failure
|
||||||
|
error_messages = [
|
||||||
|
"Paper not found in database",
|
||||||
|
"Access denied by publisher",
|
||||||
|
"Rate limit exceeded",
|
||||||
|
"Network timeout",
|
||||||
|
"Invalid DOI format"
|
||||||
|
]
|
||||||
|
error_msg = random.choice(error_messages)
|
||||||
|
|
||||||
|
paper.status = "Failed"
|
||||||
|
paper.error_msg = error_msg
|
||||||
|
|
||||||
|
# Log failure
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="dummy_scrape",
|
||||||
|
status="error",
|
||||||
|
description=f"Failed to scrape {doi}: {error_msg}",
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
result = ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=f"Failed to scrape {doi}: {error_msg}",
|
||||||
|
data={"error_code": "dummy_error"},
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
db.session.commit()
|
||||||
|
return result
|
59
scipaperloader/scrapers/factory.py
Normal file
59
scipaperloader/scrapers/factory.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
import importlib
|
||||||
|
from flask import current_app
|
||||||
|
from .base import BaseScraper
|
||||||
|
|
||||||
|
def get_scraper() -> BaseScraper:
|
||||||
|
"""Load the configured scraper module dynamically with error handling."""
|
||||||
|
from ..models import ScraperModuleConfig, ActivityLog
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get module name from database first, fallback to config
|
||||||
|
name = ScraperModuleConfig.get_current_module()
|
||||||
|
if not name:
|
||||||
|
name = current_app.config.get("SCRAPER_MODULE", "dummy")
|
||||||
|
|
||||||
|
module = importlib.import_module(f"scipaperloader.scrapers.{name}")
|
||||||
|
cls = getattr(module, "Scraper")
|
||||||
|
|
||||||
|
# Validate that it's actually a BaseScraper
|
||||||
|
if not issubclass(cls, BaseScraper):
|
||||||
|
raise TypeError(f"Scraper class in module '{name}' does not inherit from BaseScraper")
|
||||||
|
|
||||||
|
return cls()
|
||||||
|
|
||||||
|
except (ImportError, AttributeError, TypeError) as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Failed to load scraper module '{name}': {str(e)}",
|
||||||
|
source="scraper_factory",
|
||||||
|
severity="error"
|
||||||
|
)
|
||||||
|
# Fallback to dummy scraper
|
||||||
|
from .dummy import Scraper as DummyScraper
|
||||||
|
return DummyScraper()
|
||||||
|
|
||||||
|
def get_available_scrapers():
|
||||||
|
"""Get list of available scraper modules."""
|
||||||
|
import os
|
||||||
|
from scipaperloader.scrapers import __path__ as scrapers_path
|
||||||
|
|
||||||
|
modules = []
|
||||||
|
scrapers_dir = scrapers_path[0]
|
||||||
|
|
||||||
|
for filename in os.listdir(scrapers_dir):
|
||||||
|
if filename.endswith(".py") and filename not in ("__init__.py", "base.py", "factory.py"):
|
||||||
|
module_name = filename[:-3]
|
||||||
|
try:
|
||||||
|
# Try to import and validate the module
|
||||||
|
module = importlib.import_module(f"scipaperloader.scrapers.{module_name}")
|
||||||
|
cls = getattr(module, "Scraper", None)
|
||||||
|
if cls and issubclass(cls, BaseScraper):
|
||||||
|
modules.append({
|
||||||
|
"name": module_name,
|
||||||
|
"class": cls,
|
||||||
|
"description": getattr(cls, "__doc__", "No description available")
|
||||||
|
})
|
||||||
|
except (ImportError, AttributeError, TypeError):
|
||||||
|
# Skip invalid modules
|
||||||
|
pass
|
||||||
|
|
||||||
|
return modules
|
@ -9,52 +9,87 @@
|
|||||||
<!-- include flash messages template -->
|
<!-- include flash messages template -->
|
||||||
{% include "partials/flash_messages.html.jinja" %}
|
{% include "partials/flash_messages.html.jinja" %}
|
||||||
|
|
||||||
<form action="{{ url_for('config.update_general') }}" method="post">
|
<div class="row">
|
||||||
<div class="form-section">
|
<!-- General Settings Column -->
|
||||||
<h6>Scraper Volume</h6>
|
<div class="col-md-6">
|
||||||
<p class="text-muted">Configure the total number of papers to scrape per day.</p>
|
<form action="{{ url_for('config.update_general') }}" method="post">
|
||||||
|
<div class="form-section">
|
||||||
|
<h6>Scraper Volume</h6>
|
||||||
|
<p class="text-muted">Configure the total number of papers to scrape per day.</p>
|
||||||
|
|
||||||
<div class="mb-3">
|
<div class="mb-3">
|
||||||
<label for="totalVolume" class="form-label">Papers per day:</label>
|
<label for="totalVolume" class="form-label">Papers per day:</label>
|
||||||
<input type="number" class="form-control" id="totalVolume" name="total_volume" min="1"
|
<input type="number" class="form-control" id="totalVolume" name="total_volume"
|
||||||
max="{{ max_volume }}" value="{{ volume_config.volume }}" required>
|
min="1" max="{{ max_volume }}" value="{{ volume_config.volume }}" required>
|
||||||
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
|
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="form-section">
|
||||||
|
<h6>Download Path</h6>
|
||||||
|
<p class="text-muted">Base directory where scraped paper files will be stored.</p>
|
||||||
|
<div class="mb-3">
|
||||||
|
<label for="downloadPath" class="form-label">Download Directory:</label>
|
||||||
|
<input type="text" class="form-control" id="downloadPath" name="download_path"
|
||||||
|
value="{{ download_path_config.path }}" required>
|
||||||
|
<div class="form-text">Enter the full path to the download directory (e.g.,
|
||||||
|
/data/papers).
|
||||||
|
Ensure the directory exists and the application has write permissions.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="form-section">
|
||||||
|
<h6>System Settings</h6>
|
||||||
|
<p class="text-muted">Configure general system behavior.</p>
|
||||||
|
|
||||||
|
<div class="mb-3 form-check">
|
||||||
|
<input type="checkbox" class="form-check-input" id="enableNotifications" checked>
|
||||||
|
<label class="form-check-label" for="enableNotifications">
|
||||||
|
Enable email notifications
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="mb-3 form-check">
|
||||||
|
<input type="checkbox" class="form-check-input" id="enableLogging" checked>
|
||||||
|
<label class="form-check-label" for="enableLogging">
|
||||||
|
Enable detailed activity logging
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<button type="submit" class="btn btn-primary">Save General Settings</button>
|
||||||
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="form-section">
|
<!-- Scraper Module Column -->
|
||||||
<h6>Download Path</h6>
|
<div class="col-md-6">
|
||||||
<p class="text-muted">Base directory where scraped paper files will be stored.</p>
|
<form method="post" action="{{ url_for('config.update_scraper_module') }}">
|
||||||
<div class="mb-3">
|
<div class="form-section">
|
||||||
<label for="downloadPath" class="form-label">Download Directory:</label>
|
<h6>Scraper Module</h6>
|
||||||
<input type="text" class="form-control" id="downloadPath" name="download_path"
|
<p class="text-muted">Select which scraper module to use for processing papers.</p>
|
||||||
value="{{ download_path_config.path }}" required>
|
|
||||||
<div class="form-text">Enter the full path to the download directory (e.g., /data/papers).
|
<div class="mb-3">
|
||||||
Ensure the directory exists and the application has write permissions.</div>
|
<label for="scraper_module" class="form-label">Active Scraper Module:</label>
|
||||||
</div>
|
<select class="form-control" id="scraper_module" name="scraper_module">
|
||||||
|
{% for module in available_scraper_modules %}
|
||||||
|
<option value="{{ module }}" {% if module==current_scraper_module %} selected
|
||||||
|
{%endif %}>
|
||||||
|
{{ module }}
|
||||||
|
{% if scraper_details[module] %}
|
||||||
|
- {{ scraper_details[module].description[:50] }}...
|
||||||
|
{% endif %}
|
||||||
|
</option>
|
||||||
|
{% endfor %}
|
||||||
|
</select>
|
||||||
|
<div class="form-text">
|
||||||
|
Current module: <strong>{{ current_scraper_module }}</strong>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<button type="submit" class="btn btn-primary">Update Scraper Module</button>
|
||||||
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
<div class="form-section">
|
|
||||||
<h6>System Settings</h6>
|
|
||||||
<p class="text-muted">Configure general system behavior.</p>
|
|
||||||
|
|
||||||
<div class="mb-3 form-check">
|
|
||||||
<input type="checkbox" class="form-check-input" id="enableNotifications" checked>
|
|
||||||
<label class="form-check-label" for="enableNotifications">
|
|
||||||
Enable email notifications
|
|
||||||
</label>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="mb-3 form-check">
|
|
||||||
<input type="checkbox" class="form-check-input" id="enableLogging" checked>
|
|
||||||
<label class="form-check-label" for="enableLogging">
|
|
||||||
Enable detailed activity logging
|
|
||||||
</label>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<button type="submit" class="btn btn-primary">Save General Settings</button>
|
|
||||||
</form>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user