modularizes the scraper methods

This commit is contained in:
Michael Beck 2025-05-23 14:32:41 +02:00
parent 11f086aa64
commit 8f2375215d
9 changed files with 366 additions and 213 deletions

View File

@ -1,10 +1,11 @@
"""Configuration management blueprint."""
from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify
from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify, current_app
from ..db import db
# Import the new model
from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig
from ..defaults import MAX_VOLUME
import os # Import os for path validation
from scipaperloader.scrapers import __path__ as scrapers_path
bp = Blueprint("config", __name__, url_prefix="/config")
@ -281,6 +282,46 @@ def update_schedule():
return redirect(url_for("config.schedule"))
@bp.route("/update/scraper_module", methods=["POST"])
def update_scraper_module():
"""Update the scraper module configuration."""
from ..models import ScraperModuleConfig
new_scraper_module = request.form.get("scraper_module")
if not new_scraper_module:
flash("Scraper module cannot be empty.", "error")
return redirect(url_for("config.general"))
# Validate that the module exists and is valid
from scipaperloader.scrapers.factory import get_available_scrapers
available_modules = [m["name"] for m in get_available_scrapers()]
if new_scraper_module not in available_modules:
flash(f"Invalid scraper module: {new_scraper_module}", "error")
return redirect(url_for("config.general"))
# Update the database configuration
ScraperModuleConfig.set_module(new_scraper_module)
flash(f"Scraper module updated to '{new_scraper_module}'.", "success")
return redirect(url_for("config.general"))
@bp.context_processor
def inject_scraper_modules():
"""Inject available scraper modules into the template context."""
from scipaperloader.scrapers.factory import get_available_scrapers
from ..models import ScraperModuleConfig
available_scrapers = get_available_scrapers()
current_module = ScraperModuleConfig.get_current_module()
return {
"available_scraper_modules": [s["name"] for s in available_scrapers],
"current_scraper_module": current_module,
"scraper_details": {s["name"]: s for s in available_scrapers}
}
@bp.route("/api/schedule/stats")
def schedule_stats():
"""Get statistics about the current schedule configuration."""

View File

@ -12,6 +12,7 @@ from ..celery import celery
from ..defaults import MAX_VOLUME
from celery.schedules import crontab
from sqlalchemy import func
from scipaperloader.scrapers.factory import get_scraper
bp = Blueprint("scraper", __name__, url_prefix="/scraper")
@ -153,7 +154,7 @@ def stop_scraper():
# Stop any running tasks
task_types_to_revoke = [
'scipaperloader.blueprints.scraper.dummy_process_paper',
'scipaperloader.blueprints.scraper.process_paper',
'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
'scipaperloader.blueprints.scraper.run_periodic_dummy_scraper'
]
@ -224,7 +225,7 @@ def pause_scraper():
# Just revoke processing tasks, but leave the periodic tasks running
# so it can continue to check the state (which is now paused)
task_types_to_revoke = [
'scipaperloader.blueprints.scraper.dummy_process_paper',
'scipaperloader.blueprints.scraper.process_paper',
'scipaperloader.blueprints.scraper.dummy_scheduled_scraper'
]
@ -373,70 +374,7 @@ def update_config():
return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"})
@celery.task(bind=True)
def dummy_scrape_paper(self):
"""Simulate scraping a single paper."""
# Simulate success or failure
success = random.random() > 0.3 # 70% success rate
# Simulate processing time
import time
time.sleep(random.randint(2, 5)) # 2-5 seconds
if success:
# Create a dummy paper
new_paper = PaperMetadata(
title=f"Dummy Paper {random.randint(1000, 9999)}",
doi=f"10.1234/dummy.{random.randint(1000, 9999)}",
journal=random.choice([
"Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
"Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
]),
type="article",
language="en",
published_online=datetime.now().date(),
status="Done",
file_path="/path/to/dummy/paper.pdf"
)
db.session.add(new_paper)
db.session.commit()
# Log the successful scrape
ActivityLog.log_scraper_activity(
action="scrape_paper",
paper_id=new_paper.id,
status="success",
description=f"Successfully scraped paper {new_paper.doi}"
)
return {
"success": True,
"paper_id": new_paper.id,
"title": new_paper.title,
"doi": new_paper.doi
}
else:
# Log the failed scrape
error_message = random.choice([
"Connection timeout",
"404 Not Found",
"Access denied",
"Invalid DOI format",
"PDF download failed",
"Rate limited by publisher"
])
ActivityLog.log_scraper_activity(
action="scrape_paper",
status="error",
description=f"Failed to scrape paper: {error_message}"
)
return {
"success": False,
"error": error_message
}
@celery.task
@ -545,11 +483,11 @@ def dummy_scheduled_scraper():
)
# --- Now schedule processing for the newly selected "Pending" papers ---
# (Assuming dummy_process_paper takes a paper_id)
# (Using the new modular process_paper task)
# Add random delays for processing within the hour (e.g., up to 3600 seconds)
for paper_id in selected_paper_ids:
delay = random.uniform(1, 3500) # Random delay up to ~58 minutes
dummy_process_paper.apply_async(args=[paper_id], countdown=delay)
process_paper.apply_async(args=[paper_id], countdown=delay)
ActivityLog.log_scraper_activity(
action="schedule_processing",
@ -568,109 +506,6 @@ def dummy_scheduled_scraper():
return False
@celery.task(bind=True)
def dummy_process_paper(self, paper_id):
"""
Process a single paper for the dummy scraper.
Args:
paper_id (int): ID of the paper to process
"""
# First check if the scraper is still active and not paused
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active or scraper_state.is_paused:
# Log that task was skipped due to scraper being stopped or paused
ActivityLog.log_scraper_activity(
action="process_paper",
status="info",
description=f"Skipped processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}"
)
return False
# Get the paper from database
paper = PaperMetadata.query.get(paper_id)
if not paper:
# Log error if paper not found
ActivityLog.log_scraper_activity(
action="process_paper",
status="error",
description=f"Paper with ID {paper_id} not found"
)
return False
# Simulate random success/failure (70% success rate)
success = random.random() < 0.7
# Simulate processing time (1-5 seconds)
process_time = random.uniform(1, 5)
time.sleep(process_time)
# Check again if scraper is still active and not paused after the time delay
# This ensures we don't process papers if the scraper was stopped during the delay
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active or scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_paper",
status="info",
description=f"Cancelled processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}"
)
return False
if success:
# --- Get configured download path ---
download_base_path = DownloadPathConfig.get_path()
# Ensure the base path exists (optional, but good practice)
# os.makedirs(download_base_path, exist_ok=True)
# --- Construct the file path ---
# Sanitize DOI for use in filename
safe_doi = paper.doi.replace('/', '_').replace(':', '_')
filename = f"{safe_doi}.pdf"
full_path = os.path.join(download_base_path, filename)
# Update paper status to "Done" and set the file path
paper.status = "Done"
paper.file_path = full_path # Use the constructed path
# Log success
ActivityLog.log_scraper_activity(
action="process_paper",
paper_id=paper.id,
status="success",
description=f"Successfully processed paper: {paper.doi}. File at: {full_path}" # Log path
)
else:
# Update paper status to "Failed"
paper.status = "Failed"
# Generate random error message
error_message = random.choice([
"Publisher website unavailable",
"No PDF download link found",
"Access restricted",
"Download timeout",
"Invalid DOI",
"Rate limited by publisher"
])
paper.error_msg = error_message
# Log failure
ActivityLog.log_scraper_activity(
action="process_paper",
paper_id=paper.id,
status="error",
description=f"Failed to process paper: {error_message}"
)
# Update the timestamp
paper.updated_at = datetime.utcnow()
# Commit changes to database
db.session.commit()
return success
@celery.task(bind=True)
def process_paper_batch(self, paper_ids):
"""
@ -914,3 +749,21 @@ def calculate_papers_for_current_hour():
)
return papers_this_hour
@celery.task(bind=True)
def process_paper(self, paper_id):
"""Process a paper using the configured scraper."""
from scipaperloader.models import PaperMetadata
paper = PaperMetadata.query.get(paper_id)
if not paper:
return {"status": "error", "message": f"Paper with ID {paper_id} not found"}
scraper = get_scraper()
result = scraper.scrape(paper.doi)
return {
"paper_id": paper_id,
"status": result.status,
"message": result.message
}

View File

@ -6,3 +6,4 @@ class Config:
SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
SQLALCHEMY_TRACK_MODIFICATIONS = False
APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
SCRAPER_MODULE = os.environ.get("SCRAPER_MODULE", "dummy")

View File

@ -277,6 +277,40 @@ class ScraperState(db.Model):
return state.is_active and not state.is_paused
class ScraperModuleConfig(db.Model):
"""Model to store the configured scraper module."""
id = db.Column(db.Integer, primary_key=True)
module_name = db.Column(db.String(100), default="dummy")
@classmethod
def get_current_module(cls):
"""Get the currently configured scraper module."""
config = cls.query.first()
if not config:
config = cls(module_name="dummy")
db.session.add(config)
db.session.commit()
return config.module_name
@classmethod
def set_module(cls, module_name):
"""Set the scraper module."""
config = cls.query.first()
if not config:
config = cls(module_name=module_name)
db.session.add(config)
else:
old_value = config.module_name
config.module_name = module_name
ActivityLog.log_config_change(
config_key="scraper_module",
old_value=old_value,
new_value=module_name,
description="Updated scraper module configuration"
)
db.session.commit()
return config
def init_schedule_config():
"""Initialize ScheduleConfig with default values if empty"""
if ScheduleConfig.query.count() == 0:

View File

@ -0,0 +1,2 @@
# This package contains all scraper modules.
# Each scraper should implement the BaseScraper interface from base.py.

View File

@ -0,0 +1,34 @@
from abc import ABC, abstractmethod
from typing import NamedTuple, Optional, Dict
from datetime import datetime
class ScrapeResult(NamedTuple):
status: str # "success", "error", "skipped"
message: str # human-readable status
data: Optional[Dict] # any extra payload (file_path, metadata, etc.)
duration: Optional[float] = None # processing time in seconds
timestamp: Optional[datetime] = None # when the operation completed
class BaseScraper(ABC):
"""Base class for all scraper implementations."""
@abstractmethod
def scrape(self, doi: str) -> ScrapeResult:
"""
Fetch metadata and/or download paper for the given DOI.
Args:
doi: The DOI of the paper to scrape
Returns:
ScrapeResult with status, message, and optional data
"""
pass
def get_name(self) -> str:
"""Return the name of this scraper."""
return self.__class__.__name__
def get_description(self) -> str:
"""Return a description of this scraper."""
return getattr(self.__class__, "__doc__", "No description available")

View File

@ -0,0 +1,94 @@
import time
import random
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Dummy scraper for testing purposes that simulates paper downloading."""
def scrape(self, doi: str) -> ScrapeResult:
"""Simulate scraping a paper with realistic timing and random success/failure."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Simulate processing time (1-3 seconds)
processing_time = random.uniform(1, 3)
time.sleep(processing_time)
# Simulate 80% success rate
success = random.random() < 0.8
if success:
# Get download path and simulate file creation
download_path = DownloadPathConfig.get_path()
file_name = f"{doi.replace('/', '_')}.pdf"
file_path = f"{download_path}/{file_name}"
# Update paper status
paper.status = "Done"
paper.file_path = file_path
paper.error_msg = None
# Log success
ActivityLog.log_scraper_activity(
action="dummy_scrape",
status="success",
description=f"Successfully scraped {doi}",
paper_id=paper.id
)
result = ScrapeResult(
status="success",
message=f"Successfully scraped {doi}",
data={
"file_path": file_path,
"title": paper.title,
"journal": paper.journal
},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
else:
# Simulate failure
error_messages = [
"Paper not found in database",
"Access denied by publisher",
"Rate limit exceeded",
"Network timeout",
"Invalid DOI format"
]
error_msg = random.choice(error_messages)
paper.status = "Failed"
paper.error_msg = error_msg
# Log failure
ActivityLog.log_scraper_activity(
action="dummy_scrape",
status="error",
description=f"Failed to scrape {doi}: {error_msg}",
paper_id=paper.id
)
result = ScrapeResult(
status="error",
message=f"Failed to scrape {doi}: {error_msg}",
data={"error_code": "dummy_error"},
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
db.session.commit()
return result

View File

@ -0,0 +1,59 @@
import importlib
from flask import current_app
from .base import BaseScraper
def get_scraper() -> BaseScraper:
"""Load the configured scraper module dynamically with error handling."""
from ..models import ScraperModuleConfig, ActivityLog
try:
# Get module name from database first, fallback to config
name = ScraperModuleConfig.get_current_module()
if not name:
name = current_app.config.get("SCRAPER_MODULE", "dummy")
module = importlib.import_module(f"scipaperloader.scrapers.{name}")
cls = getattr(module, "Scraper")
# Validate that it's actually a BaseScraper
if not issubclass(cls, BaseScraper):
raise TypeError(f"Scraper class in module '{name}' does not inherit from BaseScraper")
return cls()
except (ImportError, AttributeError, TypeError) as e:
ActivityLog.log_error(
error_message=f"Failed to load scraper module '{name}': {str(e)}",
source="scraper_factory",
severity="error"
)
# Fallback to dummy scraper
from .dummy import Scraper as DummyScraper
return DummyScraper()
def get_available_scrapers():
"""Get list of available scraper modules."""
import os
from scipaperloader.scrapers import __path__ as scrapers_path
modules = []
scrapers_dir = scrapers_path[0]
for filename in os.listdir(scrapers_dir):
if filename.endswith(".py") and filename not in ("__init__.py", "base.py", "factory.py"):
module_name = filename[:-3]
try:
# Try to import and validate the module
module = importlib.import_module(f"scipaperloader.scrapers.{module_name}")
cls = getattr(module, "Scraper", None)
if cls and issubclass(cls, BaseScraper):
modules.append({
"name": module_name,
"class": cls,
"description": getattr(cls, "__doc__", "No description available")
})
except (ImportError, AttributeError, TypeError):
# Skip invalid modules
pass
return modules

View File

@ -9,52 +9,87 @@
<!-- include flash messages template -->
{% include "partials/flash_messages.html.jinja" %}
<form action="{{ url_for('config.update_general') }}" method="post">
<div class="form-section">
<h6>Scraper Volume</h6>
<p class="text-muted">Configure the total number of papers to scrape per day.</p>
<div class="row">
<!-- General Settings Column -->
<div class="col-md-6">
<form action="{{ url_for('config.update_general') }}" method="post">
<div class="form-section">
<h6>Scraper Volume</h6>
<p class="text-muted">Configure the total number of papers to scrape per day.</p>
<div class="mb-3">
<label for="totalVolume" class="form-label">Papers per day:</label>
<input type="number" class="form-control" id="totalVolume" name="total_volume" min="1"
max="{{ max_volume }}" value="{{ volume_config.volume }}" required>
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
</div>
<div class="mb-3">
<label for="totalVolume" class="form-label">Papers per day:</label>
<input type="number" class="form-control" id="totalVolume" name="total_volume"
min="1" max="{{ max_volume }}" value="{{ volume_config.volume }}" required>
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
</div>
</div>
<div class="form-section">
<h6>Download Path</h6>
<p class="text-muted">Base directory where scraped paper files will be stored.</p>
<div class="mb-3">
<label for="downloadPath" class="form-label">Download Directory:</label>
<input type="text" class="form-control" id="downloadPath" name="download_path"
value="{{ download_path_config.path }}" required>
<div class="form-text">Enter the full path to the download directory (e.g.,
/data/papers).
Ensure the directory exists and the application has write permissions.</div>
</div>
</div>
<div class="form-section">
<h6>System Settings</h6>
<p class="text-muted">Configure general system behavior.</p>
<div class="mb-3 form-check">
<input type="checkbox" class="form-check-input" id="enableNotifications" checked>
<label class="form-check-label" for="enableNotifications">
Enable email notifications
</label>
</div>
<div class="mb-3 form-check">
<input type="checkbox" class="form-check-input" id="enableLogging" checked>
<label class="form-check-label" for="enableLogging">
Enable detailed activity logging
</label>
</div>
</div>
<button type="submit" class="btn btn-primary">Save General Settings</button>
</form>
</div>
<div class="form-section">
<h6>Download Path</h6>
<p class="text-muted">Base directory where scraped paper files will be stored.</p>
<div class="mb-3">
<label for="downloadPath" class="form-label">Download Directory:</label>
<input type="text" class="form-control" id="downloadPath" name="download_path"
value="{{ download_path_config.path }}" required>
<div class="form-text">Enter the full path to the download directory (e.g., /data/papers).
Ensure the directory exists and the application has write permissions.</div>
</div>
<!-- Scraper Module Column -->
<div class="col-md-6">
<form method="post" action="{{ url_for('config.update_scraper_module') }}">
<div class="form-section">
<h6>Scraper Module</h6>
<p class="text-muted">Select which scraper module to use for processing papers.</p>
<div class="mb-3">
<label for="scraper_module" class="form-label">Active Scraper Module:</label>
<select class="form-control" id="scraper_module" name="scraper_module">
{% for module in available_scraper_modules %}
<option value="{{ module }}" {% if module==current_scraper_module %} selected
{%endif %}>
{{ module }}
{% if scraper_details[module] %}
- {{ scraper_details[module].description[:50] }}...
{% endif %}
</option>
{% endfor %}
</select>
<div class="form-text">
Current module: <strong>{{ current_scraper_module }}</strong>
</div>
</div>
</div>
<button type="submit" class="btn btn-primary">Update Scraper Module</button>
</form>
</div>
<div class="form-section">
<h6>System Settings</h6>
<p class="text-muted">Configure general system behavior.</p>
<div class="mb-3 form-check">
<input type="checkbox" class="form-check-input" id="enableNotifications" checked>
<label class="form-check-label" for="enableNotifications">
Enable email notifications
</label>
</div>
<div class="mb-3 form-check">
<input type="checkbox" class="form-check-input" id="enableLogging" checked>
<label class="form-check-label" for="enableLogging">
Enable detailed activity logging
</label>
</div>
</div>
<button type="submit" class="btn btn-primary">Save General Settings</button>
</form>
</div>
</div>
</div>
</div>