modularizes the scraper methods

2025-05-23 14:32:41 +02:00 · 2025-05-23 14:32:41 +02:00 · 8f2375215d
commit 8f2375215d
parent 11f086aa64
9 changed files with 366 additions and 213 deletions
--- a/scipaperloader/blueprints/config.py
+++ b/scipaperloader/blueprints/config.py
@ -1,10 +1,11 @@
 """Configuration management blueprint."""
-from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify
+from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify, current_app
 from ..db import db
 # Import the new model
 from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig
 from ..defaults import MAX_VOLUME
 import os # Import os for path validation
 from scipaperloader.scrapers import __path__ as scrapers_path
 bp = Blueprint("config", __name__, url_prefix="/config")
@ -281,6 +282,46 @@ def update_schedule():
    return redirect(url_for("config.schedule"))
@bp.route("/update/scraper_module", methods=["POST"])
 def update_scraper_module():
    """Update the scraper module configuration."""
    from ..models import ScraperModuleConfig
    new_scraper_module = request.form.get("scraper_module")
    if not new_scraper_module:
        flash("Scraper module cannot be empty.", "error")
        return redirect(url_for("config.general"))
    # Validate that the module exists and is valid
    from scipaperloader.scrapers.factory import get_available_scrapers
    available_modules = [m["name"] for m in get_available_scrapers()]
    if new_scraper_module not in available_modules:
        flash(f"Invalid scraper module: {new_scraper_module}", "error")
        return redirect(url_for("config.general"))
    # Update the database configuration
    ScraperModuleConfig.set_module(new_scraper_module)
    flash(f"Scraper module updated to '{new_scraper_module}'.", "success")
    return redirect(url_for("config.general"))
@bp.context_processor
 def inject_scraper_modules():
    """Inject available scraper modules into the template context."""
    from scipaperloader.scrapers.factory import get_available_scrapers
    from ..models import ScraperModuleConfig
    available_scrapers = get_available_scrapers()
    current_module = ScraperModuleConfig.get_current_module()
    return {
        "available_scraper_modules": [s["name"] for s in available_scrapers],
        "current_scraper_module": current_module,
        "scraper_details": {s["name"]: s for s in available_scrapers}
    }
@bp.route("/api/schedule/stats")
 def schedule_stats():
    """Get statistics about the current schedule configuration."""
--- a/scipaperloader/blueprints/scraper.py
+++ b/scipaperloader/blueprints/scraper.py
@ -12,6 +12,7 @@ from ..celery import celery
 from ..defaults import MAX_VOLUME
 from celery.schedules import crontab
 from sqlalchemy import func
 from scipaperloader.scrapers.factory import get_scraper
 bp = Blueprint("scraper", __name__, url_prefix="/scraper")
@ -153,7 +154,7 @@ def stop_scraper():
        # Stop any running tasks
        task_types_to_revoke = [
-            'scipaperloader.blueprints.scraper.dummy_process_paper',
+            'scipaperloader.blueprints.scraper.process_paper',
            'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
            'scipaperloader.blueprints.scraper.run_periodic_dummy_scraper'
        ]
@ -224,7 +225,7 @@ def pause_scraper():
        # Just revoke processing tasks, but leave the periodic tasks running
        # so it can continue to check the state (which is now paused)
        task_types_to_revoke = [
-            'scipaperloader.blueprints.scraper.dummy_process_paper',
+            'scipaperloader.blueprints.scraper.process_paper',
            'scipaperloader.blueprints.scraper.dummy_scheduled_scraper'
        ]
@ -373,70 +374,7 @@ def update_config():
        return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"})
@celery.task(bind=True)
 def dummy_scrape_paper(self):
    """Simulate scraping a single paper."""
    # Simulate success or failure
    success = random.random() > 0.3  # 70% success rate
    # Simulate processing time
    import time
    time.sleep(random.randint(2, 5))  # 2-5 seconds
    if success:
        # Create a dummy paper
        new_paper = PaperMetadata(
            title=f"Dummy Paper {random.randint(1000, 9999)}",
            doi=f"10.1234/dummy.{random.randint(1000, 9999)}",
            journal=random.choice([
                "Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
                "Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
            ]),
            type="article",
            language="en",
            published_online=datetime.now().date(),
            status="Done",
            file_path="/path/to/dummy/paper.pdf"
        )
        db.session.add(new_paper)
        db.session.commit()
        # Log the successful scrape
        ActivityLog.log_scraper_activity(
            action="scrape_paper",
            paper_id=new_paper.id,
            status="success",
            description=f"Successfully scraped paper {new_paper.doi}"
        )
        return {
            "success": True,
            "paper_id": new_paper.id,
            "title": new_paper.title,
            "doi": new_paper.doi
        }
    else:
        # Log the failed scrape
        error_message = random.choice([
            "Connection timeout",
            "404 Not Found",
            "Access denied",
            "Invalid DOI format",
            "PDF download failed",
            "Rate limited by publisher"
        ])
        ActivityLog.log_scraper_activity(
            action="scrape_paper",
            status="error",
            description=f"Failed to scrape paper: {error_message}"
        )
        return {
            "success": False,
            "error": error_message
        }
@celery.task
@ -545,11 +483,11 @@ def dummy_scheduled_scraper():
        )
        # --- Now schedule processing for the newly selected "Pending" papers ---
-        # (Assuming dummy_process_paper takes a paper_id)
+        # (Using the new modular process_paper task)
        # Add random delays for processing within the hour (e.g., up to 3600 seconds)
        for paper_id in selected_paper_ids:
            delay = random.uniform(1, 3500) # Random delay up to ~58 minutes
-            dummy_process_paper.apply_async(args=[paper_id], countdown=delay)
+            process_paper.apply_async(args=[paper_id], countdown=delay)
        ActivityLog.log_scraper_activity(
            action="schedule_processing",
@ -568,109 +506,6 @@ def dummy_scheduled_scraper():
        return False
@celery.task(bind=True)
 def dummy_process_paper(self, paper_id):
    """
    Process a single paper for the dummy scraper.
    Args:
        paper_id (int): ID of the paper to process
    """
    # First check if the scraper is still active and not paused
    scraper_state = ScraperState.get_current_state()
    if not scraper_state.is_active or scraper_state.is_paused:
        # Log that task was skipped due to scraper being stopped or paused
        ActivityLog.log_scraper_activity(
            action="process_paper",
            status="info",
            description=f"Skipped processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}"
        )
        return False
    # Get the paper from database
    paper = PaperMetadata.query.get(paper_id)
    if not paper:
        # Log error if paper not found
        ActivityLog.log_scraper_activity(
            action="process_paper",
            status="error",
            description=f"Paper with ID {paper_id} not found"
        )
        return False
    # Simulate random success/failure (70% success rate)
    success = random.random() < 0.7
    # Simulate processing time (1-5 seconds)
    process_time = random.uniform(1, 5)
    time.sleep(process_time)
    # Check again if scraper is still active and not paused after the time delay
    # This ensures we don't process papers if the scraper was stopped during the delay
    scraper_state = ScraperState.get_current_state()
    if not scraper_state.is_active or scraper_state.is_paused:
        ActivityLog.log_scraper_activity(
            action="process_paper",
            status="info",
            description=f"Cancelled processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}"
        )
        return False
    if success:
        # --- Get configured download path ---
        download_base_path = DownloadPathConfig.get_path()
        # Ensure the base path exists (optional, but good practice)
        # os.makedirs(download_base_path, exist_ok=True) 
        # --- Construct the file path ---
        # Sanitize DOI for use in filename
        safe_doi = paper.doi.replace('/', '_').replace(':', '_')
        filename = f"{safe_doi}.pdf"
        full_path = os.path.join(download_base_path, filename)
        # Update paper status to "Done" and set the file path
        paper.status = "Done"
        paper.file_path = full_path # Use the constructed path
        # Log success
        ActivityLog.log_scraper_activity(
            action="process_paper",
            paper_id=paper.id,
            status="success",
            description=f"Successfully processed paper: {paper.doi}. File at: {full_path}" # Log path
        )
    else:
        # Update paper status to "Failed"
        paper.status = "Failed"
        # Generate random error message
        error_message = random.choice([
            "Publisher website unavailable",
            "No PDF download link found",
            "Access restricted",
            "Download timeout",
            "Invalid DOI",
            "Rate limited by publisher"
        ])
        paper.error_msg = error_message
        # Log failure
        ActivityLog.log_scraper_activity(
            action="process_paper",
            paper_id=paper.id,
            status="error",
            description=f"Failed to process paper: {error_message}"
        )
    # Update the timestamp
    paper.updated_at = datetime.utcnow()
    # Commit changes to database
    db.session.commit()
    return success
@celery.task(bind=True)
 def process_paper_batch(self, paper_ids):
    """
@ -914,3 +749,21 @@ def calculate_papers_for_current_hour():
    )
    return papers_this_hour
@celery.task(bind=True)
 def process_paper(self, paper_id):
    """Process a paper using the configured scraper."""
    from scipaperloader.models import PaperMetadata
    paper = PaperMetadata.query.get(paper_id)
    if not paper:
        return {"status": "error", "message": f"Paper with ID {paper_id} not found"}
    scraper = get_scraper()
    result = scraper.scrape(paper.doi)
    return {
        "paper_id": paper_id,
        "status": result.status,
        "message": result.message
    }
--- a/scipaperloader/config.py
+++ b/scipaperloader/config.py
@ -6,3 +6,4 @@ class Config:
    SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
    SQLALCHEMY_TRACK_MODIFICATIONS = False
    APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
    SCRAPER_MODULE = os.environ.get("SCRAPER_MODULE", "dummy")
--- a/scipaperloader/models.py
+++ b/scipaperloader/models.py
@ -277,6 +277,40 @@ class ScraperState(db.Model):
        return state.is_active and not state.is_paused
 class ScraperModuleConfig(db.Model):
    """Model to store the configured scraper module."""
    id = db.Column(db.Integer, primary_key=True)
    module_name = db.Column(db.String(100), default="dummy")
    @classmethod
    def get_current_module(cls):
        """Get the currently configured scraper module."""
        config = cls.query.first()
        if not config:
            config = cls(module_name="dummy")
            db.session.add(config)
            db.session.commit()
        return config.module_name
    @classmethod
    def set_module(cls, module_name):
        """Set the scraper module."""
        config = cls.query.first()
        if not config:
            config = cls(module_name=module_name)
            db.session.add(config)
        else:
            old_value = config.module_name
            config.module_name = module_name
            ActivityLog.log_config_change(
                config_key="scraper_module",
                old_value=old_value,
                new_value=module_name,
                description="Updated scraper module configuration"
            )
        db.session.commit()
        return config
 def init_schedule_config():
    """Initialize ScheduleConfig with default values if empty"""
    if ScheduleConfig.query.count() == 0:
--- a/scipaperloader/scrapers/init.py
+++ b/scipaperloader/scrapers/init.py
@ -0,0 +1,2 @@
 # This package contains all scraper modules.
 # Each scraper should implement the BaseScraper interface from base.py.
--- a/scipaperloader/scrapers/base.py
+++ b/scipaperloader/scrapers/base.py
@ -0,0 +1,34 @@
 from abc import ABC, abstractmethod
 from typing import NamedTuple, Optional, Dict
 from datetime import datetime
 class ScrapeResult(NamedTuple):
    status: str            # "success", "error", "skipped"
    message: str           # human-readable status
    data: Optional[Dict]   # any extra payload (file_path, metadata, etc.)
    duration: Optional[float] = None  # processing time in seconds
    timestamp: Optional[datetime] = None  # when the operation completed
 class BaseScraper(ABC):
    """Base class for all scraper implementations."""
    @abstractmethod
    def scrape(self, doi: str) -> ScrapeResult:
        """
        Fetch metadata and/or download paper for the given DOI.
        Args:
            doi: The DOI of the paper to scrape
        Returns:
            ScrapeResult with status, message, and optional data
        """
        pass
    def get_name(self) -> str:
        """Return the name of this scraper."""
        return self.__class__.__name__
    def get_description(self) -> str:
        """Return a description of this scraper."""
        return getattr(self.__class__, "__doc__", "No description available")
--- a/scipaperloader/scrapers/dummy.py
+++ b/scipaperloader/scrapers/dummy.py
@ -0,0 +1,94 @@
 import time
 import random
 from datetime import datetime
 from .base import BaseScraper, ScrapeResult
 from flask import current_app
 from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
 from ..db import db
 class Scraper(BaseScraper):
    """Dummy scraper for testing purposes that simulates paper downloading."""
    def scrape(self, doi: str) -> ScrapeResult:
        """Simulate scraping a paper with realistic timing and random success/failure."""
        start_time = time.time()
        paper = PaperMetadata.query.filter_by(doi=doi).first()
        if not paper:
            return ScrapeResult(
                status="error", 
                message=f"No paper found for DOI {doi}", 
                data=None,
                duration=time.time() - start_time,
                timestamp=datetime.utcnow()
            )
        # Simulate processing time (1-3 seconds)
        processing_time = random.uniform(1, 3)
        time.sleep(processing_time)
        # Simulate 80% success rate
        success = random.random() < 0.8
        if success:
            # Get download path and simulate file creation
            download_path = DownloadPathConfig.get_path()
            file_name = f"{doi.replace('/', '_')}.pdf"
            file_path = f"{download_path}/{file_name}"
            # Update paper status
            paper.status = "Done"
            paper.file_path = file_path
            paper.error_msg = None
            # Log success
            ActivityLog.log_scraper_activity(
                action="dummy_scrape",
                status="success",
                description=f"Successfully scraped {doi}",
                paper_id=paper.id
            )
            result = ScrapeResult(
                status="success",
                message=f"Successfully scraped {doi}",
                data={
                    "file_path": file_path,
                    "title": paper.title,
                    "journal": paper.journal
                },
                duration=time.time() - start_time,
                timestamp=datetime.utcnow()
            )
        else:
            # Simulate failure
            error_messages = [
                "Paper not found in database",
                "Access denied by publisher",
                "Rate limit exceeded",
                "Network timeout",
                "Invalid DOI format"
            ]
            error_msg = random.choice(error_messages)
            paper.status = "Failed"
            paper.error_msg = error_msg
            # Log failure
            ActivityLog.log_scraper_activity(
                action="dummy_scrape",
                status="error",
                description=f"Failed to scrape {doi}: {error_msg}",
                paper_id=paper.id
            )
            result = ScrapeResult(
                status="error",
                message=f"Failed to scrape {doi}: {error_msg}",
                data={"error_code": "dummy_error"},
                duration=time.time() - start_time,
                timestamp=datetime.utcnow()
            )
        db.session.commit()
        return result
--- a/scipaperloader/scrapers/factory.py
+++ b/scipaperloader/scrapers/factory.py
@ -0,0 +1,59 @@
 import importlib
 from flask import current_app
 from .base import BaseScraper
 def get_scraper() -> BaseScraper:
    """Load the configured scraper module dynamically with error handling."""
    from ..models import ScraperModuleConfig, ActivityLog
    try:
        # Get module name from database first, fallback to config
        name = ScraperModuleConfig.get_current_module()
        if not name:
            name = current_app.config.get("SCRAPER_MODULE", "dummy")
        module = importlib.import_module(f"scipaperloader.scrapers.{name}")
        cls = getattr(module, "Scraper")
        # Validate that it's actually a BaseScraper
        if not issubclass(cls, BaseScraper):
            raise TypeError(f"Scraper class in module '{name}' does not inherit from BaseScraper")
        return cls()
    except (ImportError, AttributeError, TypeError) as e:
        ActivityLog.log_error(
            error_message=f"Failed to load scraper module '{name}': {str(e)}",
            source="scraper_factory",
            severity="error"
        )
        # Fallback to dummy scraper
        from .dummy import Scraper as DummyScraper
        return DummyScraper()
 def get_available_scrapers():
    """Get list of available scraper modules."""
    import os
    from scipaperloader.scrapers import __path__ as scrapers_path
    modules = []
    scrapers_dir = scrapers_path[0]
    for filename in os.listdir(scrapers_dir):
        if filename.endswith(".py") and filename not in ("__init__.py", "base.py", "factory.py"):
            module_name = filename[:-3]
            try:
                # Try to import and validate the module
                module = importlib.import_module(f"scipaperloader.scrapers.{module_name}")
                cls = getattr(module, "Scraper", None)
                if cls and issubclass(cls, BaseScraper):
                    modules.append({
                        "name": module_name,
                        "class": cls,
                        "description": getattr(cls, "__doc__", "No description available")
                    })
            except (ImportError, AttributeError, TypeError):
                # Skip invalid modules
                pass
    return modules
--- a/scipaperloader/templates/config/general.html.jinja
+++ b/scipaperloader/templates/config/general.html.jinja
@ -9,52 +9,87 @@
                <!-- include flash messages template -->
                {% include "partials/flash_messages.html.jinja" %}
-                <form action="{{ url_for('config.update_general') }}" method="post">
+                <div class="row">
-                    <div class="form-section">
+                    <!-- General Settings Column -->
-                        <h6>Scraper Volume</h6>
+                    <div class="col-md-6">
-                        <p class="text-muted">Configure the total number of papers to scrape per day.</p>
+                        <form action="{{ url_for('config.update_general') }}" method="post">
                            <div class="form-section">
                                <h6>Scraper Volume</h6>
                                <p class="text-muted">Configure the total number of papers to scrape per day.</p>
-                        <div class="mb-3">
+                                <div class="mb-3">
-                            <label for="totalVolume" class="form-label">Papers per day:</label>
+                                    <label for="totalVolume" class="form-label">Papers per day:</label>
-                            <input type="number" class="form-control" id="totalVolume" name="total_volume" min="1"
+                                    <input type="number" class="form-control" id="totalVolume" name="total_volume"
-                                max="{{ max_volume }}" value="{{ volume_config.volume }}" required>
+                                        min="1" max="{{ max_volume }}" value="{{ volume_config.volume }}" required>
-                            <div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
+                                    <div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
-                        </div>
+                                </div>
                            </div>
                            <div class="form-section">
                                <h6>Download Path</h6>
                                <p class="text-muted">Base directory where scraped paper files will be stored.</p>
                                <div class="mb-3">
                                    <label for="downloadPath" class="form-label">Download Directory:</label>
                                    <input type="text" class="form-control" id="downloadPath" name="download_path"
                                        value="{{ download_path_config.path }}" required>
                                    <div class="form-text">Enter the full path to the download directory (e.g.,
                                        /data/papers).
                                        Ensure the directory exists and the application has write permissions.</div>
                                </div>
                            </div>
                            <div class="form-section">
                                <h6>System Settings</h6>
                                <p class="text-muted">Configure general system behavior.</p>
                                <div class="mb-3 form-check">
                                    <input type="checkbox" class="form-check-input" id="enableNotifications" checked>
                                    <label class="form-check-label" for="enableNotifications">
                                        Enable email notifications
                                    </label>
                                </div>
                                <div class="mb-3 form-check">
                                    <input type="checkbox" class="form-check-input" id="enableLogging" checked>
                                    <label class="form-check-label" for="enableLogging">
                                        Enable detailed activity logging
                                    </label>
                                </div>
                            </div>
                            <button type="submit" class="btn btn-primary">Save General Settings</button>
                        </form>
                    </div>
-                    <div class="form-section">
+                    <!-- Scraper Module Column -->
-                        <h6>Download Path</h6>
+                    <div class="col-md-6">
-                        <p class="text-muted">Base directory where scraped paper files will be stored.</p>
+                        <form method="post" action="{{ url_for('config.update_scraper_module') }}">
-                        <div class="mb-3">
+                            <div class="form-section">
-                            <label for="downloadPath" class="form-label">Download Directory:</label>
+                                <h6>Scraper Module</h6>
-                            <input type="text" class="form-control" id="downloadPath" name="download_path"
+                                <p class="text-muted">Select which scraper module to use for processing papers.</p>
-                                value="{{ download_path_config.path }}" required>
+
-                            <div class="form-text">Enter the full path to the download directory (e.g., /data/papers).
+                                <div class="mb-3">
-                                Ensure the directory exists and the application has write permissions.</div>
+                                    <label for="scraper_module" class="form-label">Active Scraper Module:</label>
-                        </div>
+                                    <select class="form-control" id="scraper_module" name="scraper_module">
                                        {% for module in available_scraper_modules %}
                                        <option value="{{ module }}" {% if module==current_scraper_module %} selected
                                            {%endif %}>
                                            {{ module }}
                                            {% if scraper_details[module] %}
                                            - {{ scraper_details[module].description[:50] }}...
                                            {% endif %}
                                        </option>
                                        {% endfor %}
                                    </select>
                                    <div class="form-text">
                                        Current module: <strong>{{ current_scraper_module }}</strong>
                                    </div>
                                </div>
                            </div>
                            <button type="submit" class="btn btn-primary">Update Scraper Module</button>
                        </form>
                    </div>
-
+                </div>
                    <div class="form-section">
                        <h6>System Settings</h6>
                        <p class="text-muted">Configure general system behavior.</p>
                        <div class="mb-3 form-check">
                            <input type="checkbox" class="form-check-input" id="enableNotifications" checked>
                            <label class="form-check-label" for="enableNotifications">
                                Enable email notifications
                            </label>
                        </div>
                        <div class="mb-3 form-check">
                            <input type="checkbox" class="form-check-input" id="enableLogging" checked>
                            <label class="form-check-label" for="enableLogging">
                                Enable detailed activity logging
                            </label>
                        </div>
                    </div>
                    <button type="submit" class="btn btn-primary">Save General Settings</button>
                </form>
            </div>
        </div>
    </div>
		`@ -0,0 +1,2 @@`
							`# This package contains all scraper modules.`
							`# Each scraper should implement the BaseScraper interface from base.py.`