modularizes the scraper methods

2025-05-23 14:32:41 +02:00 · 2025-05-23 14:32:41 +02:00 · 8f2375215d
commit 8f2375215d
parent 11f086aa64
9 changed files with 366 additions and 213 deletions
--- a/scipaperloader/blueprints/config.py
+++ b/scipaperloader/blueprints/config.py
@ -1,10 +1,11 @@
 """Configuration management blueprint."""
-from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify
+from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify, current_app
 from ..db import db
 # Import the new model
 from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig
 from ..defaults import MAX_VOLUME
 import os # Import os for path validation
+from scipaperloader.scrapers import __path__ as scrapers_path

 bp = Blueprint("config", __name__, url_prefix="/config")

@ -281,6 +282,46 @@ def update_schedule():
    return redirect(url_for("config.schedule"))


+@bp.route("/update/scraper_module", methods=["POST"])
+def update_scraper_module():
+    """Update the scraper module configuration."""
+    from ..models import ScraperModuleConfig
+    
+    new_scraper_module = request.form.get("scraper_module")
+    if not new_scraper_module:
+        flash("Scraper module cannot be empty.", "error")
+        return redirect(url_for("config.general"))
+
+    # Validate that the module exists and is valid
+    from scipaperloader.scrapers.factory import get_available_scrapers
+    available_modules = [m["name"] for m in get_available_scrapers()]
+    
+    if new_scraper_module not in available_modules:
+        flash(f"Invalid scraper module: {new_scraper_module}", "error")
+        return redirect(url_for("config.general"))
+
+    # Update the database configuration
+    ScraperModuleConfig.set_module(new_scraper_module)
+    flash(f"Scraper module updated to '{new_scraper_module}'.", "success")
+    return redirect(url_for("config.general"))
+
+
+@bp.context_processor
+def inject_scraper_modules():
+    """Inject available scraper modules into the template context."""
+    from scipaperloader.scrapers.factory import get_available_scrapers
+    from ..models import ScraperModuleConfig
+    
+    available_scrapers = get_available_scrapers()
+    current_module = ScraperModuleConfig.get_current_module()
+    
+    return {
+        "available_scraper_modules": [s["name"] for s in available_scrapers],
+        "current_scraper_module": current_module,
+        "scraper_details": {s["name"]: s for s in available_scrapers}
+    }
+
+
@bp.route("/api/schedule/stats")
 def schedule_stats():
    """Get statistics about the current schedule configuration."""
--- a/scipaperloader/blueprints/scraper.py
+++ b/scipaperloader/blueprints/scraper.py
@ -12,6 +12,7 @@ from ..celery import celery
 from ..defaults import MAX_VOLUME
 from celery.schedules import crontab
 from sqlalchemy import func
+from scipaperloader.scrapers.factory import get_scraper

 bp = Blueprint("scraper", __name__, url_prefix="/scraper")

@ -153,7 +154,7 @@ def stop_scraper():

        # Stop any running tasks
        task_types_to_revoke = [
-            'scipaperloader.blueprints.scraper.dummy_process_paper',
+            'scipaperloader.blueprints.scraper.process_paper',
            'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
            'scipaperloader.blueprints.scraper.run_periodic_dummy_scraper'
        ]
@ -224,7 +225,7 @@ def pause_scraper():
        # Just revoke processing tasks, but leave the periodic tasks running
        # so it can continue to check the state (which is now paused)
        task_types_to_revoke = [
-            'scipaperloader.blueprints.scraper.dummy_process_paper',
+            'scipaperloader.blueprints.scraper.process_paper',
            'scipaperloader.blueprints.scraper.dummy_scheduled_scraper'
        ]

@ -373,70 +374,7 @@ def update_config():
        return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"})


-@celery.task(bind=True)
-def dummy_scrape_paper(self):
-    """Simulate scraping a single paper."""
-    # Simulate success or failure
-    success = random.random() > 0.3  # 70% success rate

-    # Simulate processing time
-    import time
-    time.sleep(random.randint(2, 5))  # 2-5 seconds
-
-    if success:
-        # Create a dummy paper
-        new_paper = PaperMetadata(
-            title=f"Dummy Paper {random.randint(1000, 9999)}",
-            doi=f"10.1234/dummy.{random.randint(1000, 9999)}",
-            journal=random.choice([
-                "Nature", "Science", "PLOS ONE", "Journal of Dummy Research",
-                "Proceedings of the Dummy Society", "Cell", "Dummy Review Letters"
-            ]),
-            type="article",
-            language="en",
-            published_online=datetime.now().date(),
-            status="Done",
-            file_path="/path/to/dummy/paper.pdf"
-        )
-
-        db.session.add(new_paper)
-        db.session.commit()
-
-        # Log the successful scrape
-        ActivityLog.log_scraper_activity(
-            action="scrape_paper",
-            paper_id=new_paper.id,
-            status="success",
-            description=f"Successfully scraped paper {new_paper.doi}"
-        )
-
-        return {
-            "success": True,
-            "paper_id": new_paper.id,
-            "title": new_paper.title,
-            "doi": new_paper.doi
-        }
-    else:
-        # Log the failed scrape
-        error_message = random.choice([
-            "Connection timeout",
-            "404 Not Found",
-            "Access denied",
-            "Invalid DOI format",
-            "PDF download failed",
-            "Rate limited by publisher"
-        ])
-
-        ActivityLog.log_scraper_activity(
-            action="scrape_paper",
-            status="error",
-            description=f"Failed to scrape paper: {error_message}"
-        )
-
-        return {
-            "success": False,
-            "error": error_message
-        }


@celery.task
@ -545,11 +483,11 @@ def dummy_scheduled_scraper():
        )

        # --- Now schedule processing for the newly selected "Pending" papers ---
-        # (Assuming dummy_process_paper takes a paper_id)
+        # (Using the new modular process_paper task)
        # Add random delays for processing within the hour (e.g., up to 3600 seconds)
        for paper_id in selected_paper_ids:
            delay = random.uniform(1, 3500) # Random delay up to ~58 minutes
-            dummy_process_paper.apply_async(args=[paper_id], countdown=delay)
+            process_paper.apply_async(args=[paper_id], countdown=delay)
            
        ActivityLog.log_scraper_activity(
            action="schedule_processing",
@ -568,109 +506,6 @@ def dummy_scheduled_scraper():
        return False


-@celery.task(bind=True)
-def dummy_process_paper(self, paper_id):
-    """
-    Process a single paper for the dummy scraper.
-
-    Args:
-        paper_id (int): ID of the paper to process
-    """
-    # First check if the scraper is still active and not paused
-    scraper_state = ScraperState.get_current_state()
-    if not scraper_state.is_active or scraper_state.is_paused:
-        # Log that task was skipped due to scraper being stopped or paused
-        ActivityLog.log_scraper_activity(
-            action="process_paper",
-            status="info",
-            description=f"Skipped processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}"
-        )
-        return False
-
-    # Get the paper from database
-    paper = PaperMetadata.query.get(paper_id)
-    if not paper:
-        # Log error if paper not found
-        ActivityLog.log_scraper_activity(
-            action="process_paper",
-            status="error",
-            description=f"Paper with ID {paper_id} not found"
-        )
-        return False
-
-    # Simulate random success/failure (70% success rate)
-    success = random.random() < 0.7
-
-    # Simulate processing time (1-5 seconds)
-    process_time = random.uniform(1, 5)
-    time.sleep(process_time)
-
-    # Check again if scraper is still active and not paused after the time delay
-    # This ensures we don't process papers if the scraper was stopped during the delay
-    scraper_state = ScraperState.get_current_state()
-    if not scraper_state.is_active or scraper_state.is_paused:
-        ActivityLog.log_scraper_activity(
-            action="process_paper",
-            status="info",
-            description=f"Cancelled processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}"
-        )
-        return False
-
-    if success:
-        # --- Get configured download path ---
-        download_base_path = DownloadPathConfig.get_path()
-        # Ensure the base path exists (optional, but good practice)
-        # os.makedirs(download_base_path, exist_ok=True) 
-
-        # --- Construct the file path ---
-        # Sanitize DOI for use in filename
-        safe_doi = paper.doi.replace('/', '_').replace(':', '_')
-        filename = f"{safe_doi}.pdf"
-        full_path = os.path.join(download_base_path, filename)
-
-        # Update paper status to "Done" and set the file path
-        paper.status = "Done"
-        paper.file_path = full_path # Use the constructed path
-
-        # Log success
-        ActivityLog.log_scraper_activity(
-            action="process_paper",
-            paper_id=paper.id,
-            status="success",
-            description=f"Successfully processed paper: {paper.doi}. File at: {full_path}" # Log path
-        )
-    else:
-        # Update paper status to "Failed"
-        paper.status = "Failed"
-
-        # Generate random error message
-        error_message = random.choice([
-            "Publisher website unavailable",
-            "No PDF download link found",
-            "Access restricted",
-            "Download timeout",
-            "Invalid DOI",
-            "Rate limited by publisher"
-        ])
-        paper.error_msg = error_message
-
-        # Log failure
-        ActivityLog.log_scraper_activity(
-            action="process_paper",
-            paper_id=paper.id,
-            status="error",
-            description=f"Failed to process paper: {error_message}"
-        )
-
-    # Update the timestamp
-    paper.updated_at = datetime.utcnow()
-
-    # Commit changes to database
-    db.session.commit()
-
-    return success
-
-
@celery.task(bind=True)
 def process_paper_batch(self, paper_ids):
    """
@ -914,3 +749,21 @@ def calculate_papers_for_current_hour():
    )
    
    return papers_this_hour
+
+
+@celery.task(bind=True)
+def process_paper(self, paper_id):
+    """Process a paper using the configured scraper."""
+    from scipaperloader.models import PaperMetadata
+    paper = PaperMetadata.query.get(paper_id)
+    if not paper:
+        return {"status": "error", "message": f"Paper with ID {paper_id} not found"}
+
+    scraper = get_scraper()
+    result = scraper.scrape(paper.doi)
+
+    return {
+        "paper_id": paper_id,
+        "status": result.status,
+        "message": result.message
+    }
--- a/scipaperloader/config.py
+++ b/scipaperloader/config.py
@ -6,3 +6,4 @@ class Config:
    SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db")
    SQLALCHEMY_TRACK_MODIFICATIONS = False
    APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader")
+    SCRAPER_MODULE = os.environ.get("SCRAPER_MODULE", "dummy")
--- a/scipaperloader/models.py
+++ b/scipaperloader/models.py
@ -277,6 +277,40 @@ class ScraperState(db.Model):
        return state.is_active and not state.is_paused


+class ScraperModuleConfig(db.Model):
+    """Model to store the configured scraper module."""
+    id = db.Column(db.Integer, primary_key=True)
+    module_name = db.Column(db.String(100), default="dummy")
+    
+    @classmethod
+    def get_current_module(cls):
+        """Get the currently configured scraper module."""
+        config = cls.query.first()
+        if not config:
+            config = cls(module_name="dummy")
+            db.session.add(config)
+            db.session.commit()
+        return config.module_name
+    
+    @classmethod
+    def set_module(cls, module_name):
+        """Set the scraper module."""
+        config = cls.query.first()
+        if not config:
+            config = cls(module_name=module_name)
+            db.session.add(config)
+        else:
+            old_value = config.module_name
+            config.module_name = module_name
+            ActivityLog.log_config_change(
+                config_key="scraper_module",
+                old_value=old_value,
+                new_value=module_name,
+                description="Updated scraper module configuration"
+            )
+        db.session.commit()
+        return config
+
 def init_schedule_config():
    """Initialize ScheduleConfig with default values if empty"""
    if ScheduleConfig.query.count() == 0:
--- a/scipaperloader/scrapers/init.py
+++ b/scipaperloader/scrapers/init.py
@ -0,0 +1,2 @@
+# This package contains all scraper modules.
+# Each scraper should implement the BaseScraper interface from base.py.
--- a/scipaperloader/scrapers/base.py
+++ b/scipaperloader/scrapers/base.py
@ -0,0 +1,34 @@
+from abc import ABC, abstractmethod
+from typing import NamedTuple, Optional, Dict
+from datetime import datetime
+
+class ScrapeResult(NamedTuple):
+    status: str            # "success", "error", "skipped"
+    message: str           # human-readable status
+    data: Optional[Dict]   # any extra payload (file_path, metadata, etc.)
+    duration: Optional[float] = None  # processing time in seconds
+    timestamp: Optional[datetime] = None  # when the operation completed
+
+class BaseScraper(ABC):
+    """Base class for all scraper implementations."""
+    
+    @abstractmethod
+    def scrape(self, doi: str) -> ScrapeResult:
+        """
+        Fetch metadata and/or download paper for the given DOI.
+        
+        Args:
+            doi: The DOI of the paper to scrape
+            
+        Returns:
+            ScrapeResult with status, message, and optional data
+        """
+        pass
+    
+    def get_name(self) -> str:
+        """Return the name of this scraper."""
+        return self.__class__.__name__
+    
+    def get_description(self) -> str:
+        """Return a description of this scraper."""
+        return getattr(self.__class__, "__doc__", "No description available")
--- a/scipaperloader/scrapers/dummy.py
+++ b/scipaperloader/scrapers/dummy.py
@ -0,0 +1,94 @@
+import time
+import random
+from datetime import datetime
+from .base import BaseScraper, ScrapeResult
+from flask import current_app
+from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
+from ..db import db
+
+class Scraper(BaseScraper):
+    """Dummy scraper for testing purposes that simulates paper downloading."""
+    
+    def scrape(self, doi: str) -> ScrapeResult:
+        """Simulate scraping a paper with realistic timing and random success/failure."""
+        start_time = time.time()
+        
+        paper = PaperMetadata.query.filter_by(doi=doi).first()
+        if not paper:
+            return ScrapeResult(
+                status="error", 
+                message=f"No paper found for DOI {doi}", 
+                data=None,
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+
+        # Simulate processing time (1-3 seconds)
+        processing_time = random.uniform(1, 3)
+        time.sleep(processing_time)
+        
+        # Simulate 80% success rate
+        success = random.random() < 0.8
+        
+        if success:
+            # Get download path and simulate file creation
+            download_path = DownloadPathConfig.get_path()
+            file_name = f"{doi.replace('/', '_')}.pdf"
+            file_path = f"{download_path}/{file_name}"
+            
+            # Update paper status
+            paper.status = "Done"
+            paper.file_path = file_path
+            paper.error_msg = None
+            
+            # Log success
+            ActivityLog.log_scraper_activity(
+                action="dummy_scrape",
+                status="success",
+                description=f"Successfully scraped {doi}",
+                paper_id=paper.id
+            )
+            
+            result = ScrapeResult(
+                status="success",
+                message=f"Successfully scraped {doi}",
+                data={
+                    "file_path": file_path,
+                    "title": paper.title,
+                    "journal": paper.journal
+                },
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+        else:
+            # Simulate failure
+            error_messages = [
+                "Paper not found in database",
+                "Access denied by publisher",
+                "Rate limit exceeded",
+                "Network timeout",
+                "Invalid DOI format"
+            ]
+            error_msg = random.choice(error_messages)
+            
+            paper.status = "Failed"
+            paper.error_msg = error_msg
+            
+            # Log failure
+            ActivityLog.log_scraper_activity(
+                action="dummy_scrape",
+                status="error",
+                description=f"Failed to scrape {doi}: {error_msg}",
+                paper_id=paper.id
+            )
+            
+            result = ScrapeResult(
+                status="error",
+                message=f"Failed to scrape {doi}: {error_msg}",
+                data={"error_code": "dummy_error"},
+                duration=time.time() - start_time,
+                timestamp=datetime.utcnow()
+            )
+        
+        db.session.commit()
+        return result
--- a/scipaperloader/scrapers/factory.py
+++ b/scipaperloader/scrapers/factory.py
@ -0,0 +1,59 @@
+import importlib
+from flask import current_app
+from .base import BaseScraper
+
+def get_scraper() -> BaseScraper:
+    """Load the configured scraper module dynamically with error handling."""
+    from ..models import ScraperModuleConfig, ActivityLog
+    
+    try:
+        # Get module name from database first, fallback to config
+        name = ScraperModuleConfig.get_current_module()
+        if not name:
+            name = current_app.config.get("SCRAPER_MODULE", "dummy")
+            
+        module = importlib.import_module(f"scipaperloader.scrapers.{name}")
+        cls = getattr(module, "Scraper")
+        
+        # Validate that it's actually a BaseScraper
+        if not issubclass(cls, BaseScraper):
+            raise TypeError(f"Scraper class in module '{name}' does not inherit from BaseScraper")
+            
+        return cls()
+        
+    except (ImportError, AttributeError, TypeError) as e:
+        ActivityLog.log_error(
+            error_message=f"Failed to load scraper module '{name}': {str(e)}",
+            source="scraper_factory",
+            severity="error"
+        )
+        # Fallback to dummy scraper
+        from .dummy import Scraper as DummyScraper
+        return DummyScraper()
+
+def get_available_scrapers():
+    """Get list of available scraper modules."""
+    import os
+    from scipaperloader.scrapers import __path__ as scrapers_path
+    
+    modules = []
+    scrapers_dir = scrapers_path[0]
+    
+    for filename in os.listdir(scrapers_dir):
+        if filename.endswith(".py") and filename not in ("__init__.py", "base.py", "factory.py"):
+            module_name = filename[:-3]
+            try:
+                # Try to import and validate the module
+                module = importlib.import_module(f"scipaperloader.scrapers.{module_name}")
+                cls = getattr(module, "Scraper", None)
+                if cls and issubclass(cls, BaseScraper):
+                    modules.append({
+                        "name": module_name,
+                        "class": cls,
+                        "description": getattr(cls, "__doc__", "No description available")
+                    })
+            except (ImportError, AttributeError, TypeError):
+                # Skip invalid modules
+                pass
+                
+    return modules
--- a/scipaperloader/templates/config/general.html.jinja
+++ b/scipaperloader/templates/config/general.html.jinja
@ -9,52 +9,87 @@
                <!-- include flash messages template -->
                {% include "partials/flash_messages.html.jinja" %}

-                <form action="{{ url_for('config.update_general') }}" method="post">
-                    <div class="form-section">
-                        <h6>Scraper Volume</h6>
-                        <p class="text-muted">Configure the total number of papers to scrape per day.</p>
+                <div class="row">
+                    <!-- General Settings Column -->
+                    <div class="col-md-6">
+                        <form action="{{ url_for('config.update_general') }}" method="post">
+                            <div class="form-section">
+                                <h6>Scraper Volume</h6>
+                                <p class="text-muted">Configure the total number of papers to scrape per day.</p>

-                        <div class="mb-3">
-                            <label for="totalVolume" class="form-label">Papers per day:</label>
-                            <input type="number" class="form-control" id="totalVolume" name="total_volume" min="1"
-                                max="{{ max_volume }}" value="{{ volume_config.volume }}" required>
-                            <div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
-                        </div>
+                                <div class="mb-3">
+                                    <label for="totalVolume" class="form-label">Papers per day:</label>
+                                    <input type="number" class="form-control" id="totalVolume" name="total_volume"
+                                        min="1" max="{{ max_volume }}" value="{{ volume_config.volume }}" required>
+                                    <div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
+                                </div>
+                            </div>
+
+                            <div class="form-section">
+                                <h6>Download Path</h6>
+                                <p class="text-muted">Base directory where scraped paper files will be stored.</p>
+                                <div class="mb-3">
+                                    <label for="downloadPath" class="form-label">Download Directory:</label>
+                                    <input type="text" class="form-control" id="downloadPath" name="download_path"
+                                        value="{{ download_path_config.path }}" required>
+                                    <div class="form-text">Enter the full path to the download directory (e.g.,
+                                        /data/papers).
+                                        Ensure the directory exists and the application has write permissions.</div>
+                                </div>
+                            </div>
+
+                            <div class="form-section">
+                                <h6>System Settings</h6>
+                                <p class="text-muted">Configure general system behavior.</p>
+
+                                <div class="mb-3 form-check">
+                                    <input type="checkbox" class="form-check-input" id="enableNotifications" checked>
+                                    <label class="form-check-label" for="enableNotifications">
+                                        Enable email notifications
+                                    </label>
+                                </div>
+
+                                <div class="mb-3 form-check">
+                                    <input type="checkbox" class="form-check-input" id="enableLogging" checked>
+                                    <label class="form-check-label" for="enableLogging">
+                                        Enable detailed activity logging
+                                    </label>
+                                </div>
+                            </div>
+
+                            <button type="submit" class="btn btn-primary">Save General Settings</button>
+                        </form>
                    </div>

-                    <div class="form-section">
-                        <h6>Download Path</h6>
-                        <p class="text-muted">Base directory where scraped paper files will be stored.</p>
-                        <div class="mb-3">
-                            <label for="downloadPath" class="form-label">Download Directory:</label>
-                            <input type="text" class="form-control" id="downloadPath" name="download_path"
-                                value="{{ download_path_config.path }}" required>
-                            <div class="form-text">Enter the full path to the download directory (e.g., /data/papers).
-                                Ensure the directory exists and the application has write permissions.</div>
-                        </div>
+                    <!-- Scraper Module Column -->
+                    <div class="col-md-6">
+                        <form method="post" action="{{ url_for('config.update_scraper_module') }}">
+                            <div class="form-section">
+                                <h6>Scraper Module</h6>
+                                <p class="text-muted">Select which scraper module to use for processing papers.</p>
+
+                                <div class="mb-3">
+                                    <label for="scraper_module" class="form-label">Active Scraper Module:</label>
+                                    <select class="form-control" id="scraper_module" name="scraper_module">
+                                        {% for module in available_scraper_modules %}
+                                        <option value="{{ module }}" {% if module==current_scraper_module %} selected
+                                            {%endif %}>
+                                            {{ module }}
+                                            {% if scraper_details[module] %}
+                                            - {{ scraper_details[module].description[:50] }}...
+                                            {% endif %}
+                                        </option>
+                                        {% endfor %}
+                                    </select>
+                                    <div class="form-text">
+                                        Current module: <strong>{{ current_scraper_module }}</strong>
+                                    </div>
+                                </div>
+                            </div>
+                            <button type="submit" class="btn btn-primary">Update Scraper Module</button>
+                        </form>
                    </div>
-
-                    <div class="form-section">
-                        <h6>System Settings</h6>
-                        <p class="text-muted">Configure general system behavior.</p>
-
-                        <div class="mb-3 form-check">
-                            <input type="checkbox" class="form-check-input" id="enableNotifications" checked>
-                            <label class="form-check-label" for="enableNotifications">
-                                Enable email notifications
-                            </label>
-                        </div>
-
-                        <div class="mb-3 form-check">
-                            <input type="checkbox" class="form-check-input" id="enableLogging" checked>
-                            <label class="form-check-label" for="enableLogging">
-                                Enable detailed activity logging
-                            </label>
-                        </div>
-                    </div>
-
-                    <button type="submit" class="btn btn-primary">Save General Settings</button>
-                </form>
+                </div>
            </div>
        </div>
    </div>