From 8f2375215db05f904b00f8c7fbd7d83119a0d5c4 Mon Sep 17 00:00:00 2001 From: Michael Beck Date: Fri, 23 May 2025 14:32:41 +0200 Subject: [PATCH] modularizes the scraper methods --- scipaperloader/blueprints/config.py | 43 +++- scipaperloader/blueprints/scraper.py | 193 +++--------------- scipaperloader/config.py | 1 + scipaperloader/models.py | 34 +++ scipaperloader/scrapers/__init__.py | 2 + scipaperloader/scrapers/base.py | 34 +++ scipaperloader/scrapers/dummy.py | 94 +++++++++ scipaperloader/scrapers/factory.py | 59 ++++++ .../templates/config/general.html.jinja | 119 +++++++---- 9 files changed, 366 insertions(+), 213 deletions(-) create mode 100644 scipaperloader/scrapers/__init__.py create mode 100644 scipaperloader/scrapers/base.py create mode 100644 scipaperloader/scrapers/dummy.py create mode 100644 scipaperloader/scrapers/factory.py diff --git a/scipaperloader/blueprints/config.py b/scipaperloader/blueprints/config.py index f33fa01..1a6aa8c 100644 --- a/scipaperloader/blueprints/config.py +++ b/scipaperloader/blueprints/config.py @@ -1,10 +1,11 @@ """Configuration management blueprint.""" -from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify +from flask import Blueprint, render_template, redirect, url_for, request, flash, jsonify, current_app from ..db import db # Import the new model from ..models import VolumeConfig, ScheduleConfig, ActivityLog, DownloadPathConfig from ..defaults import MAX_VOLUME import os # Import os for path validation +from scipaperloader.scrapers import __path__ as scrapers_path bp = Blueprint("config", __name__, url_prefix="/config") @@ -281,6 +282,46 @@ def update_schedule(): return redirect(url_for("config.schedule")) +@bp.route("/update/scraper_module", methods=["POST"]) +def update_scraper_module(): + """Update the scraper module configuration.""" + from ..models import ScraperModuleConfig + + new_scraper_module = request.form.get("scraper_module") + if not new_scraper_module: + flash("Scraper module cannot be empty.", "error") + return redirect(url_for("config.general")) + + # Validate that the module exists and is valid + from scipaperloader.scrapers.factory import get_available_scrapers + available_modules = [m["name"] for m in get_available_scrapers()] + + if new_scraper_module not in available_modules: + flash(f"Invalid scraper module: {new_scraper_module}", "error") + return redirect(url_for("config.general")) + + # Update the database configuration + ScraperModuleConfig.set_module(new_scraper_module) + flash(f"Scraper module updated to '{new_scraper_module}'.", "success") + return redirect(url_for("config.general")) + + +@bp.context_processor +def inject_scraper_modules(): + """Inject available scraper modules into the template context.""" + from scipaperloader.scrapers.factory import get_available_scrapers + from ..models import ScraperModuleConfig + + available_scrapers = get_available_scrapers() + current_module = ScraperModuleConfig.get_current_module() + + return { + "available_scraper_modules": [s["name"] for s in available_scrapers], + "current_scraper_module": current_module, + "scraper_details": {s["name"]: s for s in available_scrapers} + } + + @bp.route("/api/schedule/stats") def schedule_stats(): """Get statistics about the current schedule configuration.""" diff --git a/scipaperloader/blueprints/scraper.py b/scipaperloader/blueprints/scraper.py index 6ab74cf..7592c01 100644 --- a/scipaperloader/blueprints/scraper.py +++ b/scipaperloader/blueprints/scraper.py @@ -12,6 +12,7 @@ from ..celery import celery from ..defaults import MAX_VOLUME from celery.schedules import crontab from sqlalchemy import func +from scipaperloader.scrapers.factory import get_scraper bp = Blueprint("scraper", __name__, url_prefix="/scraper") @@ -153,7 +154,7 @@ def stop_scraper(): # Stop any running tasks task_types_to_revoke = [ - 'scipaperloader.blueprints.scraper.dummy_process_paper', + 'scipaperloader.blueprints.scraper.process_paper', 'scipaperloader.blueprints.scraper.dummy_scheduled_scraper', 'scipaperloader.blueprints.scraper.run_periodic_dummy_scraper' ] @@ -224,7 +225,7 @@ def pause_scraper(): # Just revoke processing tasks, but leave the periodic tasks running # so it can continue to check the state (which is now paused) task_types_to_revoke = [ - 'scipaperloader.blueprints.scraper.dummy_process_paper', + 'scipaperloader.blueprints.scraper.process_paper', 'scipaperloader.blueprints.scraper.dummy_scheduled_scraper' ] @@ -373,70 +374,7 @@ def update_config(): return jsonify({"success": False, "message": f"Unexpected error: {str(e)}"}) -@celery.task(bind=True) -def dummy_scrape_paper(self): - """Simulate scraping a single paper.""" - # Simulate success or failure - success = random.random() > 0.3 # 70% success rate - # Simulate processing time - import time - time.sleep(random.randint(2, 5)) # 2-5 seconds - - if success: - # Create a dummy paper - new_paper = PaperMetadata( - title=f"Dummy Paper {random.randint(1000, 9999)}", - doi=f"10.1234/dummy.{random.randint(1000, 9999)}", - journal=random.choice([ - "Nature", "Science", "PLOS ONE", "Journal of Dummy Research", - "Proceedings of the Dummy Society", "Cell", "Dummy Review Letters" - ]), - type="article", - language="en", - published_online=datetime.now().date(), - status="Done", - file_path="/path/to/dummy/paper.pdf" - ) - - db.session.add(new_paper) - db.session.commit() - - # Log the successful scrape - ActivityLog.log_scraper_activity( - action="scrape_paper", - paper_id=new_paper.id, - status="success", - description=f"Successfully scraped paper {new_paper.doi}" - ) - - return { - "success": True, - "paper_id": new_paper.id, - "title": new_paper.title, - "doi": new_paper.doi - } - else: - # Log the failed scrape - error_message = random.choice([ - "Connection timeout", - "404 Not Found", - "Access denied", - "Invalid DOI format", - "PDF download failed", - "Rate limited by publisher" - ]) - - ActivityLog.log_scraper_activity( - action="scrape_paper", - status="error", - description=f"Failed to scrape paper: {error_message}" - ) - - return { - "success": False, - "error": error_message - } @celery.task @@ -545,11 +483,11 @@ def dummy_scheduled_scraper(): ) # --- Now schedule processing for the newly selected "Pending" papers --- - # (Assuming dummy_process_paper takes a paper_id) + # (Using the new modular process_paper task) # Add random delays for processing within the hour (e.g., up to 3600 seconds) for paper_id in selected_paper_ids: delay = random.uniform(1, 3500) # Random delay up to ~58 minutes - dummy_process_paper.apply_async(args=[paper_id], countdown=delay) + process_paper.apply_async(args=[paper_id], countdown=delay) ActivityLog.log_scraper_activity( action="schedule_processing", @@ -568,109 +506,6 @@ def dummy_scheduled_scraper(): return False -@celery.task(bind=True) -def dummy_process_paper(self, paper_id): - """ - Process a single paper for the dummy scraper. - - Args: - paper_id (int): ID of the paper to process - """ - # First check if the scraper is still active and not paused - scraper_state = ScraperState.get_current_state() - if not scraper_state.is_active or scraper_state.is_paused: - # Log that task was skipped due to scraper being stopped or paused - ActivityLog.log_scraper_activity( - action="process_paper", - status="info", - description=f"Skipped processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}" - ) - return False - - # Get the paper from database - paper = PaperMetadata.query.get(paper_id) - if not paper: - # Log error if paper not found - ActivityLog.log_scraper_activity( - action="process_paper", - status="error", - description=f"Paper with ID {paper_id} not found" - ) - return False - - # Simulate random success/failure (70% success rate) - success = random.random() < 0.7 - - # Simulate processing time (1-5 seconds) - process_time = random.uniform(1, 5) - time.sleep(process_time) - - # Check again if scraper is still active and not paused after the time delay - # This ensures we don't process papers if the scraper was stopped during the delay - scraper_state = ScraperState.get_current_state() - if not scraper_state.is_active or scraper_state.is_paused: - ActivityLog.log_scraper_activity( - action="process_paper", - status="info", - description=f"Cancelled processing paper ID {paper_id} because scraper is {'paused' if scraper_state.is_paused else 'stopped'}" - ) - return False - - if success: - # --- Get configured download path --- - download_base_path = DownloadPathConfig.get_path() - # Ensure the base path exists (optional, but good practice) - # os.makedirs(download_base_path, exist_ok=True) - - # --- Construct the file path --- - # Sanitize DOI for use in filename - safe_doi = paper.doi.replace('/', '_').replace(':', '_') - filename = f"{safe_doi}.pdf" - full_path = os.path.join(download_base_path, filename) - - # Update paper status to "Done" and set the file path - paper.status = "Done" - paper.file_path = full_path # Use the constructed path - - # Log success - ActivityLog.log_scraper_activity( - action="process_paper", - paper_id=paper.id, - status="success", - description=f"Successfully processed paper: {paper.doi}. File at: {full_path}" # Log path - ) - else: - # Update paper status to "Failed" - paper.status = "Failed" - - # Generate random error message - error_message = random.choice([ - "Publisher website unavailable", - "No PDF download link found", - "Access restricted", - "Download timeout", - "Invalid DOI", - "Rate limited by publisher" - ]) - paper.error_msg = error_message - - # Log failure - ActivityLog.log_scraper_activity( - action="process_paper", - paper_id=paper.id, - status="error", - description=f"Failed to process paper: {error_message}" - ) - - # Update the timestamp - paper.updated_at = datetime.utcnow() - - # Commit changes to database - db.session.commit() - - return success - - @celery.task(bind=True) def process_paper_batch(self, paper_ids): """ @@ -914,3 +749,21 @@ def calculate_papers_for_current_hour(): ) return papers_this_hour + + +@celery.task(bind=True) +def process_paper(self, paper_id): + """Process a paper using the configured scraper.""" + from scipaperloader.models import PaperMetadata + paper = PaperMetadata.query.get(paper_id) + if not paper: + return {"status": "error", "message": f"Paper with ID {paper_id} not found"} + + scraper = get_scraper() + result = scraper.scrape(paper.doi) + + return { + "paper_id": paper_id, + "status": result.status, + "message": result.message + } diff --git a/scipaperloader/config.py b/scipaperloader/config.py index a4d59ea..7222ec2 100644 --- a/scipaperloader/config.py +++ b/scipaperloader/config.py @@ -6,3 +6,4 @@ class Config: SQLALCHEMY_DATABASE_URI = os.environ.get("DATABASE_URL", "sqlite:///papers.db") SQLALCHEMY_TRACK_MODIFICATIONS = False APP_TITLE = os.environ.get("APP_TITLE", "SciPaperLoader") + SCRAPER_MODULE = os.environ.get("SCRAPER_MODULE", "dummy") diff --git a/scipaperloader/models.py b/scipaperloader/models.py index 781aa6b..7b3c25d 100644 --- a/scipaperloader/models.py +++ b/scipaperloader/models.py @@ -277,6 +277,40 @@ class ScraperState(db.Model): return state.is_active and not state.is_paused +class ScraperModuleConfig(db.Model): + """Model to store the configured scraper module.""" + id = db.Column(db.Integer, primary_key=True) + module_name = db.Column(db.String(100), default="dummy") + + @classmethod + def get_current_module(cls): + """Get the currently configured scraper module.""" + config = cls.query.first() + if not config: + config = cls(module_name="dummy") + db.session.add(config) + db.session.commit() + return config.module_name + + @classmethod + def set_module(cls, module_name): + """Set the scraper module.""" + config = cls.query.first() + if not config: + config = cls(module_name=module_name) + db.session.add(config) + else: + old_value = config.module_name + config.module_name = module_name + ActivityLog.log_config_change( + config_key="scraper_module", + old_value=old_value, + new_value=module_name, + description="Updated scraper module configuration" + ) + db.session.commit() + return config + def init_schedule_config(): """Initialize ScheduleConfig with default values if empty""" if ScheduleConfig.query.count() == 0: diff --git a/scipaperloader/scrapers/__init__.py b/scipaperloader/scrapers/__init__.py new file mode 100644 index 0000000..0e80737 --- /dev/null +++ b/scipaperloader/scrapers/__init__.py @@ -0,0 +1,2 @@ +# This package contains all scraper modules. +# Each scraper should implement the BaseScraper interface from base.py. diff --git a/scipaperloader/scrapers/base.py b/scipaperloader/scrapers/base.py new file mode 100644 index 0000000..5cf443c --- /dev/null +++ b/scipaperloader/scrapers/base.py @@ -0,0 +1,34 @@ +from abc import ABC, abstractmethod +from typing import NamedTuple, Optional, Dict +from datetime import datetime + +class ScrapeResult(NamedTuple): + status: str # "success", "error", "skipped" + message: str # human-readable status + data: Optional[Dict] # any extra payload (file_path, metadata, etc.) + duration: Optional[float] = None # processing time in seconds + timestamp: Optional[datetime] = None # when the operation completed + +class BaseScraper(ABC): + """Base class for all scraper implementations.""" + + @abstractmethod + def scrape(self, doi: str) -> ScrapeResult: + """ + Fetch metadata and/or download paper for the given DOI. + + Args: + doi: The DOI of the paper to scrape + + Returns: + ScrapeResult with status, message, and optional data + """ + pass + + def get_name(self) -> str: + """Return the name of this scraper.""" + return self.__class__.__name__ + + def get_description(self) -> str: + """Return a description of this scraper.""" + return getattr(self.__class__, "__doc__", "No description available") diff --git a/scipaperloader/scrapers/dummy.py b/scipaperloader/scrapers/dummy.py new file mode 100644 index 0000000..df60354 --- /dev/null +++ b/scipaperloader/scrapers/dummy.py @@ -0,0 +1,94 @@ +import time +import random +from datetime import datetime +from .base import BaseScraper, ScrapeResult +from flask import current_app +from ..models import PaperMetadata, ActivityLog, DownloadPathConfig +from ..db import db + +class Scraper(BaseScraper): + """Dummy scraper for testing purposes that simulates paper downloading.""" + + def scrape(self, doi: str) -> ScrapeResult: + """Simulate scraping a paper with realistic timing and random success/failure.""" + start_time = time.time() + + paper = PaperMetadata.query.filter_by(doi=doi).first() + if not paper: + return ScrapeResult( + status="error", + message=f"No paper found for DOI {doi}", + data=None, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + # Simulate processing time (1-3 seconds) + processing_time = random.uniform(1, 3) + time.sleep(processing_time) + + # Simulate 80% success rate + success = random.random() < 0.8 + + if success: + # Get download path and simulate file creation + download_path = DownloadPathConfig.get_path() + file_name = f"{doi.replace('/', '_')}.pdf" + file_path = f"{download_path}/{file_name}" + + # Update paper status + paper.status = "Done" + paper.file_path = file_path + paper.error_msg = None + + # Log success + ActivityLog.log_scraper_activity( + action="dummy_scrape", + status="success", + description=f"Successfully scraped {doi}", + paper_id=paper.id + ) + + result = ScrapeResult( + status="success", + message=f"Successfully scraped {doi}", + data={ + "file_path": file_path, + "title": paper.title, + "journal": paper.journal + }, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + else: + # Simulate failure + error_messages = [ + "Paper not found in database", + "Access denied by publisher", + "Rate limit exceeded", + "Network timeout", + "Invalid DOI format" + ] + error_msg = random.choice(error_messages) + + paper.status = "Failed" + paper.error_msg = error_msg + + # Log failure + ActivityLog.log_scraper_activity( + action="dummy_scrape", + status="error", + description=f"Failed to scrape {doi}: {error_msg}", + paper_id=paper.id + ) + + result = ScrapeResult( + status="error", + message=f"Failed to scrape {doi}: {error_msg}", + data={"error_code": "dummy_error"}, + duration=time.time() - start_time, + timestamp=datetime.utcnow() + ) + + db.session.commit() + return result diff --git a/scipaperloader/scrapers/factory.py b/scipaperloader/scrapers/factory.py new file mode 100644 index 0000000..080612f --- /dev/null +++ b/scipaperloader/scrapers/factory.py @@ -0,0 +1,59 @@ +import importlib +from flask import current_app +from .base import BaseScraper + +def get_scraper() -> BaseScraper: + """Load the configured scraper module dynamically with error handling.""" + from ..models import ScraperModuleConfig, ActivityLog + + try: + # Get module name from database first, fallback to config + name = ScraperModuleConfig.get_current_module() + if not name: + name = current_app.config.get("SCRAPER_MODULE", "dummy") + + module = importlib.import_module(f"scipaperloader.scrapers.{name}") + cls = getattr(module, "Scraper") + + # Validate that it's actually a BaseScraper + if not issubclass(cls, BaseScraper): + raise TypeError(f"Scraper class in module '{name}' does not inherit from BaseScraper") + + return cls() + + except (ImportError, AttributeError, TypeError) as e: + ActivityLog.log_error( + error_message=f"Failed to load scraper module '{name}': {str(e)}", + source="scraper_factory", + severity="error" + ) + # Fallback to dummy scraper + from .dummy import Scraper as DummyScraper + return DummyScraper() + +def get_available_scrapers(): + """Get list of available scraper modules.""" + import os + from scipaperloader.scrapers import __path__ as scrapers_path + + modules = [] + scrapers_dir = scrapers_path[0] + + for filename in os.listdir(scrapers_dir): + if filename.endswith(".py") and filename not in ("__init__.py", "base.py", "factory.py"): + module_name = filename[:-3] + try: + # Try to import and validate the module + module = importlib.import_module(f"scipaperloader.scrapers.{module_name}") + cls = getattr(module, "Scraper", None) + if cls and issubclass(cls, BaseScraper): + modules.append({ + "name": module_name, + "class": cls, + "description": getattr(cls, "__doc__", "No description available") + }) + except (ImportError, AttributeError, TypeError): + # Skip invalid modules + pass + + return modules diff --git a/scipaperloader/templates/config/general.html.jinja b/scipaperloader/templates/config/general.html.jinja index e68f4d1..a474561 100644 --- a/scipaperloader/templates/config/general.html.jinja +++ b/scipaperloader/templates/config/general.html.jinja @@ -9,52 +9,87 @@ {% include "partials/flash_messages.html.jinja" %} -
-
-
Scraper Volume
-

Configure the total number of papers to scrape per day.

+
+ +
+ +
+
Scraper Volume
+

Configure the total number of papers to scrape per day.

-
- - -
Enter a value between 1 and {{ max_volume }}
-
+
+ + +
Enter a value between 1 and {{ max_volume }}
+
+
+ +
+
Download Path
+

Base directory where scraped paper files will be stored.

+
+ + +
Enter the full path to the download directory (e.g., + /data/papers). + Ensure the directory exists and the application has write permissions.
+
+
+ +
+
System Settings
+

Configure general system behavior.

+ +
+ + +
+ +
+ + +
+
+ + +
-
-
Download Path
-

Base directory where scraped paper files will be stored.

-
- - -
Enter the full path to the download directory (e.g., /data/papers). - Ensure the directory exists and the application has write permissions.
-
+ +
+
+
+
Scraper Module
+

Select which scraper module to use for processing papers.

+ +
+ + +
+ Current module: {{ current_scraper_module }} +
+
+
+ +
- -
-
System Settings
-

Configure general system behavior.

- -
- - -
- -
- - -
-
- - - +