refines modular scraping system. adds another dummy scraper

This commit is contained in:
Michael Beck 2025-05-26 16:13:42 +02:00
parent ac348696b5
commit 1e97a9cc7b
14 changed files with 1801 additions and 981 deletions

View File

@ -1,9 +1,11 @@
from scipaperloader.celery import celery, configure_celery
# Import all task modules to ensure they are registered with Celery
import scipaperloader.scrapers.tasks # Import new scheduler tasks
import scipaperloader.blueprints.scraper # Import the scraper module with our tasks
# Configure celery with Flask app
configure_celery()
if __name__ == '__main__':
celery.start()
# Start the Celery worker
celery.start(['worker', '--loglevel=info', '--concurrency=2'])

View File

@ -9,7 +9,7 @@ bp = Blueprint("api", __name__, url_prefix="/api")
def get_activity_logs():
"""Get activity logs with filtering options."""
# Get query parameters
category = request.args.get("category")
categories = request.args.getlist("category") # Changed to getlist for multiple values
action = request.args.get("action")
after = request.args.get("after")
limit = request.args.get("limit", 20, type=int)
@ -17,8 +17,9 @@ def get_activity_logs():
# Build query
query = ActivityLog.query
if category:
query = query.filter(ActivityLog.category == category)
if categories:
# Filter by multiple categories using in_() for SQL IN clause
query = query.filter(ActivityLog.category.in_(categories))
if action:
query = query.filter(ActivityLog.action == action)

View File

@ -34,21 +34,8 @@ def _update_volume(new_volume):
if new_volume <= 0 or new_volume > MAX_VOLUME:
return False, f"Volume must be between 1 and {MAX_VOLUME}", None
volume_config = VolumeConfig.query.first()
if not volume_config:
volume_config = VolumeConfig(volume=new_volume)
db.session.add(volume_config)
else:
old_value = volume_config.volume
volume_config.volume = new_volume
ActivityLog.log_config_change(
config_key="scraper_volume",
old_value=old_value,
new_value=new_volume,
description="Updated scraper volume"
)
db.session.commit()
# Use the new class method to set the volume
volume_config = VolumeConfig.set_volume(new_volume)
# Invalidate and recalculate the hourly quota cache
try:

File diff suppressed because it is too large Load Diff

View File

@ -32,8 +32,8 @@ def configure_celery(app=None):
task_reject_on_worker_lost=True, # Requeue tasks if worker dies
# Configure Beat schedule for periodic tasks
beat_schedule={
'scheduled-scraper-hourly': {
'task': 'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
'hourly-scraper-scheduler': {
'task': 'scipaperloader.scrapers.tasks.hourly_scraper_scheduler',
'schedule': crontab(minute=0), # Run at the start of every hour
'options': {'expires': 3600}
},

View File

@ -91,12 +91,13 @@ class ActivityLog(db.Model):
return log
@classmethod
def log_scraper_command(cls, action, status=None, user_id=None, **extra):
def log_scraper_command(cls, action, status=None, description=None, user_id=None, **extra):
"""Log a scraper command (start/stop/pause)."""
log = cls(
category=ActivityCategory.SCRAPER_COMMAND.value,
action=action,
status=status,
description=description,
user_id=user_id
)
log.set_extra_data(extra)
@ -191,6 +192,7 @@ class PaperMetadata(db.Model):
language = db.Column(db.String(50))
published_online = db.Column(db.Date) # or DateTime/String
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
previous_status = db.Column(db.String(10), nullable=True) # Store previous status for reversion
file_path = db.Column(db.Text)
error_msg = db.Column(db.Text)
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
@ -209,6 +211,35 @@ class ScheduleConfig(db.Model):
class VolumeConfig(db.Model):
id = db.Column(db.Integer, primary_key=True)
volume = db.Column(db.Float) # volume of papers to scrape per day
@classmethod
def get_current_volume(cls):
"""Get the current volume configuration, creating default if needed."""
config = cls.query.first()
if not config:
config = cls(volume=100)
db.session.add(config)
db.session.commit()
return config.volume
@classmethod
def set_volume(cls, new_volume):
"""Set the volume configuration."""
config = cls.query.first()
if not config:
config = cls(volume=new_volume)
db.session.add(config)
else:
old_value = config.volume
config.volume = new_volume
ActivityLog.log_config_change(
config_key="scraper_volume",
old_value=old_value,
new_value=new_volume,
description="Updated scraper volume configuration"
)
db.session.commit()
return config
class DownloadPathConfig(db.Model):
"""Model to store the base path for downloaded files."""
@ -220,7 +251,7 @@ class DownloadPathConfig(db.Model):
"""Get the configured download path, creating default if needed."""
config = cls.query.first()
if not config:
config = cls(path="/path/to/dummy/papers") # Ensure default exists
config = cls(path="/tmp/") # Ensure default exists
db.session.add(config)
db.session.commit()
return config.path
@ -341,6 +372,7 @@ def init_schedule_config():
default_volume = VolumeConfig(volume=100)
db.session.add(default_volume)
db.session.commit()
# Initialize DownloadPathConfig if it doesn't exist
if DownloadPathConfig.query.count() == 0:

View File

@ -1,2 +1,18 @@
# This package contains all scraper modules.
# Each scraper should implement the BaseScraper interface from base.py.
from .base import BaseScraper, ScrapeResult
from .factory import get_scraper, get_available_scrapers
from .manager import ScraperManager
from .dummy import Scraper as DummyScraper
from .failed_retry import Scraper as FailedRetryScraper
__all__ = [
'BaseScraper',
'ScrapeResult',
'get_scraper',
'get_available_scrapers',
'ScraperManager',
'DummyScraper',
'FailedRetryScraper'
]

View File

@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import NamedTuple, Optional, Dict
from typing import NamedTuple, Optional, Dict, List
from datetime import datetime
class ScrapeResult(NamedTuple):
@ -12,6 +12,12 @@ class ScrapeResult(NamedTuple):
class BaseScraper(ABC):
"""Base class for all scraper implementations."""
# Default input/output statuses - can be overridden by subclasses
INPUT_STATUSES = ["New"] # Which paper statuses this scraper will process
OUTPUT_STATUS_SUCCESS = "Done" # Status to set on successful scraping
OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping
OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing
@abstractmethod
def scrape(self, doi: str) -> ScrapeResult:
"""
@ -32,3 +38,15 @@ class BaseScraper(ABC):
def get_description(self) -> str:
"""Return a description of this scraper."""
return getattr(self.__class__, "__doc__", "No description available")
def get_input_statuses(self) -> List[str]:
"""Return list of paper statuses this scraper can process."""
return self.INPUT_STATUSES
def get_output_statuses(self) -> Dict[str, str]:
"""Return mapping of result types to output statuses."""
return {
"success": self.OUTPUT_STATUS_SUCCESS,
"failure": self.OUTPUT_STATUS_FAILURE,
"processing": self.OUTPUT_STATUS_PROCESSING
}

View File

@ -10,6 +10,12 @@ from ..db import db
class Scraper(BaseScraper):
"""Dummy scraper for testing purposes that simulates paper downloading."""
# This scraper processes "New" papers and outputs "Done"/"Failed"
INPUT_STATUSES = ["New"]
OUTPUT_STATUS_SUCCESS = "Done"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "Pending"
def scrape(self, doi: str) -> ScrapeResult:
"""Simulate scraping a paper with realistic timing and random success/failure."""
start_time = time.time()

View File

@ -1,5 +1,4 @@
import importlib
from flask import current_app
from .base import BaseScraper
def get_scraper() -> BaseScraper:
@ -7,10 +6,16 @@ def get_scraper() -> BaseScraper:
from ..models import ScraperModuleConfig, ActivityLog
try:
# Get module name from database first, fallback to config
# Get module name from database first, fallback to dummy
name = ScraperModuleConfig.get_current_module()
if not name:
name = current_app.config.get("SCRAPER_MODULE", "dummy")
# Only try to access Flask config if we're in app context
try:
from flask import current_app
name = current_app.config.get("SCRAPER_MODULE", "dummy")
except RuntimeError:
# No app context, use dummy
name = "dummy"
module = importlib.import_module(f"scipaperloader.scrapers.{name}")
cls = getattr(module, "Scraper")

View File

@ -0,0 +1,123 @@
import time
import random
import os
from datetime import datetime
from .base import BaseScraper, ScrapeResult
from flask import current_app
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
from ..db import db
class Scraper(BaseScraper):
"""Retry scraper that attempts to re-process failed papers with different strategies."""
# This scraper specifically targets "Failed" papers and retries them
INPUT_STATUSES = ["Failed"]
OUTPUT_STATUS_SUCCESS = "Done"
OUTPUT_STATUS_FAILURE = "Failed"
OUTPUT_STATUS_PROCESSING = "Retrying"
def scrape(self, doi: str) -> ScrapeResult:
"""Retry scraping a failed paper with enhanced error handling."""
start_time = time.time()
paper = PaperMetadata.query.filter_by(doi=doi).first()
if not paper:
return ScrapeResult(
status="error",
message=f"No paper found for DOI {doi}",
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
# Log retry attempt
ActivityLog.log_scraper_activity(
action="retry_failed_paper",
status="info",
description=f"Retrying failed paper: {paper.title}",
paper_id=paper.id
)
# Simulate longer processing time for retry (2-5 seconds)
processing_time = random.uniform(2, 5)
time.sleep(processing_time)
# Simulate 60% success rate on retry (lower than initial attempt)
success = random.random() < 0.6
result_data = {}
if success:
# Get download path and create dummy file
download_path = DownloadPathConfig.get_path()
file_name = f"{doi.replace('/', '_')}_retry.pdf"
file_path = f"{download_path}/{file_name}"
try:
# Ensure directory exists
os.makedirs(download_path, exist_ok=True)
# Create a dummy PDF file
with open(file_path, 'w') as f:
f.write(f"Dummy PDF content for retry of {doi}")
result_data = {"file_path": file_path}
# Log success
ActivityLog.log_scraper_activity(
action="retry_scrape_success",
status="success",
description=f"Successfully retried {doi} on second attempt",
paper_id=paper.id
)
result = ScrapeResult(
status="success",
message=f"Successfully retried paper {doi}",
data=result_data,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
except Exception as e:
error_msg = f"Failed to save retry file: {str(e)}"
ActivityLog.log_scraper_activity(
action="retry_scrape_file_error",
status="error",
description=error_msg,
paper_id=paper.id
)
result = ScrapeResult(
status="error",
message=error_msg,
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
else:
# Retry failed - generate different error message
error_messages = [
"Retry failed: Still no access to publisher",
"Retry failed: Alternative download methods exhausted",
"Retry failed: DOI appears permanently inaccessible",
"Retry failed: Network timeout persists"
]
error_msg = random.choice(error_messages)
ActivityLog.log_scraper_activity(
action="retry_scrape_failure",
status="error",
description=f"Retry failed for {doi}: {error_msg}",
paper_id=paper.id
)
result = ScrapeResult(
status="error",
message=error_msg,
data=None,
duration=time.time() - start_time,
timestamp=datetime.utcnow()
)
return result

View File

@ -0,0 +1,747 @@
"""
Simplified scraper management system with hourly quota scheduling.
"""
import random
import math
import redis
from datetime import datetime, timedelta
from typing import List, Dict, Optional
from sqlalchemy import func
from ..models import (
PaperMetadata,
ScheduleConfig,
VolumeConfig,
ScraperState,
ActivityLog,
ScraperModuleConfig
)
from ..db import db
from ..cache_utils import get_cached_hourly_quota
from .factory import get_scraper, get_available_scrapers
from ..celery import celery
class ScraperManager:
"""Manages scraper operations with hourly quota-based scheduling."""
def __init__(self):
self.current_scraper = None
self.pending_papers = [] # Track papers being processed
# Initialize Redis client for delayed task management
self.redis_client = None
self._init_redis_client()
def _init_redis_client(self):
"""Initialize Redis client for delayed task management."""
try:
# Use same Redis configuration as Celery
self.redis_client = redis.Redis(
host='localhost',
port=6379,
db=0,
decode_responses=True
)
# Test connection
self.redis_client.ping()
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to initialize Redis client: {str(e)}",
source="ScraperManager._init_redis_client"
)
self.redis_client = None
def _clear_delayed_tasks_from_redis(self) -> int:
"""Clear delayed tasks from Redis structures used by Celery.
Based on analysis, Celery stores delayed tasks in:
- 'unacked_index': Sorted set containing task IDs with execution timestamps
- 'unacked': Hash containing task data keyed by task ID
Returns:
int: Number of delayed tasks cleared
"""
if not self.redis_client:
try:
ActivityLog.log_error(
error_message="Redis client not available - cannot clear delayed tasks",
source="ScraperManager._clear_delayed_tasks_from_redis"
)
except RuntimeError:
# Working outside application context - just print instead
print("❌ Redis client not available - cannot clear delayed tasks")
return 0
cleared_count = 0
try:
# Define scraper task patterns to identify our tasks
scraper_patterns = [
'process_single_paper',
'process_papers_batch',
'hourly_scraper_scheduler'
]
try:
ActivityLog.log_scraper_activity(
action="check_delayed_tasks",
status="info",
description="Checking Celery delayed task structures (unacked_index, unacked)"
)
except RuntimeError:
print("🔍 Checking Celery delayed task structures (unacked_index, unacked)")
# Check 'unacked_index' (sorted set with task IDs and timestamps)
unacked_index_cleared = 0
if self.redis_client.exists('unacked_index'):
try:
# Get all task IDs from the sorted set
task_ids = self.redis_client.zrange('unacked_index', 0, -1)
if task_ids:
try:
ActivityLog.log_scraper_activity(
action="scan_unacked_index",
status="info",
description=f"Found {len(task_ids)} tasks in 'unacked_index'"
)
except RuntimeError:
print(f"📋 Found {len(task_ids)} tasks in 'unacked_index'")
# Check each task ID against the 'unacked' hash to get task details
scraper_task_ids = []
for task_id in task_ids:
try:
# Get task data from 'unacked' hash
task_data = self.redis_client.hget('unacked', task_id)
if task_data:
# Check if this task contains any of our scraper patterns
if any(pattern in str(task_data) for pattern in scraper_patterns):
scraper_task_ids.append(task_id)
except Exception:
# Skip individual task errors
continue
# Remove scraper task IDs from both structures
for task_id in scraper_task_ids:
try:
# Remove from unacked_index (sorted set)
removed_from_index = self.redis_client.zrem('unacked_index', task_id)
# Remove from unacked (hash)
removed_from_hash = self.redis_client.hdel('unacked', task_id)
if removed_from_index or removed_from_hash:
unacked_index_cleared += 1
except Exception as e:
try:
ActivityLog.log_error(
error_message=f"Error removing delayed task {task_id}: {str(e)}",
source="ScraperManager._clear_delayed_tasks_from_redis"
)
except RuntimeError:
print(f"❌ Error removing delayed task {task_id}: {str(e)}")
continue
cleared_count += unacked_index_cleared
if unacked_index_cleared > 0:
try:
ActivityLog.log_scraper_activity(
action="clear_unacked_tasks",
status="success",
description=f"Cleared {unacked_index_cleared} scraper tasks from unacked structures"
)
except RuntimeError:
print(f"✅ Cleared {unacked_index_cleared} scraper tasks from unacked structures")
else:
try:
ActivityLog.log_scraper_activity(
action="check_unacked_index",
status="info",
description="No tasks found in 'unacked_index'"
)
except RuntimeError:
print(" No tasks found in 'unacked_index'")
except Exception as e:
try:
ActivityLog.log_error(
error_message=f"Error accessing 'unacked_index': {str(e)}",
source="ScraperManager._clear_delayed_tasks_from_redis"
)
except RuntimeError:
print(f"❌ Error accessing 'unacked_index': {str(e)}")
else:
try:
ActivityLog.log_scraper_activity(
action="check_unacked_index",
status="info",
description="'unacked_index' key does not exist - no delayed tasks"
)
except RuntimeError:
print(" 'unacked_index' key does not exist - no delayed tasks")
# Also check the 'celery' queue for immediate tasks (backup check)
celery_cleared = 0
try:
queue_length = self.redis_client.llen('celery')
if queue_length and queue_length > 0:
# Scan for any scraper tasks in the immediate queue
scraper_tasks = []
for i in range(queue_length):
try:
task_data = self.redis_client.lindex('celery', i)
if task_data and any(pattern in str(task_data) for pattern in scraper_patterns):
scraper_tasks.append(task_data)
except Exception:
continue
# Remove scraper tasks from celery queue
for task_data in scraper_tasks:
try:
removed_count = self.redis_client.lrem('celery', 0, task_data)
celery_cleared += removed_count
except Exception:
continue
cleared_count += celery_cleared
if celery_cleared > 0:
try:
ActivityLog.log_scraper_activity(
action="clear_celery_tasks",
status="success",
description=f"Cleared {celery_cleared} scraper tasks from 'celery' queue"
)
except RuntimeError:
print(f"✅ Cleared {celery_cleared} scraper tasks from 'celery' queue")
except Exception as e:
try:
ActivityLog.log_error(
error_message=f"Error checking 'celery' queue: {str(e)}",
source="ScraperManager._clear_delayed_tasks_from_redis"
)
except RuntimeError:
print(f"❌ Error checking 'celery' queue: {str(e)}")
# Summary
if cleared_count > 0:
try:
ActivityLog.log_scraper_activity(
action="clear_delayed_tasks_complete",
status="success",
description=f"Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})"
)
except RuntimeError:
print(f"✅ Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})")
else:
try:
ActivityLog.log_scraper_activity(
action="clear_delayed_tasks_complete",
status="info",
description="No delayed scraper tasks found to clear in Redis"
)
except RuntimeError:
print(" No delayed scraper tasks found to clear in Redis")
return cleared_count
except Exception as e:
try:
ActivityLog.log_error(
error_message=f"Failed to clear delayed tasks from Redis: {str(e)}",
source="ScraperManager._clear_delayed_tasks_from_redis"
)
except RuntimeError:
print(f"❌ Failed to clear delayed tasks from Redis: {str(e)}")
return 0
def start_scraper(self) -> Dict[str, str]:
"""Start the scraper system."""
try:
# Get current scraper
self.current_scraper = get_scraper()
# Activate scraper state
ScraperState.set_active(True)
ScraperState.set_paused(False)
scraper_name = self.current_scraper.get_name()
ActivityLog.log_scraper_command(
action="start_scraper",
status="success",
description=f"Started scraper: {scraper_name}. Use /trigger-immediate endpoint to immediately schedule papers instead of waiting for the next hourly boundary."
)
return {"status": "success", "message": "Scraper started successfully. Papers will be scheduled at the next hourly boundary, or use /trigger-immediate to schedule immediately."}
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to start scraper: {str(e)}",
source="ScraperManager.start_scraper"
)
return {"status": "error", "message": str(e)}
def pause_scraper(self) -> Dict[str, str]:
"""Pause the scraper system."""
try:
ScraperState.set_paused(True)
ActivityLog.log_scraper_command(
action="pause_scraper",
status="success",
description="Scraper paused - processing will halt"
)
return {"status": "success", "message": "Scraper paused"}
except Exception as e:
return {"status": "error", "message": str(e)}
def resume_scraper(self) -> Dict[str, str]:
"""Resume the scraper system."""
try:
ScraperState.set_paused(False)
ActivityLog.log_scraper_command(
action="resume_scraper",
status="success",
description="Scraper resumed - processing will continue"
)
return {"status": "success", "message": "Scraper resumed"}
except Exception as e:
return {"status": "error", "message": str(e)}
def stop_scraper(self) -> Dict[str, str]:
"""Stop the scraper, revoke all running tasks, and revert pending papers."""
try:
# First, revoke all running tasks
revoked_count = 0
delayed_cleared_count = 0
ActivityLog.log_scraper_command(
action="stop_scraper_start",
status="info",
description="Beginning scraper stop process with task revocation and delayed task clearing"
)
try:
# Get Celery inspector to check for running tasks
i = celery.control.inspect()
active = i.active() or {}
scheduled = i.scheduled() or {}
reserved = i.reserved() or {}
# Revoke active tasks
for worker, tasks in active.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
ActivityLog.log_scraper_activity(
action="revoke_task",
status="success",
description=f"Revoked active task: {task.get('name', 'unknown')} (ID: {task['id']})"
)
# Revoke scheduled tasks
for worker, tasks in scheduled.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
ActivityLog.log_scraper_activity(
action="revoke_task",
status="success",
description=f"Revoked scheduled task: {task.get('name', 'unknown')} (ID: {task['id']})"
)
# Revoke reserved tasks
for worker, tasks in reserved.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
ActivityLog.log_scraper_activity(
action="revoke_task",
status="success",
description=f"Revoked reserved task: {task.get('name', 'unknown')} (ID: {task['id']})"
)
# Purge all task queues
celery.control.purge()
ActivityLog.log_scraper_activity(
action="purge_queues",
status="success",
description="Purged all task queues"
)
# **NEW: Clear delayed tasks from Redis sorted sets**
delayed_cleared_count = self._clear_delayed_tasks_from_redis()
# Additional cleanup: revoke any remaining scraper-related tasks by name pattern
try:
# Use broadcast to revoke tasks that match scraper patterns
scraper_task_patterns = [
'process_single_paper',
'process_papers_batch',
'hourly_scraper_scheduler'
]
# Get a fresh inspection of tasks after purge
fresh_inspect = celery.control.inspect()
all_tasks = {}
all_tasks.update(fresh_inspect.active() or {})
all_tasks.update(fresh_inspect.scheduled() or {})
all_tasks.update(fresh_inspect.reserved() or {})
additional_revoked = 0
for worker, tasks in all_tasks.items():
for task in tasks:
task_name = task.get('name', '')
task_id = task.get('id', '')
if any(pattern in task_name for pattern in scraper_task_patterns) and task_id:
celery.control.revoke(task_id, terminate=True)
additional_revoked += 1
ActivityLog.log_scraper_activity(
action="revoke_scraper_task",
status="success",
description=f"Revoked lingering scraper task: {task_name} (ID: {task_id})"
)
if additional_revoked > 0:
ActivityLog.log_scraper_activity(
action="cleanup_scraper_tasks",
status="success",
description=f"Additional cleanup: revoked {additional_revoked} lingering scraper tasks"
)
except Exception as e:
ActivityLog.log_error(
error_message=f"Error during additional scraper task cleanup: {str(e)}",
source="ScraperManager.stop_scraper.cleanup"
)
except Exception as e:
ActivityLog.log_error(
error_message=f"Error revoking tasks: {str(e)}",
source="ScraperManager.stop_scraper"
)
# Continue with paper reversion even if task revocation fails
# Get current scraper to know what status to revert to
scraper = get_scraper()
input_statuses = scraper.get_input_statuses()
# Find papers that are currently being processed
processing_status = scraper.get_output_statuses()["processing"]
pending_papers = PaperMetadata.query.filter_by(status=processing_status).all()
# Revert their status to the first input status
reverted_count = 0
if pending_papers and input_statuses:
revert_status = input_statuses[0] # Use first input status as default
for paper in pending_papers:
# Try to use previous_status if available, otherwise use first input status
if hasattr(paper, 'previous_status') and paper.previous_status:
paper.status = paper.previous_status
else:
paper.status = revert_status
paper.updated_at = datetime.utcnow()
reverted_count += 1
db.session.commit()
ActivityLog.log_scraper_activity(
action="revert_pending_papers",
status="success",
description=f"Reverted {reverted_count} papers from '{processing_status}' to previous status"
)
# Deactivate scraper
ScraperState.set_active(False)
ScraperState.set_paused(False)
ActivityLog.log_scraper_command(
action="stop_scraper",
status="success",
description=f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers."
)
return {
"status": "success",
"message": f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers to previous status."
}
except Exception as e:
ActivityLog.log_error(
error_message=f"Failed to stop scraper: {str(e)}",
source="ScraperManager.stop_scraper"
)
return {"status": "error", "message": str(e)}
def reset_scraper(self) -> Dict[str, str]:
"""Reset scraper state, revoke all running tasks, and clear all processing statuses."""
try:
# First, revoke all running tasks (similar to stop_scraper)
revoked_count = 0
ActivityLog.log_scraper_command(
action="reset_scraper_start",
status="info",
description="Beginning scraper reset process with task revocation"
)
try:
# Get Celery inspector to check for running tasks
i = celery.control.inspect()
active = i.active() or {}
scheduled = i.scheduled() or {}
reserved = i.reserved() or {}
# Revoke all tasks (active, scheduled, reserved)
for queue_name, queue_tasks in [("active", active), ("scheduled", scheduled), ("reserved", reserved)]:
for worker, tasks in queue_tasks.items():
for task in tasks:
if 'id' in task:
celery.control.revoke(task['id'], terminate=True)
revoked_count += 1
ActivityLog.log_scraper_activity(
action="revoke_task",
status="success",
description=f"Revoked {queue_name} task: {task.get('name', 'unknown')} (ID: {task['id']})"
)
# Purge all task queues
celery.control.purge()
ActivityLog.log_scraper_activity(
action="purge_queues",
status="success",
description="Purged all task queues during reset"
)
except Exception as e:
ActivityLog.log_error(
error_message=f"Error revoking tasks during reset: {str(e)}",
source="ScraperManager.reset_scraper"
)
# Continue with paper reversion even if task revocation fails
# Get current scraper configuration
scraper = get_scraper()
input_statuses = scraper.get_input_statuses()
processing_status = scraper.get_output_statuses()["processing"]
# Reset all papers in processing status
pending_papers = PaperMetadata.query.filter_by(status=processing_status).all()
reverted_count = 0
if pending_papers and input_statuses:
revert_status = input_statuses[0]
for paper in pending_papers:
# Try to use previous_status if available, otherwise use first input status
if hasattr(paper, 'previous_status') and paper.previous_status:
paper.status = paper.previous_status
else:
paper.status = revert_status
paper.updated_at = datetime.utcnow()
paper.error_msg = None # Clear any error messages
reverted_count += 1
db.session.commit()
# Reset scraper state
ScraperState.set_active(False)
ScraperState.set_paused(False)
ActivityLog.log_scraper_command(
action="reset_scraper",
status="success",
description=f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers."
)
return {
"status": "success",
"message": f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers to original status."
}
except Exception as e:
return {"status": "error", "message": str(e)}
def get_current_hour_quota(self) -> int:
"""Calculate papers to process in current hour based on schedule."""
try:
return get_cached_hourly_quota(self._calculate_papers_for_current_hour)
except Exception as e:
ActivityLog.log_error(
error_message=f"Error calculating hourly quota: {str(e)}",
source="ScraperManager.get_current_hour_quota"
)
return 0
def _calculate_papers_for_current_hour(self) -> int:
"""Internal method to calculate hourly quota."""
try:
# Get current hour and volume config
current_hour = datetime.now().hour
volume_config = VolumeConfig.get_current_volume()
daily_volume = volume_config if volume_config else 100
# Get schedule config for current hour
schedule_config = ScheduleConfig.query.filter_by(hour=current_hour).first()
current_weight = schedule_config.weight if schedule_config else 1.0
# Get total weight across all hours
total_weight = db.session.query(func.sum(ScheduleConfig.weight)).scalar() or 24.0
# Calculate quota: (current_weight / total_weight) * daily_volume
quota = math.ceil((current_weight / total_weight) * daily_volume)
ActivityLog.log_scraper_activity(
action="calculate_hourly_quota",
status="info",
description=f"Hour {current_hour}: quota={quota} (weight={current_weight}, total_weight={total_weight}, daily_volume={daily_volume})"
)
return max(1, quota) # Ensure at least 1 paper per hour
except Exception as e:
ActivityLog.log_error(
error_message=f"Error in quota calculation: {str(e)}",
source="ScraperManager._calculate_papers_for_current_hour"
)
return 1 # Fallback to 1 paper per hour
def select_papers_for_processing(self, limit: Optional[int] = None) -> List[PaperMetadata]:
"""Select papers for processing based on current scraper configuration."""
try:
scraper = get_scraper()
input_statuses = scraper.get_input_statuses()
if not input_statuses:
return []
# Use provided limit or calculate from hourly quota
papers_needed = limit if limit is not None else self.get_current_hour_quota()
# Query papers with input statuses, randomize selection
papers = (PaperMetadata.query
.filter(PaperMetadata.status.in_(input_statuses))
.order_by(func.random())
.limit(papers_needed)
.all())
ActivityLog.log_scraper_activity(
action="select_papers",
status="info",
description=f"Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})"
)
return papers
except Exception as e:
ActivityLog.log_error(
error_message=f"Error selecting papers: {str(e)}",
source="ScraperManager.select_papers_for_processing"
)
return []
def process_paper(self, paper: PaperMetadata) -> Dict:
"""Process a single paper using the current scraper."""
try:
scraper = get_scraper()
output_statuses = scraper.get_output_statuses()
# Store the previous status before changing it
previous_status = paper.status
# Update paper status to processing
paper.previous_status = previous_status
paper.status = output_statuses["processing"]
paper.updated_at = datetime.utcnow()
db.session.commit()
# Perform scraping
result = scraper.scrape(paper.doi)
# Update paper status based on result
if result.status == "success":
paper.status = output_statuses["success"]
paper.error_msg = None
if result.data and "file_path" in result.data:
paper.file_path = result.data["file_path"]
else:
paper.status = output_statuses["failure"]
paper.error_msg = result.message
paper.updated_at = datetime.utcnow()
db.session.commit()
# Log result
ActivityLog.log_scraper_activity(
action="process_paper",
paper_id=paper.id,
status=result.status,
description=f"Processed {paper.doi}: {result.message}"
)
return {
"paper_id": paper.id,
"status": result.status,
"message": result.message,
"duration": result.duration
}
except Exception as e:
# Revert paper status on error
try:
input_statuses = get_scraper().get_input_statuses()
if input_statuses:
paper.status = input_statuses[0]
paper.error_msg = f"Processing error: {str(e)}"
paper.updated_at = datetime.utcnow()
db.session.commit()
except:
pass # Don't fail if reversion fails
ActivityLog.log_error(
error_message=f"Error processing paper {paper.id}: {str(e)}",
source="ScraperManager.process_paper"
)
return {"paper_id": paper.id, "status": "error", "message": str(e)}
def get_status(self) -> Dict:
"""Get current scraper status."""
scraper_state = ScraperState.get_current_state()
scraper = get_scraper()
# Count papers by status
input_statuses = scraper.get_input_statuses()
output_statuses = scraper.get_output_statuses()
available_count = (PaperMetadata.query
.filter(PaperMetadata.status.in_(input_statuses))
.count())
processing_count = (PaperMetadata.query
.filter_by(status=output_statuses["processing"])
.count())
return {
"active": scraper_state.is_active,
"paused": scraper_state.is_paused,
"current_scraper": scraper.get_name(),
"input_statuses": input_statuses,
"output_statuses": output_statuses,
"available_papers": available_count,
"processing_papers": processing_count,
"current_hour_quota": self.get_current_hour_quota()
}

View File

@ -0,0 +1,189 @@
"""
Hourly scheduler task that processes papers at random times within each hour.
"""
import random
from datetime import datetime, timedelta
from typing import Optional
from celery import shared_task
from ..models import ScraperState, ActivityLog
from .manager import ScraperManager
@shared_task(bind=True)
def hourly_scraper_scheduler(self):
"""
Hourly task that schedules paper processing at random times within the hour.
This task runs at the beginning of each hour and:
1. Calculates how many papers to process this hour
2. Schedules individual paper processing tasks at random times within the hour
"""
try:
# Check if scraper is active
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="Hourly scheduler skipped - scraper not active"
)
# Disable retries for inactive scheduler
self.retry = False
return {"status": "inactive", "papers_scheduled": 0}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="Hourly scheduler skipped - scraper paused"
)
# Disable retries for paused scheduler
self.retry = False
return {"status": "paused", "papers_scheduled": 0}
# Initialize scraper manager
manager = ScraperManager()
# Get papers to process this hour
papers = manager.select_papers_for_processing()
if not papers:
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="info",
description="No papers available for processing this hour"
)
return {"status": "empty", "papers_scheduled": 0}
# Schedule papers at random times within the hour (0-3600 seconds)
scheduled_count = 0
current_time = datetime.now()
for paper in papers:
# Random delay between 1 second and 58 minutes
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
# Schedule the task using Celery's task registry to avoid circular import issues
from ..celery import celery
celery.send_task(
'scipaperloader.scrapers.tasks.process_single_paper',
args=[paper.id],
countdown=delay_seconds
)
scheduled_count += 1
# Log each scheduled paper
schedule_time = current_time + timedelta(seconds=delay_seconds)
ActivityLog.log_scraper_activity(
action="schedule_paper",
paper_id=paper.id,
status="info",
description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
)
ActivityLog.log_scraper_activity(
action="hourly_scheduler",
status="success",
description=f"Scheduled {scheduled_count} papers for random processing within this hour"
)
return {"status": "success", "papers_scheduled": scheduled_count}
except Exception as e:
ActivityLog.log_error(
error_message=f"Hourly scheduler error: {str(e)}",
source="hourly_scraper_scheduler"
)
return {"status": "error", "message": str(e)}
@shared_task(bind=True)
def process_single_paper(self, paper_id: int):
"""
Process a single paper. This task is scheduled at random times within each hour.
Args:
paper_id: ID of the paper to process
"""
try:
# Double-check scraper state before processing
scraper_state = ScraperState.get_current_state()
if not scraper_state.is_active:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Skipped processing - scraper not active"
)
# Use Celery's ignore to mark this task as completed without error
self.retry = False
return {"status": "inactive", "paper_id": paper_id}
if scraper_state.is_paused:
ActivityLog.log_scraper_activity(
action="process_single_paper",
paper_id=paper_id,
status="skipped",
description="Skipped processing - scraper paused"
)
# Use Celery's ignore for paused state too
self.retry = False
return {"status": "paused", "paper_id": paper_id}
# Get the paper
from ..models import PaperMetadata
paper = PaperMetadata.query.get(paper_id)
if not paper:
return {"status": "error", "message": f"Paper {paper_id} not found"}
# Process the paper using scraper manager
manager = ScraperManager()
result = manager.process_paper(paper)
return result
except Exception as e:
ActivityLog.log_error(
error_message=f"Error processing paper {paper_id}: {str(e)}",
source="process_single_paper"
)
return {"status": "error", "paper_id": paper_id, "message": str(e)}
@shared_task(bind=True)
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
"""
Process multiple papers in a batch for immediate processing.
Args:
paper_ids: List of paper IDs to process
scraper_module: Optional specific scraper module to use
"""
try:
results = []
manager = ScraperManager()
for paper_id in paper_ids:
from ..models import PaperMetadata
paper = PaperMetadata.query.get(paper_id)
if paper:
result = manager.process_paper(paper)
results.append(result)
else:
results.append({
"paper_id": paper_id,
"status": "error",
"message": "Paper not found"
})
return {"results": results, "total_processed": len(results)}
except Exception as e:
ActivityLog.log_error(
error_message=f"Error processing batch: {str(e)}",
source="process_papers_batch"
)
return {"status": "error", "message": str(e)}

View File

@ -29,6 +29,11 @@
height: 400px;
}
.chart-wrapper {
position: relative;
height: 400px;
}
.notification {
position: fixed;
bottom: 20px;
@ -100,132 +105,137 @@
<div class="form-group">
<label for="volumeInput">Papers per day:</label>
<input type="number" class="form-control" id="volumeInput"
value="{{ volume_config.volume if volume_config else 100 }}" min="1"
max="{{ max_volume }}">
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
value="{{ volume_config if volume_config else 100 }}" min="1" max="{{ max_volume }}">
<button type="submit" class="btn btn-primary mt-2">
<i class="fas fa-save"></i> Update Volume
</button>
</div>
<button type="submit" class="btn btn-primary mt-2">Update Volume</button>
</form>
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
</div>
</form>
</div>
</div>
</div>
</div>
<!-- New row for single paper processing -->
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5>Process Single Paper</h5>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-6">
<form id="searchPaperForm" class="mb-3">
<div class="input-group">
<input type="text" id="paperSearchInput" class="form-control"
placeholder="Search paper by title, DOI, or ID...">
<button class="btn btn-outline-secondary" type="submit">Search</button>
</div>
</form>
</div>
<div class="col-md-6">
<div class="form-group">
<label for="scraperSelect">Scraper Module:</label>
<select class="form-control" id="scraperSelect">
<option value="">Use default system scraper</option>
<!-- Available scrapers will be populated here -->
</select>
<div class="form-text">
Select which scraper to use for processing the paper
</div>
<!-- New row for single paper processing -->
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5>Process Single Paper</h5>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-6">
<form id="searchPaperForm" class="mb-3">
<div class="input-group">
<input type="text" id="paperSearchInput" class="form-control"
placeholder="Search paper by title, DOI, or ID...">
<button class="btn btn-outline-secondary" type="submit">Search</button>
</div>
</form>
</div>
<div class="col-md-6">
<div class="form-group">
<label for="scraperSelect">Scraper Module:</label>
<select class="form-control" id="scraperSelect">
<option value="">Use default system scraper</option>
<!-- Available scrapers will be populated here -->
</select>
<div class="form-text">
Select which scraper to use for processing the paper
</div>
</div>
</div>
<div id="searchResults" class="mt-3 search-results-container d-none">
<table class="table table-hover table-striped">
<thead>
<tr>
<th>ID</th>
<th>Title</th>
<th>DOI</th>
<th>Status</th>
<th>Actions</th>
</tr>
</thead>
<tbody id="paperSearchResults">
<!-- Search results will be populated here -->
</tbody>
</table>
</div>
<div id="processingStatus" class="alert alert-info mt-3 d-none"></div>
</div>
<div id="searchResults" class="mt-3 search-results-container d-none">
<table class="table table-hover table-striped">
<thead>
<tr>
<th>ID</th>
<th>Title</th>
<th>DOI</th>
<th>Status</th>
<th>Actions</th>
</tr>
</thead>
<tbody id="paperSearchResults">
<!-- Search results will be populated here -->
</tbody>
</table>
</div>
<div id="processingStatus" class="alert alert-info mt-3 d-none"></div>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header d-flex justify-content-between align-items-center">
<h5>Scraping Activity</h5>
<div>
<div class="form-check form-switch">
<input class="form-check-input" type="checkbox" id="notificationsToggle" checked>
<label class="form-check-label" for="notificationsToggle">Show Notifications</label>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header d-flex justify-content-between align-items-center">
<h5>Scraping Activity</h5>
<div>
<div class="form-check form-switch">
<input class="form-check-input" type="checkbox" id="notificationsToggle" checked>
<label class="form-check-label" for="notificationsToggle">Show Notifications</label>
</div>
</div>
<div class="card-body">
<div class="btn-group mb-3">
<button class="btn btn-outline-secondary time-range-btn" data-hours="6">Last 6
hours</button>
<button class="btn btn-outline-secondary time-range-btn active" data-hours="24">Last 24
hours</button>
<button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3
days</button>
</div>
<div class="stats-chart" id="activityChart"></div>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5>Recent Activity</h5>
<div class="card-body">
<div class="btn-group mb-3">
<button class="btn btn-outline-secondary time-range-btn" data-hours="6">Last 6
hours</button>
<button class="btn btn-outline-secondary time-range-btn active" data-hours="24">Last 24
hours</button>
<button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3
days</button>
</div>
<div class="card-body">
<div class="table-responsive">
<table class="table table-striped">
<thead>
<tr>
<th>Time</th>
<th>Action</th>
<th>Status</th>
<th>Description</th>
</tr>
</thead>
<tbody id="activityLog">
<tr>
<td colspan="4" class="text-center">Loading activities...</td>
</tr>
</tbody>
</table>
</div>
<div class="chart-wrapper">
<canvas id="activityChart"></canvas>
</div>
</div>
</div>
</div>
</div>
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5>Recent Activity</h5>
</div>
<div class="card-body">
<div class="table-responsive">
<table class="table table-striped">
<thead>
<tr>
<th>Time</th>
<th>Action</th>
<th>Status</th>
<th>Description</th>
</tr>
</thead>
<tbody id="activityLog">
<tr>
<td colspan="4" class="text-center">Loading activities...</td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
</div>
</div>
</div>
{% endblock content %}
{% block scripts %}
{{ super() }}
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script>
// Global variables for the scraper dashboard
let notificationsEnabled = true;
@ -251,10 +261,14 @@
// Initialize the page
document.addEventListener('DOMContentLoaded', function () {
initStatusPolling();
loadActivityStats(currentTimeRange);
loadRecentActivity();
loadAvailableScrapers();
// Load chart data after a short delay to ensure Chart.js is loaded
setTimeout(() => {
loadActivityStats(currentTimeRange);
}, 100);
// Initialize event listeners
startButton.addEventListener('click', startScraper);
pauseButton.addEventListener('click', togglePauseScraper);
@ -470,13 +484,21 @@
fetch('/scraper/status')
.then(response => response.json())
.then(data => {
if (data.active) {
if (data.paused) {
statusIndicator.className = 'status-indicator status-paused';
console.log('Status data received:', data); // Debug log
// Remove all status classes first
statusIndicator.classList.remove('status-active', 'status-paused', 'status-inactive');
// Handle the new JSON structure with scraper_state
const scraperState = data.scraper_state || data; // Fallback for old structure
if (scraperState.active) {
if (scraperState.paused) {
statusIndicator.classList.add('status-paused');
statusText.textContent = 'Paused';
pauseButton.textContent = 'Resume';
} else {
statusIndicator.className = 'status-indicator status-active';
statusIndicator.classList.add('status-active');
statusText.textContent = 'Active';
pauseButton.textContent = 'Pause';
}
@ -485,13 +507,20 @@
stopButton.disabled = false;
resetButton.disabled = false; // Enable reset when active
} else {
statusIndicator.className = 'status-indicator status-inactive';
statusIndicator.classList.add('status-inactive');
statusText.textContent = 'Inactive';
startButton.disabled = false;
pauseButton.disabled = true;
stopButton.disabled = true;
resetButton.disabled = false; // Enable reset when inactive too
}
})
.catch(error => {
console.error('Error fetching status:', error);
// On error, show inactive state
statusIndicator.classList.remove('status-active', 'status-paused', 'status-inactive');
statusIndicator.classList.add('status-inactive');
statusText.textContent = 'Error';
});
}
@ -499,7 +528,13 @@
function startScraper() {
console.log("Start button clicked - sending request to /scraper/start");
fetch('/scraper/start', { method: 'POST' })
fetch('/scraper/start', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({})
})
.then(response => {
console.log("Response received:", response);
return response.json();
@ -521,7 +556,13 @@
}
function togglePauseScraper() {
fetch('/scraper/pause', { method: 'POST' })
fetch('/scraper/pause', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({})
})
.then(response => response.json())
.then(data => {
if (data.success) {
@ -535,7 +576,13 @@
}
function stopScraper() {
fetch('/scraper/stop', { method: 'POST' })
fetch('/scraper/stop', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({})
})
.then(response => response.json())
.then(data => {
if (data.success) {
@ -706,14 +753,28 @@
// Load data functions
function loadActivityStats(hours) {
fetch(`/scraper/stats?hours=${hours}`)
.then(response => response.json())
.then(response => {
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
return response.json();
})
.then(data => {
console.log('Stats data loaded:', data);
renderActivityChart(data);
})
.catch(error => {
console.error('Failed to load activity stats:', error);
// Hide the chart or show an error message
const chartContainer = document.getElementById('activityChart').parentElement;
if (chartContainer) {
chartContainer.innerHTML = '<p class="text-muted">Chart data unavailable</p>';
}
});
}
function loadRecentActivity() {
fetch('/api/activity_logs?category=scraper_activity&limit=20')
fetch('/api/activity_logs?category=scraper_activity&category=scraper_command&limit=50')
.then(response => response.json())
.then(data => {
renderActivityLog(data);
@ -728,7 +789,19 @@
// Rendering functions
function renderActivityChart(data) {
const ctx = document.getElementById('activityChart').getContext('2d');
// Check if Chart.js is available
if (typeof Chart === 'undefined') {
console.error('Chart.js is not loaded');
return;
}
const chartElement = document.getElementById('activityChart');
if (!chartElement) {
console.error('Chart canvas element not found');
return;
}
const ctx = chartElement.getContext('2d');
// Extract the data for the chart
const labels = data.map(item => `${item.hour}:00`);
@ -857,7 +930,7 @@
let lastPaperTimestamp = new Date().toISOString();
function checkForNewPapers() {
fetch(`/api/activity_logs?category=scraper_activity&action=scrape_paper&after=${lastPaperTimestamp}&limit=5`)
fetch(`/api/activity_logs?category=scraper_activity&category=scraper_command&action=scrape_paper&after=${lastPaperTimestamp}&limit=5`)
.then(response => response.json())
.then(data => {
if (data && data.length > 0) {