refines modular scraping system. adds another dummy scraper
This commit is contained in:
parent
ac348696b5
commit
1e97a9cc7b
@ -1,9 +1,11 @@
|
||||
from scipaperloader.celery import celery, configure_celery
|
||||
# Import all task modules to ensure they are registered with Celery
|
||||
import scipaperloader.scrapers.tasks # Import new scheduler tasks
|
||||
import scipaperloader.blueprints.scraper # Import the scraper module with our tasks
|
||||
|
||||
# Configure celery with Flask app
|
||||
configure_celery()
|
||||
|
||||
if __name__ == '__main__':
|
||||
celery.start()
|
||||
# Start the Celery worker
|
||||
celery.start(['worker', '--loglevel=info', '--concurrency=2'])
|
@ -9,7 +9,7 @@ bp = Blueprint("api", __name__, url_prefix="/api")
|
||||
def get_activity_logs():
|
||||
"""Get activity logs with filtering options."""
|
||||
# Get query parameters
|
||||
category = request.args.get("category")
|
||||
categories = request.args.getlist("category") # Changed to getlist for multiple values
|
||||
action = request.args.get("action")
|
||||
after = request.args.get("after")
|
||||
limit = request.args.get("limit", 20, type=int)
|
||||
@ -17,8 +17,9 @@ def get_activity_logs():
|
||||
# Build query
|
||||
query = ActivityLog.query
|
||||
|
||||
if category:
|
||||
query = query.filter(ActivityLog.category == category)
|
||||
if categories:
|
||||
# Filter by multiple categories using in_() for SQL IN clause
|
||||
query = query.filter(ActivityLog.category.in_(categories))
|
||||
|
||||
if action:
|
||||
query = query.filter(ActivityLog.action == action)
|
||||
|
@ -34,21 +34,8 @@ def _update_volume(new_volume):
|
||||
if new_volume <= 0 or new_volume > MAX_VOLUME:
|
||||
return False, f"Volume must be between 1 and {MAX_VOLUME}", None
|
||||
|
||||
volume_config = VolumeConfig.query.first()
|
||||
if not volume_config:
|
||||
volume_config = VolumeConfig(volume=new_volume)
|
||||
db.session.add(volume_config)
|
||||
else:
|
||||
old_value = volume_config.volume
|
||||
volume_config.volume = new_volume
|
||||
ActivityLog.log_config_change(
|
||||
config_key="scraper_volume",
|
||||
old_value=old_value,
|
||||
new_value=new_volume,
|
||||
description="Updated scraper volume"
|
||||
)
|
||||
|
||||
db.session.commit()
|
||||
# Use the new class method to set the volume
|
||||
volume_config = VolumeConfig.set_volume(new_volume)
|
||||
|
||||
# Invalidate and recalculate the hourly quota cache
|
||||
try:
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -32,8 +32,8 @@ def configure_celery(app=None):
|
||||
task_reject_on_worker_lost=True, # Requeue tasks if worker dies
|
||||
# Configure Beat schedule for periodic tasks
|
||||
beat_schedule={
|
||||
'scheduled-scraper-hourly': {
|
||||
'task': 'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
|
||||
'hourly-scraper-scheduler': {
|
||||
'task': 'scipaperloader.scrapers.tasks.hourly_scraper_scheduler',
|
||||
'schedule': crontab(minute=0), # Run at the start of every hour
|
||||
'options': {'expires': 3600}
|
||||
},
|
||||
|
@ -91,12 +91,13 @@ class ActivityLog(db.Model):
|
||||
return log
|
||||
|
||||
@classmethod
|
||||
def log_scraper_command(cls, action, status=None, user_id=None, **extra):
|
||||
def log_scraper_command(cls, action, status=None, description=None, user_id=None, **extra):
|
||||
"""Log a scraper command (start/stop/pause)."""
|
||||
log = cls(
|
||||
category=ActivityCategory.SCRAPER_COMMAND.value,
|
||||
action=action,
|
||||
status=status,
|
||||
description=description,
|
||||
user_id=user_id
|
||||
)
|
||||
log.set_extra_data(extra)
|
||||
@ -191,6 +192,7 @@ class PaperMetadata(db.Model):
|
||||
language = db.Column(db.String(50))
|
||||
published_online = db.Column(db.Date) # or DateTime/String
|
||||
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
|
||||
previous_status = db.Column(db.String(10), nullable=True) # Store previous status for reversion
|
||||
file_path = db.Column(db.Text)
|
||||
error_msg = db.Column(db.Text)
|
||||
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
|
||||
@ -209,6 +211,35 @@ class ScheduleConfig(db.Model):
|
||||
class VolumeConfig(db.Model):
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
volume = db.Column(db.Float) # volume of papers to scrape per day
|
||||
|
||||
@classmethod
|
||||
def get_current_volume(cls):
|
||||
"""Get the current volume configuration, creating default if needed."""
|
||||
config = cls.query.first()
|
||||
if not config:
|
||||
config = cls(volume=100)
|
||||
db.session.add(config)
|
||||
db.session.commit()
|
||||
return config.volume
|
||||
|
||||
@classmethod
|
||||
def set_volume(cls, new_volume):
|
||||
"""Set the volume configuration."""
|
||||
config = cls.query.first()
|
||||
if not config:
|
||||
config = cls(volume=new_volume)
|
||||
db.session.add(config)
|
||||
else:
|
||||
old_value = config.volume
|
||||
config.volume = new_volume
|
||||
ActivityLog.log_config_change(
|
||||
config_key="scraper_volume",
|
||||
old_value=old_value,
|
||||
new_value=new_volume,
|
||||
description="Updated scraper volume configuration"
|
||||
)
|
||||
db.session.commit()
|
||||
return config
|
||||
|
||||
class DownloadPathConfig(db.Model):
|
||||
"""Model to store the base path for downloaded files."""
|
||||
@ -220,7 +251,7 @@ class DownloadPathConfig(db.Model):
|
||||
"""Get the configured download path, creating default if needed."""
|
||||
config = cls.query.first()
|
||||
if not config:
|
||||
config = cls(path="/path/to/dummy/papers") # Ensure default exists
|
||||
config = cls(path="/tmp/") # Ensure default exists
|
||||
db.session.add(config)
|
||||
db.session.commit()
|
||||
return config.path
|
||||
@ -341,6 +372,7 @@ def init_schedule_config():
|
||||
default_volume = VolumeConfig(volume=100)
|
||||
db.session.add(default_volume)
|
||||
db.session.commit()
|
||||
|
||||
|
||||
# Initialize DownloadPathConfig if it doesn't exist
|
||||
if DownloadPathConfig.query.count() == 0:
|
||||
|
@ -1,2 +1,18 @@
|
||||
# This package contains all scraper modules.
|
||||
# Each scraper should implement the BaseScraper interface from base.py.
|
||||
|
||||
from .base import BaseScraper, ScrapeResult
|
||||
from .factory import get_scraper, get_available_scrapers
|
||||
from .manager import ScraperManager
|
||||
from .dummy import Scraper as DummyScraper
|
||||
from .failed_retry import Scraper as FailedRetryScraper
|
||||
|
||||
__all__ = [
|
||||
'BaseScraper',
|
||||
'ScrapeResult',
|
||||
'get_scraper',
|
||||
'get_available_scrapers',
|
||||
'ScraperManager',
|
||||
'DummyScraper',
|
||||
'FailedRetryScraper'
|
||||
]
|
||||
|
@ -1,5 +1,5 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import NamedTuple, Optional, Dict
|
||||
from typing import NamedTuple, Optional, Dict, List
|
||||
from datetime import datetime
|
||||
|
||||
class ScrapeResult(NamedTuple):
|
||||
@ -12,6 +12,12 @@ class ScrapeResult(NamedTuple):
|
||||
class BaseScraper(ABC):
|
||||
"""Base class for all scraper implementations."""
|
||||
|
||||
# Default input/output statuses - can be overridden by subclasses
|
||||
INPUT_STATUSES = ["New"] # Which paper statuses this scraper will process
|
||||
OUTPUT_STATUS_SUCCESS = "Done" # Status to set on successful scraping
|
||||
OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping
|
||||
OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing
|
||||
|
||||
@abstractmethod
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""
|
||||
@ -32,3 +38,15 @@ class BaseScraper(ABC):
|
||||
def get_description(self) -> str:
|
||||
"""Return a description of this scraper."""
|
||||
return getattr(self.__class__, "__doc__", "No description available")
|
||||
|
||||
def get_input_statuses(self) -> List[str]:
|
||||
"""Return list of paper statuses this scraper can process."""
|
||||
return self.INPUT_STATUSES
|
||||
|
||||
def get_output_statuses(self) -> Dict[str, str]:
|
||||
"""Return mapping of result types to output statuses."""
|
||||
return {
|
||||
"success": self.OUTPUT_STATUS_SUCCESS,
|
||||
"failure": self.OUTPUT_STATUS_FAILURE,
|
||||
"processing": self.OUTPUT_STATUS_PROCESSING
|
||||
}
|
||||
|
@ -10,6 +10,12 @@ from ..db import db
|
||||
class Scraper(BaseScraper):
|
||||
"""Dummy scraper for testing purposes that simulates paper downloading."""
|
||||
|
||||
# This scraper processes "New" papers and outputs "Done"/"Failed"
|
||||
INPUT_STATUSES = ["New"]
|
||||
OUTPUT_STATUS_SUCCESS = "Done"
|
||||
OUTPUT_STATUS_FAILURE = "Failed"
|
||||
OUTPUT_STATUS_PROCESSING = "Pending"
|
||||
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""Simulate scraping a paper with realistic timing and random success/failure."""
|
||||
start_time = time.time()
|
||||
|
@ -1,5 +1,4 @@
|
||||
import importlib
|
||||
from flask import current_app
|
||||
from .base import BaseScraper
|
||||
|
||||
def get_scraper() -> BaseScraper:
|
||||
@ -7,10 +6,16 @@ def get_scraper() -> BaseScraper:
|
||||
from ..models import ScraperModuleConfig, ActivityLog
|
||||
|
||||
try:
|
||||
# Get module name from database first, fallback to config
|
||||
# Get module name from database first, fallback to dummy
|
||||
name = ScraperModuleConfig.get_current_module()
|
||||
if not name:
|
||||
name = current_app.config.get("SCRAPER_MODULE", "dummy")
|
||||
# Only try to access Flask config if we're in app context
|
||||
try:
|
||||
from flask import current_app
|
||||
name = current_app.config.get("SCRAPER_MODULE", "dummy")
|
||||
except RuntimeError:
|
||||
# No app context, use dummy
|
||||
name = "dummy"
|
||||
|
||||
module = importlib.import_module(f"scipaperloader.scrapers.{name}")
|
||||
cls = getattr(module, "Scraper")
|
||||
|
123
scipaperloader/scrapers/failed_retry.py
Normal file
123
scipaperloader/scrapers/failed_retry.py
Normal file
@ -0,0 +1,123 @@
|
||||
import time
|
||||
import random
|
||||
import os
|
||||
from datetime import datetime
|
||||
from .base import BaseScraper, ScrapeResult
|
||||
from flask import current_app
|
||||
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||
from ..db import db
|
||||
|
||||
class Scraper(BaseScraper):
|
||||
"""Retry scraper that attempts to re-process failed papers with different strategies."""
|
||||
|
||||
# This scraper specifically targets "Failed" papers and retries them
|
||||
INPUT_STATUSES = ["Failed"]
|
||||
OUTPUT_STATUS_SUCCESS = "Done"
|
||||
OUTPUT_STATUS_FAILURE = "Failed"
|
||||
OUTPUT_STATUS_PROCESSING = "Retrying"
|
||||
|
||||
def scrape(self, doi: str) -> ScrapeResult:
|
||||
"""Retry scraping a failed paper with enhanced error handling."""
|
||||
start_time = time.time()
|
||||
|
||||
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||
if not paper:
|
||||
return ScrapeResult(
|
||||
status="error",
|
||||
message=f"No paper found for DOI {doi}",
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
# Log retry attempt
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_failed_paper",
|
||||
status="info",
|
||||
description=f"Retrying failed paper: {paper.title}",
|
||||
paper_id=paper.id
|
||||
)
|
||||
|
||||
# Simulate longer processing time for retry (2-5 seconds)
|
||||
processing_time = random.uniform(2, 5)
|
||||
time.sleep(processing_time)
|
||||
|
||||
# Simulate 60% success rate on retry (lower than initial attempt)
|
||||
success = random.random() < 0.6
|
||||
|
||||
result_data = {}
|
||||
|
||||
if success:
|
||||
# Get download path and create dummy file
|
||||
download_path = DownloadPathConfig.get_path()
|
||||
file_name = f"{doi.replace('/', '_')}_retry.pdf"
|
||||
file_path = f"{download_path}/{file_name}"
|
||||
|
||||
try:
|
||||
# Ensure directory exists
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
|
||||
# Create a dummy PDF file
|
||||
with open(file_path, 'w') as f:
|
||||
f.write(f"Dummy PDF content for retry of {doi}")
|
||||
|
||||
result_data = {"file_path": file_path}
|
||||
|
||||
# Log success
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_scrape_success",
|
||||
status="success",
|
||||
description=f"Successfully retried {doi} on second attempt",
|
||||
paper_id=paper.id
|
||||
)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="success",
|
||||
message=f"Successfully retried paper {doi}",
|
||||
data=result_data,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to save retry file: {str(e)}"
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_scrape_file_error",
|
||||
status="error",
|
||||
description=error_msg,
|
||||
paper_id=paper.id
|
||||
)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
else:
|
||||
# Retry failed - generate different error message
|
||||
error_messages = [
|
||||
"Retry failed: Still no access to publisher",
|
||||
"Retry failed: Alternative download methods exhausted",
|
||||
"Retry failed: DOI appears permanently inaccessible",
|
||||
"Retry failed: Network timeout persists"
|
||||
]
|
||||
error_msg = random.choice(error_messages)
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="retry_scrape_failure",
|
||||
status="error",
|
||||
description=f"Retry failed for {doi}: {error_msg}",
|
||||
paper_id=paper.id
|
||||
)
|
||||
|
||||
result = ScrapeResult(
|
||||
status="error",
|
||||
message=error_msg,
|
||||
data=None,
|
||||
duration=time.time() - start_time,
|
||||
timestamp=datetime.utcnow()
|
||||
)
|
||||
|
||||
return result
|
747
scipaperloader/scrapers/manager.py
Normal file
747
scipaperloader/scrapers/manager.py
Normal file
@ -0,0 +1,747 @@
|
||||
"""
|
||||
Simplified scraper management system with hourly quota scheduling.
|
||||
"""
|
||||
|
||||
import random
|
||||
import math
|
||||
import redis
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Optional
|
||||
from sqlalchemy import func
|
||||
|
||||
from ..models import (
|
||||
PaperMetadata,
|
||||
ScheduleConfig,
|
||||
VolumeConfig,
|
||||
ScraperState,
|
||||
ActivityLog,
|
||||
ScraperModuleConfig
|
||||
)
|
||||
from ..db import db
|
||||
from ..cache_utils import get_cached_hourly_quota
|
||||
from .factory import get_scraper, get_available_scrapers
|
||||
from ..celery import celery
|
||||
|
||||
|
||||
class ScraperManager:
|
||||
"""Manages scraper operations with hourly quota-based scheduling."""
|
||||
|
||||
def __init__(self):
|
||||
self.current_scraper = None
|
||||
self.pending_papers = [] # Track papers being processed
|
||||
# Initialize Redis client for delayed task management
|
||||
self.redis_client = None
|
||||
self._init_redis_client()
|
||||
|
||||
def _init_redis_client(self):
|
||||
"""Initialize Redis client for delayed task management."""
|
||||
try:
|
||||
# Use same Redis configuration as Celery
|
||||
self.redis_client = redis.Redis(
|
||||
host='localhost',
|
||||
port=6379,
|
||||
db=0,
|
||||
decode_responses=True
|
||||
)
|
||||
# Test connection
|
||||
self.redis_client.ping()
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Failed to initialize Redis client: {str(e)}",
|
||||
source="ScraperManager._init_redis_client"
|
||||
)
|
||||
self.redis_client = None
|
||||
|
||||
def _clear_delayed_tasks_from_redis(self) -> int:
|
||||
"""Clear delayed tasks from Redis structures used by Celery.
|
||||
|
||||
Based on analysis, Celery stores delayed tasks in:
|
||||
- 'unacked_index': Sorted set containing task IDs with execution timestamps
|
||||
- 'unacked': Hash containing task data keyed by task ID
|
||||
|
||||
Returns:
|
||||
int: Number of delayed tasks cleared
|
||||
"""
|
||||
if not self.redis_client:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message="Redis client not available - cannot clear delayed tasks",
|
||||
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||
)
|
||||
except RuntimeError:
|
||||
# Working outside application context - just print instead
|
||||
print("❌ Redis client not available - cannot clear delayed tasks")
|
||||
return 0
|
||||
|
||||
cleared_count = 0
|
||||
try:
|
||||
# Define scraper task patterns to identify our tasks
|
||||
scraper_patterns = [
|
||||
'process_single_paper',
|
||||
'process_papers_batch',
|
||||
'hourly_scraper_scheduler'
|
||||
]
|
||||
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="check_delayed_tasks",
|
||||
status="info",
|
||||
description="Checking Celery delayed task structures (unacked_index, unacked)"
|
||||
)
|
||||
except RuntimeError:
|
||||
print("🔍 Checking Celery delayed task structures (unacked_index, unacked)")
|
||||
|
||||
# Check 'unacked_index' (sorted set with task IDs and timestamps)
|
||||
unacked_index_cleared = 0
|
||||
if self.redis_client.exists('unacked_index'):
|
||||
try:
|
||||
# Get all task IDs from the sorted set
|
||||
task_ids = self.redis_client.zrange('unacked_index', 0, -1)
|
||||
|
||||
if task_ids:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="scan_unacked_index",
|
||||
status="info",
|
||||
description=f"Found {len(task_ids)} tasks in 'unacked_index'"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"📋 Found {len(task_ids)} tasks in 'unacked_index'")
|
||||
|
||||
# Check each task ID against the 'unacked' hash to get task details
|
||||
scraper_task_ids = []
|
||||
for task_id in task_ids:
|
||||
try:
|
||||
# Get task data from 'unacked' hash
|
||||
task_data = self.redis_client.hget('unacked', task_id)
|
||||
if task_data:
|
||||
# Check if this task contains any of our scraper patterns
|
||||
if any(pattern in str(task_data) for pattern in scraper_patterns):
|
||||
scraper_task_ids.append(task_id)
|
||||
except Exception:
|
||||
# Skip individual task errors
|
||||
continue
|
||||
|
||||
# Remove scraper task IDs from both structures
|
||||
for task_id in scraper_task_ids:
|
||||
try:
|
||||
# Remove from unacked_index (sorted set)
|
||||
removed_from_index = self.redis_client.zrem('unacked_index', task_id)
|
||||
# Remove from unacked (hash)
|
||||
removed_from_hash = self.redis_client.hdel('unacked', task_id)
|
||||
|
||||
if removed_from_index or removed_from_hash:
|
||||
unacked_index_cleared += 1
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error removing delayed task {task_id}: {str(e)}",
|
||||
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"❌ Error removing delayed task {task_id}: {str(e)}")
|
||||
continue
|
||||
|
||||
cleared_count += unacked_index_cleared
|
||||
|
||||
if unacked_index_cleared > 0:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="clear_unacked_tasks",
|
||||
status="success",
|
||||
description=f"Cleared {unacked_index_cleared} scraper tasks from unacked structures"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"✅ Cleared {unacked_index_cleared} scraper tasks from unacked structures")
|
||||
else:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="check_unacked_index",
|
||||
status="info",
|
||||
description="No tasks found in 'unacked_index'"
|
||||
)
|
||||
except RuntimeError:
|
||||
print("ℹ️ No tasks found in 'unacked_index'")
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error accessing 'unacked_index': {str(e)}",
|
||||
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"❌ Error accessing 'unacked_index': {str(e)}")
|
||||
else:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="check_unacked_index",
|
||||
status="info",
|
||||
description="'unacked_index' key does not exist - no delayed tasks"
|
||||
)
|
||||
except RuntimeError:
|
||||
print("ℹ️ 'unacked_index' key does not exist - no delayed tasks")
|
||||
|
||||
# Also check the 'celery' queue for immediate tasks (backup check)
|
||||
celery_cleared = 0
|
||||
try:
|
||||
queue_length = self.redis_client.llen('celery')
|
||||
if queue_length and queue_length > 0:
|
||||
# Scan for any scraper tasks in the immediate queue
|
||||
scraper_tasks = []
|
||||
for i in range(queue_length):
|
||||
try:
|
||||
task_data = self.redis_client.lindex('celery', i)
|
||||
if task_data and any(pattern in str(task_data) for pattern in scraper_patterns):
|
||||
scraper_tasks.append(task_data)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Remove scraper tasks from celery queue
|
||||
for task_data in scraper_tasks:
|
||||
try:
|
||||
removed_count = self.redis_client.lrem('celery', 0, task_data)
|
||||
celery_cleared += removed_count
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
cleared_count += celery_cleared
|
||||
|
||||
if celery_cleared > 0:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="clear_celery_tasks",
|
||||
status="success",
|
||||
description=f"Cleared {celery_cleared} scraper tasks from 'celery' queue"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"✅ Cleared {celery_cleared} scraper tasks from 'celery' queue")
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error checking 'celery' queue: {str(e)}",
|
||||
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"❌ Error checking 'celery' queue: {str(e)}")
|
||||
|
||||
# Summary
|
||||
if cleared_count > 0:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="clear_delayed_tasks_complete",
|
||||
status="success",
|
||||
description=f"Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"✅ Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})")
|
||||
else:
|
||||
try:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="clear_delayed_tasks_complete",
|
||||
status="info",
|
||||
description="No delayed scraper tasks found to clear in Redis"
|
||||
)
|
||||
except RuntimeError:
|
||||
print("ℹ️ No delayed scraper tasks found to clear in Redis")
|
||||
|
||||
return cleared_count
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Failed to clear delayed tasks from Redis: {str(e)}",
|
||||
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||
)
|
||||
except RuntimeError:
|
||||
print(f"❌ Failed to clear delayed tasks from Redis: {str(e)}")
|
||||
return 0
|
||||
|
||||
def start_scraper(self) -> Dict[str, str]:
|
||||
"""Start the scraper system."""
|
||||
try:
|
||||
# Get current scraper
|
||||
self.current_scraper = get_scraper()
|
||||
|
||||
# Activate scraper state
|
||||
ScraperState.set_active(True)
|
||||
ScraperState.set_paused(False)
|
||||
|
||||
scraper_name = self.current_scraper.get_name()
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="start_scraper",
|
||||
status="success",
|
||||
description=f"Started scraper: {scraper_name}. Use /trigger-immediate endpoint to immediately schedule papers instead of waiting for the next hourly boundary."
|
||||
)
|
||||
|
||||
return {"status": "success", "message": "Scraper started successfully. Papers will be scheduled at the next hourly boundary, or use /trigger-immediate to schedule immediately."}
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Failed to start scraper: {str(e)}",
|
||||
source="ScraperManager.start_scraper"
|
||||
)
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
def pause_scraper(self) -> Dict[str, str]:
|
||||
"""Pause the scraper system."""
|
||||
try:
|
||||
ScraperState.set_paused(True)
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="pause_scraper",
|
||||
status="success",
|
||||
description="Scraper paused - processing will halt"
|
||||
)
|
||||
|
||||
return {"status": "success", "message": "Scraper paused"}
|
||||
|
||||
except Exception as e:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
def resume_scraper(self) -> Dict[str, str]:
|
||||
"""Resume the scraper system."""
|
||||
try:
|
||||
ScraperState.set_paused(False)
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="resume_scraper",
|
||||
status="success",
|
||||
description="Scraper resumed - processing will continue"
|
||||
)
|
||||
|
||||
return {"status": "success", "message": "Scraper resumed"}
|
||||
|
||||
except Exception as e:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
def stop_scraper(self) -> Dict[str, str]:
|
||||
"""Stop the scraper, revoke all running tasks, and revert pending papers."""
|
||||
try:
|
||||
# First, revoke all running tasks
|
||||
revoked_count = 0
|
||||
delayed_cleared_count = 0
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="stop_scraper_start",
|
||||
status="info",
|
||||
description="Beginning scraper stop process with task revocation and delayed task clearing"
|
||||
)
|
||||
|
||||
try:
|
||||
# Get Celery inspector to check for running tasks
|
||||
i = celery.control.inspect()
|
||||
active = i.active() or {}
|
||||
scheduled = i.scheduled() or {}
|
||||
reserved = i.reserved() or {}
|
||||
|
||||
# Revoke active tasks
|
||||
for worker, tasks in active.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_task",
|
||||
status="success",
|
||||
description=f"Revoked active task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||
)
|
||||
|
||||
# Revoke scheduled tasks
|
||||
for worker, tasks in scheduled.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_task",
|
||||
status="success",
|
||||
description=f"Revoked scheduled task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||
)
|
||||
|
||||
# Revoke reserved tasks
|
||||
for worker, tasks in reserved.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_task",
|
||||
status="success",
|
||||
description=f"Revoked reserved task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||
)
|
||||
|
||||
# Purge all task queues
|
||||
celery.control.purge()
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="purge_queues",
|
||||
status="success",
|
||||
description="Purged all task queues"
|
||||
)
|
||||
|
||||
# **NEW: Clear delayed tasks from Redis sorted sets**
|
||||
delayed_cleared_count = self._clear_delayed_tasks_from_redis()
|
||||
|
||||
# Additional cleanup: revoke any remaining scraper-related tasks by name pattern
|
||||
try:
|
||||
# Use broadcast to revoke tasks that match scraper patterns
|
||||
scraper_task_patterns = [
|
||||
'process_single_paper',
|
||||
'process_papers_batch',
|
||||
'hourly_scraper_scheduler'
|
||||
]
|
||||
|
||||
# Get a fresh inspection of tasks after purge
|
||||
fresh_inspect = celery.control.inspect()
|
||||
all_tasks = {}
|
||||
all_tasks.update(fresh_inspect.active() or {})
|
||||
all_tasks.update(fresh_inspect.scheduled() or {})
|
||||
all_tasks.update(fresh_inspect.reserved() or {})
|
||||
|
||||
additional_revoked = 0
|
||||
for worker, tasks in all_tasks.items():
|
||||
for task in tasks:
|
||||
task_name = task.get('name', '')
|
||||
task_id = task.get('id', '')
|
||||
if any(pattern in task_name for pattern in scraper_task_patterns) and task_id:
|
||||
celery.control.revoke(task_id, terminate=True)
|
||||
additional_revoked += 1
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_scraper_task",
|
||||
status="success",
|
||||
description=f"Revoked lingering scraper task: {task_name} (ID: {task_id})"
|
||||
)
|
||||
|
||||
if additional_revoked > 0:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="cleanup_scraper_tasks",
|
||||
status="success",
|
||||
description=f"Additional cleanup: revoked {additional_revoked} lingering scraper tasks"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error during additional scraper task cleanup: {str(e)}",
|
||||
source="ScraperManager.stop_scraper.cleanup"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error revoking tasks: {str(e)}",
|
||||
source="ScraperManager.stop_scraper"
|
||||
)
|
||||
# Continue with paper reversion even if task revocation fails
|
||||
|
||||
# Get current scraper to know what status to revert to
|
||||
scraper = get_scraper()
|
||||
input_statuses = scraper.get_input_statuses()
|
||||
|
||||
# Find papers that are currently being processed
|
||||
processing_status = scraper.get_output_statuses()["processing"]
|
||||
pending_papers = PaperMetadata.query.filter_by(status=processing_status).all()
|
||||
|
||||
# Revert their status to the first input status
|
||||
reverted_count = 0
|
||||
if pending_papers and input_statuses:
|
||||
revert_status = input_statuses[0] # Use first input status as default
|
||||
|
||||
for paper in pending_papers:
|
||||
# Try to use previous_status if available, otherwise use first input status
|
||||
if hasattr(paper, 'previous_status') and paper.previous_status:
|
||||
paper.status = paper.previous_status
|
||||
else:
|
||||
paper.status = revert_status
|
||||
paper.updated_at = datetime.utcnow()
|
||||
reverted_count += 1
|
||||
|
||||
db.session.commit()
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revert_pending_papers",
|
||||
status="success",
|
||||
description=f"Reverted {reverted_count} papers from '{processing_status}' to previous status"
|
||||
)
|
||||
|
||||
# Deactivate scraper
|
||||
ScraperState.set_active(False)
|
||||
ScraperState.set_paused(False)
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="stop_scraper",
|
||||
status="success",
|
||||
description=f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers."
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"message": f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers to previous status."
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Failed to stop scraper: {str(e)}",
|
||||
source="ScraperManager.stop_scraper"
|
||||
)
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
def reset_scraper(self) -> Dict[str, str]:
|
||||
"""Reset scraper state, revoke all running tasks, and clear all processing statuses."""
|
||||
try:
|
||||
# First, revoke all running tasks (similar to stop_scraper)
|
||||
revoked_count = 0
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="reset_scraper_start",
|
||||
status="info",
|
||||
description="Beginning scraper reset process with task revocation"
|
||||
)
|
||||
|
||||
try:
|
||||
# Get Celery inspector to check for running tasks
|
||||
i = celery.control.inspect()
|
||||
active = i.active() or {}
|
||||
scheduled = i.scheduled() or {}
|
||||
reserved = i.reserved() or {}
|
||||
|
||||
# Revoke all tasks (active, scheduled, reserved)
|
||||
for queue_name, queue_tasks in [("active", active), ("scheduled", scheduled), ("reserved", reserved)]:
|
||||
for worker, tasks in queue_tasks.items():
|
||||
for task in tasks:
|
||||
if 'id' in task:
|
||||
celery.control.revoke(task['id'], terminate=True)
|
||||
revoked_count += 1
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="revoke_task",
|
||||
status="success",
|
||||
description=f"Revoked {queue_name} task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||
)
|
||||
|
||||
# Purge all task queues
|
||||
celery.control.purge()
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="purge_queues",
|
||||
status="success",
|
||||
description="Purged all task queues during reset"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error revoking tasks during reset: {str(e)}",
|
||||
source="ScraperManager.reset_scraper"
|
||||
)
|
||||
# Continue with paper reversion even if task revocation fails
|
||||
|
||||
# Get current scraper configuration
|
||||
scraper = get_scraper()
|
||||
input_statuses = scraper.get_input_statuses()
|
||||
processing_status = scraper.get_output_statuses()["processing"]
|
||||
|
||||
# Reset all papers in processing status
|
||||
pending_papers = PaperMetadata.query.filter_by(status=processing_status).all()
|
||||
reverted_count = 0
|
||||
|
||||
if pending_papers and input_statuses:
|
||||
revert_status = input_statuses[0]
|
||||
|
||||
for paper in pending_papers:
|
||||
# Try to use previous_status if available, otherwise use first input status
|
||||
if hasattr(paper, 'previous_status') and paper.previous_status:
|
||||
paper.status = paper.previous_status
|
||||
else:
|
||||
paper.status = revert_status
|
||||
paper.updated_at = datetime.utcnow()
|
||||
paper.error_msg = None # Clear any error messages
|
||||
reverted_count += 1
|
||||
|
||||
db.session.commit()
|
||||
|
||||
# Reset scraper state
|
||||
ScraperState.set_active(False)
|
||||
ScraperState.set_paused(False)
|
||||
|
||||
ActivityLog.log_scraper_command(
|
||||
action="reset_scraper",
|
||||
status="success",
|
||||
description=f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers."
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"message": f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers to original status."
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
def get_current_hour_quota(self) -> int:
|
||||
"""Calculate papers to process in current hour based on schedule."""
|
||||
try:
|
||||
return get_cached_hourly_quota(self._calculate_papers_for_current_hour)
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error calculating hourly quota: {str(e)}",
|
||||
source="ScraperManager.get_current_hour_quota"
|
||||
)
|
||||
return 0
|
||||
|
||||
def _calculate_papers_for_current_hour(self) -> int:
|
||||
"""Internal method to calculate hourly quota."""
|
||||
try:
|
||||
# Get current hour and volume config
|
||||
current_hour = datetime.now().hour
|
||||
volume_config = VolumeConfig.get_current_volume()
|
||||
daily_volume = volume_config if volume_config else 100
|
||||
|
||||
# Get schedule config for current hour
|
||||
schedule_config = ScheduleConfig.query.filter_by(hour=current_hour).first()
|
||||
current_weight = schedule_config.weight if schedule_config else 1.0
|
||||
|
||||
# Get total weight across all hours
|
||||
total_weight = db.session.query(func.sum(ScheduleConfig.weight)).scalar() or 24.0
|
||||
|
||||
# Calculate quota: (current_weight / total_weight) * daily_volume
|
||||
quota = math.ceil((current_weight / total_weight) * daily_volume)
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="calculate_hourly_quota",
|
||||
status="info",
|
||||
description=f"Hour {current_hour}: quota={quota} (weight={current_weight}, total_weight={total_weight}, daily_volume={daily_volume})"
|
||||
)
|
||||
|
||||
return max(1, quota) # Ensure at least 1 paper per hour
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error in quota calculation: {str(e)}",
|
||||
source="ScraperManager._calculate_papers_for_current_hour"
|
||||
)
|
||||
return 1 # Fallback to 1 paper per hour
|
||||
|
||||
def select_papers_for_processing(self, limit: Optional[int] = None) -> List[PaperMetadata]:
|
||||
"""Select papers for processing based on current scraper configuration."""
|
||||
try:
|
||||
scraper = get_scraper()
|
||||
input_statuses = scraper.get_input_statuses()
|
||||
|
||||
if not input_statuses:
|
||||
return []
|
||||
|
||||
# Use provided limit or calculate from hourly quota
|
||||
papers_needed = limit if limit is not None else self.get_current_hour_quota()
|
||||
|
||||
# Query papers with input statuses, randomize selection
|
||||
papers = (PaperMetadata.query
|
||||
.filter(PaperMetadata.status.in_(input_statuses))
|
||||
.order_by(func.random())
|
||||
.limit(papers_needed)
|
||||
.all())
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="select_papers",
|
||||
status="info",
|
||||
description=f"Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})"
|
||||
)
|
||||
|
||||
return papers
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error selecting papers: {str(e)}",
|
||||
source="ScraperManager.select_papers_for_processing"
|
||||
)
|
||||
return []
|
||||
|
||||
def process_paper(self, paper: PaperMetadata) -> Dict:
|
||||
"""Process a single paper using the current scraper."""
|
||||
try:
|
||||
scraper = get_scraper()
|
||||
output_statuses = scraper.get_output_statuses()
|
||||
|
||||
# Store the previous status before changing it
|
||||
previous_status = paper.status
|
||||
|
||||
# Update paper status to processing
|
||||
paper.previous_status = previous_status
|
||||
paper.status = output_statuses["processing"]
|
||||
paper.updated_at = datetime.utcnow()
|
||||
db.session.commit()
|
||||
|
||||
# Perform scraping
|
||||
result = scraper.scrape(paper.doi)
|
||||
|
||||
# Update paper status based on result
|
||||
if result.status == "success":
|
||||
paper.status = output_statuses["success"]
|
||||
paper.error_msg = None
|
||||
if result.data and "file_path" in result.data:
|
||||
paper.file_path = result.data["file_path"]
|
||||
else:
|
||||
paper.status = output_statuses["failure"]
|
||||
paper.error_msg = result.message
|
||||
|
||||
paper.updated_at = datetime.utcnow()
|
||||
db.session.commit()
|
||||
|
||||
# Log result
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_paper",
|
||||
paper_id=paper.id,
|
||||
status=result.status,
|
||||
description=f"Processed {paper.doi}: {result.message}"
|
||||
)
|
||||
|
||||
return {
|
||||
"paper_id": paper.id,
|
||||
"status": result.status,
|
||||
"message": result.message,
|
||||
"duration": result.duration
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
# Revert paper status on error
|
||||
try:
|
||||
input_statuses = get_scraper().get_input_statuses()
|
||||
if input_statuses:
|
||||
paper.status = input_statuses[0]
|
||||
paper.error_msg = f"Processing error: {str(e)}"
|
||||
paper.updated_at = datetime.utcnow()
|
||||
db.session.commit()
|
||||
except:
|
||||
pass # Don't fail if reversion fails
|
||||
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error processing paper {paper.id}: {str(e)}",
|
||||
source="ScraperManager.process_paper"
|
||||
)
|
||||
|
||||
return {"paper_id": paper.id, "status": "error", "message": str(e)}
|
||||
|
||||
def get_status(self) -> Dict:
|
||||
"""Get current scraper status."""
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
scraper = get_scraper()
|
||||
|
||||
# Count papers by status
|
||||
input_statuses = scraper.get_input_statuses()
|
||||
output_statuses = scraper.get_output_statuses()
|
||||
|
||||
available_count = (PaperMetadata.query
|
||||
.filter(PaperMetadata.status.in_(input_statuses))
|
||||
.count())
|
||||
|
||||
processing_count = (PaperMetadata.query
|
||||
.filter_by(status=output_statuses["processing"])
|
||||
.count())
|
||||
|
||||
return {
|
||||
"active": scraper_state.is_active,
|
||||
"paused": scraper_state.is_paused,
|
||||
"current_scraper": scraper.get_name(),
|
||||
"input_statuses": input_statuses,
|
||||
"output_statuses": output_statuses,
|
||||
"available_papers": available_count,
|
||||
"processing_papers": processing_count,
|
||||
"current_hour_quota": self.get_current_hour_quota()
|
||||
}
|
189
scipaperloader/scrapers/tasks.py
Normal file
189
scipaperloader/scrapers/tasks.py
Normal file
@ -0,0 +1,189 @@
|
||||
"""
|
||||
Hourly scheduler task that processes papers at random times within each hour.
|
||||
"""
|
||||
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
from celery import shared_task
|
||||
|
||||
from ..models import ScraperState, ActivityLog
|
||||
from .manager import ScraperManager
|
||||
|
||||
|
||||
@shared_task(bind=True)
|
||||
def hourly_scraper_scheduler(self):
|
||||
"""
|
||||
Hourly task that schedules paper processing at random times within the hour.
|
||||
|
||||
This task runs at the beginning of each hour and:
|
||||
1. Calculates how many papers to process this hour
|
||||
2. Schedules individual paper processing tasks at random times within the hour
|
||||
"""
|
||||
try:
|
||||
# Check if scraper is active
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
if not scraper_state.is_active:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="hourly_scheduler",
|
||||
status="info",
|
||||
description="Hourly scheduler skipped - scraper not active"
|
||||
)
|
||||
# Disable retries for inactive scheduler
|
||||
self.retry = False
|
||||
return {"status": "inactive", "papers_scheduled": 0}
|
||||
|
||||
if scraper_state.is_paused:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="hourly_scheduler",
|
||||
status="info",
|
||||
description="Hourly scheduler skipped - scraper paused"
|
||||
)
|
||||
# Disable retries for paused scheduler
|
||||
self.retry = False
|
||||
return {"status": "paused", "papers_scheduled": 0}
|
||||
|
||||
# Initialize scraper manager
|
||||
manager = ScraperManager()
|
||||
|
||||
# Get papers to process this hour
|
||||
papers = manager.select_papers_for_processing()
|
||||
|
||||
if not papers:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="hourly_scheduler",
|
||||
status="info",
|
||||
description="No papers available for processing this hour"
|
||||
)
|
||||
return {"status": "empty", "papers_scheduled": 0}
|
||||
|
||||
# Schedule papers at random times within the hour (0-3600 seconds)
|
||||
scheduled_count = 0
|
||||
current_time = datetime.now()
|
||||
|
||||
for paper in papers:
|
||||
# Random delay between 1 second and 58 minutes
|
||||
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
|
||||
|
||||
# Schedule the task using Celery's task registry to avoid circular import issues
|
||||
from ..celery import celery
|
||||
celery.send_task(
|
||||
'scipaperloader.scrapers.tasks.process_single_paper',
|
||||
args=[paper.id],
|
||||
countdown=delay_seconds
|
||||
)
|
||||
|
||||
scheduled_count += 1
|
||||
|
||||
# Log each scheduled paper
|
||||
schedule_time = current_time + timedelta(seconds=delay_seconds)
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="schedule_paper",
|
||||
paper_id=paper.id,
|
||||
status="info",
|
||||
description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
|
||||
)
|
||||
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="hourly_scheduler",
|
||||
status="success",
|
||||
description=f"Scheduled {scheduled_count} papers for random processing within this hour"
|
||||
)
|
||||
|
||||
return {"status": "success", "papers_scheduled": scheduled_count}
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Hourly scheduler error: {str(e)}",
|
||||
source="hourly_scraper_scheduler"
|
||||
)
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
|
||||
@shared_task(bind=True)
|
||||
def process_single_paper(self, paper_id: int):
|
||||
"""
|
||||
Process a single paper. This task is scheduled at random times within each hour.
|
||||
|
||||
Args:
|
||||
paper_id: ID of the paper to process
|
||||
"""
|
||||
try:
|
||||
# Double-check scraper state before processing
|
||||
scraper_state = ScraperState.get_current_state()
|
||||
if not scraper_state.is_active:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_single_paper",
|
||||
paper_id=paper_id,
|
||||
status="skipped",
|
||||
description="Skipped processing - scraper not active"
|
||||
)
|
||||
# Use Celery's ignore to mark this task as completed without error
|
||||
self.retry = False
|
||||
return {"status": "inactive", "paper_id": paper_id}
|
||||
|
||||
if scraper_state.is_paused:
|
||||
ActivityLog.log_scraper_activity(
|
||||
action="process_single_paper",
|
||||
paper_id=paper_id,
|
||||
status="skipped",
|
||||
description="Skipped processing - scraper paused"
|
||||
)
|
||||
# Use Celery's ignore for paused state too
|
||||
self.retry = False
|
||||
return {"status": "paused", "paper_id": paper_id}
|
||||
|
||||
# Get the paper
|
||||
from ..models import PaperMetadata
|
||||
paper = PaperMetadata.query.get(paper_id)
|
||||
if not paper:
|
||||
return {"status": "error", "message": f"Paper {paper_id} not found"}
|
||||
|
||||
# Process the paper using scraper manager
|
||||
manager = ScraperManager()
|
||||
result = manager.process_paper(paper)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error processing paper {paper_id}: {str(e)}",
|
||||
source="process_single_paper"
|
||||
)
|
||||
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
||||
|
||||
|
||||
@shared_task(bind=True)
|
||||
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
|
||||
"""
|
||||
Process multiple papers in a batch for immediate processing.
|
||||
|
||||
Args:
|
||||
paper_ids: List of paper IDs to process
|
||||
scraper_module: Optional specific scraper module to use
|
||||
"""
|
||||
try:
|
||||
results = []
|
||||
manager = ScraperManager()
|
||||
|
||||
for paper_id in paper_ids:
|
||||
from ..models import PaperMetadata
|
||||
paper = PaperMetadata.query.get(paper_id)
|
||||
if paper:
|
||||
result = manager.process_paper(paper)
|
||||
results.append(result)
|
||||
else:
|
||||
results.append({
|
||||
"paper_id": paper_id,
|
||||
"status": "error",
|
||||
"message": "Paper not found"
|
||||
})
|
||||
|
||||
return {"results": results, "total_processed": len(results)}
|
||||
|
||||
except Exception as e:
|
||||
ActivityLog.log_error(
|
||||
error_message=f"Error processing batch: {str(e)}",
|
||||
source="process_papers_batch"
|
||||
)
|
||||
return {"status": "error", "message": str(e)}
|
@ -29,6 +29,11 @@
|
||||
height: 400px;
|
||||
}
|
||||
|
||||
.chart-wrapper {
|
||||
position: relative;
|
||||
height: 400px;
|
||||
}
|
||||
|
||||
.notification {
|
||||
position: fixed;
|
||||
bottom: 20px;
|
||||
@ -100,132 +105,137 @@
|
||||
<div class="form-group">
|
||||
<label for="volumeInput">Papers per day:</label>
|
||||
<input type="number" class="form-control" id="volumeInput"
|
||||
value="{{ volume_config.volume if volume_config else 100 }}" min="1"
|
||||
max="{{ max_volume }}">
|
||||
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
|
||||
value="{{ volume_config if volume_config else 100 }}" min="1" max="{{ max_volume }}">
|
||||
<button type="submit" class="btn btn-primary mt-2">
|
||||
<i class="fas fa-save"></i> Update Volume
|
||||
</button>
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary mt-2">Update Volume</button>
|
||||
</form>
|
||||
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- New row for single paper processing -->
|
||||
<div class="row mb-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5>Process Single Paper</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="row">
|
||||
<div class="col-md-6">
|
||||
<form id="searchPaperForm" class="mb-3">
|
||||
<div class="input-group">
|
||||
<input type="text" id="paperSearchInput" class="form-control"
|
||||
placeholder="Search paper by title, DOI, or ID...">
|
||||
<button class="btn btn-outline-secondary" type="submit">Search</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
<div class="form-group">
|
||||
<label for="scraperSelect">Scraper Module:</label>
|
||||
<select class="form-control" id="scraperSelect">
|
||||
<option value="">Use default system scraper</option>
|
||||
<!-- Available scrapers will be populated here -->
|
||||
</select>
|
||||
<div class="form-text">
|
||||
Select which scraper to use for processing the paper
|
||||
</div>
|
||||
<!-- New row for single paper processing -->
|
||||
<div class="row mb-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5>Process Single Paper</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="row">
|
||||
<div class="col-md-6">
|
||||
<form id="searchPaperForm" class="mb-3">
|
||||
<div class="input-group">
|
||||
<input type="text" id="paperSearchInput" class="form-control"
|
||||
placeholder="Search paper by title, DOI, or ID...">
|
||||
<button class="btn btn-outline-secondary" type="submit">Search</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
<div class="form-group">
|
||||
<label for="scraperSelect">Scraper Module:</label>
|
||||
<select class="form-control" id="scraperSelect">
|
||||
<option value="">Use default system scraper</option>
|
||||
<!-- Available scrapers will be populated here -->
|
||||
</select>
|
||||
<div class="form-text">
|
||||
Select which scraper to use for processing the paper
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="searchResults" class="mt-3 search-results-container d-none">
|
||||
<table class="table table-hover table-striped">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th>Title</th>
|
||||
<th>DOI</th>
|
||||
<th>Status</th>
|
||||
<th>Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="paperSearchResults">
|
||||
<!-- Search results will be populated here -->
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div id="processingStatus" class="alert alert-info mt-3 d-none"></div>
|
||||
</div>
|
||||
|
||||
<div id="searchResults" class="mt-3 search-results-container d-none">
|
||||
<table class="table table-hover table-striped">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th>Title</th>
|
||||
<th>DOI</th>
|
||||
<th>Status</th>
|
||||
<th>Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="paperSearchResults">
|
||||
<!-- Search results will be populated here -->
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div id="processingStatus" class="alert alert-info mt-3 d-none"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mb-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header d-flex justify-content-between align-items-center">
|
||||
<h5>Scraping Activity</h5>
|
||||
<div>
|
||||
<div class="form-check form-switch">
|
||||
<input class="form-check-input" type="checkbox" id="notificationsToggle" checked>
|
||||
<label class="form-check-label" for="notificationsToggle">Show Notifications</label>
|
||||
</div>
|
||||
<div class="row mb-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header d-flex justify-content-between align-items-center">
|
||||
<h5>Scraping Activity</h5>
|
||||
<div>
|
||||
<div class="form-check form-switch">
|
||||
<input class="form-check-input" type="checkbox" id="notificationsToggle" checked>
|
||||
<label class="form-check-label" for="notificationsToggle">Show Notifications</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="btn-group mb-3">
|
||||
<button class="btn btn-outline-secondary time-range-btn" data-hours="6">Last 6
|
||||
hours</button>
|
||||
<button class="btn btn-outline-secondary time-range-btn active" data-hours="24">Last 24
|
||||
hours</button>
|
||||
<button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3
|
||||
days</button>
|
||||
</div>
|
||||
<div class="stats-chart" id="activityChart"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mb-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5>Recent Activity</h5>
|
||||
<div class="card-body">
|
||||
<div class="btn-group mb-3">
|
||||
<button class="btn btn-outline-secondary time-range-btn" data-hours="6">Last 6
|
||||
hours</button>
|
||||
<button class="btn btn-outline-secondary time-range-btn active" data-hours="24">Last 24
|
||||
hours</button>
|
||||
<button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3
|
||||
days</button>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="table-responsive">
|
||||
<table class="table table-striped">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Time</th>
|
||||
<th>Action</th>
|
||||
<th>Status</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="activityLog">
|
||||
<tr>
|
||||
<td colspan="4" class="text-center">Loading activities...</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div class="chart-wrapper">
|
||||
<canvas id="activityChart"></canvas>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row mb-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5>Recent Activity</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="table-responsive">
|
||||
<table class="table table-striped">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Time</th>
|
||||
<th>Action</th>
|
||||
<th>Status</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="activityLog">
|
||||
<tr>
|
||||
<td colspan="4" class="text-center">Loading activities...</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock content %}
|
||||
|
||||
{% block scripts %}
|
||||
{{ super() }}
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script>
|
||||
// Global variables for the scraper dashboard
|
||||
let notificationsEnabled = true;
|
||||
@ -251,10 +261,14 @@
|
||||
// Initialize the page
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
initStatusPolling();
|
||||
loadActivityStats(currentTimeRange);
|
||||
loadRecentActivity();
|
||||
loadAvailableScrapers();
|
||||
|
||||
// Load chart data after a short delay to ensure Chart.js is loaded
|
||||
setTimeout(() => {
|
||||
loadActivityStats(currentTimeRange);
|
||||
}, 100);
|
||||
|
||||
// Initialize event listeners
|
||||
startButton.addEventListener('click', startScraper);
|
||||
pauseButton.addEventListener('click', togglePauseScraper);
|
||||
@ -470,13 +484,21 @@
|
||||
fetch('/scraper/status')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.active) {
|
||||
if (data.paused) {
|
||||
statusIndicator.className = 'status-indicator status-paused';
|
||||
console.log('Status data received:', data); // Debug log
|
||||
|
||||
// Remove all status classes first
|
||||
statusIndicator.classList.remove('status-active', 'status-paused', 'status-inactive');
|
||||
|
||||
// Handle the new JSON structure with scraper_state
|
||||
const scraperState = data.scraper_state || data; // Fallback for old structure
|
||||
|
||||
if (scraperState.active) {
|
||||
if (scraperState.paused) {
|
||||
statusIndicator.classList.add('status-paused');
|
||||
statusText.textContent = 'Paused';
|
||||
pauseButton.textContent = 'Resume';
|
||||
} else {
|
||||
statusIndicator.className = 'status-indicator status-active';
|
||||
statusIndicator.classList.add('status-active');
|
||||
statusText.textContent = 'Active';
|
||||
pauseButton.textContent = 'Pause';
|
||||
}
|
||||
@ -485,13 +507,20 @@
|
||||
stopButton.disabled = false;
|
||||
resetButton.disabled = false; // Enable reset when active
|
||||
} else {
|
||||
statusIndicator.className = 'status-indicator status-inactive';
|
||||
statusIndicator.classList.add('status-inactive');
|
||||
statusText.textContent = 'Inactive';
|
||||
startButton.disabled = false;
|
||||
pauseButton.disabled = true;
|
||||
stopButton.disabled = true;
|
||||
resetButton.disabled = false; // Enable reset when inactive too
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error fetching status:', error);
|
||||
// On error, show inactive state
|
||||
statusIndicator.classList.remove('status-active', 'status-paused', 'status-inactive');
|
||||
statusIndicator.classList.add('status-inactive');
|
||||
statusText.textContent = 'Error';
|
||||
});
|
||||
}
|
||||
|
||||
@ -499,7 +528,13 @@
|
||||
function startScraper() {
|
||||
console.log("Start button clicked - sending request to /scraper/start");
|
||||
|
||||
fetch('/scraper/start', { method: 'POST' })
|
||||
fetch('/scraper/start', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({})
|
||||
})
|
||||
.then(response => {
|
||||
console.log("Response received:", response);
|
||||
return response.json();
|
||||
@ -521,7 +556,13 @@
|
||||
}
|
||||
|
||||
function togglePauseScraper() {
|
||||
fetch('/scraper/pause', { method: 'POST' })
|
||||
fetch('/scraper/pause', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({})
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
@ -535,7 +576,13 @@
|
||||
}
|
||||
|
||||
function stopScraper() {
|
||||
fetch('/scraper/stop', { method: 'POST' })
|
||||
fetch('/scraper/stop', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({})
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
@ -706,14 +753,28 @@
|
||||
// Load data functions
|
||||
function loadActivityStats(hours) {
|
||||
fetch(`/scraper/stats?hours=${hours}`)
|
||||
.then(response => response.json())
|
||||
.then(response => {
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
return response.json();
|
||||
})
|
||||
.then(data => {
|
||||
console.log('Stats data loaded:', data);
|
||||
renderActivityChart(data);
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Failed to load activity stats:', error);
|
||||
// Hide the chart or show an error message
|
||||
const chartContainer = document.getElementById('activityChart').parentElement;
|
||||
if (chartContainer) {
|
||||
chartContainer.innerHTML = '<p class="text-muted">Chart data unavailable</p>';
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function loadRecentActivity() {
|
||||
fetch('/api/activity_logs?category=scraper_activity&limit=20')
|
||||
fetch('/api/activity_logs?category=scraper_activity&category=scraper_command&limit=50')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
renderActivityLog(data);
|
||||
@ -728,7 +789,19 @@
|
||||
|
||||
// Rendering functions
|
||||
function renderActivityChart(data) {
|
||||
const ctx = document.getElementById('activityChart').getContext('2d');
|
||||
// Check if Chart.js is available
|
||||
if (typeof Chart === 'undefined') {
|
||||
console.error('Chart.js is not loaded');
|
||||
return;
|
||||
}
|
||||
|
||||
const chartElement = document.getElementById('activityChart');
|
||||
if (!chartElement) {
|
||||
console.error('Chart canvas element not found');
|
||||
return;
|
||||
}
|
||||
|
||||
const ctx = chartElement.getContext('2d');
|
||||
|
||||
// Extract the data for the chart
|
||||
const labels = data.map(item => `${item.hour}:00`);
|
||||
@ -857,7 +930,7 @@
|
||||
let lastPaperTimestamp = new Date().toISOString();
|
||||
|
||||
function checkForNewPapers() {
|
||||
fetch(`/api/activity_logs?category=scraper_activity&action=scrape_paper&after=${lastPaperTimestamp}&limit=5`)
|
||||
fetch(`/api/activity_logs?category=scraper_activity&category=scraper_command&action=scrape_paper&after=${lastPaperTimestamp}&limit=5`)
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data && data.length > 0) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user