refines modular scraping system. adds another dummy scraper
This commit is contained in:
parent
ac348696b5
commit
1e97a9cc7b
@ -1,9 +1,11 @@
|
|||||||
from scipaperloader.celery import celery, configure_celery
|
from scipaperloader.celery import celery, configure_celery
|
||||||
# Import all task modules to ensure they are registered with Celery
|
# Import all task modules to ensure they are registered with Celery
|
||||||
|
import scipaperloader.scrapers.tasks # Import new scheduler tasks
|
||||||
import scipaperloader.blueprints.scraper # Import the scraper module with our tasks
|
import scipaperloader.blueprints.scraper # Import the scraper module with our tasks
|
||||||
|
|
||||||
# Configure celery with Flask app
|
# Configure celery with Flask app
|
||||||
configure_celery()
|
configure_celery()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
celery.start()
|
# Start the Celery worker
|
||||||
|
celery.start(['worker', '--loglevel=info', '--concurrency=2'])
|
@ -9,7 +9,7 @@ bp = Blueprint("api", __name__, url_prefix="/api")
|
|||||||
def get_activity_logs():
|
def get_activity_logs():
|
||||||
"""Get activity logs with filtering options."""
|
"""Get activity logs with filtering options."""
|
||||||
# Get query parameters
|
# Get query parameters
|
||||||
category = request.args.get("category")
|
categories = request.args.getlist("category") # Changed to getlist for multiple values
|
||||||
action = request.args.get("action")
|
action = request.args.get("action")
|
||||||
after = request.args.get("after")
|
after = request.args.get("after")
|
||||||
limit = request.args.get("limit", 20, type=int)
|
limit = request.args.get("limit", 20, type=int)
|
||||||
@ -17,8 +17,9 @@ def get_activity_logs():
|
|||||||
# Build query
|
# Build query
|
||||||
query = ActivityLog.query
|
query = ActivityLog.query
|
||||||
|
|
||||||
if category:
|
if categories:
|
||||||
query = query.filter(ActivityLog.category == category)
|
# Filter by multiple categories using in_() for SQL IN clause
|
||||||
|
query = query.filter(ActivityLog.category.in_(categories))
|
||||||
|
|
||||||
if action:
|
if action:
|
||||||
query = query.filter(ActivityLog.action == action)
|
query = query.filter(ActivityLog.action == action)
|
||||||
|
@ -34,21 +34,8 @@ def _update_volume(new_volume):
|
|||||||
if new_volume <= 0 or new_volume > MAX_VOLUME:
|
if new_volume <= 0 or new_volume > MAX_VOLUME:
|
||||||
return False, f"Volume must be between 1 and {MAX_VOLUME}", None
|
return False, f"Volume must be between 1 and {MAX_VOLUME}", None
|
||||||
|
|
||||||
volume_config = VolumeConfig.query.first()
|
# Use the new class method to set the volume
|
||||||
if not volume_config:
|
volume_config = VolumeConfig.set_volume(new_volume)
|
||||||
volume_config = VolumeConfig(volume=new_volume)
|
|
||||||
db.session.add(volume_config)
|
|
||||||
else:
|
|
||||||
old_value = volume_config.volume
|
|
||||||
volume_config.volume = new_volume
|
|
||||||
ActivityLog.log_config_change(
|
|
||||||
config_key="scraper_volume",
|
|
||||||
old_value=old_value,
|
|
||||||
new_value=new_volume,
|
|
||||||
description="Updated scraper volume"
|
|
||||||
)
|
|
||||||
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
# Invalidate and recalculate the hourly quota cache
|
# Invalidate and recalculate the hourly quota cache
|
||||||
try:
|
try:
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -32,8 +32,8 @@ def configure_celery(app=None):
|
|||||||
task_reject_on_worker_lost=True, # Requeue tasks if worker dies
|
task_reject_on_worker_lost=True, # Requeue tasks if worker dies
|
||||||
# Configure Beat schedule for periodic tasks
|
# Configure Beat schedule for periodic tasks
|
||||||
beat_schedule={
|
beat_schedule={
|
||||||
'scheduled-scraper-hourly': {
|
'hourly-scraper-scheduler': {
|
||||||
'task': 'scipaperloader.blueprints.scraper.dummy_scheduled_scraper',
|
'task': 'scipaperloader.scrapers.tasks.hourly_scraper_scheduler',
|
||||||
'schedule': crontab(minute=0), # Run at the start of every hour
|
'schedule': crontab(minute=0), # Run at the start of every hour
|
||||||
'options': {'expires': 3600}
|
'options': {'expires': 3600}
|
||||||
},
|
},
|
||||||
|
@ -91,12 +91,13 @@ class ActivityLog(db.Model):
|
|||||||
return log
|
return log
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def log_scraper_command(cls, action, status=None, user_id=None, **extra):
|
def log_scraper_command(cls, action, status=None, description=None, user_id=None, **extra):
|
||||||
"""Log a scraper command (start/stop/pause)."""
|
"""Log a scraper command (start/stop/pause)."""
|
||||||
log = cls(
|
log = cls(
|
||||||
category=ActivityCategory.SCRAPER_COMMAND.value,
|
category=ActivityCategory.SCRAPER_COMMAND.value,
|
||||||
action=action,
|
action=action,
|
||||||
status=status,
|
status=status,
|
||||||
|
description=description,
|
||||||
user_id=user_id
|
user_id=user_id
|
||||||
)
|
)
|
||||||
log.set_extra_data(extra)
|
log.set_extra_data(extra)
|
||||||
@ -191,6 +192,7 @@ class PaperMetadata(db.Model):
|
|||||||
language = db.Column(db.String(50))
|
language = db.Column(db.String(50))
|
||||||
published_online = db.Column(db.Date) # or DateTime/String
|
published_online = db.Column(db.Date) # or DateTime/String
|
||||||
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
|
status = db.Column(db.String(10)) # 'Pending','Done','Failed'
|
||||||
|
previous_status = db.Column(db.String(10), nullable=True) # Store previous status for reversion
|
||||||
file_path = db.Column(db.Text)
|
file_path = db.Column(db.Text)
|
||||||
error_msg = db.Column(db.Text)
|
error_msg = db.Column(db.Text)
|
||||||
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
|
created_at = db.Column(db.DateTime, default=db.func.current_timestamp())
|
||||||
@ -210,6 +212,35 @@ class VolumeConfig(db.Model):
|
|||||||
id = db.Column(db.Integer, primary_key=True)
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
volume = db.Column(db.Float) # volume of papers to scrape per day
|
volume = db.Column(db.Float) # volume of papers to scrape per day
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_current_volume(cls):
|
||||||
|
"""Get the current volume configuration, creating default if needed."""
|
||||||
|
config = cls.query.first()
|
||||||
|
if not config:
|
||||||
|
config = cls(volume=100)
|
||||||
|
db.session.add(config)
|
||||||
|
db.session.commit()
|
||||||
|
return config.volume
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def set_volume(cls, new_volume):
|
||||||
|
"""Set the volume configuration."""
|
||||||
|
config = cls.query.first()
|
||||||
|
if not config:
|
||||||
|
config = cls(volume=new_volume)
|
||||||
|
db.session.add(config)
|
||||||
|
else:
|
||||||
|
old_value = config.volume
|
||||||
|
config.volume = new_volume
|
||||||
|
ActivityLog.log_config_change(
|
||||||
|
config_key="scraper_volume",
|
||||||
|
old_value=old_value,
|
||||||
|
new_value=new_volume,
|
||||||
|
description="Updated scraper volume configuration"
|
||||||
|
)
|
||||||
|
db.session.commit()
|
||||||
|
return config
|
||||||
|
|
||||||
class DownloadPathConfig(db.Model):
|
class DownloadPathConfig(db.Model):
|
||||||
"""Model to store the base path for downloaded files."""
|
"""Model to store the base path for downloaded files."""
|
||||||
id = db.Column(db.Integer, primary_key=True)
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
@ -220,7 +251,7 @@ class DownloadPathConfig(db.Model):
|
|||||||
"""Get the configured download path, creating default if needed."""
|
"""Get the configured download path, creating default if needed."""
|
||||||
config = cls.query.first()
|
config = cls.query.first()
|
||||||
if not config:
|
if not config:
|
||||||
config = cls(path="/path/to/dummy/papers") # Ensure default exists
|
config = cls(path="/tmp/") # Ensure default exists
|
||||||
db.session.add(config)
|
db.session.add(config)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
return config.path
|
return config.path
|
||||||
@ -342,6 +373,7 @@ def init_schedule_config():
|
|||||||
db.session.add(default_volume)
|
db.session.add(default_volume)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
|
||||||
|
|
||||||
# Initialize DownloadPathConfig if it doesn't exist
|
# Initialize DownloadPathConfig if it doesn't exist
|
||||||
if DownloadPathConfig.query.count() == 0:
|
if DownloadPathConfig.query.count() == 0:
|
||||||
default_path = DownloadPathConfig(path="/path/to/dummy/papers")
|
default_path = DownloadPathConfig(path="/path/to/dummy/papers")
|
||||||
|
@ -1,2 +1,18 @@
|
|||||||
# This package contains all scraper modules.
|
# This package contains all scraper modules.
|
||||||
# Each scraper should implement the BaseScraper interface from base.py.
|
# Each scraper should implement the BaseScraper interface from base.py.
|
||||||
|
|
||||||
|
from .base import BaseScraper, ScrapeResult
|
||||||
|
from .factory import get_scraper, get_available_scrapers
|
||||||
|
from .manager import ScraperManager
|
||||||
|
from .dummy import Scraper as DummyScraper
|
||||||
|
from .failed_retry import Scraper as FailedRetryScraper
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'BaseScraper',
|
||||||
|
'ScrapeResult',
|
||||||
|
'get_scraper',
|
||||||
|
'get_available_scrapers',
|
||||||
|
'ScraperManager',
|
||||||
|
'DummyScraper',
|
||||||
|
'FailedRetryScraper'
|
||||||
|
]
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import NamedTuple, Optional, Dict
|
from typing import NamedTuple, Optional, Dict, List
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
class ScrapeResult(NamedTuple):
|
class ScrapeResult(NamedTuple):
|
||||||
@ -12,6 +12,12 @@ class ScrapeResult(NamedTuple):
|
|||||||
class BaseScraper(ABC):
|
class BaseScraper(ABC):
|
||||||
"""Base class for all scraper implementations."""
|
"""Base class for all scraper implementations."""
|
||||||
|
|
||||||
|
# Default input/output statuses - can be overridden by subclasses
|
||||||
|
INPUT_STATUSES = ["New"] # Which paper statuses this scraper will process
|
||||||
|
OUTPUT_STATUS_SUCCESS = "Done" # Status to set on successful scraping
|
||||||
|
OUTPUT_STATUS_FAILURE = "Failed" # Status to set on failed scraping
|
||||||
|
OUTPUT_STATUS_PROCESSING = "Pending" # Status to set while processing
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def scrape(self, doi: str) -> ScrapeResult:
|
def scrape(self, doi: str) -> ScrapeResult:
|
||||||
"""
|
"""
|
||||||
@ -32,3 +38,15 @@ class BaseScraper(ABC):
|
|||||||
def get_description(self) -> str:
|
def get_description(self) -> str:
|
||||||
"""Return a description of this scraper."""
|
"""Return a description of this scraper."""
|
||||||
return getattr(self.__class__, "__doc__", "No description available")
|
return getattr(self.__class__, "__doc__", "No description available")
|
||||||
|
|
||||||
|
def get_input_statuses(self) -> List[str]:
|
||||||
|
"""Return list of paper statuses this scraper can process."""
|
||||||
|
return self.INPUT_STATUSES
|
||||||
|
|
||||||
|
def get_output_statuses(self) -> Dict[str, str]:
|
||||||
|
"""Return mapping of result types to output statuses."""
|
||||||
|
return {
|
||||||
|
"success": self.OUTPUT_STATUS_SUCCESS,
|
||||||
|
"failure": self.OUTPUT_STATUS_FAILURE,
|
||||||
|
"processing": self.OUTPUT_STATUS_PROCESSING
|
||||||
|
}
|
||||||
|
@ -10,6 +10,12 @@ from ..db import db
|
|||||||
class Scraper(BaseScraper):
|
class Scraper(BaseScraper):
|
||||||
"""Dummy scraper for testing purposes that simulates paper downloading."""
|
"""Dummy scraper for testing purposes that simulates paper downloading."""
|
||||||
|
|
||||||
|
# This scraper processes "New" papers and outputs "Done"/"Failed"
|
||||||
|
INPUT_STATUSES = ["New"]
|
||||||
|
OUTPUT_STATUS_SUCCESS = "Done"
|
||||||
|
OUTPUT_STATUS_FAILURE = "Failed"
|
||||||
|
OUTPUT_STATUS_PROCESSING = "Pending"
|
||||||
|
|
||||||
def scrape(self, doi: str) -> ScrapeResult:
|
def scrape(self, doi: str) -> ScrapeResult:
|
||||||
"""Simulate scraping a paper with realistic timing and random success/failure."""
|
"""Simulate scraping a paper with realistic timing and random success/failure."""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import importlib
|
import importlib
|
||||||
from flask import current_app
|
|
||||||
from .base import BaseScraper
|
from .base import BaseScraper
|
||||||
|
|
||||||
def get_scraper() -> BaseScraper:
|
def get_scraper() -> BaseScraper:
|
||||||
@ -7,10 +6,16 @@ def get_scraper() -> BaseScraper:
|
|||||||
from ..models import ScraperModuleConfig, ActivityLog
|
from ..models import ScraperModuleConfig, ActivityLog
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get module name from database first, fallback to config
|
# Get module name from database first, fallback to dummy
|
||||||
name = ScraperModuleConfig.get_current_module()
|
name = ScraperModuleConfig.get_current_module()
|
||||||
if not name:
|
if not name:
|
||||||
|
# Only try to access Flask config if we're in app context
|
||||||
|
try:
|
||||||
|
from flask import current_app
|
||||||
name = current_app.config.get("SCRAPER_MODULE", "dummy")
|
name = current_app.config.get("SCRAPER_MODULE", "dummy")
|
||||||
|
except RuntimeError:
|
||||||
|
# No app context, use dummy
|
||||||
|
name = "dummy"
|
||||||
|
|
||||||
module = importlib.import_module(f"scipaperloader.scrapers.{name}")
|
module = importlib.import_module(f"scipaperloader.scrapers.{name}")
|
||||||
cls = getattr(module, "Scraper")
|
cls = getattr(module, "Scraper")
|
||||||
|
123
scipaperloader/scrapers/failed_retry.py
Normal file
123
scipaperloader/scrapers/failed_retry.py
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
import time
|
||||||
|
import random
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from .base import BaseScraper, ScrapeResult
|
||||||
|
from flask import current_app
|
||||||
|
from ..models import PaperMetadata, ActivityLog, DownloadPathConfig
|
||||||
|
from ..db import db
|
||||||
|
|
||||||
|
class Scraper(BaseScraper):
|
||||||
|
"""Retry scraper that attempts to re-process failed papers with different strategies."""
|
||||||
|
|
||||||
|
# This scraper specifically targets "Failed" papers and retries them
|
||||||
|
INPUT_STATUSES = ["Failed"]
|
||||||
|
OUTPUT_STATUS_SUCCESS = "Done"
|
||||||
|
OUTPUT_STATUS_FAILURE = "Failed"
|
||||||
|
OUTPUT_STATUS_PROCESSING = "Retrying"
|
||||||
|
|
||||||
|
def scrape(self, doi: str) -> ScrapeResult:
|
||||||
|
"""Retry scraping a failed paper with enhanced error handling."""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
paper = PaperMetadata.query.filter_by(doi=doi).first()
|
||||||
|
if not paper:
|
||||||
|
return ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=f"No paper found for DOI {doi}",
|
||||||
|
data=None,
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Log retry attempt
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="retry_failed_paper",
|
||||||
|
status="info",
|
||||||
|
description=f"Retrying failed paper: {paper.title}",
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Simulate longer processing time for retry (2-5 seconds)
|
||||||
|
processing_time = random.uniform(2, 5)
|
||||||
|
time.sleep(processing_time)
|
||||||
|
|
||||||
|
# Simulate 60% success rate on retry (lower than initial attempt)
|
||||||
|
success = random.random() < 0.6
|
||||||
|
|
||||||
|
result_data = {}
|
||||||
|
|
||||||
|
if success:
|
||||||
|
# Get download path and create dummy file
|
||||||
|
download_path = DownloadPathConfig.get_path()
|
||||||
|
file_name = f"{doi.replace('/', '_')}_retry.pdf"
|
||||||
|
file_path = f"{download_path}/{file_name}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Ensure directory exists
|
||||||
|
os.makedirs(download_path, exist_ok=True)
|
||||||
|
|
||||||
|
# Create a dummy PDF file
|
||||||
|
with open(file_path, 'w') as f:
|
||||||
|
f.write(f"Dummy PDF content for retry of {doi}")
|
||||||
|
|
||||||
|
result_data = {"file_path": file_path}
|
||||||
|
|
||||||
|
# Log success
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="retry_scrape_success",
|
||||||
|
status="success",
|
||||||
|
description=f"Successfully retried {doi} on second attempt",
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
result = ScrapeResult(
|
||||||
|
status="success",
|
||||||
|
message=f"Successfully retried paper {doi}",
|
||||||
|
data=result_data,
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Failed to save retry file: {str(e)}"
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="retry_scrape_file_error",
|
||||||
|
status="error",
|
||||||
|
description=error_msg,
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
result = ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=error_msg,
|
||||||
|
data=None,
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Retry failed - generate different error message
|
||||||
|
error_messages = [
|
||||||
|
"Retry failed: Still no access to publisher",
|
||||||
|
"Retry failed: Alternative download methods exhausted",
|
||||||
|
"Retry failed: DOI appears permanently inaccessible",
|
||||||
|
"Retry failed: Network timeout persists"
|
||||||
|
]
|
||||||
|
error_msg = random.choice(error_messages)
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="retry_scrape_failure",
|
||||||
|
status="error",
|
||||||
|
description=f"Retry failed for {doi}: {error_msg}",
|
||||||
|
paper_id=paper.id
|
||||||
|
)
|
||||||
|
|
||||||
|
result = ScrapeResult(
|
||||||
|
status="error",
|
||||||
|
message=error_msg,
|
||||||
|
data=None,
|
||||||
|
duration=time.time() - start_time,
|
||||||
|
timestamp=datetime.utcnow()
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
747
scipaperloader/scrapers/manager.py
Normal file
747
scipaperloader/scrapers/manager.py
Normal file
@ -0,0 +1,747 @@
|
|||||||
|
"""
|
||||||
|
Simplified scraper management system with hourly quota scheduling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import random
|
||||||
|
import math
|
||||||
|
import redis
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
from sqlalchemy import func
|
||||||
|
|
||||||
|
from ..models import (
|
||||||
|
PaperMetadata,
|
||||||
|
ScheduleConfig,
|
||||||
|
VolumeConfig,
|
||||||
|
ScraperState,
|
||||||
|
ActivityLog,
|
||||||
|
ScraperModuleConfig
|
||||||
|
)
|
||||||
|
from ..db import db
|
||||||
|
from ..cache_utils import get_cached_hourly_quota
|
||||||
|
from .factory import get_scraper, get_available_scrapers
|
||||||
|
from ..celery import celery
|
||||||
|
|
||||||
|
|
||||||
|
class ScraperManager:
|
||||||
|
"""Manages scraper operations with hourly quota-based scheduling."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.current_scraper = None
|
||||||
|
self.pending_papers = [] # Track papers being processed
|
||||||
|
# Initialize Redis client for delayed task management
|
||||||
|
self.redis_client = None
|
||||||
|
self._init_redis_client()
|
||||||
|
|
||||||
|
def _init_redis_client(self):
|
||||||
|
"""Initialize Redis client for delayed task management."""
|
||||||
|
try:
|
||||||
|
# Use same Redis configuration as Celery
|
||||||
|
self.redis_client = redis.Redis(
|
||||||
|
host='localhost',
|
||||||
|
port=6379,
|
||||||
|
db=0,
|
||||||
|
decode_responses=True
|
||||||
|
)
|
||||||
|
# Test connection
|
||||||
|
self.redis_client.ping()
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Failed to initialize Redis client: {str(e)}",
|
||||||
|
source="ScraperManager._init_redis_client"
|
||||||
|
)
|
||||||
|
self.redis_client = None
|
||||||
|
|
||||||
|
def _clear_delayed_tasks_from_redis(self) -> int:
|
||||||
|
"""Clear delayed tasks from Redis structures used by Celery.
|
||||||
|
|
||||||
|
Based on analysis, Celery stores delayed tasks in:
|
||||||
|
- 'unacked_index': Sorted set containing task IDs with execution timestamps
|
||||||
|
- 'unacked': Hash containing task data keyed by task ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: Number of delayed tasks cleared
|
||||||
|
"""
|
||||||
|
if not self.redis_client:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message="Redis client not available - cannot clear delayed tasks",
|
||||||
|
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
# Working outside application context - just print instead
|
||||||
|
print("❌ Redis client not available - cannot clear delayed tasks")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cleared_count = 0
|
||||||
|
try:
|
||||||
|
# Define scraper task patterns to identify our tasks
|
||||||
|
scraper_patterns = [
|
||||||
|
'process_single_paper',
|
||||||
|
'process_papers_batch',
|
||||||
|
'hourly_scraper_scheduler'
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="check_delayed_tasks",
|
||||||
|
status="info",
|
||||||
|
description="Checking Celery delayed task structures (unacked_index, unacked)"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print("🔍 Checking Celery delayed task structures (unacked_index, unacked)")
|
||||||
|
|
||||||
|
# Check 'unacked_index' (sorted set with task IDs and timestamps)
|
||||||
|
unacked_index_cleared = 0
|
||||||
|
if self.redis_client.exists('unacked_index'):
|
||||||
|
try:
|
||||||
|
# Get all task IDs from the sorted set
|
||||||
|
task_ids = self.redis_client.zrange('unacked_index', 0, -1)
|
||||||
|
|
||||||
|
if task_ids:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="scan_unacked_index",
|
||||||
|
status="info",
|
||||||
|
description=f"Found {len(task_ids)} tasks in 'unacked_index'"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print(f"📋 Found {len(task_ids)} tasks in 'unacked_index'")
|
||||||
|
|
||||||
|
# Check each task ID against the 'unacked' hash to get task details
|
||||||
|
scraper_task_ids = []
|
||||||
|
for task_id in task_ids:
|
||||||
|
try:
|
||||||
|
# Get task data from 'unacked' hash
|
||||||
|
task_data = self.redis_client.hget('unacked', task_id)
|
||||||
|
if task_data:
|
||||||
|
# Check if this task contains any of our scraper patterns
|
||||||
|
if any(pattern in str(task_data) for pattern in scraper_patterns):
|
||||||
|
scraper_task_ids.append(task_id)
|
||||||
|
except Exception:
|
||||||
|
# Skip individual task errors
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Remove scraper task IDs from both structures
|
||||||
|
for task_id in scraper_task_ids:
|
||||||
|
try:
|
||||||
|
# Remove from unacked_index (sorted set)
|
||||||
|
removed_from_index = self.redis_client.zrem('unacked_index', task_id)
|
||||||
|
# Remove from unacked (hash)
|
||||||
|
removed_from_hash = self.redis_client.hdel('unacked', task_id)
|
||||||
|
|
||||||
|
if removed_from_index or removed_from_hash:
|
||||||
|
unacked_index_cleared += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error removing delayed task {task_id}: {str(e)}",
|
||||||
|
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print(f"❌ Error removing delayed task {task_id}: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
cleared_count += unacked_index_cleared
|
||||||
|
|
||||||
|
if unacked_index_cleared > 0:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="clear_unacked_tasks",
|
||||||
|
status="success",
|
||||||
|
description=f"Cleared {unacked_index_cleared} scraper tasks from unacked structures"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print(f"✅ Cleared {unacked_index_cleared} scraper tasks from unacked structures")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="check_unacked_index",
|
||||||
|
status="info",
|
||||||
|
description="No tasks found in 'unacked_index'"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print("ℹ️ No tasks found in 'unacked_index'")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error accessing 'unacked_index': {str(e)}",
|
||||||
|
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print(f"❌ Error accessing 'unacked_index': {str(e)}")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="check_unacked_index",
|
||||||
|
status="info",
|
||||||
|
description="'unacked_index' key does not exist - no delayed tasks"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print("ℹ️ 'unacked_index' key does not exist - no delayed tasks")
|
||||||
|
|
||||||
|
# Also check the 'celery' queue for immediate tasks (backup check)
|
||||||
|
celery_cleared = 0
|
||||||
|
try:
|
||||||
|
queue_length = self.redis_client.llen('celery')
|
||||||
|
if queue_length and queue_length > 0:
|
||||||
|
# Scan for any scraper tasks in the immediate queue
|
||||||
|
scraper_tasks = []
|
||||||
|
for i in range(queue_length):
|
||||||
|
try:
|
||||||
|
task_data = self.redis_client.lindex('celery', i)
|
||||||
|
if task_data and any(pattern in str(task_data) for pattern in scraper_patterns):
|
||||||
|
scraper_tasks.append(task_data)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Remove scraper tasks from celery queue
|
||||||
|
for task_data in scraper_tasks:
|
||||||
|
try:
|
||||||
|
removed_count = self.redis_client.lrem('celery', 0, task_data)
|
||||||
|
celery_cleared += removed_count
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cleared_count += celery_cleared
|
||||||
|
|
||||||
|
if celery_cleared > 0:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="clear_celery_tasks",
|
||||||
|
status="success",
|
||||||
|
description=f"Cleared {celery_cleared} scraper tasks from 'celery' queue"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print(f"✅ Cleared {celery_cleared} scraper tasks from 'celery' queue")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error checking 'celery' queue: {str(e)}",
|
||||||
|
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print(f"❌ Error checking 'celery' queue: {str(e)}")
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
if cleared_count > 0:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="clear_delayed_tasks_complete",
|
||||||
|
status="success",
|
||||||
|
description=f"Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print(f"✅ Total delayed scraper tasks cleared from Redis: {cleared_count} (unacked: {unacked_index_cleared}, celery: {celery_cleared})")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="clear_delayed_tasks_complete",
|
||||||
|
status="info",
|
||||||
|
description="No delayed scraper tasks found to clear in Redis"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print("ℹ️ No delayed scraper tasks found to clear in Redis")
|
||||||
|
|
||||||
|
return cleared_count
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
try:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Failed to clear delayed tasks from Redis: {str(e)}",
|
||||||
|
source="ScraperManager._clear_delayed_tasks_from_redis"
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
print(f"❌ Failed to clear delayed tasks from Redis: {str(e)}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def start_scraper(self) -> Dict[str, str]:
|
||||||
|
"""Start the scraper system."""
|
||||||
|
try:
|
||||||
|
# Get current scraper
|
||||||
|
self.current_scraper = get_scraper()
|
||||||
|
|
||||||
|
# Activate scraper state
|
||||||
|
ScraperState.set_active(True)
|
||||||
|
ScraperState.set_paused(False)
|
||||||
|
|
||||||
|
scraper_name = self.current_scraper.get_name()
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_command(
|
||||||
|
action="start_scraper",
|
||||||
|
status="success",
|
||||||
|
description=f"Started scraper: {scraper_name}. Use /trigger-immediate endpoint to immediately schedule papers instead of waiting for the next hourly boundary."
|
||||||
|
)
|
||||||
|
|
||||||
|
return {"status": "success", "message": "Scraper started successfully. Papers will be scheduled at the next hourly boundary, or use /trigger-immediate to schedule immediately."}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Failed to start scraper: {str(e)}",
|
||||||
|
source="ScraperManager.start_scraper"
|
||||||
|
)
|
||||||
|
return {"status": "error", "message": str(e)}
|
||||||
|
|
||||||
|
def pause_scraper(self) -> Dict[str, str]:
|
||||||
|
"""Pause the scraper system."""
|
||||||
|
try:
|
||||||
|
ScraperState.set_paused(True)
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_command(
|
||||||
|
action="pause_scraper",
|
||||||
|
status="success",
|
||||||
|
description="Scraper paused - processing will halt"
|
||||||
|
)
|
||||||
|
|
||||||
|
return {"status": "success", "message": "Scraper paused"}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"status": "error", "message": str(e)}
|
||||||
|
|
||||||
|
def resume_scraper(self) -> Dict[str, str]:
|
||||||
|
"""Resume the scraper system."""
|
||||||
|
try:
|
||||||
|
ScraperState.set_paused(False)
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_command(
|
||||||
|
action="resume_scraper",
|
||||||
|
status="success",
|
||||||
|
description="Scraper resumed - processing will continue"
|
||||||
|
)
|
||||||
|
|
||||||
|
return {"status": "success", "message": "Scraper resumed"}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"status": "error", "message": str(e)}
|
||||||
|
|
||||||
|
def stop_scraper(self) -> Dict[str, str]:
|
||||||
|
"""Stop the scraper, revoke all running tasks, and revert pending papers."""
|
||||||
|
try:
|
||||||
|
# First, revoke all running tasks
|
||||||
|
revoked_count = 0
|
||||||
|
delayed_cleared_count = 0
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_command(
|
||||||
|
action="stop_scraper_start",
|
||||||
|
status="info",
|
||||||
|
description="Beginning scraper stop process with task revocation and delayed task clearing"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get Celery inspector to check for running tasks
|
||||||
|
i = celery.control.inspect()
|
||||||
|
active = i.active() or {}
|
||||||
|
scheduled = i.scheduled() or {}
|
||||||
|
reserved = i.reserved() or {}
|
||||||
|
|
||||||
|
# Revoke active tasks
|
||||||
|
for worker, tasks in active.items():
|
||||||
|
for task in tasks:
|
||||||
|
if 'id' in task:
|
||||||
|
celery.control.revoke(task['id'], terminate=True)
|
||||||
|
revoked_count += 1
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="revoke_task",
|
||||||
|
status="success",
|
||||||
|
description=f"Revoked active task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Revoke scheduled tasks
|
||||||
|
for worker, tasks in scheduled.items():
|
||||||
|
for task in tasks:
|
||||||
|
if 'id' in task:
|
||||||
|
celery.control.revoke(task['id'], terminate=True)
|
||||||
|
revoked_count += 1
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="revoke_task",
|
||||||
|
status="success",
|
||||||
|
description=f"Revoked scheduled task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Revoke reserved tasks
|
||||||
|
for worker, tasks in reserved.items():
|
||||||
|
for task in tasks:
|
||||||
|
if 'id' in task:
|
||||||
|
celery.control.revoke(task['id'], terminate=True)
|
||||||
|
revoked_count += 1
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="revoke_task",
|
||||||
|
status="success",
|
||||||
|
description=f"Revoked reserved task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Purge all task queues
|
||||||
|
celery.control.purge()
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="purge_queues",
|
||||||
|
status="success",
|
||||||
|
description="Purged all task queues"
|
||||||
|
)
|
||||||
|
|
||||||
|
# **NEW: Clear delayed tasks from Redis sorted sets**
|
||||||
|
delayed_cleared_count = self._clear_delayed_tasks_from_redis()
|
||||||
|
|
||||||
|
# Additional cleanup: revoke any remaining scraper-related tasks by name pattern
|
||||||
|
try:
|
||||||
|
# Use broadcast to revoke tasks that match scraper patterns
|
||||||
|
scraper_task_patterns = [
|
||||||
|
'process_single_paper',
|
||||||
|
'process_papers_batch',
|
||||||
|
'hourly_scraper_scheduler'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Get a fresh inspection of tasks after purge
|
||||||
|
fresh_inspect = celery.control.inspect()
|
||||||
|
all_tasks = {}
|
||||||
|
all_tasks.update(fresh_inspect.active() or {})
|
||||||
|
all_tasks.update(fresh_inspect.scheduled() or {})
|
||||||
|
all_tasks.update(fresh_inspect.reserved() or {})
|
||||||
|
|
||||||
|
additional_revoked = 0
|
||||||
|
for worker, tasks in all_tasks.items():
|
||||||
|
for task in tasks:
|
||||||
|
task_name = task.get('name', '')
|
||||||
|
task_id = task.get('id', '')
|
||||||
|
if any(pattern in task_name for pattern in scraper_task_patterns) and task_id:
|
||||||
|
celery.control.revoke(task_id, terminate=True)
|
||||||
|
additional_revoked += 1
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="revoke_scraper_task",
|
||||||
|
status="success",
|
||||||
|
description=f"Revoked lingering scraper task: {task_name} (ID: {task_id})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if additional_revoked > 0:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="cleanup_scraper_tasks",
|
||||||
|
status="success",
|
||||||
|
description=f"Additional cleanup: revoked {additional_revoked} lingering scraper tasks"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error during additional scraper task cleanup: {str(e)}",
|
||||||
|
source="ScraperManager.stop_scraper.cleanup"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error revoking tasks: {str(e)}",
|
||||||
|
source="ScraperManager.stop_scraper"
|
||||||
|
)
|
||||||
|
# Continue with paper reversion even if task revocation fails
|
||||||
|
|
||||||
|
# Get current scraper to know what status to revert to
|
||||||
|
scraper = get_scraper()
|
||||||
|
input_statuses = scraper.get_input_statuses()
|
||||||
|
|
||||||
|
# Find papers that are currently being processed
|
||||||
|
processing_status = scraper.get_output_statuses()["processing"]
|
||||||
|
pending_papers = PaperMetadata.query.filter_by(status=processing_status).all()
|
||||||
|
|
||||||
|
# Revert their status to the first input status
|
||||||
|
reverted_count = 0
|
||||||
|
if pending_papers and input_statuses:
|
||||||
|
revert_status = input_statuses[0] # Use first input status as default
|
||||||
|
|
||||||
|
for paper in pending_papers:
|
||||||
|
# Try to use previous_status if available, otherwise use first input status
|
||||||
|
if hasattr(paper, 'previous_status') and paper.previous_status:
|
||||||
|
paper.status = paper.previous_status
|
||||||
|
else:
|
||||||
|
paper.status = revert_status
|
||||||
|
paper.updated_at = datetime.utcnow()
|
||||||
|
reverted_count += 1
|
||||||
|
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="revert_pending_papers",
|
||||||
|
status="success",
|
||||||
|
description=f"Reverted {reverted_count} papers from '{processing_status}' to previous status"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Deactivate scraper
|
||||||
|
ScraperState.set_active(False)
|
||||||
|
ScraperState.set_paused(False)
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_command(
|
||||||
|
action="stop_scraper",
|
||||||
|
status="success",
|
||||||
|
description=f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers."
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": f"Scraper stopped. Revoked {revoked_count} tasks, cleared {delayed_cleared_count} delayed tasks, and reverted {reverted_count} papers to previous status."
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Failed to stop scraper: {str(e)}",
|
||||||
|
source="ScraperManager.stop_scraper"
|
||||||
|
)
|
||||||
|
return {"status": "error", "message": str(e)}
|
||||||
|
|
||||||
|
def reset_scraper(self) -> Dict[str, str]:
|
||||||
|
"""Reset scraper state, revoke all running tasks, and clear all processing statuses."""
|
||||||
|
try:
|
||||||
|
# First, revoke all running tasks (similar to stop_scraper)
|
||||||
|
revoked_count = 0
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_command(
|
||||||
|
action="reset_scraper_start",
|
||||||
|
status="info",
|
||||||
|
description="Beginning scraper reset process with task revocation"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get Celery inspector to check for running tasks
|
||||||
|
i = celery.control.inspect()
|
||||||
|
active = i.active() or {}
|
||||||
|
scheduled = i.scheduled() or {}
|
||||||
|
reserved = i.reserved() or {}
|
||||||
|
|
||||||
|
# Revoke all tasks (active, scheduled, reserved)
|
||||||
|
for queue_name, queue_tasks in [("active", active), ("scheduled", scheduled), ("reserved", reserved)]:
|
||||||
|
for worker, tasks in queue_tasks.items():
|
||||||
|
for task in tasks:
|
||||||
|
if 'id' in task:
|
||||||
|
celery.control.revoke(task['id'], terminate=True)
|
||||||
|
revoked_count += 1
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="revoke_task",
|
||||||
|
status="success",
|
||||||
|
description=f"Revoked {queue_name} task: {task.get('name', 'unknown')} (ID: {task['id']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Purge all task queues
|
||||||
|
celery.control.purge()
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="purge_queues",
|
||||||
|
status="success",
|
||||||
|
description="Purged all task queues during reset"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error revoking tasks during reset: {str(e)}",
|
||||||
|
source="ScraperManager.reset_scraper"
|
||||||
|
)
|
||||||
|
# Continue with paper reversion even if task revocation fails
|
||||||
|
|
||||||
|
# Get current scraper configuration
|
||||||
|
scraper = get_scraper()
|
||||||
|
input_statuses = scraper.get_input_statuses()
|
||||||
|
processing_status = scraper.get_output_statuses()["processing"]
|
||||||
|
|
||||||
|
# Reset all papers in processing status
|
||||||
|
pending_papers = PaperMetadata.query.filter_by(status=processing_status).all()
|
||||||
|
reverted_count = 0
|
||||||
|
|
||||||
|
if pending_papers and input_statuses:
|
||||||
|
revert_status = input_statuses[0]
|
||||||
|
|
||||||
|
for paper in pending_papers:
|
||||||
|
# Try to use previous_status if available, otherwise use first input status
|
||||||
|
if hasattr(paper, 'previous_status') and paper.previous_status:
|
||||||
|
paper.status = paper.previous_status
|
||||||
|
else:
|
||||||
|
paper.status = revert_status
|
||||||
|
paper.updated_at = datetime.utcnow()
|
||||||
|
paper.error_msg = None # Clear any error messages
|
||||||
|
reverted_count += 1
|
||||||
|
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
# Reset scraper state
|
||||||
|
ScraperState.set_active(False)
|
||||||
|
ScraperState.set_paused(False)
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_command(
|
||||||
|
action="reset_scraper",
|
||||||
|
status="success",
|
||||||
|
description=f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers."
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"message": f"Scraper reset. Revoked {revoked_count} tasks and reverted {reverted_count} papers to original status."
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"status": "error", "message": str(e)}
|
||||||
|
|
||||||
|
def get_current_hour_quota(self) -> int:
|
||||||
|
"""Calculate papers to process in current hour based on schedule."""
|
||||||
|
try:
|
||||||
|
return get_cached_hourly_quota(self._calculate_papers_for_current_hour)
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error calculating hourly quota: {str(e)}",
|
||||||
|
source="ScraperManager.get_current_hour_quota"
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def _calculate_papers_for_current_hour(self) -> int:
|
||||||
|
"""Internal method to calculate hourly quota."""
|
||||||
|
try:
|
||||||
|
# Get current hour and volume config
|
||||||
|
current_hour = datetime.now().hour
|
||||||
|
volume_config = VolumeConfig.get_current_volume()
|
||||||
|
daily_volume = volume_config if volume_config else 100
|
||||||
|
|
||||||
|
# Get schedule config for current hour
|
||||||
|
schedule_config = ScheduleConfig.query.filter_by(hour=current_hour).first()
|
||||||
|
current_weight = schedule_config.weight if schedule_config else 1.0
|
||||||
|
|
||||||
|
# Get total weight across all hours
|
||||||
|
total_weight = db.session.query(func.sum(ScheduleConfig.weight)).scalar() or 24.0
|
||||||
|
|
||||||
|
# Calculate quota: (current_weight / total_weight) * daily_volume
|
||||||
|
quota = math.ceil((current_weight / total_weight) * daily_volume)
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="calculate_hourly_quota",
|
||||||
|
status="info",
|
||||||
|
description=f"Hour {current_hour}: quota={quota} (weight={current_weight}, total_weight={total_weight}, daily_volume={daily_volume})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return max(1, quota) # Ensure at least 1 paper per hour
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error in quota calculation: {str(e)}",
|
||||||
|
source="ScraperManager._calculate_papers_for_current_hour"
|
||||||
|
)
|
||||||
|
return 1 # Fallback to 1 paper per hour
|
||||||
|
|
||||||
|
def select_papers_for_processing(self, limit: Optional[int] = None) -> List[PaperMetadata]:
|
||||||
|
"""Select papers for processing based on current scraper configuration."""
|
||||||
|
try:
|
||||||
|
scraper = get_scraper()
|
||||||
|
input_statuses = scraper.get_input_statuses()
|
||||||
|
|
||||||
|
if not input_statuses:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Use provided limit or calculate from hourly quota
|
||||||
|
papers_needed = limit if limit is not None else self.get_current_hour_quota()
|
||||||
|
|
||||||
|
# Query papers with input statuses, randomize selection
|
||||||
|
papers = (PaperMetadata.query
|
||||||
|
.filter(PaperMetadata.status.in_(input_statuses))
|
||||||
|
.order_by(func.random())
|
||||||
|
.limit(papers_needed)
|
||||||
|
.all())
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="select_papers",
|
||||||
|
status="info",
|
||||||
|
description=f"Selected {len(papers)} papers from statuses {input_statuses} (requested: {papers_needed})"
|
||||||
|
)
|
||||||
|
|
||||||
|
return papers
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error selecting papers: {str(e)}",
|
||||||
|
source="ScraperManager.select_papers_for_processing"
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def process_paper(self, paper: PaperMetadata) -> Dict:
|
||||||
|
"""Process a single paper using the current scraper."""
|
||||||
|
try:
|
||||||
|
scraper = get_scraper()
|
||||||
|
output_statuses = scraper.get_output_statuses()
|
||||||
|
|
||||||
|
# Store the previous status before changing it
|
||||||
|
previous_status = paper.status
|
||||||
|
|
||||||
|
# Update paper status to processing
|
||||||
|
paper.previous_status = previous_status
|
||||||
|
paper.status = output_statuses["processing"]
|
||||||
|
paper.updated_at = datetime.utcnow()
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
# Perform scraping
|
||||||
|
result = scraper.scrape(paper.doi)
|
||||||
|
|
||||||
|
# Update paper status based on result
|
||||||
|
if result.status == "success":
|
||||||
|
paper.status = output_statuses["success"]
|
||||||
|
paper.error_msg = None
|
||||||
|
if result.data and "file_path" in result.data:
|
||||||
|
paper.file_path = result.data["file_path"]
|
||||||
|
else:
|
||||||
|
paper.status = output_statuses["failure"]
|
||||||
|
paper.error_msg = result.message
|
||||||
|
|
||||||
|
paper.updated_at = datetime.utcnow()
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
# Log result
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="process_paper",
|
||||||
|
paper_id=paper.id,
|
||||||
|
status=result.status,
|
||||||
|
description=f"Processed {paper.doi}: {result.message}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"paper_id": paper.id,
|
||||||
|
"status": result.status,
|
||||||
|
"message": result.message,
|
||||||
|
"duration": result.duration
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Revert paper status on error
|
||||||
|
try:
|
||||||
|
input_statuses = get_scraper().get_input_statuses()
|
||||||
|
if input_statuses:
|
||||||
|
paper.status = input_statuses[0]
|
||||||
|
paper.error_msg = f"Processing error: {str(e)}"
|
||||||
|
paper.updated_at = datetime.utcnow()
|
||||||
|
db.session.commit()
|
||||||
|
except:
|
||||||
|
pass # Don't fail if reversion fails
|
||||||
|
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error processing paper {paper.id}: {str(e)}",
|
||||||
|
source="ScraperManager.process_paper"
|
||||||
|
)
|
||||||
|
|
||||||
|
return {"paper_id": paper.id, "status": "error", "message": str(e)}
|
||||||
|
|
||||||
|
def get_status(self) -> Dict:
|
||||||
|
"""Get current scraper status."""
|
||||||
|
scraper_state = ScraperState.get_current_state()
|
||||||
|
scraper = get_scraper()
|
||||||
|
|
||||||
|
# Count papers by status
|
||||||
|
input_statuses = scraper.get_input_statuses()
|
||||||
|
output_statuses = scraper.get_output_statuses()
|
||||||
|
|
||||||
|
available_count = (PaperMetadata.query
|
||||||
|
.filter(PaperMetadata.status.in_(input_statuses))
|
||||||
|
.count())
|
||||||
|
|
||||||
|
processing_count = (PaperMetadata.query
|
||||||
|
.filter_by(status=output_statuses["processing"])
|
||||||
|
.count())
|
||||||
|
|
||||||
|
return {
|
||||||
|
"active": scraper_state.is_active,
|
||||||
|
"paused": scraper_state.is_paused,
|
||||||
|
"current_scraper": scraper.get_name(),
|
||||||
|
"input_statuses": input_statuses,
|
||||||
|
"output_statuses": output_statuses,
|
||||||
|
"available_papers": available_count,
|
||||||
|
"processing_papers": processing_count,
|
||||||
|
"current_hour_quota": self.get_current_hour_quota()
|
||||||
|
}
|
189
scipaperloader/scrapers/tasks.py
Normal file
189
scipaperloader/scrapers/tasks.py
Normal file
@ -0,0 +1,189 @@
|
|||||||
|
"""
|
||||||
|
Hourly scheduler task that processes papers at random times within each hour.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import random
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Optional
|
||||||
|
from celery import shared_task
|
||||||
|
|
||||||
|
from ..models import ScraperState, ActivityLog
|
||||||
|
from .manager import ScraperManager
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task(bind=True)
|
||||||
|
def hourly_scraper_scheduler(self):
|
||||||
|
"""
|
||||||
|
Hourly task that schedules paper processing at random times within the hour.
|
||||||
|
|
||||||
|
This task runs at the beginning of each hour and:
|
||||||
|
1. Calculates how many papers to process this hour
|
||||||
|
2. Schedules individual paper processing tasks at random times within the hour
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Check if scraper is active
|
||||||
|
scraper_state = ScraperState.get_current_state()
|
||||||
|
if not scraper_state.is_active:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="hourly_scheduler",
|
||||||
|
status="info",
|
||||||
|
description="Hourly scheduler skipped - scraper not active"
|
||||||
|
)
|
||||||
|
# Disable retries for inactive scheduler
|
||||||
|
self.retry = False
|
||||||
|
return {"status": "inactive", "papers_scheduled": 0}
|
||||||
|
|
||||||
|
if scraper_state.is_paused:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="hourly_scheduler",
|
||||||
|
status="info",
|
||||||
|
description="Hourly scheduler skipped - scraper paused"
|
||||||
|
)
|
||||||
|
# Disable retries for paused scheduler
|
||||||
|
self.retry = False
|
||||||
|
return {"status": "paused", "papers_scheduled": 0}
|
||||||
|
|
||||||
|
# Initialize scraper manager
|
||||||
|
manager = ScraperManager()
|
||||||
|
|
||||||
|
# Get papers to process this hour
|
||||||
|
papers = manager.select_papers_for_processing()
|
||||||
|
|
||||||
|
if not papers:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="hourly_scheduler",
|
||||||
|
status="info",
|
||||||
|
description="No papers available for processing this hour"
|
||||||
|
)
|
||||||
|
return {"status": "empty", "papers_scheduled": 0}
|
||||||
|
|
||||||
|
# Schedule papers at random times within the hour (0-3600 seconds)
|
||||||
|
scheduled_count = 0
|
||||||
|
current_time = datetime.now()
|
||||||
|
|
||||||
|
for paper in papers:
|
||||||
|
# Random delay between 1 second and 58 minutes
|
||||||
|
delay_seconds = random.randint(1, 3480) # Up to 58 minutes
|
||||||
|
|
||||||
|
# Schedule the task using Celery's task registry to avoid circular import issues
|
||||||
|
from ..celery import celery
|
||||||
|
celery.send_task(
|
||||||
|
'scipaperloader.scrapers.tasks.process_single_paper',
|
||||||
|
args=[paper.id],
|
||||||
|
countdown=delay_seconds
|
||||||
|
)
|
||||||
|
|
||||||
|
scheduled_count += 1
|
||||||
|
|
||||||
|
# Log each scheduled paper
|
||||||
|
schedule_time = current_time + timedelta(seconds=delay_seconds)
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="schedule_paper",
|
||||||
|
paper_id=paper.id,
|
||||||
|
status="info",
|
||||||
|
description=f"Scheduled paper {paper.doi} for processing at {schedule_time.strftime('%H:%M:%S')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="hourly_scheduler",
|
||||||
|
status="success",
|
||||||
|
description=f"Scheduled {scheduled_count} papers for random processing within this hour"
|
||||||
|
)
|
||||||
|
|
||||||
|
return {"status": "success", "papers_scheduled": scheduled_count}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Hourly scheduler error: {str(e)}",
|
||||||
|
source="hourly_scraper_scheduler"
|
||||||
|
)
|
||||||
|
return {"status": "error", "message": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task(bind=True)
|
||||||
|
def process_single_paper(self, paper_id: int):
|
||||||
|
"""
|
||||||
|
Process a single paper. This task is scheduled at random times within each hour.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
paper_id: ID of the paper to process
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Double-check scraper state before processing
|
||||||
|
scraper_state = ScraperState.get_current_state()
|
||||||
|
if not scraper_state.is_active:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="process_single_paper",
|
||||||
|
paper_id=paper_id,
|
||||||
|
status="skipped",
|
||||||
|
description="Skipped processing - scraper not active"
|
||||||
|
)
|
||||||
|
# Use Celery's ignore to mark this task as completed without error
|
||||||
|
self.retry = False
|
||||||
|
return {"status": "inactive", "paper_id": paper_id}
|
||||||
|
|
||||||
|
if scraper_state.is_paused:
|
||||||
|
ActivityLog.log_scraper_activity(
|
||||||
|
action="process_single_paper",
|
||||||
|
paper_id=paper_id,
|
||||||
|
status="skipped",
|
||||||
|
description="Skipped processing - scraper paused"
|
||||||
|
)
|
||||||
|
# Use Celery's ignore for paused state too
|
||||||
|
self.retry = False
|
||||||
|
return {"status": "paused", "paper_id": paper_id}
|
||||||
|
|
||||||
|
# Get the paper
|
||||||
|
from ..models import PaperMetadata
|
||||||
|
paper = PaperMetadata.query.get(paper_id)
|
||||||
|
if not paper:
|
||||||
|
return {"status": "error", "message": f"Paper {paper_id} not found"}
|
||||||
|
|
||||||
|
# Process the paper using scraper manager
|
||||||
|
manager = ScraperManager()
|
||||||
|
result = manager.process_paper(paper)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error processing paper {paper_id}: {str(e)}",
|
||||||
|
source="process_single_paper"
|
||||||
|
)
|
||||||
|
return {"status": "error", "paper_id": paper_id, "message": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task(bind=True)
|
||||||
|
def process_papers_batch(self, paper_ids: list, scraper_module: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Process multiple papers in a batch for immediate processing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
paper_ids: List of paper IDs to process
|
||||||
|
scraper_module: Optional specific scraper module to use
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
results = []
|
||||||
|
manager = ScraperManager()
|
||||||
|
|
||||||
|
for paper_id in paper_ids:
|
||||||
|
from ..models import PaperMetadata
|
||||||
|
paper = PaperMetadata.query.get(paper_id)
|
||||||
|
if paper:
|
||||||
|
result = manager.process_paper(paper)
|
||||||
|
results.append(result)
|
||||||
|
else:
|
||||||
|
results.append({
|
||||||
|
"paper_id": paper_id,
|
||||||
|
"status": "error",
|
||||||
|
"message": "Paper not found"
|
||||||
|
})
|
||||||
|
|
||||||
|
return {"results": results, "total_processed": len(results)}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
ActivityLog.log_error(
|
||||||
|
error_message=f"Error processing batch: {str(e)}",
|
||||||
|
source="process_papers_batch"
|
||||||
|
)
|
||||||
|
return {"status": "error", "message": str(e)}
|
@ -29,6 +29,11 @@
|
|||||||
height: 400px;
|
height: 400px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.chart-wrapper {
|
||||||
|
position: relative;
|
||||||
|
height: 400px;
|
||||||
|
}
|
||||||
|
|
||||||
.notification {
|
.notification {
|
||||||
position: fixed;
|
position: fixed;
|
||||||
bottom: 20px;
|
bottom: 20px;
|
||||||
@ -100,19 +105,21 @@
|
|||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label for="volumeInput">Papers per day:</label>
|
<label for="volumeInput">Papers per day:</label>
|
||||||
<input type="number" class="form-control" id="volumeInput"
|
<input type="number" class="form-control" id="volumeInput"
|
||||||
value="{{ volume_config.volume if volume_config else 100 }}" min="1"
|
value="{{ volume_config if volume_config else 100 }}" min="1" max="{{ max_volume }}">
|
||||||
max="{{ max_volume }}">
|
<button type="submit" class="btn btn-primary mt-2">
|
||||||
|
<i class="fas fa-save"></i> Update Volume
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
|
<div class="form-text">Enter a value between 1 and {{ max_volume }}</div>
|
||||||
</div>
|
</div>
|
||||||
<button type="submit" class="btn btn-primary mt-2">Update Volume</button>
|
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- New row for single paper processing -->
|
<!-- New row for single paper processing -->
|
||||||
<div class="row mb-4">
|
<div class="row mb-4">
|
||||||
<div class="col-12">
|
<div class="col-12">
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-header">
|
<div class="card-header">
|
||||||
@ -164,9 +171,9 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="row mb-4">
|
<div class="row mb-4">
|
||||||
<div class="col-12">
|
<div class="col-12">
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-header d-flex justify-content-between align-items-center">
|
<div class="card-header d-flex justify-content-between align-items-center">
|
||||||
@ -187,13 +194,15 @@
|
|||||||
<button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3
|
<button class="btn btn-outline-secondary time-range-btn" data-hours="72">Last 3
|
||||||
days</button>
|
days</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="stats-chart" id="activityChart"></div>
|
<div class="chart-wrapper">
|
||||||
|
<canvas id="activityChart"></canvas>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="row mb-4">
|
<div class="row mb-4">
|
||||||
<div class="col-12">
|
<div class="col-12">
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-header">
|
<div class="card-header">
|
||||||
@ -220,12 +229,13 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{% endblock content %}
|
{% endblock content %}
|
||||||
|
|
||||||
{% block scripts %}
|
{% block scripts %}
|
||||||
{{ super() }}
|
{{ super() }}
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||||
<script>
|
<script>
|
||||||
// Global variables for the scraper dashboard
|
// Global variables for the scraper dashboard
|
||||||
let notificationsEnabled = true;
|
let notificationsEnabled = true;
|
||||||
@ -251,10 +261,14 @@
|
|||||||
// Initialize the page
|
// Initialize the page
|
||||||
document.addEventListener('DOMContentLoaded', function () {
|
document.addEventListener('DOMContentLoaded', function () {
|
||||||
initStatusPolling();
|
initStatusPolling();
|
||||||
loadActivityStats(currentTimeRange);
|
|
||||||
loadRecentActivity();
|
loadRecentActivity();
|
||||||
loadAvailableScrapers();
|
loadAvailableScrapers();
|
||||||
|
|
||||||
|
// Load chart data after a short delay to ensure Chart.js is loaded
|
||||||
|
setTimeout(() => {
|
||||||
|
loadActivityStats(currentTimeRange);
|
||||||
|
}, 100);
|
||||||
|
|
||||||
// Initialize event listeners
|
// Initialize event listeners
|
||||||
startButton.addEventListener('click', startScraper);
|
startButton.addEventListener('click', startScraper);
|
||||||
pauseButton.addEventListener('click', togglePauseScraper);
|
pauseButton.addEventListener('click', togglePauseScraper);
|
||||||
@ -470,13 +484,21 @@
|
|||||||
fetch('/scraper/status')
|
fetch('/scraper/status')
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(data => {
|
.then(data => {
|
||||||
if (data.active) {
|
console.log('Status data received:', data); // Debug log
|
||||||
if (data.paused) {
|
|
||||||
statusIndicator.className = 'status-indicator status-paused';
|
// Remove all status classes first
|
||||||
|
statusIndicator.classList.remove('status-active', 'status-paused', 'status-inactive');
|
||||||
|
|
||||||
|
// Handle the new JSON structure with scraper_state
|
||||||
|
const scraperState = data.scraper_state || data; // Fallback for old structure
|
||||||
|
|
||||||
|
if (scraperState.active) {
|
||||||
|
if (scraperState.paused) {
|
||||||
|
statusIndicator.classList.add('status-paused');
|
||||||
statusText.textContent = 'Paused';
|
statusText.textContent = 'Paused';
|
||||||
pauseButton.textContent = 'Resume';
|
pauseButton.textContent = 'Resume';
|
||||||
} else {
|
} else {
|
||||||
statusIndicator.className = 'status-indicator status-active';
|
statusIndicator.classList.add('status-active');
|
||||||
statusText.textContent = 'Active';
|
statusText.textContent = 'Active';
|
||||||
pauseButton.textContent = 'Pause';
|
pauseButton.textContent = 'Pause';
|
||||||
}
|
}
|
||||||
@ -485,13 +507,20 @@
|
|||||||
stopButton.disabled = false;
|
stopButton.disabled = false;
|
||||||
resetButton.disabled = false; // Enable reset when active
|
resetButton.disabled = false; // Enable reset when active
|
||||||
} else {
|
} else {
|
||||||
statusIndicator.className = 'status-indicator status-inactive';
|
statusIndicator.classList.add('status-inactive');
|
||||||
statusText.textContent = 'Inactive';
|
statusText.textContent = 'Inactive';
|
||||||
startButton.disabled = false;
|
startButton.disabled = false;
|
||||||
pauseButton.disabled = true;
|
pauseButton.disabled = true;
|
||||||
stopButton.disabled = true;
|
stopButton.disabled = true;
|
||||||
resetButton.disabled = false; // Enable reset when inactive too
|
resetButton.disabled = false; // Enable reset when inactive too
|
||||||
}
|
}
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Error fetching status:', error);
|
||||||
|
// On error, show inactive state
|
||||||
|
statusIndicator.classList.remove('status-active', 'status-paused', 'status-inactive');
|
||||||
|
statusIndicator.classList.add('status-inactive');
|
||||||
|
statusText.textContent = 'Error';
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -499,7 +528,13 @@
|
|||||||
function startScraper() {
|
function startScraper() {
|
||||||
console.log("Start button clicked - sending request to /scraper/start");
|
console.log("Start button clicked - sending request to /scraper/start");
|
||||||
|
|
||||||
fetch('/scraper/start', { method: 'POST' })
|
fetch('/scraper/start', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
},
|
||||||
|
body: JSON.stringify({})
|
||||||
|
})
|
||||||
.then(response => {
|
.then(response => {
|
||||||
console.log("Response received:", response);
|
console.log("Response received:", response);
|
||||||
return response.json();
|
return response.json();
|
||||||
@ -521,7 +556,13 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
function togglePauseScraper() {
|
function togglePauseScraper() {
|
||||||
fetch('/scraper/pause', { method: 'POST' })
|
fetch('/scraper/pause', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
},
|
||||||
|
body: JSON.stringify({})
|
||||||
|
})
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(data => {
|
.then(data => {
|
||||||
if (data.success) {
|
if (data.success) {
|
||||||
@ -535,7 +576,13 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
function stopScraper() {
|
function stopScraper() {
|
||||||
fetch('/scraper/stop', { method: 'POST' })
|
fetch('/scraper/stop', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
},
|
||||||
|
body: JSON.stringify({})
|
||||||
|
})
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(data => {
|
.then(data => {
|
||||||
if (data.success) {
|
if (data.success) {
|
||||||
@ -706,14 +753,28 @@
|
|||||||
// Load data functions
|
// Load data functions
|
||||||
function loadActivityStats(hours) {
|
function loadActivityStats(hours) {
|
||||||
fetch(`/scraper/stats?hours=${hours}`)
|
fetch(`/scraper/stats?hours=${hours}`)
|
||||||
.then(response => response.json())
|
.then(response => {
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`HTTP error! status: ${response.status}`);
|
||||||
|
}
|
||||||
|
return response.json();
|
||||||
|
})
|
||||||
.then(data => {
|
.then(data => {
|
||||||
|
console.log('Stats data loaded:', data);
|
||||||
renderActivityChart(data);
|
renderActivityChart(data);
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Failed to load activity stats:', error);
|
||||||
|
// Hide the chart or show an error message
|
||||||
|
const chartContainer = document.getElementById('activityChart').parentElement;
|
||||||
|
if (chartContainer) {
|
||||||
|
chartContainer.innerHTML = '<p class="text-muted">Chart data unavailable</p>';
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function loadRecentActivity() {
|
function loadRecentActivity() {
|
||||||
fetch('/api/activity_logs?category=scraper_activity&limit=20')
|
fetch('/api/activity_logs?category=scraper_activity&category=scraper_command&limit=50')
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(data => {
|
.then(data => {
|
||||||
renderActivityLog(data);
|
renderActivityLog(data);
|
||||||
@ -728,7 +789,19 @@
|
|||||||
|
|
||||||
// Rendering functions
|
// Rendering functions
|
||||||
function renderActivityChart(data) {
|
function renderActivityChart(data) {
|
||||||
const ctx = document.getElementById('activityChart').getContext('2d');
|
// Check if Chart.js is available
|
||||||
|
if (typeof Chart === 'undefined') {
|
||||||
|
console.error('Chart.js is not loaded');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chartElement = document.getElementById('activityChart');
|
||||||
|
if (!chartElement) {
|
||||||
|
console.error('Chart canvas element not found');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const ctx = chartElement.getContext('2d');
|
||||||
|
|
||||||
// Extract the data for the chart
|
// Extract the data for the chart
|
||||||
const labels = data.map(item => `${item.hour}:00`);
|
const labels = data.map(item => `${item.hour}:00`);
|
||||||
@ -857,7 +930,7 @@
|
|||||||
let lastPaperTimestamp = new Date().toISOString();
|
let lastPaperTimestamp = new Date().toISOString();
|
||||||
|
|
||||||
function checkForNewPapers() {
|
function checkForNewPapers() {
|
||||||
fetch(`/api/activity_logs?category=scraper_activity&action=scrape_paper&after=${lastPaperTimestamp}&limit=5`)
|
fetch(`/api/activity_logs?category=scraper_activity&category=scraper_command&action=scrape_paper&after=${lastPaperTimestamp}&limit=5`)
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(data => {
|
.then(data => {
|
||||||
if (data && data.length > 0) {
|
if (data && data.length > 0) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user