tessl/pypi-jupyterhub

A multi-user server for Jupyter notebooks that provides authentication, spawning, and proxying for multiple users simultaneously

—

Pending

Overview

Eval results

Files

Monitoring and Metrics

Name: tessl/pypi-jupyterhub
Author: tessl

JupyterHub provides comprehensive monitoring and metrics collection capabilities through Prometheus integration. The system tracks user activity, server performance, resource usage, and system health for operational visibility and capacity planning.

Capabilities

Prometheus Metrics

Core Prometheus metrics exposed by JupyterHub for monitoring and alerting.

# Counter metrics
TOTAL_USERS: Counter = Counter(
    'jupyterhub_total_users',
    'Total number of users in JupyterHub database'
)

RUNNING_SERVERS: Gauge = Gauge(
    'jupyterhub_running_servers', 
    'Number of currently running servers'
)

PENDING_SERVERS: Gauge = Gauge(
    'jupyterhub_pending_servers',
    'Number of servers in pending state' 
)

# Request metrics
REQUEST_DURATION_SECONDS: Histogram = Histogram(
    'jupyterhub_request_duration_seconds',
    'Time spent handling HTTP requests',
    ['method', 'handler', 'code']
)

REQUEST_COUNT: Counter = Counter(
    'jupyterhub_request_count_total',
    'Total number of HTTP requests',
    ['method', 'handler', 'code']
)

# Authentication metrics  
LOGIN_SUCCESS: Counter = Counter(
    'jupyterhub_login_success_total',
    'Total number of successful logins'
)

LOGIN_FAILURE: Counter = Counter(
    'jupyterhub_login_failure_total', 
    'Total number of failed logins'
)

# Spawner metrics
SPAWN_DURATION_SECONDS: Histogram = Histogram(
    'jupyterhub_spawn_duration_seconds',
    'Time spent spawning servers',
    ['spawner_class']
)

SPAWN_SUCCESS: Counter = Counter(
    'jupyterhub_spawn_success_total',
    'Total number of successful server spawns',
    ['spawner_class']
)

SPAWN_FAILURE: Counter = Counter(
    'jupyterhub_spawn_failure_total',
    'Total number of failed server spawns',
    ['spawner_class', 'error_type']  
)

# Hub metrics
HUB_RESPONSE_DURATION_SECONDS: Histogram = Histogram(
    'jupyterhub_hub_response_duration_seconds',
    'Time for Hub to respond to requests'
)

API_REQUEST_DURATION_SECONDS: Histogram = Histogram(
    'jupyterhub_api_request_duration_seconds',
    'Time spent handling API requests',
    ['method', 'endpoint', 'status']
)

Metrics Collection System

Automated metrics collection and periodic updates.

class PeriodicMetricsCollector:
    """
    Periodic metrics collector for JupyterHub system statistics.
    
    Collects and updates metrics at regular intervals to provide
    current system state information.
    """
    
    def __init__(self, app, interval: int = 60):
        """
        Initialize metrics collector.
        
        Args:
            app: JupyterHub application instance
            interval: Collection interval in seconds
        """
        self.app = app
        self.interval = interval
        self.running = False
    
    async def start(self):
        """Start periodic metrics collection"""
        self.running = True
        while self.running:
            await self.collect_metrics()
            await asyncio.sleep(self.interval)
    
    def stop(self):
        """Stop metrics collection"""
        self.running = False
    
    async def collect_metrics(self):
        """
        Collect and update all metrics.
        
        Gathers current system state and updates Prometheus metrics.
        """
        await self.collect_user_metrics()
        await self.collect_server_metrics()
        await self.collect_hub_metrics()
    
    async def collect_user_metrics(self):
        """Collect user-related metrics"""
        # Total users
        total_users = self.app.db.query(User).count()
        TOTAL_USERS.set(total_users)
        
        # Active users (with recent activity)
        cutoff = datetime.utcnow() - timedelta(hours=24)
        active_users = self.app.db.query(User).filter(
            User.last_activity > cutoff
        ).count()
        
        # Update metrics
        ACTIVE_USERS.set(active_users)
    
    async def collect_server_metrics(self):
        """Collect server-related metrics"""
        # Running servers
        running_servers = self.app.db.query(Server).filter(
            Server.url.isnot(None)
        ).count()
        RUNNING_SERVERS.set(running_servers)
        
        # Pending servers  
        pending_servers = len([
            spawner for spawner in self.app.spawners.values()
            if spawner.pending
        ])
        PENDING_SERVERS.set(pending_servers)
    
    async def collect_hub_metrics(self):
        """Collect Hub system metrics"""
        # System resource usage
        import psutil
        
        # Memory usage
        memory = psutil.virtual_memory()
        HUB_MEMORY_USAGE_BYTES.set(memory.used)
        HUB_MEMORY_TOTAL_BYTES.set(memory.total)
        
        # CPU usage
        cpu_percent = psutil.cpu_percent()
        HUB_CPU_USAGE_PERCENT.set(cpu_percent)

Custom Metrics Integration

Tools for adding custom metrics to JupyterHub applications.

from prometheus_client import Counter, Gauge, Histogram, Summary

# Custom metric definitions
CUSTOM_COUNTER: Counter = Counter(
    'jupyterhub_custom_events_total',
    'Total custom events',
    ['event_type', 'user']
)

CUSTOM_GAUGE: Gauge = Gauge(
    'jupyterhub_custom_resource_usage',
    'Custom resource usage',
    ['resource_type', 'user']
)

CUSTOM_HISTOGRAM: Histogram = Histogram(
    'jupyterhub_custom_operation_duration_seconds',
    'Custom operation duration',
    ['operation', 'status']
)

def record_custom_event(event_type: str, user: str = None):
    """
    Record a custom event metric.
    
    Args:
        event_type: Type of event to record
        user: Username associated with event (optional)
    """
    CUSTOM_COUNTER.labels(
        event_type=event_type,
        user=user or 'anonymous'
    ).inc()

def update_custom_gauge(resource_type: str, value: float, user: str = None):
    """
    Update a custom gauge metric.
    
    Args:
        resource_type: Type of resource being measured
        value: Current resource value
        user: Username associated with resource (optional)
    """
    CUSTOM_GAUGE.labels(
        resource_type=resource_type,
        user=user or 'system'
    ).set(value)

def time_custom_operation(operation: str, status: str = 'success'):
    """
    Decorator to time custom operations.
    
    Args:
        operation: Name of the operation
        status: Operation status (success, error, etc.)
        
    Returns:
        Timer context manager
    """
    return CUSTOM_HISTOGRAM.labels(
        operation=operation,
        status=status
    ).time()

Health Check System

Health monitoring and status reporting for JupyterHub components.

class HealthChecker:
    """
    Health check system for JupyterHub components.
    
    Provides endpoints and utilities for monitoring system health
    and component status.
    """
    
    def __init__(self, app):
        """
        Initialize health checker.
        
        Args:
            app: JupyterHub application instance
        """
        self.app = app
        self.checks = {}
    
    def register_check(self, name: str, check_func: callable, interval: int = 60):
        """
        Register a health check function.
        
        Args:
            name: Check name
            check_func: Function that returns health status
            interval: Check interval in seconds
        """
        self.checks[name] = {
            'func': check_func,
            'interval': interval,
            'last_run': None,
            'status': 'unknown'
        }
    
    async def run_checks(self) -> Dict[str, Any]:
        """
        Run all registered health checks.
        
        Returns:
            Dictionary of check results with status and timing
        """
        results = {}
        
        for name, check in self.checks.items():
            try:
                start_time = time.time()
                status = await check['func']()
                duration = time.time() - start_time
                
                results[name] = {
                    'status': 'healthy' if status else 'unhealthy',
                    'duration': duration,
                    'timestamp': datetime.utcnow().isoformat()
                }
            except Exception as e:
                results[name] = {
                    'status': 'error',
                    'error': str(e),
                    'timestamp': datetime.utcnow().isoformat()
                }
        
        return results
    
    async def database_health_check(self) -> bool:
        """Check database connectivity and basic operations"""
        try:
            # Test database connection
            user_count = self.app.db.query(User).count()
            return user_count >= 0
        except Exception:
            return False
    
    async def spawner_health_check(self) -> bool:
        """Check spawner system health"""
        try:
            # Check if spawners are responsive
            active_spawners = len(self.app.spawners)
            return True  # Spawner system is operational
        except Exception:
            return False
    
    async def proxy_health_check(self) -> bool:
        """Check proxy health and connectivity"""
        try:
            # Test proxy connectivity
            await self.app.proxy.get_routes()
            return True
        except Exception:
            return False

Usage Examples

Basic Metrics Integration

from jupyterhub.metrics import SPAWN_SUCCESS, SPAWN_FAILURE, SPAWN_DURATION_SECONDS
import time

class MonitoredSpawner(LocalProcessSpawner):
    """Spawner with metrics collection"""
    
    async def start(self):
        """Start server with metrics collection"""
        start_time = time.time()
        spawner_class = self.__class__.__name__
        
        try:
            # Start the server
            result = await super().start()
            
            # Record success metrics
            SPAWN_SUCCESS.labels(spawner_class=spawner_class).inc()
            duration = time.time() - start_time
            SPAWN_DURATION_SECONDS.labels(spawner_class=spawner_class).observe(duration)
            
            return result
            
        except Exception as e:
            # Record failure metrics
            error_type = type(e).__name__
            SPAWN_FAILURE.labels(
                spawner_class=spawner_class,
                error_type=error_type
            ).inc()
            raise

Custom Metrics for User Activity

from prometheus_client import Counter, Histogram

# Custom user activity metrics
USER_LOGIN_COUNTER = Counter(
    'jupyterhub_user_login_total',
    'Total user logins',
    ['username', 'authenticator']
)

NOTEBOOK_LAUNCH_DURATION = Histogram(
    'jupyterhub_notebook_launch_duration_seconds',
    'Time to launch notebook server',
    ['username', 'spawner_type']
)

class MetricsAuthenticator(PAMAuthenticator):
    """Authenticator with login metrics"""
    
    async def authenticate(self, handler, data):
        """Authenticate with metrics collection"""
        username = data.get('username', 'unknown')
        authenticator_name = self.__class__.__name__
        
        result = await super().authenticate(handler, data)
        
        if result:
            # Record successful login
            USER_LOGIN_COUNTER.labels(
                username=username,
                authenticator=authenticator_name
            ).inc()
        
        return result

class MetricsSpawner(LocalProcessSpawner):
    """Spawner with launch time metrics"""
    
    async def start(self):
        """Start server with launch time tracking"""
        start_time = time.time()
        username = self.user.name
        spawner_type = self.__class__.__name__
        
        try:
            result = await super().start()
            
            # Record launch time
            duration = time.time() - start_time
            NOTEBOOK_LAUNCH_DURATION.labels(
                username=username,
                spawner_type=spawner_type
            ).observe(duration)
            
            return result
        except Exception:
            # Still record failed launch attempts
            duration = time.time() - start_time
            NOTEBOOK_LAUNCH_DURATION.labels(
                username=username,
                spawner_type=spawner_type
            ).observe(duration)
            raise

Health Monitoring Setup

from jupyterhub.app import JupyterHub
from .monitoring import HealthChecker

class MonitoredJupyterHub(JupyterHub):
    """JupyterHub with health monitoring"""
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.health_checker = HealthChecker(self)
        self.setup_health_checks()
    
    def setup_health_checks(self):
        """Register health check functions"""
        self.health_checker.register_check(
            'database',
            self.health_checker.database_health_check,
            interval=30
        )
        
        self.health_checker.register_check(
            'proxy',
            self.health_checker.proxy_health_check,
            interval=60
        )
        
        self.health_checker.register_check(
            'spawners',
            self.health_checker.spawner_health_check,
            interval=120
        )
    
    async def start(self):
        """Start Hub with health monitoring"""
        await super().start()
        
        # Start health monitoring
        asyncio.create_task(self.periodic_health_checks())
    
    async def periodic_health_checks(self):
        """Run periodic health checks"""
        while True:
            try:
                health_results = await self.health_checker.run_checks()
                
                # Log health status
                for check_name, result in health_results.items():
                    if result['status'] != 'healthy':
                        self.log.warning(f"Health check {check_name}: {result['status']}")
                
                await asyncio.sleep(60)
            except Exception as e:
                self.log.error(f"Health check error: {e}")
                await asyncio.sleep(300)  # Wait longer on error

Grafana Dashboard Integration

# Example metrics for Grafana dashboard
DASHBOARD_METRICS = {
    'user_metrics': [
        'jupyterhub_total_users',
        'jupyterhub_active_users',
        'jupyterhub_user_login_total'
    ],
    'server_metrics': [
        'jupyterhub_running_servers',
        'jupyterhub_pending_servers', 
        'jupyterhub_spawn_duration_seconds',
        'jupyterhub_spawn_success_total',
        'jupyterhub_spawn_failure_total'
    ],
    'performance_metrics': [
        'jupyterhub_request_duration_seconds',
        'jupyterhub_api_request_duration_seconds',
        'jupyterhub_hub_response_duration_seconds'
    ],
    'system_metrics': [
        'jupyterhub_hub_memory_usage_bytes',
        'jupyterhub_hub_cpu_usage_percent'
    ]
}

def generate_grafana_queries():
    """Generate Grafana query examples"""
    queries = {
        'active_users_24h': '''
            increase(jupyterhub_user_login_total[24h])
        ''',
        'average_spawn_time': '''
            rate(jupyterhub_spawn_duration_seconds_sum[5m]) / 
            rate(jupyterhub_spawn_duration_seconds_count[5m])
        ''',
        'server_success_rate': '''
            rate(jupyterhub_spawn_success_total[5m]) / 
            (rate(jupyterhub_spawn_success_total[5m]) + 
             rate(jupyterhub_spawn_failure_total[5m])) * 100
        ''',
        'api_request_rate': '''
            rate(jupyterhub_api_request_duration_seconds_count[5m])
        '''
    }
    return queries

Alerting Configuration

# Prometheus alerting rules for JupyterHub
ALERTING_RULES = {
    'high_spawn_failure_rate': {
        'expr': '''
            rate(jupyterhub_spawn_failure_total[5m]) / 
            rate(jupyterhub_spawn_success_total[5m]) > 0.1
        ''',
        'for': '5m',
        'severity': 'warning',
        'summary': 'High spawn failure rate detected'
    },
    'hub_memory_high': {
        'expr': '''
            jupyterhub_hub_memory_usage_bytes / 
            jupyterhub_hub_memory_total_bytes > 0.9
        ''',
        'for': '2m',
        'severity': 'critical',
        'summary': 'Hub memory usage critical'
    },
    'no_running_servers': {
        'expr': 'jupyterhub_running_servers == 0',
        'for': '10m',
        'severity': 'warning',
        'summary': 'No servers currently running'
    }
}

class AlertManager:
    """Alert management for JupyterHub metrics"""
    
    def __init__(self, webhook_url=None):
        self.webhook_url = webhook_url
        self.alerts = {}
    
    async def check_alerts(self, metrics):
        """Check metrics against alert conditions"""
        for alert_name, config in ALERTING_RULES.items():
            # Evaluate alert condition
            if self.evaluate_condition(config['expr'], metrics):
                await self.trigger_alert(alert_name, config)
    
    def evaluate_condition(self, expr, metrics):
        """Evaluate alert expression against metrics"""
        # Implementation depends on metrics evaluation system
        return False
    
    async def trigger_alert(self, name, config):
        """Trigger alert notification"""
        if self.webhook_url:
            alert_data = {
                'alert': name,
                'severity': config['severity'],
                'summary': config['summary'],
                'timestamp': datetime.utcnow().isoformat()
            }
            
            # Send webhook notification
            async with aiohttp.ClientSession() as session:
                await session.post(self.webhook_url, json=alert_data)

Advanced Monitoring Patterns

Multi-dimensional Metrics

# Resource usage metrics per user and server
RESOURCE_USAGE = Gauge(
    'jupyterhub_resource_usage',
    'Resource usage by dimension',
    ['resource_type', 'username', 'server_name']
)

def update_resource_metrics(user, server_name=''):
    """Update resource usage metrics for user"""
    # Get resource usage (implementation specific)
    cpu_usage = get_cpu_usage(user.name, server_name)
    memory_usage = get_memory_usage(user.name, server_name)
    
    # Update metrics
    RESOURCE_USAGE.labels(
        resource_type='cpu',
        username=user.name,
        server_name=server_name
    ).set(cpu_usage)
    
    RESOURCE_USAGE.labels(
        resource_type='memory',
        username=user.name,
        server_name=server_name
    ).set(memory_usage)

Event-Based Metrics

from jupyterhub.metrics import record_custom_event

class EventMetricsHub(JupyterHub):
    """Hub with event-based metrics collection"""
    
    async def login_user(self, user):
        """Login user with event metrics"""
        result = await super().login_user(user)
        
        # Record login event
        record_custom_event('user_login', user.name)
        
        return result
    
    async def spawn_server(self, user, server_name=''):
        """Spawn server with event metrics"""
        record_custom_event('server_spawn_start', user.name)
        
        try:
            result = await super().spawn_server(user, server_name)
            record_custom_event('server_spawn_success', user.name)
            return result
        except Exception as e:
            record_custom_event('server_spawn_failure', user.name)
            raise

Install with Tessl CLI