A multi-user server for Jupyter notebooks that provides authentication, spawning, and proxying for multiple users simultaneously
—
JupyterHub provides comprehensive monitoring and metrics collection capabilities through Prometheus integration. The system tracks user activity, server performance, resource usage, and system health for operational visibility and capacity planning.
Core Prometheus metrics exposed by JupyterHub for monitoring and alerting.
# Counter metrics
TOTAL_USERS: Counter = Counter(
'jupyterhub_total_users',
'Total number of users in JupyterHub database'
)
RUNNING_SERVERS: Gauge = Gauge(
'jupyterhub_running_servers',
'Number of currently running servers'
)
PENDING_SERVERS: Gauge = Gauge(
'jupyterhub_pending_servers',
'Number of servers in pending state'
)
# Request metrics
REQUEST_DURATION_SECONDS: Histogram = Histogram(
'jupyterhub_request_duration_seconds',
'Time spent handling HTTP requests',
['method', 'handler', 'code']
)
REQUEST_COUNT: Counter = Counter(
'jupyterhub_request_count_total',
'Total number of HTTP requests',
['method', 'handler', 'code']
)
# Authentication metrics
LOGIN_SUCCESS: Counter = Counter(
'jupyterhub_login_success_total',
'Total number of successful logins'
)
LOGIN_FAILURE: Counter = Counter(
'jupyterhub_login_failure_total',
'Total number of failed logins'
)
# Spawner metrics
SPAWN_DURATION_SECONDS: Histogram = Histogram(
'jupyterhub_spawn_duration_seconds',
'Time spent spawning servers',
['spawner_class']
)
SPAWN_SUCCESS: Counter = Counter(
'jupyterhub_spawn_success_total',
'Total number of successful server spawns',
['spawner_class']
)
SPAWN_FAILURE: Counter = Counter(
'jupyterhub_spawn_failure_total',
'Total number of failed server spawns',
['spawner_class', 'error_type']
)
# Hub metrics
HUB_RESPONSE_DURATION_SECONDS: Histogram = Histogram(
'jupyterhub_hub_response_duration_seconds',
'Time for Hub to respond to requests'
)
API_REQUEST_DURATION_SECONDS: Histogram = Histogram(
'jupyterhub_api_request_duration_seconds',
'Time spent handling API requests',
['method', 'endpoint', 'status']
)Automated metrics collection and periodic updates.
class PeriodicMetricsCollector:
"""
Periodic metrics collector for JupyterHub system statistics.
Collects and updates metrics at regular intervals to provide
current system state information.
"""
def __init__(self, app, interval: int = 60):
"""
Initialize metrics collector.
Args:
app: JupyterHub application instance
interval: Collection interval in seconds
"""
self.app = app
self.interval = interval
self.running = False
async def start(self):
"""Start periodic metrics collection"""
self.running = True
while self.running:
await self.collect_metrics()
await asyncio.sleep(self.interval)
def stop(self):
"""Stop metrics collection"""
self.running = False
async def collect_metrics(self):
"""
Collect and update all metrics.
Gathers current system state and updates Prometheus metrics.
"""
await self.collect_user_metrics()
await self.collect_server_metrics()
await self.collect_hub_metrics()
async def collect_user_metrics(self):
"""Collect user-related metrics"""
# Total users
total_users = self.app.db.query(User).count()
TOTAL_USERS.set(total_users)
# Active users (with recent activity)
cutoff = datetime.utcnow() - timedelta(hours=24)
active_users = self.app.db.query(User).filter(
User.last_activity > cutoff
).count()
# Update metrics
ACTIVE_USERS.set(active_users)
async def collect_server_metrics(self):
"""Collect server-related metrics"""
# Running servers
running_servers = self.app.db.query(Server).filter(
Server.url.isnot(None)
).count()
RUNNING_SERVERS.set(running_servers)
# Pending servers
pending_servers = len([
spawner for spawner in self.app.spawners.values()
if spawner.pending
])
PENDING_SERVERS.set(pending_servers)
async def collect_hub_metrics(self):
"""Collect Hub system metrics"""
# System resource usage
import psutil
# Memory usage
memory = psutil.virtual_memory()
HUB_MEMORY_USAGE_BYTES.set(memory.used)
HUB_MEMORY_TOTAL_BYTES.set(memory.total)
# CPU usage
cpu_percent = psutil.cpu_percent()
HUB_CPU_USAGE_PERCENT.set(cpu_percent)Tools for adding custom metrics to JupyterHub applications.
from prometheus_client import Counter, Gauge, Histogram, Summary
# Custom metric definitions
CUSTOM_COUNTER: Counter = Counter(
'jupyterhub_custom_events_total',
'Total custom events',
['event_type', 'user']
)
CUSTOM_GAUGE: Gauge = Gauge(
'jupyterhub_custom_resource_usage',
'Custom resource usage',
['resource_type', 'user']
)
CUSTOM_HISTOGRAM: Histogram = Histogram(
'jupyterhub_custom_operation_duration_seconds',
'Custom operation duration',
['operation', 'status']
)
def record_custom_event(event_type: str, user: str = None):
"""
Record a custom event metric.
Args:
event_type: Type of event to record
user: Username associated with event (optional)
"""
CUSTOM_COUNTER.labels(
event_type=event_type,
user=user or 'anonymous'
).inc()
def update_custom_gauge(resource_type: str, value: float, user: str = None):
"""
Update a custom gauge metric.
Args:
resource_type: Type of resource being measured
value: Current resource value
user: Username associated with resource (optional)
"""
CUSTOM_GAUGE.labels(
resource_type=resource_type,
user=user or 'system'
).set(value)
def time_custom_operation(operation: str, status: str = 'success'):
"""
Decorator to time custom operations.
Args:
operation: Name of the operation
status: Operation status (success, error, etc.)
Returns:
Timer context manager
"""
return CUSTOM_HISTOGRAM.labels(
operation=operation,
status=status
).time()Health monitoring and status reporting for JupyterHub components.
class HealthChecker:
"""
Health check system for JupyterHub components.
Provides endpoints and utilities for monitoring system health
and component status.
"""
def __init__(self, app):
"""
Initialize health checker.
Args:
app: JupyterHub application instance
"""
self.app = app
self.checks = {}
def register_check(self, name: str, check_func: callable, interval: int = 60):
"""
Register a health check function.
Args:
name: Check name
check_func: Function that returns health status
interval: Check interval in seconds
"""
self.checks[name] = {
'func': check_func,
'interval': interval,
'last_run': None,
'status': 'unknown'
}
async def run_checks(self) -> Dict[str, Any]:
"""
Run all registered health checks.
Returns:
Dictionary of check results with status and timing
"""
results = {}
for name, check in self.checks.items():
try:
start_time = time.time()
status = await check['func']()
duration = time.time() - start_time
results[name] = {
'status': 'healthy' if status else 'unhealthy',
'duration': duration,
'timestamp': datetime.utcnow().isoformat()
}
except Exception as e:
results[name] = {
'status': 'error',
'error': str(e),
'timestamp': datetime.utcnow().isoformat()
}
return results
async def database_health_check(self) -> bool:
"""Check database connectivity and basic operations"""
try:
# Test database connection
user_count = self.app.db.query(User).count()
return user_count >= 0
except Exception:
return False
async def spawner_health_check(self) -> bool:
"""Check spawner system health"""
try:
# Check if spawners are responsive
active_spawners = len(self.app.spawners)
return True # Spawner system is operational
except Exception:
return False
async def proxy_health_check(self) -> bool:
"""Check proxy health and connectivity"""
try:
# Test proxy connectivity
await self.app.proxy.get_routes()
return True
except Exception:
return Falsefrom jupyterhub.metrics import SPAWN_SUCCESS, SPAWN_FAILURE, SPAWN_DURATION_SECONDS
import time
class MonitoredSpawner(LocalProcessSpawner):
"""Spawner with metrics collection"""
async def start(self):
"""Start server with metrics collection"""
start_time = time.time()
spawner_class = self.__class__.__name__
try:
# Start the server
result = await super().start()
# Record success metrics
SPAWN_SUCCESS.labels(spawner_class=spawner_class).inc()
duration = time.time() - start_time
SPAWN_DURATION_SECONDS.labels(spawner_class=spawner_class).observe(duration)
return result
except Exception as e:
# Record failure metrics
error_type = type(e).__name__
SPAWN_FAILURE.labels(
spawner_class=spawner_class,
error_type=error_type
).inc()
raisefrom prometheus_client import Counter, Histogram
# Custom user activity metrics
USER_LOGIN_COUNTER = Counter(
'jupyterhub_user_login_total',
'Total user logins',
['username', 'authenticator']
)
NOTEBOOK_LAUNCH_DURATION = Histogram(
'jupyterhub_notebook_launch_duration_seconds',
'Time to launch notebook server',
['username', 'spawner_type']
)
class MetricsAuthenticator(PAMAuthenticator):
"""Authenticator with login metrics"""
async def authenticate(self, handler, data):
"""Authenticate with metrics collection"""
username = data.get('username', 'unknown')
authenticator_name = self.__class__.__name__
result = await super().authenticate(handler, data)
if result:
# Record successful login
USER_LOGIN_COUNTER.labels(
username=username,
authenticator=authenticator_name
).inc()
return result
class MetricsSpawner(LocalProcessSpawner):
"""Spawner with launch time metrics"""
async def start(self):
"""Start server with launch time tracking"""
start_time = time.time()
username = self.user.name
spawner_type = self.__class__.__name__
try:
result = await super().start()
# Record launch time
duration = time.time() - start_time
NOTEBOOK_LAUNCH_DURATION.labels(
username=username,
spawner_type=spawner_type
).observe(duration)
return result
except Exception:
# Still record failed launch attempts
duration = time.time() - start_time
NOTEBOOK_LAUNCH_DURATION.labels(
username=username,
spawner_type=spawner_type
).observe(duration)
raisefrom jupyterhub.app import JupyterHub
from .monitoring import HealthChecker
class MonitoredJupyterHub(JupyterHub):
"""JupyterHub with health monitoring"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.health_checker = HealthChecker(self)
self.setup_health_checks()
def setup_health_checks(self):
"""Register health check functions"""
self.health_checker.register_check(
'database',
self.health_checker.database_health_check,
interval=30
)
self.health_checker.register_check(
'proxy',
self.health_checker.proxy_health_check,
interval=60
)
self.health_checker.register_check(
'spawners',
self.health_checker.spawner_health_check,
interval=120
)
async def start(self):
"""Start Hub with health monitoring"""
await super().start()
# Start health monitoring
asyncio.create_task(self.periodic_health_checks())
async def periodic_health_checks(self):
"""Run periodic health checks"""
while True:
try:
health_results = await self.health_checker.run_checks()
# Log health status
for check_name, result in health_results.items():
if result['status'] != 'healthy':
self.log.warning(f"Health check {check_name}: {result['status']}")
await asyncio.sleep(60)
except Exception as e:
self.log.error(f"Health check error: {e}")
await asyncio.sleep(300) # Wait longer on error# Example metrics for Grafana dashboard
DASHBOARD_METRICS = {
'user_metrics': [
'jupyterhub_total_users',
'jupyterhub_active_users',
'jupyterhub_user_login_total'
],
'server_metrics': [
'jupyterhub_running_servers',
'jupyterhub_pending_servers',
'jupyterhub_spawn_duration_seconds',
'jupyterhub_spawn_success_total',
'jupyterhub_spawn_failure_total'
],
'performance_metrics': [
'jupyterhub_request_duration_seconds',
'jupyterhub_api_request_duration_seconds',
'jupyterhub_hub_response_duration_seconds'
],
'system_metrics': [
'jupyterhub_hub_memory_usage_bytes',
'jupyterhub_hub_cpu_usage_percent'
]
}
def generate_grafana_queries():
"""Generate Grafana query examples"""
queries = {
'active_users_24h': '''
increase(jupyterhub_user_login_total[24h])
''',
'average_spawn_time': '''
rate(jupyterhub_spawn_duration_seconds_sum[5m]) /
rate(jupyterhub_spawn_duration_seconds_count[5m])
''',
'server_success_rate': '''
rate(jupyterhub_spawn_success_total[5m]) /
(rate(jupyterhub_spawn_success_total[5m]) +
rate(jupyterhub_spawn_failure_total[5m])) * 100
''',
'api_request_rate': '''
rate(jupyterhub_api_request_duration_seconds_count[5m])
'''
}
return queries# Prometheus alerting rules for JupyterHub
ALERTING_RULES = {
'high_spawn_failure_rate': {
'expr': '''
rate(jupyterhub_spawn_failure_total[5m]) /
rate(jupyterhub_spawn_success_total[5m]) > 0.1
''',
'for': '5m',
'severity': 'warning',
'summary': 'High spawn failure rate detected'
},
'hub_memory_high': {
'expr': '''
jupyterhub_hub_memory_usage_bytes /
jupyterhub_hub_memory_total_bytes > 0.9
''',
'for': '2m',
'severity': 'critical',
'summary': 'Hub memory usage critical'
},
'no_running_servers': {
'expr': 'jupyterhub_running_servers == 0',
'for': '10m',
'severity': 'warning',
'summary': 'No servers currently running'
}
}
class AlertManager:
"""Alert management for JupyterHub metrics"""
def __init__(self, webhook_url=None):
self.webhook_url = webhook_url
self.alerts = {}
async def check_alerts(self, metrics):
"""Check metrics against alert conditions"""
for alert_name, config in ALERTING_RULES.items():
# Evaluate alert condition
if self.evaluate_condition(config['expr'], metrics):
await self.trigger_alert(alert_name, config)
def evaluate_condition(self, expr, metrics):
"""Evaluate alert expression against metrics"""
# Implementation depends on metrics evaluation system
return False
async def trigger_alert(self, name, config):
"""Trigger alert notification"""
if self.webhook_url:
alert_data = {
'alert': name,
'severity': config['severity'],
'summary': config['summary'],
'timestamp': datetime.utcnow().isoformat()
}
# Send webhook notification
async with aiohttp.ClientSession() as session:
await session.post(self.webhook_url, json=alert_data)# Resource usage metrics per user and server
RESOURCE_USAGE = Gauge(
'jupyterhub_resource_usage',
'Resource usage by dimension',
['resource_type', 'username', 'server_name']
)
def update_resource_metrics(user, server_name=''):
"""Update resource usage metrics for user"""
# Get resource usage (implementation specific)
cpu_usage = get_cpu_usage(user.name, server_name)
memory_usage = get_memory_usage(user.name, server_name)
# Update metrics
RESOURCE_USAGE.labels(
resource_type='cpu',
username=user.name,
server_name=server_name
).set(cpu_usage)
RESOURCE_USAGE.labels(
resource_type='memory',
username=user.name,
server_name=server_name
).set(memory_usage)from jupyterhub.metrics import record_custom_event
class EventMetricsHub(JupyterHub):
"""Hub with event-based metrics collection"""
async def login_user(self, user):
"""Login user with event metrics"""
result = await super().login_user(user)
# Record login event
record_custom_event('user_login', user.name)
return result
async def spawn_server(self, user, server_name=''):
"""Spawn server with event metrics"""
record_custom_event('server_spawn_start', user.name)
try:
result = await super().spawn_server(user, server_name)
record_custom_event('server_spawn_success', user.name)
return result
except Exception as e:
record_custom_event('server_spawn_failure', user.name)
raiseInstall with Tessl CLI
npx tessl i tessl/pypi-jupyterhub