0
# Monitoring and Metrics
1
2
JupyterHub provides comprehensive monitoring and metrics collection capabilities through Prometheus integration. The system tracks user activity, server performance, resource usage, and system health for operational visibility and capacity planning.
3
4
## Capabilities
5
6
### Prometheus Metrics
7
8
Core Prometheus metrics exposed by JupyterHub for monitoring and alerting.
9
10
```python { .api }
11
# Counter metrics
12
TOTAL_USERS: Counter = Counter(
13
'jupyterhub_total_users',
14
'Total number of users in JupyterHub database'
15
)
16
17
RUNNING_SERVERS: Gauge = Gauge(
18
'jupyterhub_running_servers',
19
'Number of currently running servers'
20
)
21
22
PENDING_SERVERS: Gauge = Gauge(
23
'jupyterhub_pending_servers',
24
'Number of servers in pending state'
25
)
26
27
# Request metrics
28
REQUEST_DURATION_SECONDS: Histogram = Histogram(
29
'jupyterhub_request_duration_seconds',
30
'Time spent handling HTTP requests',
31
['method', 'handler', 'code']
32
)
33
34
REQUEST_COUNT: Counter = Counter(
35
'jupyterhub_request_count_total',
36
'Total number of HTTP requests',
37
['method', 'handler', 'code']
38
)
39
40
# Authentication metrics
41
LOGIN_SUCCESS: Counter = Counter(
42
'jupyterhub_login_success_total',
43
'Total number of successful logins'
44
)
45
46
LOGIN_FAILURE: Counter = Counter(
47
'jupyterhub_login_failure_total',
48
'Total number of failed logins'
49
)
50
51
# Spawner metrics
52
SPAWN_DURATION_SECONDS: Histogram = Histogram(
53
'jupyterhub_spawn_duration_seconds',
54
'Time spent spawning servers',
55
['spawner_class']
56
)
57
58
SPAWN_SUCCESS: Counter = Counter(
59
'jupyterhub_spawn_success_total',
60
'Total number of successful server spawns',
61
['spawner_class']
62
)
63
64
SPAWN_FAILURE: Counter = Counter(
65
'jupyterhub_spawn_failure_total',
66
'Total number of failed server spawns',
67
['spawner_class', 'error_type']
68
)
69
70
# Hub metrics
71
HUB_RESPONSE_DURATION_SECONDS: Histogram = Histogram(
72
'jupyterhub_hub_response_duration_seconds',
73
'Time for Hub to respond to requests'
74
)
75
76
API_REQUEST_DURATION_SECONDS: Histogram = Histogram(
77
'jupyterhub_api_request_duration_seconds',
78
'Time spent handling API requests',
79
['method', 'endpoint', 'status']
80
)
81
```
82
83
### Metrics Collection System
84
85
Automated metrics collection and periodic updates.
86
87
```python { .api }
88
class PeriodicMetricsCollector:
89
"""
90
Periodic metrics collector for JupyterHub system statistics.
91
92
Collects and updates metrics at regular intervals to provide
93
current system state information.
94
"""
95
96
def __init__(self, app, interval: int = 60):
97
"""
98
Initialize metrics collector.
99
100
Args:
101
app: JupyterHub application instance
102
interval: Collection interval in seconds
103
"""
104
self.app = app
105
self.interval = interval
106
self.running = False
107
108
async def start(self):
109
"""Start periodic metrics collection"""
110
self.running = True
111
while self.running:
112
await self.collect_metrics()
113
await asyncio.sleep(self.interval)
114
115
def stop(self):
116
"""Stop metrics collection"""
117
self.running = False
118
119
async def collect_metrics(self):
120
"""
121
Collect and update all metrics.
122
123
Gathers current system state and updates Prometheus metrics.
124
"""
125
await self.collect_user_metrics()
126
await self.collect_server_metrics()
127
await self.collect_hub_metrics()
128
129
async def collect_user_metrics(self):
130
"""Collect user-related metrics"""
131
# Total users
132
total_users = self.app.db.query(User).count()
133
TOTAL_USERS.set(total_users)
134
135
# Active users (with recent activity)
136
cutoff = datetime.utcnow() - timedelta(hours=24)
137
active_users = self.app.db.query(User).filter(
138
User.last_activity > cutoff
139
).count()
140
141
# Update metrics
142
ACTIVE_USERS.set(active_users)
143
144
async def collect_server_metrics(self):
145
"""Collect server-related metrics"""
146
# Running servers
147
running_servers = self.app.db.query(Server).filter(
148
Server.url.isnot(None)
149
).count()
150
RUNNING_SERVERS.set(running_servers)
151
152
# Pending servers
153
pending_servers = len([
154
spawner for spawner in self.app.spawners.values()
155
if spawner.pending
156
])
157
PENDING_SERVERS.set(pending_servers)
158
159
async def collect_hub_metrics(self):
160
"""Collect Hub system metrics"""
161
# System resource usage
162
import psutil
163
164
# Memory usage
165
memory = psutil.virtual_memory()
166
HUB_MEMORY_USAGE_BYTES.set(memory.used)
167
HUB_MEMORY_TOTAL_BYTES.set(memory.total)
168
169
# CPU usage
170
cpu_percent = psutil.cpu_percent()
171
HUB_CPU_USAGE_PERCENT.set(cpu_percent)
172
```
173
174
### Custom Metrics Integration
175
176
Tools for adding custom metrics to JupyterHub applications.
177
178
```python { .api }
179
from prometheus_client import Counter, Gauge, Histogram, Summary
180
181
# Custom metric definitions
182
CUSTOM_COUNTER: Counter = Counter(
183
'jupyterhub_custom_events_total',
184
'Total custom events',
185
['event_type', 'user']
186
)
187
188
CUSTOM_GAUGE: Gauge = Gauge(
189
'jupyterhub_custom_resource_usage',
190
'Custom resource usage',
191
['resource_type', 'user']
192
)
193
194
CUSTOM_HISTOGRAM: Histogram = Histogram(
195
'jupyterhub_custom_operation_duration_seconds',
196
'Custom operation duration',
197
['operation', 'status']
198
)
199
200
def record_custom_event(event_type: str, user: str = None):
201
"""
202
Record a custom event metric.
203
204
Args:
205
event_type: Type of event to record
206
user: Username associated with event (optional)
207
"""
208
CUSTOM_COUNTER.labels(
209
event_type=event_type,
210
user=user or 'anonymous'
211
).inc()
212
213
def update_custom_gauge(resource_type: str, value: float, user: str = None):
214
"""
215
Update a custom gauge metric.
216
217
Args:
218
resource_type: Type of resource being measured
219
value: Current resource value
220
user: Username associated with resource (optional)
221
"""
222
CUSTOM_GAUGE.labels(
223
resource_type=resource_type,
224
user=user or 'system'
225
).set(value)
226
227
def time_custom_operation(operation: str, status: str = 'success'):
228
"""
229
Decorator to time custom operations.
230
231
Args:
232
operation: Name of the operation
233
status: Operation status (success, error, etc.)
234
235
Returns:
236
Timer context manager
237
"""
238
return CUSTOM_HISTOGRAM.labels(
239
operation=operation,
240
status=status
241
).time()
242
```
243
244
### Health Check System
245
246
Health monitoring and status reporting for JupyterHub components.
247
248
```python { .api }
249
class HealthChecker:
250
"""
251
Health check system for JupyterHub components.
252
253
Provides endpoints and utilities for monitoring system health
254
and component status.
255
"""
256
257
def __init__(self, app):
258
"""
259
Initialize health checker.
260
261
Args:
262
app: JupyterHub application instance
263
"""
264
self.app = app
265
self.checks = {}
266
267
def register_check(self, name: str, check_func: callable, interval: int = 60):
268
"""
269
Register a health check function.
270
271
Args:
272
name: Check name
273
check_func: Function that returns health status
274
interval: Check interval in seconds
275
"""
276
self.checks[name] = {
277
'func': check_func,
278
'interval': interval,
279
'last_run': None,
280
'status': 'unknown'
281
}
282
283
async def run_checks(self) -> Dict[str, Any]:
284
"""
285
Run all registered health checks.
286
287
Returns:
288
Dictionary of check results with status and timing
289
"""
290
results = {}
291
292
for name, check in self.checks.items():
293
try:
294
start_time = time.time()
295
status = await check['func']()
296
duration = time.time() - start_time
297
298
results[name] = {
299
'status': 'healthy' if status else 'unhealthy',
300
'duration': duration,
301
'timestamp': datetime.utcnow().isoformat()
302
}
303
except Exception as e:
304
results[name] = {
305
'status': 'error',
306
'error': str(e),
307
'timestamp': datetime.utcnow().isoformat()
308
}
309
310
return results
311
312
async def database_health_check(self) -> bool:
313
"""Check database connectivity and basic operations"""
314
try:
315
# Test database connection
316
user_count = self.app.db.query(User).count()
317
return user_count >= 0
318
except Exception:
319
return False
320
321
async def spawner_health_check(self) -> bool:
322
"""Check spawner system health"""
323
try:
324
# Check if spawners are responsive
325
active_spawners = len(self.app.spawners)
326
return True # Spawner system is operational
327
except Exception:
328
return False
329
330
async def proxy_health_check(self) -> bool:
331
"""Check proxy health and connectivity"""
332
try:
333
# Test proxy connectivity
334
await self.app.proxy.get_routes()
335
return True
336
except Exception:
337
return False
338
```
339
340
## Usage Examples
341
342
### Basic Metrics Integration
343
344
```python
345
from jupyterhub.metrics import SPAWN_SUCCESS, SPAWN_FAILURE, SPAWN_DURATION_SECONDS
346
import time
347
348
class MonitoredSpawner(LocalProcessSpawner):
349
"""Spawner with metrics collection"""
350
351
async def start(self):
352
"""Start server with metrics collection"""
353
start_time = time.time()
354
spawner_class = self.__class__.__name__
355
356
try:
357
# Start the server
358
result = await super().start()
359
360
# Record success metrics
361
SPAWN_SUCCESS.labels(spawner_class=spawner_class).inc()
362
duration = time.time() - start_time
363
SPAWN_DURATION_SECONDS.labels(spawner_class=spawner_class).observe(duration)
364
365
return result
366
367
except Exception as e:
368
# Record failure metrics
369
error_type = type(e).__name__
370
SPAWN_FAILURE.labels(
371
spawner_class=spawner_class,
372
error_type=error_type
373
).inc()
374
raise
375
```
376
377
### Custom Metrics for User Activity
378
379
```python
380
from prometheus_client import Counter, Histogram
381
382
# Custom user activity metrics
383
USER_LOGIN_COUNTER = Counter(
384
'jupyterhub_user_login_total',
385
'Total user logins',
386
['username', 'authenticator']
387
)
388
389
NOTEBOOK_LAUNCH_DURATION = Histogram(
390
'jupyterhub_notebook_launch_duration_seconds',
391
'Time to launch notebook server',
392
['username', 'spawner_type']
393
)
394
395
class MetricsAuthenticator(PAMAuthenticator):
396
"""Authenticator with login metrics"""
397
398
async def authenticate(self, handler, data):
399
"""Authenticate with metrics collection"""
400
username = data.get('username', 'unknown')
401
authenticator_name = self.__class__.__name__
402
403
result = await super().authenticate(handler, data)
404
405
if result:
406
# Record successful login
407
USER_LOGIN_COUNTER.labels(
408
username=username,
409
authenticator=authenticator_name
410
).inc()
411
412
return result
413
414
class MetricsSpawner(LocalProcessSpawner):
415
"""Spawner with launch time metrics"""
416
417
async def start(self):
418
"""Start server with launch time tracking"""
419
start_time = time.time()
420
username = self.user.name
421
spawner_type = self.__class__.__name__
422
423
try:
424
result = await super().start()
425
426
# Record launch time
427
duration = time.time() - start_time
428
NOTEBOOK_LAUNCH_DURATION.labels(
429
username=username,
430
spawner_type=spawner_type
431
).observe(duration)
432
433
return result
434
except Exception:
435
# Still record failed launch attempts
436
duration = time.time() - start_time
437
NOTEBOOK_LAUNCH_DURATION.labels(
438
username=username,
439
spawner_type=spawner_type
440
).observe(duration)
441
raise
442
```
443
444
### Health Monitoring Setup
445
446
```python
447
from jupyterhub.app import JupyterHub
448
from .monitoring import HealthChecker
449
450
class MonitoredJupyterHub(JupyterHub):
451
"""JupyterHub with health monitoring"""
452
453
def __init__(self, **kwargs):
454
super().__init__(**kwargs)
455
self.health_checker = HealthChecker(self)
456
self.setup_health_checks()
457
458
def setup_health_checks(self):
459
"""Register health check functions"""
460
self.health_checker.register_check(
461
'database',
462
self.health_checker.database_health_check,
463
interval=30
464
)
465
466
self.health_checker.register_check(
467
'proxy',
468
self.health_checker.proxy_health_check,
469
interval=60
470
)
471
472
self.health_checker.register_check(
473
'spawners',
474
self.health_checker.spawner_health_check,
475
interval=120
476
)
477
478
async def start(self):
479
"""Start Hub with health monitoring"""
480
await super().start()
481
482
# Start health monitoring
483
asyncio.create_task(self.periodic_health_checks())
484
485
async def periodic_health_checks(self):
486
"""Run periodic health checks"""
487
while True:
488
try:
489
health_results = await self.health_checker.run_checks()
490
491
# Log health status
492
for check_name, result in health_results.items():
493
if result['status'] != 'healthy':
494
self.log.warning(f"Health check {check_name}: {result['status']}")
495
496
await asyncio.sleep(60)
497
except Exception as e:
498
self.log.error(f"Health check error: {e}")
499
await asyncio.sleep(300) # Wait longer on error
500
```
501
502
### Grafana Dashboard Integration
503
504
```python
505
# Example metrics for Grafana dashboard
506
DASHBOARD_METRICS = {
507
'user_metrics': [
508
'jupyterhub_total_users',
509
'jupyterhub_active_users',
510
'jupyterhub_user_login_total'
511
],
512
'server_metrics': [
513
'jupyterhub_running_servers',
514
'jupyterhub_pending_servers',
515
'jupyterhub_spawn_duration_seconds',
516
'jupyterhub_spawn_success_total',
517
'jupyterhub_spawn_failure_total'
518
],
519
'performance_metrics': [
520
'jupyterhub_request_duration_seconds',
521
'jupyterhub_api_request_duration_seconds',
522
'jupyterhub_hub_response_duration_seconds'
523
],
524
'system_metrics': [
525
'jupyterhub_hub_memory_usage_bytes',
526
'jupyterhub_hub_cpu_usage_percent'
527
]
528
}
529
530
def generate_grafana_queries():
531
"""Generate Grafana query examples"""
532
queries = {
533
'active_users_24h': '''
534
increase(jupyterhub_user_login_total[24h])
535
''',
536
'average_spawn_time': '''
537
rate(jupyterhub_spawn_duration_seconds_sum[5m]) /
538
rate(jupyterhub_spawn_duration_seconds_count[5m])
539
''',
540
'server_success_rate': '''
541
rate(jupyterhub_spawn_success_total[5m]) /
542
(rate(jupyterhub_spawn_success_total[5m]) +
543
rate(jupyterhub_spawn_failure_total[5m])) * 100
544
''',
545
'api_request_rate': '''
546
rate(jupyterhub_api_request_duration_seconds_count[5m])
547
'''
548
}
549
return queries
550
```
551
552
### Alerting Configuration
553
554
```python
555
# Prometheus alerting rules for JupyterHub
556
ALERTING_RULES = {
557
'high_spawn_failure_rate': {
558
'expr': '''
559
rate(jupyterhub_spawn_failure_total[5m]) /
560
rate(jupyterhub_spawn_success_total[5m]) > 0.1
561
''',
562
'for': '5m',
563
'severity': 'warning',
564
'summary': 'High spawn failure rate detected'
565
},
566
'hub_memory_high': {
567
'expr': '''
568
jupyterhub_hub_memory_usage_bytes /
569
jupyterhub_hub_memory_total_bytes > 0.9
570
''',
571
'for': '2m',
572
'severity': 'critical',
573
'summary': 'Hub memory usage critical'
574
},
575
'no_running_servers': {
576
'expr': 'jupyterhub_running_servers == 0',
577
'for': '10m',
578
'severity': 'warning',
579
'summary': 'No servers currently running'
580
}
581
}
582
583
class AlertManager:
584
"""Alert management for JupyterHub metrics"""
585
586
def __init__(self, webhook_url=None):
587
self.webhook_url = webhook_url
588
self.alerts = {}
589
590
async def check_alerts(self, metrics):
591
"""Check metrics against alert conditions"""
592
for alert_name, config in ALERTING_RULES.items():
593
# Evaluate alert condition
594
if self.evaluate_condition(config['expr'], metrics):
595
await self.trigger_alert(alert_name, config)
596
597
def evaluate_condition(self, expr, metrics):
598
"""Evaluate alert expression against metrics"""
599
# Implementation depends on metrics evaluation system
600
return False
601
602
async def trigger_alert(self, name, config):
603
"""Trigger alert notification"""
604
if self.webhook_url:
605
alert_data = {
606
'alert': name,
607
'severity': config['severity'],
608
'summary': config['summary'],
609
'timestamp': datetime.utcnow().isoformat()
610
}
611
612
# Send webhook notification
613
async with aiohttp.ClientSession() as session:
614
await session.post(self.webhook_url, json=alert_data)
615
```
616
617
## Advanced Monitoring Patterns
618
619
### Multi-dimensional Metrics
620
621
```python
622
# Resource usage metrics per user and server
623
RESOURCE_USAGE = Gauge(
624
'jupyterhub_resource_usage',
625
'Resource usage by dimension',
626
['resource_type', 'username', 'server_name']
627
)
628
629
def update_resource_metrics(user, server_name=''):
630
"""Update resource usage metrics for user"""
631
# Get resource usage (implementation specific)
632
cpu_usage = get_cpu_usage(user.name, server_name)
633
memory_usage = get_memory_usage(user.name, server_name)
634
635
# Update metrics
636
RESOURCE_USAGE.labels(
637
resource_type='cpu',
638
username=user.name,
639
server_name=server_name
640
).set(cpu_usage)
641
642
RESOURCE_USAGE.labels(
643
resource_type='memory',
644
username=user.name,
645
server_name=server_name
646
).set(memory_usage)
647
```
648
649
### Event-Based Metrics
650
651
```python
652
from jupyterhub.metrics import record_custom_event
653
654
class EventMetricsHub(JupyterHub):
655
"""Hub with event-based metrics collection"""
656
657
async def login_user(self, user):
658
"""Login user with event metrics"""
659
result = await super().login_user(user)
660
661
# Record login event
662
record_custom_event('user_login', user.name)
663
664
return result
665
666
async def spawn_server(self, user, server_name=''):
667
"""Spawn server with event metrics"""
668
record_custom_event('server_spawn_start', user.name)
669
670
try:
671
result = await super().spawn_server(user, server_name)
672
record_custom_event('server_spawn_success', user.name)
673
return result
674
except Exception as e:
675
record_custom_event('server_spawn_failure', user.name)
676
raise
677
```