Tessl Tile for pypi/jupyterhub@5.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

authentication.md configuration-utilities.md core-application.md database-models.md index.md monitoring-metrics.md rbac-permissions.md rest-api.md services-oauth.md singleuser-integration.md spawners.md

monitoring-metrics.mddocs/

0
# Monitoring and Metrics
1

2
JupyterHub provides comprehensive monitoring and metrics collection capabilities through Prometheus integration. The system tracks user activity, server performance, resource usage, and system health for operational visibility and capacity planning.
3

4
## Capabilities
5

6
### Prometheus Metrics
7

8
Core Prometheus metrics exposed by JupyterHub for monitoring and alerting.
9

10
```python { .api }
11
# Counter metrics
12
TOTAL_USERS: Counter = Counter(
13
    'jupyterhub_total_users',
14
    'Total number of users in JupyterHub database'
15
)
16

17
RUNNING_SERVERS: Gauge = Gauge(
18
    'jupyterhub_running_servers', 
19
    'Number of currently running servers'
20
)
21

22
PENDING_SERVERS: Gauge = Gauge(
23
    'jupyterhub_pending_servers',
24
    'Number of servers in pending state' 
25
)
26

27
# Request metrics
28
REQUEST_DURATION_SECONDS: Histogram = Histogram(
29
    'jupyterhub_request_duration_seconds',
30
    'Time spent handling HTTP requests',
31
    ['method', 'handler', 'code']
32
)
33

34
REQUEST_COUNT: Counter = Counter(
35
    'jupyterhub_request_count_total',
36
    'Total number of HTTP requests',
37
    ['method', 'handler', 'code']
38
)
39

40
# Authentication metrics  
41
LOGIN_SUCCESS: Counter = Counter(
42
    'jupyterhub_login_success_total',
43
    'Total number of successful logins'
44
)
45

46
LOGIN_FAILURE: Counter = Counter(
47
    'jupyterhub_login_failure_total', 
48
    'Total number of failed logins'
49
)
50

51
# Spawner metrics
52
SPAWN_DURATION_SECONDS: Histogram = Histogram(
53
    'jupyterhub_spawn_duration_seconds',
54
    'Time spent spawning servers',
55
    ['spawner_class']
56
)
57

58
SPAWN_SUCCESS: Counter = Counter(
59
    'jupyterhub_spawn_success_total',
60
    'Total number of successful server spawns',
61
    ['spawner_class']
62
)
63

64
SPAWN_FAILURE: Counter = Counter(
65
    'jupyterhub_spawn_failure_total',
66
    'Total number of failed server spawns',
67
    ['spawner_class', 'error_type']  
68
)
69

70
# Hub metrics
71
HUB_RESPONSE_DURATION_SECONDS: Histogram = Histogram(
72
    'jupyterhub_hub_response_duration_seconds',
73
    'Time for Hub to respond to requests'
74
)
75

76
API_REQUEST_DURATION_SECONDS: Histogram = Histogram(
77
    'jupyterhub_api_request_duration_seconds',
78
    'Time spent handling API requests',
79
    ['method', 'endpoint', 'status']
80
)
81
```
82

83
### Metrics Collection System
84

85
Automated metrics collection and periodic updates.
86

87
```python { .api }
88
class PeriodicMetricsCollector:
89
    """
90
    Periodic metrics collector for JupyterHub system statistics.
91
    
92
    Collects and updates metrics at regular intervals to provide
93
    current system state information.
94
    """
95
    
96
    def __init__(self, app, interval: int = 60):
97
        """
98
        Initialize metrics collector.
99
        
100
        Args:
101
            app: JupyterHub application instance
102
            interval: Collection interval in seconds
103
        """
104
        self.app = app
105
        self.interval = interval
106
        self.running = False
107
    
108
    async def start(self):
109
        """Start periodic metrics collection"""
110
        self.running = True
111
        while self.running:
112
            await self.collect_metrics()
113
            await asyncio.sleep(self.interval)
114
    
115
    def stop(self):
116
        """Stop metrics collection"""
117
        self.running = False
118
    
119
    async def collect_metrics(self):
120
        """
121
        Collect and update all metrics.
122
        
123
        Gathers current system state and updates Prometheus metrics.
124
        """
125
        await self.collect_user_metrics()
126
        await self.collect_server_metrics()
127
        await self.collect_hub_metrics()
128
    
129
    async def collect_user_metrics(self):
130
        """Collect user-related metrics"""
131
        # Total users
132
        total_users = self.app.db.query(User).count()
133
        TOTAL_USERS.set(total_users)
134
        
135
        # Active users (with recent activity)
136
        cutoff = datetime.utcnow() - timedelta(hours=24)
137
        active_users = self.app.db.query(User).filter(
138
            User.last_activity > cutoff
139
        ).count()
140
        
141
        # Update metrics
142
        ACTIVE_USERS.set(active_users)
143
    
144
    async def collect_server_metrics(self):
145
        """Collect server-related metrics"""
146
        # Running servers
147
        running_servers = self.app.db.query(Server).filter(
148
            Server.url.isnot(None)
149
        ).count()
150
        RUNNING_SERVERS.set(running_servers)
151
        
152
        # Pending servers  
153
        pending_servers = len([
154
            spawner for spawner in self.app.spawners.values()
155
            if spawner.pending
156
        ])
157
        PENDING_SERVERS.set(pending_servers)
158
    
159
    async def collect_hub_metrics(self):
160
        """Collect Hub system metrics"""
161
        # System resource usage
162
        import psutil
163
        
164
        # Memory usage
165
        memory = psutil.virtual_memory()
166
        HUB_MEMORY_USAGE_BYTES.set(memory.used)
167
        HUB_MEMORY_TOTAL_BYTES.set(memory.total)
168
        
169
        # CPU usage
170
        cpu_percent = psutil.cpu_percent()
171
        HUB_CPU_USAGE_PERCENT.set(cpu_percent)
172
```
173

174
### Custom Metrics Integration
175

176
Tools for adding custom metrics to JupyterHub applications.
177

178
```python { .api }
179
from prometheus_client import Counter, Gauge, Histogram, Summary
180

181
# Custom metric definitions
182
CUSTOM_COUNTER: Counter = Counter(
183
    'jupyterhub_custom_events_total',
184
    'Total custom events',
185
    ['event_type', 'user']
186
)
187

188
CUSTOM_GAUGE: Gauge = Gauge(
189
    'jupyterhub_custom_resource_usage',
190
    'Custom resource usage',
191
    ['resource_type', 'user']
192
)
193

194
CUSTOM_HISTOGRAM: Histogram = Histogram(
195
    'jupyterhub_custom_operation_duration_seconds',
196
    'Custom operation duration',
197
    ['operation', 'status']
198
)
199

200
def record_custom_event(event_type: str, user: str = None):
201
    """
202
    Record a custom event metric.
203
    
204
    Args:
205
        event_type: Type of event to record
206
        user: Username associated with event (optional)
207
    """
208
    CUSTOM_COUNTER.labels(
209
        event_type=event_type,
210
        user=user or 'anonymous'
211
    ).inc()
212

213
def update_custom_gauge(resource_type: str, value: float, user: str = None):
214
    """
215
    Update a custom gauge metric.
216
    
217
    Args:
218
        resource_type: Type of resource being measured
219
        value: Current resource value
220
        user: Username associated with resource (optional)
221
    """
222
    CUSTOM_GAUGE.labels(
223
        resource_type=resource_type,
224
        user=user or 'system'
225
    ).set(value)
226

227
def time_custom_operation(operation: str, status: str = 'success'):
228
    """
229
    Decorator to time custom operations.
230
    
231
    Args:
232
        operation: Name of the operation
233
        status: Operation status (success, error, etc.)
234
        
235
    Returns:
236
        Timer context manager
237
    """
238
    return CUSTOM_HISTOGRAM.labels(
239
        operation=operation,
240
        status=status
241
    ).time()
242
```
243

244
### Health Check System
245

246
Health monitoring and status reporting for JupyterHub components.
247

248
```python { .api }
249
class HealthChecker:
250
    """
251
    Health check system for JupyterHub components.
252
    
253
    Provides endpoints and utilities for monitoring system health
254
    and component status.
255
    """
256
    
257
    def __init__(self, app):
258
        """
259
        Initialize health checker.
260
        
261
        Args:
262
            app: JupyterHub application instance
263
        """
264
        self.app = app
265
        self.checks = {}
266
    
267
    def register_check(self, name: str, check_func: callable, interval: int = 60):
268
        """
269
        Register a health check function.
270
        
271
        Args:
272
            name: Check name
273
            check_func: Function that returns health status
274
            interval: Check interval in seconds
275
        """
276
        self.checks[name] = {
277
            'func': check_func,
278
            'interval': interval,
279
            'last_run': None,
280
            'status': 'unknown'
281
        }
282
    
283
    async def run_checks(self) -> Dict[str, Any]:
284
        """
285
        Run all registered health checks.
286
        
287
        Returns:
288
            Dictionary of check results with status and timing
289
        """
290
        results = {}
291
        
292
        for name, check in self.checks.items():
293
            try:
294
                start_time = time.time()
295
                status = await check['func']()
296
                duration = time.time() - start_time
297
                
298
                results[name] = {
299
                    'status': 'healthy' if status else 'unhealthy',
300
                    'duration': duration,
301
                    'timestamp': datetime.utcnow().isoformat()
302
                }
303
            except Exception as e:
304
                results[name] = {
305
                    'status': 'error',
306
                    'error': str(e),
307
                    'timestamp': datetime.utcnow().isoformat()
308
                }
309
        
310
        return results
311
    
312
    async def database_health_check(self) -> bool:
313
        """Check database connectivity and basic operations"""
314
        try:
315
            # Test database connection
316
            user_count = self.app.db.query(User).count()
317
            return user_count >= 0
318
        except Exception:
319
            return False
320
    
321
    async def spawner_health_check(self) -> bool:
322
        """Check spawner system health"""
323
        try:
324
            # Check if spawners are responsive
325
            active_spawners = len(self.app.spawners)
326
            return True  # Spawner system is operational
327
        except Exception:
328
            return False
329
    
330
    async def proxy_health_check(self) -> bool:
331
        """Check proxy health and connectivity"""
332
        try:
333
            # Test proxy connectivity
334
            await self.app.proxy.get_routes()
335
            return True
336
        except Exception:
337
            return False
338
```
339

340
## Usage Examples
341

342
### Basic Metrics Integration
343

344
```python
345
from jupyterhub.metrics import SPAWN_SUCCESS, SPAWN_FAILURE, SPAWN_DURATION_SECONDS
346
import time
347

348
class MonitoredSpawner(LocalProcessSpawner):
349
    """Spawner with metrics collection"""
350
    
351
    async def start(self):
352
        """Start server with metrics collection"""
353
        start_time = time.time()
354
        spawner_class = self.__class__.__name__
355
        
356
        try:
357
            # Start the server
358
            result = await super().start()
359
            
360
            # Record success metrics
361
            SPAWN_SUCCESS.labels(spawner_class=spawner_class).inc()
362
            duration = time.time() - start_time
363
            SPAWN_DURATION_SECONDS.labels(spawner_class=spawner_class).observe(duration)
364
            
365
            return result
366
            
367
        except Exception as e:
368
            # Record failure metrics
369
            error_type = type(e).__name__
370
            SPAWN_FAILURE.labels(
371
                spawner_class=spawner_class,
372
                error_type=error_type
373
            ).inc()
374
            raise
375
```
376

377
### Custom Metrics for User Activity
378

379
```python
380
from prometheus_client import Counter, Histogram
381

382
# Custom user activity metrics
383
USER_LOGIN_COUNTER = Counter(
384
    'jupyterhub_user_login_total',
385
    'Total user logins',
386
    ['username', 'authenticator']
387
)
388

389
NOTEBOOK_LAUNCH_DURATION = Histogram(
390
    'jupyterhub_notebook_launch_duration_seconds',
391
    'Time to launch notebook server',
392
    ['username', 'spawner_type']
393
)
394

395
class MetricsAuthenticator(PAMAuthenticator):
396
    """Authenticator with login metrics"""
397
    
398
    async def authenticate(self, handler, data):
399
        """Authenticate with metrics collection"""
400
        username = data.get('username', 'unknown')
401
        authenticator_name = self.__class__.__name__
402
        
403
        result = await super().authenticate(handler, data)
404
        
405
        if result:
406
            # Record successful login
407
            USER_LOGIN_COUNTER.labels(
408
                username=username,
409
                authenticator=authenticator_name
410
            ).inc()
411
        
412
        return result
413

414
class MetricsSpawner(LocalProcessSpawner):
415
    """Spawner with launch time metrics"""
416
    
417
    async def start(self):
418
        """Start server with launch time tracking"""
419
        start_time = time.time()
420
        username = self.user.name
421
        spawner_type = self.__class__.__name__
422
        
423
        try:
424
            result = await super().start()
425
            
426
            # Record launch time
427
            duration = time.time() - start_time
428
            NOTEBOOK_LAUNCH_DURATION.labels(
429
                username=username,
430
                spawner_type=spawner_type
431
            ).observe(duration)
432
            
433
            return result
434
        except Exception:
435
            # Still record failed launch attempts
436
            duration = time.time() - start_time
437
            NOTEBOOK_LAUNCH_DURATION.labels(
438
                username=username,
439
                spawner_type=spawner_type
440
            ).observe(duration)
441
            raise
442
```
443

444
### Health Monitoring Setup
445

446
```python
447
from jupyterhub.app import JupyterHub
448
from .monitoring import HealthChecker
449

450
class MonitoredJupyterHub(JupyterHub):
451
    """JupyterHub with health monitoring"""
452
    
453
    def __init__(self, **kwargs):
454
        super().__init__(**kwargs)
455
        self.health_checker = HealthChecker(self)
456
        self.setup_health_checks()
457
    
458
    def setup_health_checks(self):
459
        """Register health check functions"""
460
        self.health_checker.register_check(
461
            'database',
462
            self.health_checker.database_health_check,
463
            interval=30
464
        )
465
        
466
        self.health_checker.register_check(
467
            'proxy',
468
            self.health_checker.proxy_health_check,
469
            interval=60
470
        )
471
        
472
        self.health_checker.register_check(
473
            'spawners',
474
            self.health_checker.spawner_health_check,
475
            interval=120
476
        )
477
    
478
    async def start(self):
479
        """Start Hub with health monitoring"""
480
        await super().start()
481
        
482
        # Start health monitoring
483
        asyncio.create_task(self.periodic_health_checks())
484
    
485
    async def periodic_health_checks(self):
486
        """Run periodic health checks"""
487
        while True:
488
            try:
489
                health_results = await self.health_checker.run_checks()
490
                
491
                # Log health status
492
                for check_name, result in health_results.items():
493
                    if result['status'] != 'healthy':
494
                        self.log.warning(f"Health check {check_name}: {result['status']}")
495
                
496
                await asyncio.sleep(60)
497
            except Exception as e:
498
                self.log.error(f"Health check error: {e}")
499
                await asyncio.sleep(300)  # Wait longer on error
500
```
501

502
### Grafana Dashboard Integration
503

504
```python
505
# Example metrics for Grafana dashboard
506
DASHBOARD_METRICS = {
507
    'user_metrics': [
508
        'jupyterhub_total_users',
509
        'jupyterhub_active_users',
510
        'jupyterhub_user_login_total'
511
    ],
512
    'server_metrics': [
513
        'jupyterhub_running_servers',
514
        'jupyterhub_pending_servers', 
515
        'jupyterhub_spawn_duration_seconds',
516
        'jupyterhub_spawn_success_total',
517
        'jupyterhub_spawn_failure_total'
518
    ],
519
    'performance_metrics': [
520
        'jupyterhub_request_duration_seconds',
521
        'jupyterhub_api_request_duration_seconds',
522
        'jupyterhub_hub_response_duration_seconds'
523
    ],
524
    'system_metrics': [
525
        'jupyterhub_hub_memory_usage_bytes',
526
        'jupyterhub_hub_cpu_usage_percent'
527
    ]
528
}
529

530
def generate_grafana_queries():
531
    """Generate Grafana query examples"""
532
    queries = {
533
        'active_users_24h': '''
534
            increase(jupyterhub_user_login_total[24h])
535
        ''',
536
        'average_spawn_time': '''
537
            rate(jupyterhub_spawn_duration_seconds_sum[5m]) / 
538
            rate(jupyterhub_spawn_duration_seconds_count[5m])
539
        ''',
540
        'server_success_rate': '''
541
            rate(jupyterhub_spawn_success_total[5m]) / 
542
            (rate(jupyterhub_spawn_success_total[5m]) + 
543
             rate(jupyterhub_spawn_failure_total[5m])) * 100
544
        ''',
545
        'api_request_rate': '''
546
            rate(jupyterhub_api_request_duration_seconds_count[5m])
547
        '''
548
    }
549
    return queries
550
```
551

552
### Alerting Configuration
553

554
```python
555
# Prometheus alerting rules for JupyterHub
556
ALERTING_RULES = {
557
    'high_spawn_failure_rate': {
558
        'expr': '''
559
            rate(jupyterhub_spawn_failure_total[5m]) / 
560
            rate(jupyterhub_spawn_success_total[5m]) > 0.1
561
        ''',
562
        'for': '5m',
563
        'severity': 'warning',
564
        'summary': 'High spawn failure rate detected'
565
    },
566
    'hub_memory_high': {
567
        'expr': '''
568
            jupyterhub_hub_memory_usage_bytes / 
569
            jupyterhub_hub_memory_total_bytes > 0.9
570
        ''',
571
        'for': '2m',
572
        'severity': 'critical',
573
        'summary': 'Hub memory usage critical'
574
    },
575
    'no_running_servers': {
576
        'expr': 'jupyterhub_running_servers == 0',
577
        'for': '10m',
578
        'severity': 'warning',
579
        'summary': 'No servers currently running'
580
    }
581
}
582

583
class AlertManager:
584
    """Alert management for JupyterHub metrics"""
585
    
586
    def __init__(self, webhook_url=None):
587
        self.webhook_url = webhook_url
588
        self.alerts = {}
589
    
590
    async def check_alerts(self, metrics):
591
        """Check metrics against alert conditions"""
592
        for alert_name, config in ALERTING_RULES.items():
593
            # Evaluate alert condition
594
            if self.evaluate_condition(config['expr'], metrics):
595
                await self.trigger_alert(alert_name, config)
596
    
597
    def evaluate_condition(self, expr, metrics):
598
        """Evaluate alert expression against metrics"""
599
        # Implementation depends on metrics evaluation system
600
        return False
601
    
602
    async def trigger_alert(self, name, config):
603
        """Trigger alert notification"""
604
        if self.webhook_url:
605
            alert_data = {
606
                'alert': name,
607
                'severity': config['severity'],
608
                'summary': config['summary'],
609
                'timestamp': datetime.utcnow().isoformat()
610
            }
611
            
612
            # Send webhook notification
613
            async with aiohttp.ClientSession() as session:
614
                await session.post(self.webhook_url, json=alert_data)
615
```
616

617
## Advanced Monitoring Patterns
618

619
### Multi-dimensional Metrics
620

621
```python
622
# Resource usage metrics per user and server
623
RESOURCE_USAGE = Gauge(
624
    'jupyterhub_resource_usage',
625
    'Resource usage by dimension',
626
    ['resource_type', 'username', 'server_name']
627
)
628

629
def update_resource_metrics(user, server_name=''):
630
    """Update resource usage metrics for user"""
631
    # Get resource usage (implementation specific)
632
    cpu_usage = get_cpu_usage(user.name, server_name)
633
    memory_usage = get_memory_usage(user.name, server_name)
634
    
635
    # Update metrics
636
    RESOURCE_USAGE.labels(
637
        resource_type='cpu',
638
        username=user.name,
639
        server_name=server_name
640
    ).set(cpu_usage)
641
    
642
    RESOURCE_USAGE.labels(
643
        resource_type='memory',
644
        username=user.name,
645
        server_name=server_name
646
    ).set(memory_usage)
647
```
648

649
### Event-Based Metrics
650

651
```python
652
from jupyterhub.metrics import record_custom_event
653

654
class EventMetricsHub(JupyterHub):
655
    """Hub with event-based metrics collection"""
656
    
657
    async def login_user(self, user):
658
        """Login user with event metrics"""
659
        result = await super().login_user(user)
660
        
661
        # Record login event
662
        record_custom_event('user_login', user.name)
663
        
664
        return result
665
    
666
    async def spawn_server(self, user, server_name=''):
667
        """Spawn server with event metrics"""
668
        record_custom_event('server_spawn_start', user.name)
669
        
670
        try:
671
            result = await super().spawn_server(user, server_name)
672
            record_custom_event('server_spawn_success', user.name)
673
            return result
674
        except Exception as e:
675
            record_custom_event('server_spawn_failure', user.name)
676
            raise
677
```

Version

Tile

Files

monitoring-metrics.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

monitoring-metrics.mddocs/