or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

authentication.mdconfiguration-utilities.mdcore-application.mddatabase-models.mdindex.mdmonitoring-metrics.mdrbac-permissions.mdrest-api.mdservices-oauth.mdsingleuser-integration.mdspawners.md

monitoring-metrics.mddocs/

0

# Monitoring and Metrics

1

2

JupyterHub provides comprehensive monitoring and metrics collection capabilities through Prometheus integration. The system tracks user activity, server performance, resource usage, and system health for operational visibility and capacity planning.

3

4

## Capabilities

5

6

### Prometheus Metrics

7

8

Core Prometheus metrics exposed by JupyterHub for monitoring and alerting.

9

10

```python { .api }

11

# Counter metrics

12

TOTAL_USERS: Counter = Counter(

13

'jupyterhub_total_users',

14

'Total number of users in JupyterHub database'

15

)

16

17

RUNNING_SERVERS: Gauge = Gauge(

18

'jupyterhub_running_servers',

19

'Number of currently running servers'

20

)

21

22

PENDING_SERVERS: Gauge = Gauge(

23

'jupyterhub_pending_servers',

24

'Number of servers in pending state'

25

)

26

27

# Request metrics

28

REQUEST_DURATION_SECONDS: Histogram = Histogram(

29

'jupyterhub_request_duration_seconds',

30

'Time spent handling HTTP requests',

31

['method', 'handler', 'code']

32

)

33

34

REQUEST_COUNT: Counter = Counter(

35

'jupyterhub_request_count_total',

36

'Total number of HTTP requests',

37

['method', 'handler', 'code']

38

)

39

40

# Authentication metrics

41

LOGIN_SUCCESS: Counter = Counter(

42

'jupyterhub_login_success_total',

43

'Total number of successful logins'

44

)

45

46

LOGIN_FAILURE: Counter = Counter(

47

'jupyterhub_login_failure_total',

48

'Total number of failed logins'

49

)

50

51

# Spawner metrics

52

SPAWN_DURATION_SECONDS: Histogram = Histogram(

53

'jupyterhub_spawn_duration_seconds',

54

'Time spent spawning servers',

55

['spawner_class']

56

)

57

58

SPAWN_SUCCESS: Counter = Counter(

59

'jupyterhub_spawn_success_total',

60

'Total number of successful server spawns',

61

['spawner_class']

62

)

63

64

SPAWN_FAILURE: Counter = Counter(

65

'jupyterhub_spawn_failure_total',

66

'Total number of failed server spawns',

67

['spawner_class', 'error_type']

68

)

69

70

# Hub metrics

71

HUB_RESPONSE_DURATION_SECONDS: Histogram = Histogram(

72

'jupyterhub_hub_response_duration_seconds',

73

'Time for Hub to respond to requests'

74

)

75

76

API_REQUEST_DURATION_SECONDS: Histogram = Histogram(

77

'jupyterhub_api_request_duration_seconds',

78

'Time spent handling API requests',

79

['method', 'endpoint', 'status']

80

)

81

```

82

83

### Metrics Collection System

84

85

Automated metrics collection and periodic updates.

86

87

```python { .api }

88

class PeriodicMetricsCollector:

89

"""

90

Periodic metrics collector for JupyterHub system statistics.

91

92

Collects and updates metrics at regular intervals to provide

93

current system state information.

94

"""

95

96

def __init__(self, app, interval: int = 60):

97

"""

98

Initialize metrics collector.

99

100

Args:

101

app: JupyterHub application instance

102

interval: Collection interval in seconds

103

"""

104

self.app = app

105

self.interval = interval

106

self.running = False

107

108

async def start(self):

109

"""Start periodic metrics collection"""

110

self.running = True

111

while self.running:

112

await self.collect_metrics()

113

await asyncio.sleep(self.interval)

114

115

def stop(self):

116

"""Stop metrics collection"""

117

self.running = False

118

119

async def collect_metrics(self):

120

"""

121

Collect and update all metrics.

122

123

Gathers current system state and updates Prometheus metrics.

124

"""

125

await self.collect_user_metrics()

126

await self.collect_server_metrics()

127

await self.collect_hub_metrics()

128

129

async def collect_user_metrics(self):

130

"""Collect user-related metrics"""

131

# Total users

132

total_users = self.app.db.query(User).count()

133

TOTAL_USERS.set(total_users)

134

135

# Active users (with recent activity)

136

cutoff = datetime.utcnow() - timedelta(hours=24)

137

active_users = self.app.db.query(User).filter(

138

User.last_activity > cutoff

139

).count()

140

141

# Update metrics

142

ACTIVE_USERS.set(active_users)

143

144

async def collect_server_metrics(self):

145

"""Collect server-related metrics"""

146

# Running servers

147

running_servers = self.app.db.query(Server).filter(

148

Server.url.isnot(None)

149

).count()

150

RUNNING_SERVERS.set(running_servers)

151

152

# Pending servers

153

pending_servers = len([

154

spawner for spawner in self.app.spawners.values()

155

if spawner.pending

156

])

157

PENDING_SERVERS.set(pending_servers)

158

159

async def collect_hub_metrics(self):

160

"""Collect Hub system metrics"""

161

# System resource usage

162

import psutil

163

164

# Memory usage

165

memory = psutil.virtual_memory()

166

HUB_MEMORY_USAGE_BYTES.set(memory.used)

167

HUB_MEMORY_TOTAL_BYTES.set(memory.total)

168

169

# CPU usage

170

cpu_percent = psutil.cpu_percent()

171

HUB_CPU_USAGE_PERCENT.set(cpu_percent)

172

```

173

174

### Custom Metrics Integration

175

176

Tools for adding custom metrics to JupyterHub applications.

177

178

```python { .api }

179

from prometheus_client import Counter, Gauge, Histogram, Summary

180

181

# Custom metric definitions

182

CUSTOM_COUNTER: Counter = Counter(

183

'jupyterhub_custom_events_total',

184

'Total custom events',

185

['event_type', 'user']

186

)

187

188

CUSTOM_GAUGE: Gauge = Gauge(

189

'jupyterhub_custom_resource_usage',

190

'Custom resource usage',

191

['resource_type', 'user']

192

)

193

194

CUSTOM_HISTOGRAM: Histogram = Histogram(

195

'jupyterhub_custom_operation_duration_seconds',

196

'Custom operation duration',

197

['operation', 'status']

198

)

199

200

def record_custom_event(event_type: str, user: str = None):

201

"""

202

Record a custom event metric.

203

204

Args:

205

event_type: Type of event to record

206

user: Username associated with event (optional)

207

"""

208

CUSTOM_COUNTER.labels(

209

event_type=event_type,

210

user=user or 'anonymous'

211

).inc()

212

213

def update_custom_gauge(resource_type: str, value: float, user: str = None):

214

"""

215

Update a custom gauge metric.

216

217

Args:

218

resource_type: Type of resource being measured

219

value: Current resource value

220

user: Username associated with resource (optional)

221

"""

222

CUSTOM_GAUGE.labels(

223

resource_type=resource_type,

224

user=user or 'system'

225

).set(value)

226

227

def time_custom_operation(operation: str, status: str = 'success'):

228

"""

229

Decorator to time custom operations.

230

231

Args:

232

operation: Name of the operation

233

status: Operation status (success, error, etc.)

234

235

Returns:

236

Timer context manager

237

"""

238

return CUSTOM_HISTOGRAM.labels(

239

operation=operation,

240

status=status

241

).time()

242

```

243

244

### Health Check System

245

246

Health monitoring and status reporting for JupyterHub components.

247

248

```python { .api }

249

class HealthChecker:

250

"""

251

Health check system for JupyterHub components.

252

253

Provides endpoints and utilities for monitoring system health

254

and component status.

255

"""

256

257

def __init__(self, app):

258

"""

259

Initialize health checker.

260

261

Args:

262

app: JupyterHub application instance

263

"""

264

self.app = app

265

self.checks = {}

266

267

def register_check(self, name: str, check_func: callable, interval: int = 60):

268

"""

269

Register a health check function.

270

271

Args:

272

name: Check name

273

check_func: Function that returns health status

274

interval: Check interval in seconds

275

"""

276

self.checks[name] = {

277

'func': check_func,

278

'interval': interval,

279

'last_run': None,

280

'status': 'unknown'

281

}

282

283

async def run_checks(self) -> Dict[str, Any]:

284

"""

285

Run all registered health checks.

286

287

Returns:

288

Dictionary of check results with status and timing

289

"""

290

results = {}

291

292

for name, check in self.checks.items():

293

try:

294

start_time = time.time()

295

status = await check['func']()

296

duration = time.time() - start_time

297

298

results[name] = {

299

'status': 'healthy' if status else 'unhealthy',

300

'duration': duration,

301

'timestamp': datetime.utcnow().isoformat()

302

}

303

except Exception as e:

304

results[name] = {

305

'status': 'error',

306

'error': str(e),

307

'timestamp': datetime.utcnow().isoformat()

308

}

309

310

return results

311

312

async def database_health_check(self) -> bool:

313

"""Check database connectivity and basic operations"""

314

try:

315

# Test database connection

316

user_count = self.app.db.query(User).count()

317

return user_count >= 0

318

except Exception:

319

return False

320

321

async def spawner_health_check(self) -> bool:

322

"""Check spawner system health"""

323

try:

324

# Check if spawners are responsive

325

active_spawners = len(self.app.spawners)

326

return True # Spawner system is operational

327

except Exception:

328

return False

329

330

async def proxy_health_check(self) -> bool:

331

"""Check proxy health and connectivity"""

332

try:

333

# Test proxy connectivity

334

await self.app.proxy.get_routes()

335

return True

336

except Exception:

337

return False

338

```

339

340

## Usage Examples

341

342

### Basic Metrics Integration

343

344

```python

345

from jupyterhub.metrics import SPAWN_SUCCESS, SPAWN_FAILURE, SPAWN_DURATION_SECONDS

346

import time

347

348

class MonitoredSpawner(LocalProcessSpawner):

349

"""Spawner with metrics collection"""

350

351

async def start(self):

352

"""Start server with metrics collection"""

353

start_time = time.time()

354

spawner_class = self.__class__.__name__

355

356

try:

357

# Start the server

358

result = await super().start()

359

360

# Record success metrics

361

SPAWN_SUCCESS.labels(spawner_class=spawner_class).inc()

362

duration = time.time() - start_time

363

SPAWN_DURATION_SECONDS.labels(spawner_class=spawner_class).observe(duration)

364

365

return result

366

367

except Exception as e:

368

# Record failure metrics

369

error_type = type(e).__name__

370

SPAWN_FAILURE.labels(

371

spawner_class=spawner_class,

372

error_type=error_type

373

).inc()

374

raise

375

```

376

377

### Custom Metrics for User Activity

378

379

```python

380

from prometheus_client import Counter, Histogram

381

382

# Custom user activity metrics

383

USER_LOGIN_COUNTER = Counter(

384

'jupyterhub_user_login_total',

385

'Total user logins',

386

['username', 'authenticator']

387

)

388

389

NOTEBOOK_LAUNCH_DURATION = Histogram(

390

'jupyterhub_notebook_launch_duration_seconds',

391

'Time to launch notebook server',

392

['username', 'spawner_type']

393

)

394

395

class MetricsAuthenticator(PAMAuthenticator):

396

"""Authenticator with login metrics"""

397

398

async def authenticate(self, handler, data):

399

"""Authenticate with metrics collection"""

400

username = data.get('username', 'unknown')

401

authenticator_name = self.__class__.__name__

402

403

result = await super().authenticate(handler, data)

404

405

if result:

406

# Record successful login

407

USER_LOGIN_COUNTER.labels(

408

username=username,

409

authenticator=authenticator_name

410

).inc()

411

412

return result

413

414

class MetricsSpawner(LocalProcessSpawner):

415

"""Spawner with launch time metrics"""

416

417

async def start(self):

418

"""Start server with launch time tracking"""

419

start_time = time.time()

420

username = self.user.name

421

spawner_type = self.__class__.__name__

422

423

try:

424

result = await super().start()

425

426

# Record launch time

427

duration = time.time() - start_time

428

NOTEBOOK_LAUNCH_DURATION.labels(

429

username=username,

430

spawner_type=spawner_type

431

).observe(duration)

432

433

return result

434

except Exception:

435

# Still record failed launch attempts

436

duration = time.time() - start_time

437

NOTEBOOK_LAUNCH_DURATION.labels(

438

username=username,

439

spawner_type=spawner_type

440

).observe(duration)

441

raise

442

```

443

444

### Health Monitoring Setup

445

446

```python

447

from jupyterhub.app import JupyterHub

448

from .monitoring import HealthChecker

449

450

class MonitoredJupyterHub(JupyterHub):

451

"""JupyterHub with health monitoring"""

452

453

def __init__(self, **kwargs):

454

super().__init__(**kwargs)

455

self.health_checker = HealthChecker(self)

456

self.setup_health_checks()

457

458

def setup_health_checks(self):

459

"""Register health check functions"""

460

self.health_checker.register_check(

461

'database',

462

self.health_checker.database_health_check,

463

interval=30

464

)

465

466

self.health_checker.register_check(

467

'proxy',

468

self.health_checker.proxy_health_check,

469

interval=60

470

)

471

472

self.health_checker.register_check(

473

'spawners',

474

self.health_checker.spawner_health_check,

475

interval=120

476

)

477

478

async def start(self):

479

"""Start Hub with health monitoring"""

480

await super().start()

481

482

# Start health monitoring

483

asyncio.create_task(self.periodic_health_checks())

484

485

async def periodic_health_checks(self):

486

"""Run periodic health checks"""

487

while True:

488

try:

489

health_results = await self.health_checker.run_checks()

490

491

# Log health status

492

for check_name, result in health_results.items():

493

if result['status'] != 'healthy':

494

self.log.warning(f"Health check {check_name}: {result['status']}")

495

496

await asyncio.sleep(60)

497

except Exception as e:

498

self.log.error(f"Health check error: {e}")

499

await asyncio.sleep(300) # Wait longer on error

500

```

501

502

### Grafana Dashboard Integration

503

504

```python

505

# Example metrics for Grafana dashboard

506

DASHBOARD_METRICS = {

507

'user_metrics': [

508

'jupyterhub_total_users',

509

'jupyterhub_active_users',

510

'jupyterhub_user_login_total'

511

],

512

'server_metrics': [

513

'jupyterhub_running_servers',

514

'jupyterhub_pending_servers',

515

'jupyterhub_spawn_duration_seconds',

516

'jupyterhub_spawn_success_total',

517

'jupyterhub_spawn_failure_total'

518

],

519

'performance_metrics': [

520

'jupyterhub_request_duration_seconds',

521

'jupyterhub_api_request_duration_seconds',

522

'jupyterhub_hub_response_duration_seconds'

523

],

524

'system_metrics': [

525

'jupyterhub_hub_memory_usage_bytes',

526

'jupyterhub_hub_cpu_usage_percent'

527

]

528

}

529

530

def generate_grafana_queries():

531

"""Generate Grafana query examples"""

532

queries = {

533

'active_users_24h': '''

534

increase(jupyterhub_user_login_total[24h])

535

''',

536

'average_spawn_time': '''

537

rate(jupyterhub_spawn_duration_seconds_sum[5m]) /

538

rate(jupyterhub_spawn_duration_seconds_count[5m])

539

''',

540

'server_success_rate': '''

541

rate(jupyterhub_spawn_success_total[5m]) /

542

(rate(jupyterhub_spawn_success_total[5m]) +

543

rate(jupyterhub_spawn_failure_total[5m])) * 100

544

''',

545

'api_request_rate': '''

546

rate(jupyterhub_api_request_duration_seconds_count[5m])

547

'''

548

}

549

return queries

550

```

551

552

### Alerting Configuration

553

554

```python

555

# Prometheus alerting rules for JupyterHub

556

ALERTING_RULES = {

557

'high_spawn_failure_rate': {

558

'expr': '''

559

rate(jupyterhub_spawn_failure_total[5m]) /

560

rate(jupyterhub_spawn_success_total[5m]) > 0.1

561

''',

562

'for': '5m',

563

'severity': 'warning',

564

'summary': 'High spawn failure rate detected'

565

},

566

'hub_memory_high': {

567

'expr': '''

568

jupyterhub_hub_memory_usage_bytes /

569

jupyterhub_hub_memory_total_bytes > 0.9

570

''',

571

'for': '2m',

572

'severity': 'critical',

573

'summary': 'Hub memory usage critical'

574

},

575

'no_running_servers': {

576

'expr': 'jupyterhub_running_servers == 0',

577

'for': '10m',

578

'severity': 'warning',

579

'summary': 'No servers currently running'

580

}

581

}

582

583

class AlertManager:

584

"""Alert management for JupyterHub metrics"""

585

586

def __init__(self, webhook_url=None):

587

self.webhook_url = webhook_url

588

self.alerts = {}

589

590

async def check_alerts(self, metrics):

591

"""Check metrics against alert conditions"""

592

for alert_name, config in ALERTING_RULES.items():

593

# Evaluate alert condition

594

if self.evaluate_condition(config['expr'], metrics):

595

await self.trigger_alert(alert_name, config)

596

597

def evaluate_condition(self, expr, metrics):

598

"""Evaluate alert expression against metrics"""

599

# Implementation depends on metrics evaluation system

600

return False

601

602

async def trigger_alert(self, name, config):

603

"""Trigger alert notification"""

604

if self.webhook_url:

605

alert_data = {

606

'alert': name,

607

'severity': config['severity'],

608

'summary': config['summary'],

609

'timestamp': datetime.utcnow().isoformat()

610

}

611

612

# Send webhook notification

613

async with aiohttp.ClientSession() as session:

614

await session.post(self.webhook_url, json=alert_data)

615

```

616

617

## Advanced Monitoring Patterns

618

619

### Multi-dimensional Metrics

620

621

```python

622

# Resource usage metrics per user and server

623

RESOURCE_USAGE = Gauge(

624

'jupyterhub_resource_usage',

625

'Resource usage by dimension',

626

['resource_type', 'username', 'server_name']

627

)

628

629

def update_resource_metrics(user, server_name=''):

630

"""Update resource usage metrics for user"""

631

# Get resource usage (implementation specific)

632

cpu_usage = get_cpu_usage(user.name, server_name)

633

memory_usage = get_memory_usage(user.name, server_name)

634

635

# Update metrics

636

RESOURCE_USAGE.labels(

637

resource_type='cpu',

638

username=user.name,

639

server_name=server_name

640

).set(cpu_usage)

641

642

RESOURCE_USAGE.labels(

643

resource_type='memory',

644

username=user.name,

645

server_name=server_name

646

).set(memory_usage)

647

```

648

649

### Event-Based Metrics

650

651

```python

652

from jupyterhub.metrics import record_custom_event

653

654

class EventMetricsHub(JupyterHub):

655

"""Hub with event-based metrics collection"""

656

657

async def login_user(self, user):

658

"""Login user with event metrics"""

659

result = await super().login_user(user)

660

661

# Record login event

662

record_custom_event('user_login', user.name)

663

664

return result

665

666

async def spawn_server(self, user, server_name=''):

667

"""Spawn server with event metrics"""

668

record_custom_event('server_spawn_start', user.name)

669

670

try:

671

result = await super().spawn_server(user, server_name)

672

record_custom_event('server_spawn_success', user.name)

673

return result

674

except Exception as e:

675

record_custom_event('server_spawn_failure', user.name)

676

raise

677

```