0
# Router & Load Balancing
1
2
Advanced routing system for intelligent load balancing, automatic fallbacks, and retry logic across multiple model deployments. The Router class provides enterprise-grade reliability features including health monitoring, cost optimization, and performance tracking.
3
4
## Capabilities
5
6
### Router Class
7
8
Main router class that manages multiple model deployments with intelligent routing strategies and automatic failover capabilities.
9
10
```python { .api }
11
class Router:
12
def __init__(
13
self,
14
model_list: Optional[List[DeploymentTypedDict]] = None,
15
# Caching configuration
16
redis_url: Optional[str] = None,
17
redis_host: Optional[str] = None,
18
redis_port: Optional[int] = None,
19
redis_password: Optional[str] = None,
20
cache_responses: Optional[bool] = False,
21
cache_kwargs: dict = {},
22
caching_groups: Optional[List[tuple]] = None,
23
client_ttl: int = 3600,
24
# Reliability settings
25
num_retries: Optional[int] = None,
26
max_fallbacks: Optional[int] = None,
27
timeout: Optional[float] = None,
28
stream_timeout: Optional[float] = None,
29
default_litellm_params: Optional[dict] = None,
30
default_max_parallel_requests: Optional[int] = None,
31
set_verbose: bool = False,
32
debug_level: Literal["DEBUG", "INFO"] = "INFO",
33
# Fallback configuration
34
default_fallbacks: Optional[List[str]] = None,
35
fallbacks: List = [],
36
context_window_fallbacks: List = [],
37
content_policy_fallbacks: List = [],
38
# Routing strategy
39
routing_strategy: Literal[
40
"simple-shuffle",
41
"least-busy",
42
"usage-based-routing",
43
"latency-based-routing",
44
"cost-based-routing"
45
] = "simple-shuffle",
46
# Authentication and validation
47
enable_pre_call_checks: bool = False,
48
allowed_fails: int = 3,
49
cooldown_time: float = 1,
50
retry_policy: Optional[Dict[str, Any]] = None,
51
**kwargs
52
)
53
"""
54
Initialize Router with multiple model deployments and routing configuration.
55
56
Args:
57
model_list (Optional[List[DeploymentTypedDict]]): List of model deployment configurations
58
routing_strategy (str): Strategy for selecting deployments ("simple-shuffle", "least-busy", etc.)
59
num_retries (Optional[int]): Number of retries per deployment
60
max_fallbacks (Optional[int]): Maximum fallback deployments to try
61
timeout (Optional[float]): Request timeout in seconds
62
cache_responses (Optional[bool]): Enable response caching
63
fallbacks (List): Global fallback model list
64
enable_pre_call_checks (bool): Validate deployments before requests
65
"""
66
```
67
68
### Router Completion Methods
69
70
Router provides the same completion interfaces as global functions but with intelligent routing and fallback capabilities.
71
72
```python { .api }
73
def completion(
74
self,
75
model: str,
76
messages: List[Dict[str, Any]],
77
# All standard completion parameters
78
**kwargs
79
) -> Union[ModelResponse, Iterator[ModelResponseStream]]
80
"""
81
Route completion request through configured deployments with fallbacks.
82
83
Args:
84
Same as litellm.completion() but routes through multiple deployments
85
86
Returns:
87
Union[ModelResponse, Iterator[ModelResponseStream]]: Routed completion response
88
"""
89
90
async def acompletion(
91
self,
92
model: str,
93
messages: List[Dict[str, Any]],
94
**kwargs
95
) -> Union[ModelResponse, AsyncIterator[ModelResponseStream]]
96
"""
97
Async version of router completion with intelligent routing.
98
"""
99
100
def text_completion(
101
self,
102
model: str,
103
prompt: str,
104
**kwargs
105
) -> Union[TextCompletionResponse, Iterator[TextCompletionResponse]]
106
"""
107
Route text completion request through configured deployments.
108
"""
109
110
async def atext_completion(
111
self,
112
model: str,
113
prompt: str,
114
**kwargs
115
) -> Union[TextCompletionResponse, AsyncIterator[TextCompletionResponse]]
116
"""
117
Async text completion with routing.
118
"""
119
120
def embedding(
121
self,
122
model: str,
123
input: Union[str, List[str], List[int], List[List[int]]],
124
**kwargs
125
) -> EmbeddingResponse
126
"""
127
Route embedding request through configured deployments.
128
"""
129
130
async def aembedding(
131
self,
132
model: str,
133
input: Union[str, List[str], List[int], List[List[int]]],
134
**kwargs
135
) -> EmbeddingResponse
136
"""
137
Async embedding with routing.
138
"""
139
140
def image_generation(
141
self,
142
prompt: str,
143
**kwargs
144
) -> ImageResponse
145
"""
146
Route image generation through configured deployments.
147
"""
148
149
def transcription(
150
self,
151
model: str,
152
file: Union[str, bytes, IO],
153
**kwargs
154
) -> TranscriptionResponse
155
"""
156
Route transcription through configured deployments.
157
"""
158
159
def speech(
160
self,
161
model: str,
162
input: str,
163
voice: str,
164
**kwargs
165
) -> bytes
166
"""
167
Route speech synthesis through configured deployments.
168
"""
169
170
def moderation(
171
self,
172
input: Union[str, List[str]],
173
**kwargs
174
) -> ModerationCreateResponse
175
"""
176
Route moderation through configured deployments.
177
"""
178
```
179
180
### Deployment Management
181
182
Methods for managing model deployments dynamically during runtime.
183
184
```python { .api }
185
def add_deployment(self, deployment: DeploymentTypedDict) -> None:
186
"""
187
Add a new model deployment to the router.
188
189
Args:
190
deployment (DeploymentTypedDict): Deployment configuration
191
"""
192
193
def delete_deployment(self, deployment_id: str) -> None:
194
"""
195
Remove a deployment from the router.
196
197
Args:
198
deployment_id (str): ID of deployment to remove
199
"""
200
201
def get_deployments(self) -> List[DeploymentTypedDict]:
202
"""
203
Get all configured deployments.
204
205
Returns:
206
List[DeploymentTypedDict]: List of all deployments
207
"""
208
209
def set_model_list(self, model_list: List[DeploymentTypedDict]) -> None:
210
"""
211
Replace entire model list with new deployments.
212
213
Args:
214
model_list (List[DeploymentTypedDict]): New list of deployments
215
"""
216
217
def update_deployment(
218
self,
219
deployment_id: str,
220
**kwargs
221
) -> None:
222
"""
223
Update configuration of existing deployment.
224
225
Args:
226
deployment_id (str): ID of deployment to update
227
**kwargs: Updated configuration parameters
228
"""
229
```
230
231
### Health Monitoring
232
233
Health check and monitoring capabilities for deployment status and performance.
234
235
```python { .api }
236
def health_check(
237
self,
238
model: Optional[str] = None
239
) -> Dict[str, Any]:
240
"""
241
Check health status of deployments.
242
243
Args:
244
model (Optional[str]): Specific model to check, or all if None
245
246
Returns:
247
Dict[str, Any]: Health status report with deployment statuses
248
"""
249
250
async def ahealth_check(
251
self,
252
model: Optional[str] = None
253
) -> Dict[str, Any]:
254
"""
255
Async health check of deployments.
256
257
Args:
258
model (Optional[str]): Specific model to check
259
260
Returns:
261
Dict[str, Any]: Health status report
262
"""
263
```
264
265
### Analytics & Metrics
266
267
Cost tracking, usage analytics, and performance metrics for router deployments.
268
269
```python { .api }
270
def get_model_cost_map(self) -> Dict[str, Any]:
271
"""
272
Get cost information for all configured models.
273
274
Returns:
275
Dict[str, Any]: Model cost mapping with pricing details
276
"""
277
278
def print_deployment_metrics(self) -> None:
279
"""
280
Print detailed metrics for all deployments including:
281
- Request counts and success rates
282
- Average latency and throughput
283
- Cost tracking and token usage
284
- Error rates and failure types
285
"""
286
287
def reset_cost(self) -> None:
288
"""
289
Reset accumulated cost tracking data.
290
"""
291
292
def get_usage_stats(self) -> Dict[str, Any]:
293
"""
294
Get comprehensive usage statistics.
295
296
Returns:
297
Dict[str, Any]: Usage statistics including tokens, costs, latencies
298
"""
299
```
300
301
## Configuration Types
302
303
```python { .api }
304
class DeploymentTypedDict(TypedDict):
305
"""Model deployment configuration"""
306
model_name: str
307
litellm_params: Dict[str, Any]
308
model_info: Optional[Dict[str, Any]]
309
310
class LiteLLMParams(TypedDict):
311
"""Parameters for LiteLLM model configuration"""
312
model: str
313
api_key: Optional[str]
314
api_base: Optional[str]
315
api_version: Optional[str]
316
timeout: Optional[float]
317
max_retries: Optional[int]
318
custom_llm_provider: Optional[str]
319
320
class ModelInfo(TypedDict):
321
"""Model metadata and capabilities"""
322
id: Optional[str]
323
mode: Optional[Literal["chat", "completion", "embedding"]]
324
input_cost_per_token: Optional[float]
325
output_cost_per_token: Optional[float]
326
max_tokens: Optional[int]
327
supports_function_calling: Optional[bool]
328
supports_vision: Optional[bool]
329
```
330
331
## Usage Examples
332
333
### Basic Router Setup
334
335
```python
336
from litellm import Router
337
338
# Configure multiple OpenAI deployments
339
model_list = [
340
{
341
"model_name": "gpt-4",
342
"litellm_params": {
343
"model": "gpt-4",
344
"api_key": "sk-key1",
345
"api_base": "https://api.openai.com/v1"
346
}
347
},
348
{
349
"model_name": "gpt-4",
350
"litellm_params": {
351
"model": "azure/gpt-4",
352
"api_key": "azure-key",
353
"api_base": "https://my-azure.openai.azure.com/",
354
"api_version": "2024-02-01"
355
}
356
}
357
]
358
359
router = Router(model_list=model_list)
360
361
# Use router like normal completion
362
response = router.completion(
363
model="gpt-4",
364
messages=[{"role": "user", "content": "Hello!"}]
365
)
366
```
367
368
### Advanced Router Configuration
369
370
```python
371
from litellm import Router
372
373
model_list = [
374
{
375
"model_name": "gpt-4-primary",
376
"litellm_params": {
377
"model": "gpt-4",
378
"api_key": "primary-key"
379
},
380
"model_info": {
381
"id": "primary-deployment"
382
}
383
},
384
{
385
"model_name": "gpt-4-fallback",
386
"litellm_params": {
387
"model": "azure/gpt-4",
388
"api_key": "azure-key",
389
"api_base": "https://backup.openai.azure.com/",
390
"api_version": "2024-02-01"
391
},
392
"model_info": {
393
"id": "backup-deployment"
394
}
395
}
396
]
397
398
router = Router(
399
model_list=model_list,
400
routing_strategy="least-busy",
401
num_retries=3,
402
max_fallbacks=2,
403
timeout=30,
404
enable_pre_call_checks=True,
405
fallbacks=["gpt-3.5-turbo", "claude-3-haiku-20240307"]
406
)
407
```
408
409
### Router with Redis Caching
410
411
```python
412
router = Router(
413
model_list=model_list,
414
redis_url="redis://localhost:6379",
415
cache_responses=True,
416
client_ttl=3600, # 1 hour cache TTL
417
cache_kwargs={
418
"ttl": 600, # 10 minute default TTL
419
"namespace": "litellm_cache"
420
}
421
)
422
423
# Cached responses for identical requests
424
response1 = router.completion(
425
model="gpt-4",
426
messages=[{"role": "user", "content": "What is 2+2?"}]
427
)
428
429
# This will return cached response
430
response2 = router.completion(
431
model="gpt-4",
432
messages=[{"role": "user", "content": "What is 2+2?"}]
433
)
434
```
435
436
### Cost-Based Routing
437
438
```python
439
model_list = [
440
{
441
"model_name": "gpt-4",
442
"litellm_params": {"model": "gpt-4"},
443
"model_info": {
444
"input_cost_per_token": 0.00003,
445
"output_cost_per_token": 0.00006
446
}
447
},
448
{
449
"model_name": "gpt-3.5-turbo",
450
"litellm_params": {"model": "gpt-3.5-turbo"},
451
"model_info": {
452
"input_cost_per_token": 0.000001,
453
"output_cost_per_token": 0.000002
454
}
455
}
456
]
457
458
router = Router(
459
model_list=model_list,
460
routing_strategy="cost-based-routing"
461
)
462
463
# Router will prefer cheaper models when possible
464
response = router.completion(
465
model="gpt-4", # Will route to gpt-3.5-turbo if suitable
466
messages=[{"role": "user", "content": "Simple question"}]
467
)
468
```
469
470
### Health Monitoring
471
472
```python
473
# Check overall health
474
health = router.health_check()
475
print("Router Health:", health)
476
477
# Check specific model
478
gpt4_health = router.health_check(model="gpt-4")
479
print("GPT-4 Health:", gpt4_health)
480
481
# Print detailed metrics
482
router.print_deployment_metrics()
483
484
# Get cost information
485
costs = router.get_model_cost_map()
486
print("Cost Map:", costs)
487
```
488
489
### Dynamic Deployment Management
490
491
```python
492
# Add new deployment at runtime
493
new_deployment = {
494
"model_name": "claude-3",
495
"litellm_params": {
496
"model": "claude-3-sonnet-20240229",
497
"api_key": "anthropic-key"
498
},
499
"model_info": {
500
"id": "claude-deployment"
501
}
502
}
503
504
router.add_deployment(new_deployment)
505
506
# Update existing deployment
507
router.update_deployment(
508
deployment_id="primary-deployment",
509
api_key="new-primary-key"
510
)
511
512
# Remove deployment
513
router.delete_deployment("backup-deployment")
514
515
# Get current deployments
516
deployments = router.get_deployments()
517
print(f"Active deployments: {len(deployments)}")
518
```
519
520
### Fallback Configuration
521
522
```python
523
router = Router(
524
model_list=model_list,
525
# Global fallbacks for any model
526
fallbacks=["gpt-3.5-turbo", "claude-3-haiku-20240307"],
527
# Context window fallbacks
528
context_window_fallbacks=[
529
{"gpt-4": ["claude-3-sonnet-20240229"]}, # If gpt-4 context exceeded
530
{"claude-3-opus-20240229": ["gpt-4"]} # If claude opus context exceeded
531
],
532
# Content policy fallbacks
533
content_policy_fallbacks=[
534
{"gpt-4": ["claude-3-sonnet-20240229"]} # If content policy violation
535
]
536
)
537
538
try:
539
response = router.completion(
540
model="gpt-4",
541
messages=[{"role": "user", "content": "Very long prompt..."}]
542
)
543
except Exception as e:
544
print(f"All fallbacks exhausted: {e}")
545
```
546
547
### Async Router Usage
548
549
```python
550
import asyncio
551
552
async def concurrent_requests():
553
router = Router(model_list=model_list)
554
555
tasks = []
556
for i in range(10):
557
task = router.acompletion(
558
model="gpt-4",
559
messages=[{"role": "user", "content": f"Request {i}"}]
560
)
561
tasks.append(task)
562
563
responses = await asyncio.gather(*tasks)
564
return responses
565
566
responses = asyncio.run(concurrent_requests())
567
```
568
569
### Custom Retry Policy
570
571
```python
572
retry_policy = {
573
"max_retries": 5,
574
"base_delay": 1.0, # Base delay between retries
575
"max_delay": 60.0, # Maximum delay between retries
576
"backoff_factor": 2.0, # Exponential backoff multiplier
577
"jitter": True # Add random jitter to prevent thundering herd
578
}
579
580
router = Router(
581
model_list=model_list,
582
retry_policy=retry_policy,
583
allowed_fails=2, # Deployments marked unhealthy after 2 failures
584
cooldown_time=300 # 5 minute cooldown for unhealthy deployments
585
)
586
```