Tessl Tile for pypi/litellm@1.76.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-completion.md exceptions.md index.md other-apis.md providers.md router.md utilities.md

router.mddocs/

0
# Router & Load Balancing
1

2
Advanced routing system for intelligent load balancing, automatic fallbacks, and retry logic across multiple model deployments. The Router class provides enterprise-grade reliability features including health monitoring, cost optimization, and performance tracking.
3

4
## Capabilities
5

6
### Router Class
7

8
Main router class that manages multiple model deployments with intelligent routing strategies and automatic failover capabilities.
9

10
```python { .api }
11
class Router:
12
    def __init__(
13
        self,
14
        model_list: Optional[List[DeploymentTypedDict]] = None,
15
        # Caching configuration
16
        redis_url: Optional[str] = None,
17
        redis_host: Optional[str] = None,
18
        redis_port: Optional[int] = None,
19
        redis_password: Optional[str] = None,
20
        cache_responses: Optional[bool] = False,
21
        cache_kwargs: dict = {},
22
        caching_groups: Optional[List[tuple]] = None,
23
        client_ttl: int = 3600,
24
        # Reliability settings
25
        num_retries: Optional[int] = None,
26
        max_fallbacks: Optional[int] = None,
27
        timeout: Optional[float] = None,
28
        stream_timeout: Optional[float] = None,
29
        default_litellm_params: Optional[dict] = None,
30
        default_max_parallel_requests: Optional[int] = None,
31
        set_verbose: bool = False,
32
        debug_level: Literal["DEBUG", "INFO"] = "INFO",
33
        # Fallback configuration
34
        default_fallbacks: Optional[List[str]] = None,
35
        fallbacks: List = [],
36
        context_window_fallbacks: List = [],
37
        content_policy_fallbacks: List = [],
38
        # Routing strategy
39
        routing_strategy: Literal[
40
            "simple-shuffle",
41
            "least-busy", 
42
            "usage-based-routing",
43
            "latency-based-routing",
44
            "cost-based-routing"
45
        ] = "simple-shuffle",
46
        # Authentication and validation
47
        enable_pre_call_checks: bool = False,
48
        allowed_fails: int = 3,
49
        cooldown_time: float = 1,
50
        retry_policy: Optional[Dict[str, Any]] = None,
51
        **kwargs
52
    )
53
    """
54
    Initialize Router with multiple model deployments and routing configuration.
55

56
    Args:
57
        model_list (Optional[List[DeploymentTypedDict]]): List of model deployment configurations
58
        routing_strategy (str): Strategy for selecting deployments ("simple-shuffle", "least-busy", etc.)
59
        num_retries (Optional[int]): Number of retries per deployment
60
        max_fallbacks (Optional[int]): Maximum fallback deployments to try
61
        timeout (Optional[float]): Request timeout in seconds
62
        cache_responses (Optional[bool]): Enable response caching
63
        fallbacks (List): Global fallback model list
64
        enable_pre_call_checks (bool): Validate deployments before requests
65
    """
66
```
67

68
### Router Completion Methods
69

70
Router provides the same completion interfaces as global functions but with intelligent routing and fallback capabilities.
71

72
```python { .api }
73
def completion(
74
    self,
75
    model: str,
76
    messages: List[Dict[str, Any]],
77
    # All standard completion parameters
78
    **kwargs
79
) -> Union[ModelResponse, Iterator[ModelResponseStream]]
80
    """
81
    Route completion request through configured deployments with fallbacks.
82

83
    Args:
84
        Same as litellm.completion() but routes through multiple deployments
85

86
    Returns:
87
        Union[ModelResponse, Iterator[ModelResponseStream]]: Routed completion response
88
    """
89

90
async def acompletion(
91
    self,
92
    model: str,
93
    messages: List[Dict[str, Any]],
94
    **kwargs
95
) -> Union[ModelResponse, AsyncIterator[ModelResponseStream]]
96
    """
97
    Async version of router completion with intelligent routing.
98
    """
99

100
def text_completion(
101
    self,
102
    model: str, 
103
    prompt: str,
104
    **kwargs
105
) -> Union[TextCompletionResponse, Iterator[TextCompletionResponse]]
106
    """
107
    Route text completion request through configured deployments.
108
    """
109

110
async def atext_completion(
111
    self,
112
    model: str,
113
    prompt: str,
114
    **kwargs
115
) -> Union[TextCompletionResponse, AsyncIterator[TextCompletionResponse]]
116
    """
117
    Async text completion with routing.
118
    """
119

120
def embedding(
121
    self,
122
    model: str,
123
    input: Union[str, List[str], List[int], List[List[int]]],
124
    **kwargs
125
) -> EmbeddingResponse
126
    """
127
    Route embedding request through configured deployments.
128
    """
129

130
async def aembedding(
131
    self,
132
    model: str,
133
    input: Union[str, List[str], List[int], List[List[int]]],
134
    **kwargs
135
) -> EmbeddingResponse
136
    """
137
    Async embedding with routing.
138
    """
139

140
def image_generation(
141
    self,
142
    prompt: str,
143
    **kwargs
144
) -> ImageResponse
145
    """
146
    Route image generation through configured deployments.
147
    """
148

149
def transcription(
150
    self,
151
    model: str,
152
    file: Union[str, bytes, IO],
153
    **kwargs
154
) -> TranscriptionResponse
155
    """
156
    Route transcription through configured deployments.
157
    """
158

159
def speech(
160
    self,
161
    model: str,
162
    input: str,
163
    voice: str,
164
    **kwargs
165
) -> bytes
166
    """
167
    Route speech synthesis through configured deployments.
168
    """
169

170
def moderation(
171
    self,
172
    input: Union[str, List[str]],
173
    **kwargs
174
) -> ModerationCreateResponse
175
    """
176
    Route moderation through configured deployments.
177
    """
178
```
179

180
### Deployment Management
181

182
Methods for managing model deployments dynamically during runtime.
183

184
```python { .api }
185
def add_deployment(self, deployment: DeploymentTypedDict) -> None:
186
    """
187
    Add a new model deployment to the router.
188

189
    Args:
190
        deployment (DeploymentTypedDict): Deployment configuration
191
    """
192

193
def delete_deployment(self, deployment_id: str) -> None:
194
    """
195
    Remove a deployment from the router.
196

197
    Args:
198
        deployment_id (str): ID of deployment to remove
199
    """
200

201
def get_deployments(self) -> List[DeploymentTypedDict]:
202
    """
203
    Get all configured deployments.
204

205
    Returns:
206
        List[DeploymentTypedDict]: List of all deployments
207
    """
208

209
def set_model_list(self, model_list: List[DeploymentTypedDict]) -> None:
210
    """
211
    Replace entire model list with new deployments.
212

213
    Args:
214
        model_list (List[DeploymentTypedDict]): New list of deployments
215
    """
216

217
def update_deployment(
218
    self,
219
    deployment_id: str,
220
    **kwargs
221
) -> None:
222
    """
223
    Update configuration of existing deployment.
224

225
    Args:
226
        deployment_id (str): ID of deployment to update
227
        **kwargs: Updated configuration parameters
228
    """
229
```
230

231
### Health Monitoring
232

233
Health check and monitoring capabilities for deployment status and performance.
234

235
```python { .api }
236
def health_check(
237
    self,
238
    model: Optional[str] = None
239
) -> Dict[str, Any]:
240
    """
241
    Check health status of deployments.
242

243
    Args:
244
        model (Optional[str]): Specific model to check, or all if None
245

246
    Returns:
247
        Dict[str, Any]: Health status report with deployment statuses
248
    """
249

250
async def ahealth_check(
251
    self,
252
    model: Optional[str] = None
253
) -> Dict[str, Any]:
254
    """
255
    Async health check of deployments.
256

257
    Args:
258
        model (Optional[str]): Specific model to check
259

260
    Returns:
261
        Dict[str, Any]: Health status report
262
    """
263
```
264

265
### Analytics & Metrics
266

267
Cost tracking, usage analytics, and performance metrics for router deployments.
268

269
```python { .api }
270
def get_model_cost_map(self) -> Dict[str, Any]:
271
    """
272
    Get cost information for all configured models.
273

274
    Returns:
275
        Dict[str, Any]: Model cost mapping with pricing details
276
    """
277

278
def print_deployment_metrics(self) -> None:
279
    """
280
    Print detailed metrics for all deployments including:
281
    - Request counts and success rates
282
    - Average latency and throughput
283
    - Cost tracking and token usage
284
    - Error rates and failure types
285
    """
286

287
def reset_cost(self) -> None:
288
    """
289
    Reset accumulated cost tracking data.
290
    """
291

292
def get_usage_stats(self) -> Dict[str, Any]:
293
    """
294
    Get comprehensive usage statistics.
295

296
    Returns:
297
        Dict[str, Any]: Usage statistics including tokens, costs, latencies
298
    """
299
```
300

301
## Configuration Types
302

303
```python { .api }
304
class DeploymentTypedDict(TypedDict):
305
    """Model deployment configuration"""
306
    model_name: str
307
    litellm_params: Dict[str, Any]
308
    model_info: Optional[Dict[str, Any]]
309
    
310
class LiteLLMParams(TypedDict):
311
    """Parameters for LiteLLM model configuration"""
312
    model: str
313
    api_key: Optional[str]
314
    api_base: Optional[str]
315
    api_version: Optional[str]
316
    timeout: Optional[float]
317
    max_retries: Optional[int]
318
    custom_llm_provider: Optional[str]
319
    
320
class ModelInfo(TypedDict):
321
    """Model metadata and capabilities"""
322
    id: Optional[str]
323
    mode: Optional[Literal["chat", "completion", "embedding"]]
324
    input_cost_per_token: Optional[float]
325
    output_cost_per_token: Optional[float]
326
    max_tokens: Optional[int]
327
    supports_function_calling: Optional[bool]
328
    supports_vision: Optional[bool]
329
```
330

331
## Usage Examples
332

333
### Basic Router Setup
334

335
```python
336
from litellm import Router
337

338
# Configure multiple OpenAI deployments
339
model_list = [
340
    {
341
        "model_name": "gpt-4",
342
        "litellm_params": {
343
            "model": "gpt-4",
344
            "api_key": "sk-key1",
345
            "api_base": "https://api.openai.com/v1"
346
        }
347
    },
348
    {
349
        "model_name": "gpt-4",
350
        "litellm_params": {
351
            "model": "azure/gpt-4",
352
            "api_key": "azure-key",
353
            "api_base": "https://my-azure.openai.azure.com/",
354
            "api_version": "2024-02-01"
355
        }
356
    }
357
]
358

359
router = Router(model_list=model_list)
360

361
# Use router like normal completion
362
response = router.completion(
363
    model="gpt-4",
364
    messages=[{"role": "user", "content": "Hello!"}]
365
)
366
```
367

368
### Advanced Router Configuration
369

370
```python
371
from litellm import Router
372

373
model_list = [
374
    {
375
        "model_name": "gpt-4-primary",
376
        "litellm_params": {
377
            "model": "gpt-4",
378
            "api_key": "primary-key"
379
        },
380
        "model_info": {
381
            "id": "primary-deployment"
382
        }
383
    },
384
    {
385
        "model_name": "gpt-4-fallback",
386
        "litellm_params": {
387
            "model": "azure/gpt-4",
388
            "api_key": "azure-key",
389
            "api_base": "https://backup.openai.azure.com/",
390
            "api_version": "2024-02-01"
391
        },
392
        "model_info": {
393
            "id": "backup-deployment"
394
        }
395
    }
396
]
397

398
router = Router(
399
    model_list=model_list,
400
    routing_strategy="least-busy",
401
    num_retries=3,
402
    max_fallbacks=2,
403
    timeout=30,
404
    enable_pre_call_checks=True,
405
    fallbacks=["gpt-3.5-turbo", "claude-3-haiku-20240307"]
406
)
407
```
408

409
### Router with Redis Caching
410

411
```python
412
router = Router(
413
    model_list=model_list,
414
    redis_url="redis://localhost:6379",
415
    cache_responses=True,
416
    client_ttl=3600,  # 1 hour cache TTL
417
    cache_kwargs={
418
        "ttl": 600,  # 10 minute default TTL
419
        "namespace": "litellm_cache"
420
    }
421
)
422

423
# Cached responses for identical requests
424
response1 = router.completion(
425
    model="gpt-4",
426
    messages=[{"role": "user", "content": "What is 2+2?"}]
427
)
428

429
# This will return cached response
430
response2 = router.completion(
431
    model="gpt-4", 
432
    messages=[{"role": "user", "content": "What is 2+2?"}]
433
)
434
```
435

436
### Cost-Based Routing
437

438
```python
439
model_list = [
440
    {
441
        "model_name": "gpt-4",
442
        "litellm_params": {"model": "gpt-4"},
443
        "model_info": {
444
            "input_cost_per_token": 0.00003,
445
            "output_cost_per_token": 0.00006
446
        }
447
    },
448
    {
449
        "model_name": "gpt-3.5-turbo",
450
        "litellm_params": {"model": "gpt-3.5-turbo"},
451
        "model_info": {
452
            "input_cost_per_token": 0.000001,
453
            "output_cost_per_token": 0.000002
454
        }
455
    }
456
]
457

458
router = Router(
459
    model_list=model_list,
460
    routing_strategy="cost-based-routing"
461
)
462

463
# Router will prefer cheaper models when possible
464
response = router.completion(
465
    model="gpt-4",  # Will route to gpt-3.5-turbo if suitable
466
    messages=[{"role": "user", "content": "Simple question"}]
467
)
468
```
469

470
### Health Monitoring
471

472
```python
473
# Check overall health
474
health = router.health_check()
475
print("Router Health:", health)
476

477
# Check specific model
478
gpt4_health = router.health_check(model="gpt-4")
479
print("GPT-4 Health:", gpt4_health)
480

481
# Print detailed metrics
482
router.print_deployment_metrics()
483

484
# Get cost information
485
costs = router.get_model_cost_map()
486
print("Cost Map:", costs)
487
```
488

489
### Dynamic Deployment Management
490

491
```python
492
# Add new deployment at runtime
493
new_deployment = {
494
    "model_name": "claude-3",
495
    "litellm_params": {
496
        "model": "claude-3-sonnet-20240229",
497
        "api_key": "anthropic-key"
498
    },
499
    "model_info": {
500
        "id": "claude-deployment"
501
    }
502
}
503

504
router.add_deployment(new_deployment)
505

506
# Update existing deployment
507
router.update_deployment(
508
    deployment_id="primary-deployment",
509
    api_key="new-primary-key"
510
)
511

512
# Remove deployment
513
router.delete_deployment("backup-deployment")
514

515
# Get current deployments
516
deployments = router.get_deployments()
517
print(f"Active deployments: {len(deployments)}")
518
```
519

520
### Fallback Configuration
521

522
```python
523
router = Router(
524
    model_list=model_list,
525
    # Global fallbacks for any model
526
    fallbacks=["gpt-3.5-turbo", "claude-3-haiku-20240307"],
527
    # Context window fallbacks
528
    context_window_fallbacks=[
529
        {"gpt-4": ["claude-3-sonnet-20240229"]},  # If gpt-4 context exceeded
530
        {"claude-3-opus-20240229": ["gpt-4"]}     # If claude opus context exceeded
531
    ],
532
    # Content policy fallbacks
533
    content_policy_fallbacks=[
534
        {"gpt-4": ["claude-3-sonnet-20240229"]}   # If content policy violation
535
    ]
536
)
537

538
try:
539
    response = router.completion(
540
        model="gpt-4",
541
        messages=[{"role": "user", "content": "Very long prompt..."}]
542
    )
543
except Exception as e:
544
    print(f"All fallbacks exhausted: {e}")
545
```
546

547
### Async Router Usage
548

549
```python
550
import asyncio
551

552
async def concurrent_requests():
553
    router = Router(model_list=model_list)
554
    
555
    tasks = []
556
    for i in range(10):
557
        task = router.acompletion(
558
            model="gpt-4",
559
            messages=[{"role": "user", "content": f"Request {i}"}]
560
        )
561
        tasks.append(task)
562
    
563
    responses = await asyncio.gather(*tasks)
564
    return responses
565

566
responses = asyncio.run(concurrent_requests())
567
```
568

569
### Custom Retry Policy
570

571
```python
572
retry_policy = {
573
    "max_retries": 5,
574
    "base_delay": 1.0,      # Base delay between retries
575
    "max_delay": 60.0,      # Maximum delay between retries
576
    "backoff_factor": 2.0,  # Exponential backoff multiplier
577
    "jitter": True          # Add random jitter to prevent thundering herd
578
}
579

580
router = Router(
581
    model_list=model_list,
582
    retry_policy=retry_policy,
583
    allowed_fails=2,        # Deployments marked unhealthy after 2 failures
584
    cooldown_time=300       # 5 minute cooldown for unhealthy deployments
585
)
586
```

Version

Tile

Files

router.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

router.mddocs/