or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-completion.mdexceptions.mdindex.mdother-apis.mdproviders.mdrouter.mdutilities.md

router.mddocs/

0

# Router & Load Balancing

1

2

Advanced routing system for intelligent load balancing, automatic fallbacks, and retry logic across multiple model deployments. The Router class provides enterprise-grade reliability features including health monitoring, cost optimization, and performance tracking.

3

4

## Capabilities

5

6

### Router Class

7

8

Main router class that manages multiple model deployments with intelligent routing strategies and automatic failover capabilities.

9

10

```python { .api }

11

class Router:

12

def __init__(

13

self,

14

model_list: Optional[List[DeploymentTypedDict]] = None,

15

# Caching configuration

16

redis_url: Optional[str] = None,

17

redis_host: Optional[str] = None,

18

redis_port: Optional[int] = None,

19

redis_password: Optional[str] = None,

20

cache_responses: Optional[bool] = False,

21

cache_kwargs: dict = {},

22

caching_groups: Optional[List[tuple]] = None,

23

client_ttl: int = 3600,

24

# Reliability settings

25

num_retries: Optional[int] = None,

26

max_fallbacks: Optional[int] = None,

27

timeout: Optional[float] = None,

28

stream_timeout: Optional[float] = None,

29

default_litellm_params: Optional[dict] = None,

30

default_max_parallel_requests: Optional[int] = None,

31

set_verbose: bool = False,

32

debug_level: Literal["DEBUG", "INFO"] = "INFO",

33

# Fallback configuration

34

default_fallbacks: Optional[List[str]] = None,

35

fallbacks: List = [],

36

context_window_fallbacks: List = [],

37

content_policy_fallbacks: List = [],

38

# Routing strategy

39

routing_strategy: Literal[

40

"simple-shuffle",

41

"least-busy",

42

"usage-based-routing",

43

"latency-based-routing",

44

"cost-based-routing"

45

] = "simple-shuffle",

46

# Authentication and validation

47

enable_pre_call_checks: bool = False,

48

allowed_fails: int = 3,

49

cooldown_time: float = 1,

50

retry_policy: Optional[Dict[str, Any]] = None,

51

**kwargs

52

)

53

"""

54

Initialize Router with multiple model deployments and routing configuration.

55

56

Args:

57

model_list (Optional[List[DeploymentTypedDict]]): List of model deployment configurations

58

routing_strategy (str): Strategy for selecting deployments ("simple-shuffle", "least-busy", etc.)

59

num_retries (Optional[int]): Number of retries per deployment

60

max_fallbacks (Optional[int]): Maximum fallback deployments to try

61

timeout (Optional[float]): Request timeout in seconds

62

cache_responses (Optional[bool]): Enable response caching

63

fallbacks (List): Global fallback model list

64

enable_pre_call_checks (bool): Validate deployments before requests

65

"""

66

```

67

68

### Router Completion Methods

69

70

Router provides the same completion interfaces as global functions but with intelligent routing and fallback capabilities.

71

72

```python { .api }

73

def completion(

74

self,

75

model: str,

76

messages: List[Dict[str, Any]],

77

# All standard completion parameters

78

**kwargs

79

) -> Union[ModelResponse, Iterator[ModelResponseStream]]

80

"""

81

Route completion request through configured deployments with fallbacks.

82

83

Args:

84

Same as litellm.completion() but routes through multiple deployments

85

86

Returns:

87

Union[ModelResponse, Iterator[ModelResponseStream]]: Routed completion response

88

"""

89

90

async def acompletion(

91

self,

92

model: str,

93

messages: List[Dict[str, Any]],

94

**kwargs

95

) -> Union[ModelResponse, AsyncIterator[ModelResponseStream]]

96

"""

97

Async version of router completion with intelligent routing.

98

"""

99

100

def text_completion(

101

self,

102

model: str,

103

prompt: str,

104

**kwargs

105

) -> Union[TextCompletionResponse, Iterator[TextCompletionResponse]]

106

"""

107

Route text completion request through configured deployments.

108

"""

109

110

async def atext_completion(

111

self,

112

model: str,

113

prompt: str,

114

**kwargs

115

) -> Union[TextCompletionResponse, AsyncIterator[TextCompletionResponse]]

116

"""

117

Async text completion with routing.

118

"""

119

120

def embedding(

121

self,

122

model: str,

123

input: Union[str, List[str], List[int], List[List[int]]],

124

**kwargs

125

) -> EmbeddingResponse

126

"""

127

Route embedding request through configured deployments.

128

"""

129

130

async def aembedding(

131

self,

132

model: str,

133

input: Union[str, List[str], List[int], List[List[int]]],

134

**kwargs

135

) -> EmbeddingResponse

136

"""

137

Async embedding with routing.

138

"""

139

140

def image_generation(

141

self,

142

prompt: str,

143

**kwargs

144

) -> ImageResponse

145

"""

146

Route image generation through configured deployments.

147

"""

148

149

def transcription(

150

self,

151

model: str,

152

file: Union[str, bytes, IO],

153

**kwargs

154

) -> TranscriptionResponse

155

"""

156

Route transcription through configured deployments.

157

"""

158

159

def speech(

160

self,

161

model: str,

162

input: str,

163

voice: str,

164

**kwargs

165

) -> bytes

166

"""

167

Route speech synthesis through configured deployments.

168

"""

169

170

def moderation(

171

self,

172

input: Union[str, List[str]],

173

**kwargs

174

) -> ModerationCreateResponse

175

"""

176

Route moderation through configured deployments.

177

"""

178

```

179

180

### Deployment Management

181

182

Methods for managing model deployments dynamically during runtime.

183

184

```python { .api }

185

def add_deployment(self, deployment: DeploymentTypedDict) -> None:

186

"""

187

Add a new model deployment to the router.

188

189

Args:

190

deployment (DeploymentTypedDict): Deployment configuration

191

"""

192

193

def delete_deployment(self, deployment_id: str) -> None:

194

"""

195

Remove a deployment from the router.

196

197

Args:

198

deployment_id (str): ID of deployment to remove

199

"""

200

201

def get_deployments(self) -> List[DeploymentTypedDict]:

202

"""

203

Get all configured deployments.

204

205

Returns:

206

List[DeploymentTypedDict]: List of all deployments

207

"""

208

209

def set_model_list(self, model_list: List[DeploymentTypedDict]) -> None:

210

"""

211

Replace entire model list with new deployments.

212

213

Args:

214

model_list (List[DeploymentTypedDict]): New list of deployments

215

"""

216

217

def update_deployment(

218

self,

219

deployment_id: str,

220

**kwargs

221

) -> None:

222

"""

223

Update configuration of existing deployment.

224

225

Args:

226

deployment_id (str): ID of deployment to update

227

**kwargs: Updated configuration parameters

228

"""

229

```

230

231

### Health Monitoring

232

233

Health check and monitoring capabilities for deployment status and performance.

234

235

```python { .api }

236

def health_check(

237

self,

238

model: Optional[str] = None

239

) -> Dict[str, Any]:

240

"""

241

Check health status of deployments.

242

243

Args:

244

model (Optional[str]): Specific model to check, or all if None

245

246

Returns:

247

Dict[str, Any]: Health status report with deployment statuses

248

"""

249

250

async def ahealth_check(

251

self,

252

model: Optional[str] = None

253

) -> Dict[str, Any]:

254

"""

255

Async health check of deployments.

256

257

Args:

258

model (Optional[str]): Specific model to check

259

260

Returns:

261

Dict[str, Any]: Health status report

262

"""

263

```

264

265

### Analytics & Metrics

266

267

Cost tracking, usage analytics, and performance metrics for router deployments.

268

269

```python { .api }

270

def get_model_cost_map(self) -> Dict[str, Any]:

271

"""

272

Get cost information for all configured models.

273

274

Returns:

275

Dict[str, Any]: Model cost mapping with pricing details

276

"""

277

278

def print_deployment_metrics(self) -> None:

279

"""

280

Print detailed metrics for all deployments including:

281

- Request counts and success rates

282

- Average latency and throughput

283

- Cost tracking and token usage

284

- Error rates and failure types

285

"""

286

287

def reset_cost(self) -> None:

288

"""

289

Reset accumulated cost tracking data.

290

"""

291

292

def get_usage_stats(self) -> Dict[str, Any]:

293

"""

294

Get comprehensive usage statistics.

295

296

Returns:

297

Dict[str, Any]: Usage statistics including tokens, costs, latencies

298

"""

299

```

300

301

## Configuration Types

302

303

```python { .api }

304

class DeploymentTypedDict(TypedDict):

305

"""Model deployment configuration"""

306

model_name: str

307

litellm_params: Dict[str, Any]

308

model_info: Optional[Dict[str, Any]]

309

310

class LiteLLMParams(TypedDict):

311

"""Parameters for LiteLLM model configuration"""

312

model: str

313

api_key: Optional[str]

314

api_base: Optional[str]

315

api_version: Optional[str]

316

timeout: Optional[float]

317

max_retries: Optional[int]

318

custom_llm_provider: Optional[str]

319

320

class ModelInfo(TypedDict):

321

"""Model metadata and capabilities"""

322

id: Optional[str]

323

mode: Optional[Literal["chat", "completion", "embedding"]]

324

input_cost_per_token: Optional[float]

325

output_cost_per_token: Optional[float]

326

max_tokens: Optional[int]

327

supports_function_calling: Optional[bool]

328

supports_vision: Optional[bool]

329

```

330

331

## Usage Examples

332

333

### Basic Router Setup

334

335

```python

336

from litellm import Router

337

338

# Configure multiple OpenAI deployments

339

model_list = [

340

{

341

"model_name": "gpt-4",

342

"litellm_params": {

343

"model": "gpt-4",

344

"api_key": "sk-key1",

345

"api_base": "https://api.openai.com/v1"

346

}

347

},

348

{

349

"model_name": "gpt-4",

350

"litellm_params": {

351

"model": "azure/gpt-4",

352

"api_key": "azure-key",

353

"api_base": "https://my-azure.openai.azure.com/",

354

"api_version": "2024-02-01"

355

}

356

}

357

]

358

359

router = Router(model_list=model_list)

360

361

# Use router like normal completion

362

response = router.completion(

363

model="gpt-4",

364

messages=[{"role": "user", "content": "Hello!"}]

365

)

366

```

367

368

### Advanced Router Configuration

369

370

```python

371

from litellm import Router

372

373

model_list = [

374

{

375

"model_name": "gpt-4-primary",

376

"litellm_params": {

377

"model": "gpt-4",

378

"api_key": "primary-key"

379

},

380

"model_info": {

381

"id": "primary-deployment"

382

}

383

},

384

{

385

"model_name": "gpt-4-fallback",

386

"litellm_params": {

387

"model": "azure/gpt-4",

388

"api_key": "azure-key",

389

"api_base": "https://backup.openai.azure.com/",

390

"api_version": "2024-02-01"

391

},

392

"model_info": {

393

"id": "backup-deployment"

394

}

395

}

396

]

397

398

router = Router(

399

model_list=model_list,

400

routing_strategy="least-busy",

401

num_retries=3,

402

max_fallbacks=2,

403

timeout=30,

404

enable_pre_call_checks=True,

405

fallbacks=["gpt-3.5-turbo", "claude-3-haiku-20240307"]

406

)

407

```

408

409

### Router with Redis Caching

410

411

```python

412

router = Router(

413

model_list=model_list,

414

redis_url="redis://localhost:6379",

415

cache_responses=True,

416

client_ttl=3600, # 1 hour cache TTL

417

cache_kwargs={

418

"ttl": 600, # 10 minute default TTL

419

"namespace": "litellm_cache"

420

}

421

)

422

423

# Cached responses for identical requests

424

response1 = router.completion(

425

model="gpt-4",

426

messages=[{"role": "user", "content": "What is 2+2?"}]

427

)

428

429

# This will return cached response

430

response2 = router.completion(

431

model="gpt-4",

432

messages=[{"role": "user", "content": "What is 2+2?"}]

433

)

434

```

435

436

### Cost-Based Routing

437

438

```python

439

model_list = [

440

{

441

"model_name": "gpt-4",

442

"litellm_params": {"model": "gpt-4"},

443

"model_info": {

444

"input_cost_per_token": 0.00003,

445

"output_cost_per_token": 0.00006

446

}

447

},

448

{

449

"model_name": "gpt-3.5-turbo",

450

"litellm_params": {"model": "gpt-3.5-turbo"},

451

"model_info": {

452

"input_cost_per_token": 0.000001,

453

"output_cost_per_token": 0.000002

454

}

455

}

456

]

457

458

router = Router(

459

model_list=model_list,

460

routing_strategy="cost-based-routing"

461

)

462

463

# Router will prefer cheaper models when possible

464

response = router.completion(

465

model="gpt-4", # Will route to gpt-3.5-turbo if suitable

466

messages=[{"role": "user", "content": "Simple question"}]

467

)

468

```

469

470

### Health Monitoring

471

472

```python

473

# Check overall health

474

health = router.health_check()

475

print("Router Health:", health)

476

477

# Check specific model

478

gpt4_health = router.health_check(model="gpt-4")

479

print("GPT-4 Health:", gpt4_health)

480

481

# Print detailed metrics

482

router.print_deployment_metrics()

483

484

# Get cost information

485

costs = router.get_model_cost_map()

486

print("Cost Map:", costs)

487

```

488

489

### Dynamic Deployment Management

490

491

```python

492

# Add new deployment at runtime

493

new_deployment = {

494

"model_name": "claude-3",

495

"litellm_params": {

496

"model": "claude-3-sonnet-20240229",

497

"api_key": "anthropic-key"

498

},

499

"model_info": {

500

"id": "claude-deployment"

501

}

502

}

503

504

router.add_deployment(new_deployment)

505

506

# Update existing deployment

507

router.update_deployment(

508

deployment_id="primary-deployment",

509

api_key="new-primary-key"

510

)

511

512

# Remove deployment

513

router.delete_deployment("backup-deployment")

514

515

# Get current deployments

516

deployments = router.get_deployments()

517

print(f"Active deployments: {len(deployments)}")

518

```

519

520

### Fallback Configuration

521

522

```python

523

router = Router(

524

model_list=model_list,

525

# Global fallbacks for any model

526

fallbacks=["gpt-3.5-turbo", "claude-3-haiku-20240307"],

527

# Context window fallbacks

528

context_window_fallbacks=[

529

{"gpt-4": ["claude-3-sonnet-20240229"]}, # If gpt-4 context exceeded

530

{"claude-3-opus-20240229": ["gpt-4"]} # If claude opus context exceeded

531

],

532

# Content policy fallbacks

533

content_policy_fallbacks=[

534

{"gpt-4": ["claude-3-sonnet-20240229"]} # If content policy violation

535

]

536

)

537

538

try:

539

response = router.completion(

540

model="gpt-4",

541

messages=[{"role": "user", "content": "Very long prompt..."}]

542

)

543

except Exception as e:

544

print(f"All fallbacks exhausted: {e}")

545

```

546

547

### Async Router Usage

548

549

```python

550

import asyncio

551

552

async def concurrent_requests():

553

router = Router(model_list=model_list)

554

555

tasks = []

556

for i in range(10):

557

task = router.acompletion(

558

model="gpt-4",

559

messages=[{"role": "user", "content": f"Request {i}"}]

560

)

561

tasks.append(task)

562

563

responses = await asyncio.gather(*tasks)

564

return responses

565

566

responses = asyncio.run(concurrent_requests())

567

```

568

569

### Custom Retry Policy

570

571

```python

572

retry_policy = {

573

"max_retries": 5,

574

"base_delay": 1.0, # Base delay between retries

575

"max_delay": 60.0, # Maximum delay between retries

576

"backoff_factor": 2.0, # Exponential backoff multiplier

577

"jitter": True # Add random jitter to prevent thundering herd

578

}

579

580

router = Router(

581

model_list=model_list,

582

retry_policy=retry_policy,

583

allowed_fails=2, # Deployments marked unhealthy after 2 failures

584

cooldown_time=300 # 5 minute cooldown for unhealthy deployments

585

)

586

```