or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-distributed.mddata-processing.mddistributed-training.mdhyperparameter-tuning.mdindex.mdmodel-serving.mdreinforcement-learning.mdutilities-advanced.md

model-serving.mddocs/

0

# Model Serving

1

2

Ray Serve provides scalable model serving and application deployment with automatic scaling, batching, and multi-model support. It enables production deployment of ML models and Python applications.

3

4

## Capabilities

5

6

### Core Serving Framework

7

8

Basic serving functionality and deployment management.

9

10

```python { .api }

11

def start(detached=False, http_options=HTTPOptions(), **kwargs):

12

"""

13

Start Ray Serve.

14

15

Args:

16

detached (bool): Whether to run in detached mode

17

http_options (HTTPOptions, optional): HTTP configuration

18

**kwargs: Additional Ray initialization arguments

19

"""

20

21

def shutdown():

22

"""Shutdown Ray Serve."""

23

24

def run(target, *, name=None, route_prefix=None, blocking=True, **kwargs):

25

"""

26

Deploy and run a deployment.

27

28

Args:

29

target: Deployment target (function, class, or Deployment)

30

name (str, optional): Deployment name

31

route_prefix (str, optional): HTTP route prefix

32

blocking (bool): Whether to block until deployment is ready

33

**kwargs: Additional deployment options

34

35

Returns:

36

DeploymentHandle: Handle to deployment

37

"""

38

39

def status():

40

"""

41

Get status of Ray Serve deployments.

42

43

Returns:

44

str: Status information

45

"""

46

47

class HTTPOptions:

48

"""HTTP server configuration options."""

49

50

def __init__(self, *, host="127.0.0.1", port=8000, middlewares=None,

51

location="EveryNode", num_cpus=0):

52

"""

53

Initialize HTTP options.

54

55

Args:

56

host (str): Host to bind to

57

port (int): Port to bind to

58

middlewares (list, optional): ASGI middlewares

59

location (str): Where to run HTTP servers

60

num_cpus (int): CPUs for HTTP servers

61

"""

62

```

63

64

### Deployment Decorator and Configuration

65

66

Create and configure deployments.

67

68

```python { .api }

69

def deployment(func_or_class=None, *, name=None, version=None,

70

num_replicas=None, route_prefix=None, ray_actor_options=None,

71

user_config=None, max_concurrent_queries=None,

72

autoscaling_config=None, graceful_shutdown_wait_loop_s=None,

73

graceful_shutdown_timeout_s=None, health_check_period_s=None,

74

health_check_timeout_s=None, is_driver_deployment=None):

75

"""

76

Decorator to create Ray Serve deployment.

77

78

Args:

79

func_or_class: Function or class to deploy

80

name (str, optional): Deployment name

81

version (str, optional): Deployment version

82

num_replicas (int, optional): Number of replicas

83

route_prefix (str, optional): HTTP route prefix

84

ray_actor_options (dict, optional): Ray actor options

85

user_config: User configuration

86

max_concurrent_queries (int, optional): Max concurrent queries per replica

87

autoscaling_config (AutoscalingConfig, optional): Autoscaling configuration

88

graceful_shutdown_wait_loop_s (float, optional): Graceful shutdown wait

89

graceful_shutdown_timeout_s (float, optional): Graceful shutdown timeout

90

health_check_period_s (float, optional): Health check period

91

health_check_timeout_s (float, optional): Health check timeout

92

is_driver_deployment (bool, optional): Whether this is driver deployment

93

94

Returns:

95

Deployment: Deployment object

96

"""

97

98

class Deployment:

99

"""Ray Serve deployment."""

100

101

def deploy(self, *init_args, _blocking=True, **init_kwargs):

102

"""

103

Deploy this deployment.

104

105

Args:

106

*init_args: Arguments for deployment initialization

107

_blocking (bool): Whether to block until ready

108

**init_kwargs: Keyword arguments for initialization

109

110

Returns:

111

DeploymentHandle: Handle to deployment

112

"""

113

114

def delete(self):

115

"""Delete this deployment."""

116

117

def get_handle(self, sync=None):

118

"""

119

Get handle to this deployment.

120

121

Args:

122

sync (bool, optional): Whether to use sync handle

123

124

Returns:

125

DeploymentHandle: Handle to deployment

126

"""

127

128

def options(self, *, func_or_class=None, **kwargs):

129

"""

130

Create new deployment with modified options.

131

132

Args:

133

func_or_class: New function or class

134

**kwargs: Options to modify

135

136

Returns:

137

Deployment: New deployment with modified options

138

"""

139

140

def multiplexed(max_num_models_per_replica=None, *, buffer_size_bytes=100_000_000,

141

buffer_size_bytes_per_replica=None, max_num_models=None):

142

"""

143

Decorator for multiplexed deployments supporting multiple models.

144

145

Args:

146

max_num_models_per_replica (int, optional): Max models per replica

147

buffer_size_bytes (int): Buffer size in bytes

148

buffer_size_bytes_per_replica (int, optional): Buffer size per replica

149

max_num_models (int, optional): Maximum total models

150

151

Returns:

152

Decorator function for multiplexed deployment

153

"""

154

155

def get_multiplexed_model_id():

156

"""

157

Get current multiplexed model ID within a deployment.

158

159

Returns:

160

str: Current model ID

161

"""

162

163

class AutoscalingConfig:

164

"""Configuration for deployment autoscaling."""

165

166

def __init__(self, *, min_replicas=None, max_replicas=None,

167

target_num_ongoing_requests_per_replica=None,

168

metrics_interval_s=None, look_back_period_s=None,

169

smoothing_factor=None, downscale_delay_s=None,

170

upscale_delay_s=None):

171

"""

172

Initialize autoscaling configuration.

173

174

Args:

175

min_replicas (int, optional): Minimum number of replicas

176

max_replicas (int, optional): Maximum number of replicas

177

target_num_ongoing_requests_per_replica (float, optional): Target requests per replica

178

metrics_interval_s (float, optional): Metrics collection interval

179

look_back_period_s (float, optional): Metrics lookback period

180

smoothing_factor (float, optional): Smoothing factor for metrics

181

downscale_delay_s (float, optional): Delay before downscaling

182

upscale_delay_s (float, optional): Delay before upscaling

183

"""

184

```

185

186

### Deployment Handles

187

188

Interact with deployed models and services.

189

190

```python { .api }

191

class DeploymentHandle:

192

"""Handle for interacting with deployment."""

193

194

def remote(self, *args, **kwargs):

195

"""

196

Make async request to deployment.

197

198

Args:

199

*args: Arguments to pass

200

**kwargs: Keyword arguments to pass

201

202

Returns:

203

DeploymentResponse: Response object

204

"""

205

206

def options(self, *, method_name=None, multiplexed_model_id=None, **kwargs):

207

"""

208

Create handle with modified options.

209

210

Args:

211

method_name (str, optional): Method to call

212

multiplexed_model_id (str, optional): Model ID for multiplexing

213

**kwargs: Additional options

214

215

Returns:

216

DeploymentHandle: Handle with modified options

217

"""

218

219

class DeploymentResponse:

220

"""Response from deployment."""

221

222

def result(self, *, timeout_s=None):

223

"""

224

Get result (blocking).

225

226

Args:

227

timeout_s (float, optional): Timeout in seconds

228

229

Returns:

230

Result of deployment call

231

"""

232

233

class DeploymentResponseGenerator:

234

"""Generator for streaming deployment responses."""

235

236

def __iter__(self):

237

"""Iterate over streaming responses."""

238

239

def __next__(self):

240

"""Get next response."""

241

```

242

243

### Application Framework

244

245

Build complex serving applications.

246

247

```python { .api }

248

class Application:

249

"""Ray Serve application."""

250

251

def __init__(self, import_path, *, args=None, kwargs=None):

252

"""

253

Initialize application.

254

255

Args:

256

import_path (str): Import path to application

257

args (list, optional): Arguments for application

258

kwargs (dict, optional): Keyword arguments for application

259

"""

260

261

def build(app_or_deployment, *args, **kwargs):

262

"""

263

Build application from deployment or function.

264

265

Args:

266

app_or_deployment: Application or deployment to build

267

*args: Arguments for building

268

**kwargs: Keyword arguments for building

269

270

Returns:

271

Application: Built application

272

"""

273

```

274

275

### Batching Support

276

277

Batch requests for improved throughput.

278

279

```python { .api }

280

class Batched:

281

"""Decorator for batched request handling."""

282

283

def __init__(self, *, max_batch_size=None, batch_wait_timeout_s=None):

284

"""

285

Initialize batching decorator.

286

287

Args:

288

max_batch_size (int, optional): Maximum batch size

289

batch_wait_timeout_s (float, optional): Batch wait timeout

290

"""

291

292

def batch(max_batch_size=None, batch_wait_timeout_s=None):

293

"""

294

Decorator for batched request handling.

295

296

Args:

297

max_batch_size (int, optional): Maximum batch size

298

batch_wait_timeout_s (float, optional): Batch wait timeout

299

300

Returns:

301

Batched: Batching decorator

302

"""

303

```

304

305

### Ingress and Routing

306

307

Handle HTTP requests and routing.

308

309

```python { .api }

310

class Ingress:

311

"""Base class for custom HTTP ingress."""

312

313

async def __call__(self, request):

314

"""

315

Handle HTTP request.

316

317

Args:

318

request: HTTP request

319

320

Returns:

321

HTTP response

322

"""

323

324

def ingress(app):

325

"""

326

Mark deployment as HTTP ingress.

327

328

Args:

329

app: Deployment to mark as ingress

330

331

Returns:

332

Deployment with ingress configuration

333

"""

334

```

335

336

### Model Multiplexing

337

338

Serve multiple models from single deployment.

339

340

```python { .api }

341

class MultiplexedReplicaResult:

342

"""Result from multiplexed model call."""

343

344

def __init__(self, result):

345

"""Initialize with result."""

346

347

def get_multiplexed_model_id():

348

"""

349

Get current multiplexed model ID.

350

351

Returns:

352

str: Current model ID

353

"""

354

```

355

356

### Configuration and Context

357

358

Runtime configuration and context access.

359

360

```python { .api }

361

def get_replica_context():

362

"""

363

Get current replica context.

364

365

Returns:

366

ReplicaContext: Current replica context

367

"""

368

369

class ReplicaContext:

370

"""Context for current replica."""

371

372

@property

373

def deployment(self):

374

"""Current deployment name."""

375

376

@property

377

def replica_tag(self):

378

"""Current replica tag."""

379

380

@property

381

def servable_object(self):

382

"""Current servable object."""

383

```

384

385

## Usage Examples

386

387

### Basic Model Serving

388

389

```python

390

import ray

391

from ray import serve

392

import numpy as np

393

394

# Start Ray Serve

395

serve.start()

396

397

# Define a simple model

398

@serve.deployment

399

class SimpleModel:

400

def __init__(self):

401

# Load your model here

402

self.model = self._load_model()

403

404

def _load_model(self):

405

# Placeholder for model loading

406

return lambda x: x * 2

407

408

def __call__(self, request):

409

data = request.json()

410

input_data = np.array(data["input"])

411

prediction = self.model(input_data)

412

return {"prediction": prediction.tolist()}

413

414

# Deploy the model

415

SimpleModel.deploy()

416

417

# Make a request

418

import requests

419

response = requests.post("http://127.0.0.1:8000/SimpleModel",

420

json={"input": [1, 2, 3, 4]})

421

print(response.json()) # {"prediction": [2, 4, 6, 8]}

422

423

serve.shutdown()

424

```

425

426

### Advanced Model with Batching

427

428

```python

429

import ray

430

from ray import serve

431

import torch

432

433

serve.start()

434

435

@serve.deployment(

436

num_replicas=2,

437

ray_actor_options={"num_cpus": 1, "num_gpus": 0.5}

438

)

439

class PyTorchModel:

440

def __init__(self, model_path):

441

self.model = torch.load(model_path)

442

self.model.eval()

443

444

@serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1)

445

async def predict_batch(self, inputs):

446

batch = torch.stack(inputs)

447

with torch.no_grad():

448

predictions = self.model(batch)

449

return predictions.numpy()

450

451

async def __call__(self, request):

452

data = torch.tensor(request.json()["input"])

453

prediction = await self.predict_batch(data)

454

return {"prediction": prediction.tolist()}

455

456

# Deploy with specific configuration

457

PyTorchModel.options(

458

autoscaling_config=serve.AutoscalingConfig(

459

min_replicas=1,

460

max_replicas=5,

461

target_num_ongoing_requests_per_replica=2

462

)

463

).deploy("model.pt")

464

```

465

466

### Multi-Model Deployment

467

468

```python

469

import ray

470

from ray import serve

471

472

serve.start()

473

474

@serve.deployment

475

class ModelRouter:

476

def __init__(self):

477

self.model_a = ModelA.get_handle()

478

self.model_b = ModelB.get_handle()

479

480

async def __call__(self, request):

481

data = request.json()

482

model_type = data.get("model", "a")

483

484

if model_type == "a":

485

result = await self.model_a.remote(data)

486

else:

487

result = await self.model_b.remote(data)

488

489

return result

490

491

@serve.deployment

492

class ModelA:

493

async def __call__(self, data):

494

return {"model": "a", "result": data["input"] * 2}

495

496

@serve.deployment

497

class ModelB:

498

async def __call__(self, data):

499

return {"model": "b", "result": data["input"] + 10}

500

501

# Deploy all models

502

ModelA.deploy()

503

ModelB.deploy()

504

ModelRouter.deploy()

505

```

506

507

### Application with Custom Ingress

508

509

```python

510

import ray

511

from ray import serve

512

from starlette.requests import Request

513

from starlette.responses import JSONResponse

514

515

serve.start()

516

517

@serve.deployment

518

@serve.ingress(app)

519

class CustomIngress:

520

def __init__(self):

521

self.model = MLModel.get_handle()

522

523

async def __call__(self, request: Request):

524

if request.method == "GET":

525

return JSONResponse({"status": "healthy"})

526

527

elif request.method == "POST":

528

data = await request.json()

529

result = await self.model.remote(data)

530

return JSONResponse(result)

531

532

else:

533

return JSONResponse({"error": "Method not allowed"},

534

status_code=405)

535

536

@serve.deployment

537

class MLModel:

538

def __init__(self):

539

# Initialize your model

540

pass

541

542

async def predict(self, data):

543

# Model prediction logic

544

return {"prediction": "result"}

545

546

# Build and run application

547

app = serve.build(CustomIngress)

548

serve.run(app)

549

```

550

551

### Production Configuration

552

553

```python

554

import ray

555

from ray import serve

556

557

# Production serving configuration

558

serve.start(

559

detached=True,

560

http_options=serve.HTTPOptions(

561

host="0.0.0.0",

562

port=8000,

563

location="EveryNode"

564

)

565

)

566

567

@serve.deployment(

568

name="production-model",

569

version="v1.0",

570

num_replicas=4,

571

autoscaling_config=serve.AutoscalingConfig(

572

min_replicas=2,

573

max_replicas=10,

574

target_num_ongoing_requests_per_replica=5

575

),

576

ray_actor_options={

577

"num_cpus": 2,

578

"num_gpus": 1,

579

"memory": 4000 * 1024 * 1024 # 4GB

580

},

581

health_check_period_s=10,

582

health_check_timeout_s=30,

583

graceful_shutdown_timeout_s=60

584

)

585

class ProductionModel:

586

def __init__(self, model_config):

587

self.model = self._load_model(model_config)

588

self.preprocessor = self._load_preprocessor()

589

590

def _load_model(self, config):

591

# Load production model

592

pass

593

594

def _load_preprocessor(self):

595

# Load data preprocessor

596

pass

597

598

@serve.batch(max_batch_size=64, batch_wait_timeout_s=0.05)

599

async def predict_batch(self, inputs):

600

# Batch prediction with preprocessing

601

processed = [self.preprocessor(inp) for inp in inputs]

602

predictions = self.model.predict(processed)

603

return predictions

604

605

async def __call__(self, request):

606

data = request.json()

607

prediction = await self.predict_batch(data["input"])

608

return {"prediction": prediction, "version": "v1.0"}

609

610

# Deploy production model

611

ProductionModel.deploy({"model_path": "s3://models/production-v1.0"})

612

```