0
# Model Serving
1
2
Ray Serve provides scalable model serving and application deployment with automatic scaling, batching, and multi-model support. It enables production deployment of ML models and Python applications.
3
4
## Capabilities
5
6
### Core Serving Framework
7
8
Basic serving functionality and deployment management.
9
10
```python { .api }
11
def start(detached=False, http_options=HTTPOptions(), **kwargs):
12
"""
13
Start Ray Serve.
14
15
Args:
16
detached (bool): Whether to run in detached mode
17
http_options (HTTPOptions, optional): HTTP configuration
18
**kwargs: Additional Ray initialization arguments
19
"""
20
21
def shutdown():
22
"""Shutdown Ray Serve."""
23
24
def run(target, *, name=None, route_prefix=None, blocking=True, **kwargs):
25
"""
26
Deploy and run a deployment.
27
28
Args:
29
target: Deployment target (function, class, or Deployment)
30
name (str, optional): Deployment name
31
route_prefix (str, optional): HTTP route prefix
32
blocking (bool): Whether to block until deployment is ready
33
**kwargs: Additional deployment options
34
35
Returns:
36
DeploymentHandle: Handle to deployment
37
"""
38
39
def status():
40
"""
41
Get status of Ray Serve deployments.
42
43
Returns:
44
str: Status information
45
"""
46
47
class HTTPOptions:
48
"""HTTP server configuration options."""
49
50
def __init__(self, *, host="127.0.0.1", port=8000, middlewares=None,
51
location="EveryNode", num_cpus=0):
52
"""
53
Initialize HTTP options.
54
55
Args:
56
host (str): Host to bind to
57
port (int): Port to bind to
58
middlewares (list, optional): ASGI middlewares
59
location (str): Where to run HTTP servers
60
num_cpus (int): CPUs for HTTP servers
61
"""
62
```
63
64
### Deployment Decorator and Configuration
65
66
Create and configure deployments.
67
68
```python { .api }
69
def deployment(func_or_class=None, *, name=None, version=None,
70
num_replicas=None, route_prefix=None, ray_actor_options=None,
71
user_config=None, max_concurrent_queries=None,
72
autoscaling_config=None, graceful_shutdown_wait_loop_s=None,
73
graceful_shutdown_timeout_s=None, health_check_period_s=None,
74
health_check_timeout_s=None, is_driver_deployment=None):
75
"""
76
Decorator to create Ray Serve deployment.
77
78
Args:
79
func_or_class: Function or class to deploy
80
name (str, optional): Deployment name
81
version (str, optional): Deployment version
82
num_replicas (int, optional): Number of replicas
83
route_prefix (str, optional): HTTP route prefix
84
ray_actor_options (dict, optional): Ray actor options
85
user_config: User configuration
86
max_concurrent_queries (int, optional): Max concurrent queries per replica
87
autoscaling_config (AutoscalingConfig, optional): Autoscaling configuration
88
graceful_shutdown_wait_loop_s (float, optional): Graceful shutdown wait
89
graceful_shutdown_timeout_s (float, optional): Graceful shutdown timeout
90
health_check_period_s (float, optional): Health check period
91
health_check_timeout_s (float, optional): Health check timeout
92
is_driver_deployment (bool, optional): Whether this is driver deployment
93
94
Returns:
95
Deployment: Deployment object
96
"""
97
98
class Deployment:
99
"""Ray Serve deployment."""
100
101
def deploy(self, *init_args, _blocking=True, **init_kwargs):
102
"""
103
Deploy this deployment.
104
105
Args:
106
*init_args: Arguments for deployment initialization
107
_blocking (bool): Whether to block until ready
108
**init_kwargs: Keyword arguments for initialization
109
110
Returns:
111
DeploymentHandle: Handle to deployment
112
"""
113
114
def delete(self):
115
"""Delete this deployment."""
116
117
def get_handle(self, sync=None):
118
"""
119
Get handle to this deployment.
120
121
Args:
122
sync (bool, optional): Whether to use sync handle
123
124
Returns:
125
DeploymentHandle: Handle to deployment
126
"""
127
128
def options(self, *, func_or_class=None, **kwargs):
129
"""
130
Create new deployment with modified options.
131
132
Args:
133
func_or_class: New function or class
134
**kwargs: Options to modify
135
136
Returns:
137
Deployment: New deployment with modified options
138
"""
139
140
def multiplexed(max_num_models_per_replica=None, *, buffer_size_bytes=100_000_000,
141
buffer_size_bytes_per_replica=None, max_num_models=None):
142
"""
143
Decorator for multiplexed deployments supporting multiple models.
144
145
Args:
146
max_num_models_per_replica (int, optional): Max models per replica
147
buffer_size_bytes (int): Buffer size in bytes
148
buffer_size_bytes_per_replica (int, optional): Buffer size per replica
149
max_num_models (int, optional): Maximum total models
150
151
Returns:
152
Decorator function for multiplexed deployment
153
"""
154
155
def get_multiplexed_model_id():
156
"""
157
Get current multiplexed model ID within a deployment.
158
159
Returns:
160
str: Current model ID
161
"""
162
163
class AutoscalingConfig:
164
"""Configuration for deployment autoscaling."""
165
166
def __init__(self, *, min_replicas=None, max_replicas=None,
167
target_num_ongoing_requests_per_replica=None,
168
metrics_interval_s=None, look_back_period_s=None,
169
smoothing_factor=None, downscale_delay_s=None,
170
upscale_delay_s=None):
171
"""
172
Initialize autoscaling configuration.
173
174
Args:
175
min_replicas (int, optional): Minimum number of replicas
176
max_replicas (int, optional): Maximum number of replicas
177
target_num_ongoing_requests_per_replica (float, optional): Target requests per replica
178
metrics_interval_s (float, optional): Metrics collection interval
179
look_back_period_s (float, optional): Metrics lookback period
180
smoothing_factor (float, optional): Smoothing factor for metrics
181
downscale_delay_s (float, optional): Delay before downscaling
182
upscale_delay_s (float, optional): Delay before upscaling
183
"""
184
```
185
186
### Deployment Handles
187
188
Interact with deployed models and services.
189
190
```python { .api }
191
class DeploymentHandle:
192
"""Handle for interacting with deployment."""
193
194
def remote(self, *args, **kwargs):
195
"""
196
Make async request to deployment.
197
198
Args:
199
*args: Arguments to pass
200
**kwargs: Keyword arguments to pass
201
202
Returns:
203
DeploymentResponse: Response object
204
"""
205
206
def options(self, *, method_name=None, multiplexed_model_id=None, **kwargs):
207
"""
208
Create handle with modified options.
209
210
Args:
211
method_name (str, optional): Method to call
212
multiplexed_model_id (str, optional): Model ID for multiplexing
213
**kwargs: Additional options
214
215
Returns:
216
DeploymentHandle: Handle with modified options
217
"""
218
219
class DeploymentResponse:
220
"""Response from deployment."""
221
222
def result(self, *, timeout_s=None):
223
"""
224
Get result (blocking).
225
226
Args:
227
timeout_s (float, optional): Timeout in seconds
228
229
Returns:
230
Result of deployment call
231
"""
232
233
class DeploymentResponseGenerator:
234
"""Generator for streaming deployment responses."""
235
236
def __iter__(self):
237
"""Iterate over streaming responses."""
238
239
def __next__(self):
240
"""Get next response."""
241
```
242
243
### Application Framework
244
245
Build complex serving applications.
246
247
```python { .api }
248
class Application:
249
"""Ray Serve application."""
250
251
def __init__(self, import_path, *, args=None, kwargs=None):
252
"""
253
Initialize application.
254
255
Args:
256
import_path (str): Import path to application
257
args (list, optional): Arguments for application
258
kwargs (dict, optional): Keyword arguments for application
259
"""
260
261
def build(app_or_deployment, *args, **kwargs):
262
"""
263
Build application from deployment or function.
264
265
Args:
266
app_or_deployment: Application or deployment to build
267
*args: Arguments for building
268
**kwargs: Keyword arguments for building
269
270
Returns:
271
Application: Built application
272
"""
273
```
274
275
### Batching Support
276
277
Batch requests for improved throughput.
278
279
```python { .api }
280
class Batched:
281
"""Decorator for batched request handling."""
282
283
def __init__(self, *, max_batch_size=None, batch_wait_timeout_s=None):
284
"""
285
Initialize batching decorator.
286
287
Args:
288
max_batch_size (int, optional): Maximum batch size
289
batch_wait_timeout_s (float, optional): Batch wait timeout
290
"""
291
292
def batch(max_batch_size=None, batch_wait_timeout_s=None):
293
"""
294
Decorator for batched request handling.
295
296
Args:
297
max_batch_size (int, optional): Maximum batch size
298
batch_wait_timeout_s (float, optional): Batch wait timeout
299
300
Returns:
301
Batched: Batching decorator
302
"""
303
```
304
305
### Ingress and Routing
306
307
Handle HTTP requests and routing.
308
309
```python { .api }
310
class Ingress:
311
"""Base class for custom HTTP ingress."""
312
313
async def __call__(self, request):
314
"""
315
Handle HTTP request.
316
317
Args:
318
request: HTTP request
319
320
Returns:
321
HTTP response
322
"""
323
324
def ingress(app):
325
"""
326
Mark deployment as HTTP ingress.
327
328
Args:
329
app: Deployment to mark as ingress
330
331
Returns:
332
Deployment with ingress configuration
333
"""
334
```
335
336
### Model Multiplexing
337
338
Serve multiple models from single deployment.
339
340
```python { .api }
341
class MultiplexedReplicaResult:
342
"""Result from multiplexed model call."""
343
344
def __init__(self, result):
345
"""Initialize with result."""
346
347
def get_multiplexed_model_id():
348
"""
349
Get current multiplexed model ID.
350
351
Returns:
352
str: Current model ID
353
"""
354
```
355
356
### Configuration and Context
357
358
Runtime configuration and context access.
359
360
```python { .api }
361
def get_replica_context():
362
"""
363
Get current replica context.
364
365
Returns:
366
ReplicaContext: Current replica context
367
"""
368
369
class ReplicaContext:
370
"""Context for current replica."""
371
372
@property
373
def deployment(self):
374
"""Current deployment name."""
375
376
@property
377
def replica_tag(self):
378
"""Current replica tag."""
379
380
@property
381
def servable_object(self):
382
"""Current servable object."""
383
```
384
385
## Usage Examples
386
387
### Basic Model Serving
388
389
```python
390
import ray
391
from ray import serve
392
import numpy as np
393
394
# Start Ray Serve
395
serve.start()
396
397
# Define a simple model
398
@serve.deployment
399
class SimpleModel:
400
def __init__(self):
401
# Load your model here
402
self.model = self._load_model()
403
404
def _load_model(self):
405
# Placeholder for model loading
406
return lambda x: x * 2
407
408
def __call__(self, request):
409
data = request.json()
410
input_data = np.array(data["input"])
411
prediction = self.model(input_data)
412
return {"prediction": prediction.tolist()}
413
414
# Deploy the model
415
SimpleModel.deploy()
416
417
# Make a request
418
import requests
419
response = requests.post("http://127.0.0.1:8000/SimpleModel",
420
json={"input": [1, 2, 3, 4]})
421
print(response.json()) # {"prediction": [2, 4, 6, 8]}
422
423
serve.shutdown()
424
```
425
426
### Advanced Model with Batching
427
428
```python
429
import ray
430
from ray import serve
431
import torch
432
433
serve.start()
434
435
@serve.deployment(
436
num_replicas=2,
437
ray_actor_options={"num_cpus": 1, "num_gpus": 0.5}
438
)
439
class PyTorchModel:
440
def __init__(self, model_path):
441
self.model = torch.load(model_path)
442
self.model.eval()
443
444
@serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1)
445
async def predict_batch(self, inputs):
446
batch = torch.stack(inputs)
447
with torch.no_grad():
448
predictions = self.model(batch)
449
return predictions.numpy()
450
451
async def __call__(self, request):
452
data = torch.tensor(request.json()["input"])
453
prediction = await self.predict_batch(data)
454
return {"prediction": prediction.tolist()}
455
456
# Deploy with specific configuration
457
PyTorchModel.options(
458
autoscaling_config=serve.AutoscalingConfig(
459
min_replicas=1,
460
max_replicas=5,
461
target_num_ongoing_requests_per_replica=2
462
)
463
).deploy("model.pt")
464
```
465
466
### Multi-Model Deployment
467
468
```python
469
import ray
470
from ray import serve
471
472
serve.start()
473
474
@serve.deployment
475
class ModelRouter:
476
def __init__(self):
477
self.model_a = ModelA.get_handle()
478
self.model_b = ModelB.get_handle()
479
480
async def __call__(self, request):
481
data = request.json()
482
model_type = data.get("model", "a")
483
484
if model_type == "a":
485
result = await self.model_a.remote(data)
486
else:
487
result = await self.model_b.remote(data)
488
489
return result
490
491
@serve.deployment
492
class ModelA:
493
async def __call__(self, data):
494
return {"model": "a", "result": data["input"] * 2}
495
496
@serve.deployment
497
class ModelB:
498
async def __call__(self, data):
499
return {"model": "b", "result": data["input"] + 10}
500
501
# Deploy all models
502
ModelA.deploy()
503
ModelB.deploy()
504
ModelRouter.deploy()
505
```
506
507
### Application with Custom Ingress
508
509
```python
510
import ray
511
from ray import serve
512
from starlette.requests import Request
513
from starlette.responses import JSONResponse
514
515
serve.start()
516
517
@serve.deployment
518
@serve.ingress(app)
519
class CustomIngress:
520
def __init__(self):
521
self.model = MLModel.get_handle()
522
523
async def __call__(self, request: Request):
524
if request.method == "GET":
525
return JSONResponse({"status": "healthy"})
526
527
elif request.method == "POST":
528
data = await request.json()
529
result = await self.model.remote(data)
530
return JSONResponse(result)
531
532
else:
533
return JSONResponse({"error": "Method not allowed"},
534
status_code=405)
535
536
@serve.deployment
537
class MLModel:
538
def __init__(self):
539
# Initialize your model
540
pass
541
542
async def predict(self, data):
543
# Model prediction logic
544
return {"prediction": "result"}
545
546
# Build and run application
547
app = serve.build(CustomIngress)
548
serve.run(app)
549
```
550
551
### Production Configuration
552
553
```python
554
import ray
555
from ray import serve
556
557
# Production serving configuration
558
serve.start(
559
detached=True,
560
http_options=serve.HTTPOptions(
561
host="0.0.0.0",
562
port=8000,
563
location="EveryNode"
564
)
565
)
566
567
@serve.deployment(
568
name="production-model",
569
version="v1.0",
570
num_replicas=4,
571
autoscaling_config=serve.AutoscalingConfig(
572
min_replicas=2,
573
max_replicas=10,
574
target_num_ongoing_requests_per_replica=5
575
),
576
ray_actor_options={
577
"num_cpus": 2,
578
"num_gpus": 1,
579
"memory": 4000 * 1024 * 1024 # 4GB
580
},
581
health_check_period_s=10,
582
health_check_timeout_s=30,
583
graceful_shutdown_timeout_s=60
584
)
585
class ProductionModel:
586
def __init__(self, model_config):
587
self.model = self._load_model(model_config)
588
self.preprocessor = self._load_preprocessor()
589
590
def _load_model(self, config):
591
# Load production model
592
pass
593
594
def _load_preprocessor(self):
595
# Load data preprocessor
596
pass
597
598
@serve.batch(max_batch_size=64, batch_wait_timeout_s=0.05)
599
async def predict_batch(self, inputs):
600
# Batch prediction with preprocessing
601
processed = [self.preprocessor(inp) for inp in inputs]
602
predictions = self.model.predict(processed)
603
return predictions
604
605
async def __call__(self, request):
606
data = request.json()
607
prediction = await self.predict_batch(data["input"])
608
return {"prediction": prediction, "version": "v1.0"}
609
610
# Deploy production model
611
ProductionModel.deploy({"model_path": "s3://models/production-v1.0"})
612
```