Tessl Tile for pypi/ray@2.49.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-distributed.md data-processing.md distributed-training.md hyperparameter-tuning.md index.md model-serving.md reinforcement-learning.md utilities-advanced.md

model-serving.mddocs/

0
# Model Serving
1

2
Ray Serve provides scalable model serving and application deployment with automatic scaling, batching, and multi-model support. It enables production deployment of ML models and Python applications.
3

4
## Capabilities
5

6
### Core Serving Framework
7

8
Basic serving functionality and deployment management.
9

10
```python { .api }
11
def start(detached=False, http_options=HTTPOptions(), **kwargs):
12
    """
13
    Start Ray Serve.
14
    
15
    Args:
16
        detached (bool): Whether to run in detached mode
17
        http_options (HTTPOptions, optional): HTTP configuration
18
        **kwargs: Additional Ray initialization arguments
19
    """
20

21
def shutdown():
22
    """Shutdown Ray Serve."""
23

24
def run(target, *, name=None, route_prefix=None, blocking=True, **kwargs):
25
    """
26
    Deploy and run a deployment.
27
    
28
    Args:
29
        target: Deployment target (function, class, or Deployment)
30
        name (str, optional): Deployment name
31
        route_prefix (str, optional): HTTP route prefix
32
        blocking (bool): Whether to block until deployment is ready
33
        **kwargs: Additional deployment options
34
    
35
    Returns:
36
        DeploymentHandle: Handle to deployment
37
    """
38

39
def status():
40
    """
41
    Get status of Ray Serve deployments.
42
    
43
    Returns:
44
        str: Status information
45
    """
46

47
class HTTPOptions:
48
    """HTTP server configuration options."""
49
    
50
    def __init__(self, *, host="127.0.0.1", port=8000, middlewares=None,
51
                 location="EveryNode", num_cpus=0):
52
        """
53
        Initialize HTTP options.
54
        
55
        Args:
56
            host (str): Host to bind to
57
            port (int): Port to bind to
58
            middlewares (list, optional): ASGI middlewares
59
            location (str): Where to run HTTP servers
60
            num_cpus (int): CPUs for HTTP servers
61
        """
62
```
63

64
### Deployment Decorator and Configuration
65

66
Create and configure deployments.
67

68
```python { .api }
69
def deployment(func_or_class=None, *, name=None, version=None,
70
               num_replicas=None, route_prefix=None, ray_actor_options=None,
71
               user_config=None, max_concurrent_queries=None,
72
               autoscaling_config=None, graceful_shutdown_wait_loop_s=None,
73
               graceful_shutdown_timeout_s=None, health_check_period_s=None,
74
               health_check_timeout_s=None, is_driver_deployment=None):
75
    """
76
    Decorator to create Ray Serve deployment.
77
    
78
    Args:
79
        func_or_class: Function or class to deploy
80
        name (str, optional): Deployment name
81
        version (str, optional): Deployment version
82
        num_replicas (int, optional): Number of replicas
83
        route_prefix (str, optional): HTTP route prefix
84
        ray_actor_options (dict, optional): Ray actor options
85
        user_config: User configuration
86
        max_concurrent_queries (int, optional): Max concurrent queries per replica
87
        autoscaling_config (AutoscalingConfig, optional): Autoscaling configuration
88
        graceful_shutdown_wait_loop_s (float, optional): Graceful shutdown wait
89
        graceful_shutdown_timeout_s (float, optional): Graceful shutdown timeout
90
        health_check_period_s (float, optional): Health check period
91
        health_check_timeout_s (float, optional): Health check timeout
92
        is_driver_deployment (bool, optional): Whether this is driver deployment
93
    
94
    Returns:
95
        Deployment: Deployment object
96
    """
97

98
class Deployment:
99
    """Ray Serve deployment."""
100
    
101
    def deploy(self, *init_args, _blocking=True, **init_kwargs):
102
        """
103
        Deploy this deployment.
104
        
105
        Args:
106
            *init_args: Arguments for deployment initialization
107
            _blocking (bool): Whether to block until ready
108
            **init_kwargs: Keyword arguments for initialization
109
        
110
        Returns:
111
            DeploymentHandle: Handle to deployment
112
        """
113
    
114
    def delete(self):
115
        """Delete this deployment."""
116
    
117
    def get_handle(self, sync=None):
118
        """
119
        Get handle to this deployment.
120
        
121
        Args:
122
            sync (bool, optional): Whether to use sync handle
123
        
124
        Returns:
125
            DeploymentHandle: Handle to deployment
126
        """
127
    
128
    def options(self, *, func_or_class=None, **kwargs):
129
        """
130
        Create new deployment with modified options.
131
        
132
        Args:
133
            func_or_class: New function or class
134
            **kwargs: Options to modify
135
        
136
        Returns:
137
            Deployment: New deployment with modified options
138
        """
139

140
def multiplexed(max_num_models_per_replica=None, *, buffer_size_bytes=100_000_000,
141
                buffer_size_bytes_per_replica=None, max_num_models=None):
142
    """
143
    Decorator for multiplexed deployments supporting multiple models.
144
    
145
    Args:
146
        max_num_models_per_replica (int, optional): Max models per replica
147
        buffer_size_bytes (int): Buffer size in bytes
148
        buffer_size_bytes_per_replica (int, optional): Buffer size per replica
149
        max_num_models (int, optional): Maximum total models
150
    
151
    Returns:
152
        Decorator function for multiplexed deployment
153
    """
154

155
def get_multiplexed_model_id():
156
    """
157
    Get current multiplexed model ID within a deployment.
158
    
159
    Returns:
160
        str: Current model ID
161
    """
162

163
class AutoscalingConfig:
164
    """Configuration for deployment autoscaling."""
165
    
166
    def __init__(self, *, min_replicas=None, max_replicas=None,
167
                 target_num_ongoing_requests_per_replica=None,
168
                 metrics_interval_s=None, look_back_period_s=None,
169
                 smoothing_factor=None, downscale_delay_s=None,
170
                 upscale_delay_s=None):
171
        """
172
        Initialize autoscaling configuration.
173
        
174
        Args:
175
            min_replicas (int, optional): Minimum number of replicas
176
            max_replicas (int, optional): Maximum number of replicas
177
            target_num_ongoing_requests_per_replica (float, optional): Target requests per replica
178
            metrics_interval_s (float, optional): Metrics collection interval
179
            look_back_period_s (float, optional): Metrics lookback period
180
            smoothing_factor (float, optional): Smoothing factor for metrics
181
            downscale_delay_s (float, optional): Delay before downscaling
182
            upscale_delay_s (float, optional): Delay before upscaling
183
        """
184
```
185

186
### Deployment Handles
187

188
Interact with deployed models and services.
189

190
```python { .api }
191
class DeploymentHandle:
192
    """Handle for interacting with deployment."""
193
    
194
    def remote(self, *args, **kwargs):
195
        """
196
        Make async request to deployment.
197
        
198
        Args:
199
            *args: Arguments to pass
200
            **kwargs: Keyword arguments to pass
201
        
202
        Returns:
203
            DeploymentResponse: Response object
204
        """
205
    
206
    def options(self, *, method_name=None, multiplexed_model_id=None, **kwargs):
207
        """
208
        Create handle with modified options.
209
        
210
        Args:
211
            method_name (str, optional): Method to call
212
            multiplexed_model_id (str, optional): Model ID for multiplexing
213
            **kwargs: Additional options
214
        
215
        Returns:
216
            DeploymentHandle: Handle with modified options
217
        """
218

219
class DeploymentResponse:
220
    """Response from deployment."""
221
    
222
    def result(self, *, timeout_s=None):
223
        """
224
        Get result (blocking).
225
        
226
        Args:
227
            timeout_s (float, optional): Timeout in seconds
228
        
229
        Returns:
230
            Result of deployment call
231
        """
232

233
class DeploymentResponseGenerator:
234
    """Generator for streaming deployment responses."""
235
    
236
    def __iter__(self):
237
        """Iterate over streaming responses."""
238
    
239
    def __next__(self):
240
        """Get next response."""
241
```
242

243
### Application Framework
244

245
Build complex serving applications.
246

247
```python { .api }
248
class Application:
249
    """Ray Serve application."""
250
    
251
    def __init__(self, import_path, *, args=None, kwargs=None):
252
        """
253
        Initialize application.
254
        
255
        Args:
256
            import_path (str): Import path to application
257
            args (list, optional): Arguments for application
258
            kwargs (dict, optional): Keyword arguments for application
259
        """
260

261
def build(app_or_deployment, *args, **kwargs):
262
    """
263
    Build application from deployment or function.
264
    
265
    Args:
266
        app_or_deployment: Application or deployment to build
267
        *args: Arguments for building
268
        **kwargs: Keyword arguments for building
269
    
270
    Returns:
271
        Application: Built application
272
    """
273
```
274

275
### Batching Support
276

277
Batch requests for improved throughput.
278

279
```python { .api }
280
class Batched:
281
    """Decorator for batched request handling."""
282
    
283
    def __init__(self, *, max_batch_size=None, batch_wait_timeout_s=None):
284
        """
285
        Initialize batching decorator.
286
        
287
        Args:
288
            max_batch_size (int, optional): Maximum batch size
289
            batch_wait_timeout_s (float, optional): Batch wait timeout
290
        """
291

292
def batch(max_batch_size=None, batch_wait_timeout_s=None):
293
    """
294
    Decorator for batched request handling.
295
    
296
    Args:
297
        max_batch_size (int, optional): Maximum batch size
298
        batch_wait_timeout_s (float, optional): Batch wait timeout
299
    
300
    Returns:
301
        Batched: Batching decorator
302
    """
303
```
304

305
### Ingress and Routing
306

307
Handle HTTP requests and routing.
308

309
```python { .api }
310
class Ingress:
311
    """Base class for custom HTTP ingress."""
312
    
313
    async def __call__(self, request):
314
        """
315
        Handle HTTP request.
316
        
317
        Args:
318
            request: HTTP request
319
        
320
        Returns:
321
            HTTP response
322
        """
323

324
def ingress(app):
325
    """
326
    Mark deployment as HTTP ingress.
327
    
328
    Args:
329
        app: Deployment to mark as ingress
330
    
331
    Returns:
332
        Deployment with ingress configuration
333
    """
334
```
335

336
### Model Multiplexing
337

338
Serve multiple models from single deployment.
339

340
```python { .api }
341
class MultiplexedReplicaResult:
342
    """Result from multiplexed model call."""
343
    
344
    def __init__(self, result):
345
        """Initialize with result."""
346

347
def get_multiplexed_model_id():
348
    """
349
    Get current multiplexed model ID.
350
    
351
    Returns:
352
        str: Current model ID
353
    """
354
```
355

356
### Configuration and Context
357

358
Runtime configuration and context access.
359

360
```python { .api }
361
def get_replica_context():
362
    """
363
    Get current replica context.
364
    
365
    Returns:
366
        ReplicaContext: Current replica context
367
    """
368

369
class ReplicaContext:
370
    """Context for current replica."""
371
    
372
    @property
373
    def deployment(self):
374
        """Current deployment name."""
375
    
376
    @property
377
    def replica_tag(self):
378
        """Current replica tag."""
379
    
380
    @property
381
    def servable_object(self):
382
        """Current servable object."""
383
```
384

385
## Usage Examples
386

387
### Basic Model Serving
388

389
```python
390
import ray
391
from ray import serve
392
import numpy as np
393

394
# Start Ray Serve
395
serve.start()
396

397
# Define a simple model
398
@serve.deployment
399
class SimpleModel:
400
    def __init__(self):
401
        # Load your model here
402
        self.model = self._load_model()
403
    
404
    def _load_model(self):
405
        # Placeholder for model loading
406
        return lambda x: x * 2
407
    
408
    def __call__(self, request):
409
        data = request.json()
410
        input_data = np.array(data["input"])
411
        prediction = self.model(input_data)
412
        return {"prediction": prediction.tolist()}
413

414
# Deploy the model
415
SimpleModel.deploy()
416

417
# Make a request
418
import requests
419
response = requests.post("http://127.0.0.1:8000/SimpleModel", 
420
                        json={"input": [1, 2, 3, 4]})
421
print(response.json())  # {"prediction": [2, 4, 6, 8]}
422

423
serve.shutdown()
424
```
425

426
### Advanced Model with Batching
427

428
```python
429
import ray
430
from ray import serve
431
import torch
432

433
serve.start()
434

435
@serve.deployment(
436
    num_replicas=2,
437
    ray_actor_options={"num_cpus": 1, "num_gpus": 0.5}
438
)
439
class PyTorchModel:
440
    def __init__(self, model_path):
441
        self.model = torch.load(model_path)
442
        self.model.eval()
443
    
444
    @serve.batch(max_batch_size=32, batch_wait_timeout_s=0.1)
445
    async def predict_batch(self, inputs):
446
        batch = torch.stack(inputs)
447
        with torch.no_grad():
448
            predictions = self.model(batch)
449
        return predictions.numpy()
450
    
451
    async def __call__(self, request):
452
        data = torch.tensor(request.json()["input"])
453
        prediction = await self.predict_batch(data)
454
        return {"prediction": prediction.tolist()}
455

456
# Deploy with specific configuration
457
PyTorchModel.options(
458
    autoscaling_config=serve.AutoscalingConfig(
459
        min_replicas=1,
460
        max_replicas=5,
461
        target_num_ongoing_requests_per_replica=2
462
    )
463
).deploy("model.pt")
464
```
465

466
### Multi-Model Deployment
467

468
```python
469
import ray
470
from ray import serve
471

472
serve.start()
473

474
@serve.deployment
475
class ModelRouter:
476
    def __init__(self):
477
        self.model_a = ModelA.get_handle()
478
        self.model_b = ModelB.get_handle()
479
    
480
    async def __call__(self, request):
481
        data = request.json()
482
        model_type = data.get("model", "a")
483
        
484
        if model_type == "a":
485
            result = await self.model_a.remote(data)
486
        else:
487
            result = await self.model_b.remote(data)
488
        
489
        return result
490

491
@serve.deployment
492
class ModelA:
493
    async def __call__(self, data):
494
        return {"model": "a", "result": data["input"] * 2}
495

496
@serve.deployment  
497
class ModelB:
498
    async def __call__(self, data):
499
        return {"model": "b", "result": data["input"] + 10}
500

501
# Deploy all models
502
ModelA.deploy()
503
ModelB.deploy()
504
ModelRouter.deploy()
505
```
506

507
### Application with Custom Ingress
508

509
```python
510
import ray
511
from ray import serve
512
from starlette.requests import Request
513
from starlette.responses import JSONResponse
514

515
serve.start()
516

517
@serve.deployment
518
@serve.ingress(app)
519
class CustomIngress:
520
    def __init__(self):
521
        self.model = MLModel.get_handle()
522
    
523
    async def __call__(self, request: Request):
524
        if request.method == "GET":
525
            return JSONResponse({"status": "healthy"})
526
        
527
        elif request.method == "POST":
528
            data = await request.json()
529
            result = await self.model.remote(data)
530
            return JSONResponse(result)
531
        
532
        else:
533
            return JSONResponse({"error": "Method not allowed"}, 
534
                              status_code=405)
535

536
@serve.deployment
537
class MLModel:
538
    def __init__(self):
539
        # Initialize your model
540
        pass
541
    
542
    async def predict(self, data):
543
        # Model prediction logic
544
        return {"prediction": "result"}
545

546
# Build and run application
547
app = serve.build(CustomIngress)
548
serve.run(app)
549
```
550

551
### Production Configuration
552

553
```python
554
import ray
555
from ray import serve
556

557
# Production serving configuration
558
serve.start(
559
    detached=True,
560
    http_options=serve.HTTPOptions(
561
        host="0.0.0.0",
562
        port=8000,
563
        location="EveryNode"
564
    )
565
)
566

567
@serve.deployment(
568
    name="production-model",
569
    version="v1.0",
570
    num_replicas=4,
571
    autoscaling_config=serve.AutoscalingConfig(
572
        min_replicas=2,
573
        max_replicas=10,
574
        target_num_ongoing_requests_per_replica=5
575
    ),
576
    ray_actor_options={
577
        "num_cpus": 2,
578
        "num_gpus": 1,
579
        "memory": 4000 * 1024 * 1024  # 4GB
580
    },
581
    health_check_period_s=10,
582
    health_check_timeout_s=30,
583
    graceful_shutdown_timeout_s=60
584
)
585
class ProductionModel:
586
    def __init__(self, model_config):
587
        self.model = self._load_model(model_config)
588
        self.preprocessor = self._load_preprocessor()
589
    
590
    def _load_model(self, config):
591
        # Load production model
592
        pass
593
    
594
    def _load_preprocessor(self):
595
        # Load data preprocessor
596
        pass
597
    
598
    @serve.batch(max_batch_size=64, batch_wait_timeout_s=0.05)
599
    async def predict_batch(self, inputs):
600
        # Batch prediction with preprocessing
601
        processed = [self.preprocessor(inp) for inp in inputs]
602
        predictions = self.model.predict(processed)
603
        return predictions
604
    
605
    async def __call__(self, request):
606
        data = request.json()
607
        prediction = await self.predict_batch(data["input"])
608
        return {"prediction": prediction, "version": "v1.0"}
609

610
# Deploy production model
611
ProductionModel.deploy({"model_path": "s3://models/production-v1.0"})
612
```

Version

Tile

Files

model-serving.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

model-serving.mddocs/