0
# Inference Clients
1
2
High-level async clients for making inference requests to KServe models with retry logic, SSL support, and protocol conversion. These clients provide a convenient interface for interacting with deployed models.
3
4
## Capabilities
5
6
### REST Client
7
8
Asynchronous REST client for HTTP-based inference requests with configurable retry policies and SSL support.
9
10
```python { .api }
11
class InferenceRESTClient:
12
def __init__(self, url: str, config: RESTConfig = None):
13
"""
14
Initialize REST inference client.
15
16
Args:
17
url (str): Base URL of the inference service
18
config (RESTConfig, optional): Client configuration
19
"""
20
21
async def infer(self, request: InferRequest) -> InferResponse:
22
"""
23
Send inference request to model.
24
25
Args:
26
request (InferRequest): Inference request with inputs
27
28
Returns:
29
InferResponse: Model predictions and metadata
30
31
Raises:
32
ConnectionError: If unable to connect to service
33
TimeoutError: If request times out
34
"""
35
36
async def explain(self, request: InferRequest) -> InferResponse:
37
"""
38
Request explanation for model predictions.
39
40
Args:
41
request (InferRequest): Input data for explanation
42
43
Returns:
44
InferResponse: Explanation results
45
"""
46
47
async def is_server_ready(self) -> bool:
48
"""
49
Check if inference server is ready.
50
51
Returns:
52
bool: True if server is ready
53
"""
54
55
async def is_server_live(self) -> bool:
56
"""
57
Check if inference server is live.
58
59
Returns:
60
bool: True if server is live
61
"""
62
63
async def is_model_ready(self, model_name: str) -> bool:
64
"""
65
Check if specific model is ready.
66
67
Args:
68
model_name (str): Name of model to check
69
70
Returns:
71
bool: True if model is ready
72
"""
73
74
# Properties
75
host: str # Server host URL
76
config: RESTConfig # Client configuration
77
protocol_version: str # Protocol version (v1 or v2)
78
```
79
80
### gRPC Client
81
82
Asynchronous gRPC client for high-performance inference requests with streaming support and built-in retry policies.
83
84
```python { .api }
85
class InferenceGRPCClient:
86
def __init__(self,
87
url: str,
88
verbose: bool = False,
89
use_ssl: bool = False,
90
root_certificates: Optional[str] = None,
91
private_key: Optional[str] = None,
92
certificate_chain: Optional[str] = None,
93
creds: Optional[grpc.ChannelCredentials] = None,
94
channel_args: Optional[List[Tuple[str, Any]]] = None,
95
timeout: Optional[float] = 60,
96
retries: int = 3):
97
"""
98
Initialize gRPC inference client.
99
100
Args:
101
url: gRPC server URL (host:port)
102
verbose: Enable verbose logging
103
use_ssl: Use SSL-enabled channel (ignored if creds provided)
104
root_certificates: Path to PEM-encoded root certificates file
105
private_key: Path to PEM-encoded private key file
106
certificate_chain: Path to PEM-encoded certificate chain file
107
creds: ChannelCredentials instance for secure communication
108
channel_args: List of key-value pairs for channel configuration
109
timeout: Maximum end-to-end time in seconds (default: 60)
110
retries: Number of retries if request fails
111
"""
112
113
async def infer(self, request: InferRequest) -> InferResponse:
114
"""
115
Send inference request via gRPC.
116
117
Args:
118
request (InferRequest): Inference request with inputs
119
120
Returns:
121
InferResponse: Model predictions and metadata
122
"""
123
124
async def is_server_ready(self) -> bool:
125
"""
126
Check if gRPC server is ready.
127
128
Returns:
129
bool: True if server is ready
130
"""
131
132
async def is_server_live(self) -> bool:
133
"""
134
Check if gRPC server is live.
135
136
Returns:
137
bool: True if server is live
138
"""
139
140
async def is_model_ready(self, model_name: str) -> bool:
141
"""
142
Check if specific model is ready via gRPC.
143
144
Args:
145
model_name (str): Name of model to check
146
147
Returns:
148
bool: True if model is ready
149
"""
150
151
# Properties
152
host: str # Server host URL
153
channel: Any # gRPC channel
154
metadata: List[Tuple[str, str]] # Request metadata
155
```
156
157
### Client Configuration
158
159
Configuration class for REST client with transport and authentication options.
160
161
```python { .api }
162
class RESTConfig:
163
def __init__(self,
164
transport: httpx.AsyncBaseTransport = None,
165
protocol: Union[str, PredictorProtocol] = "v1",
166
retries: int = 3,
167
http2: bool = False,
168
timeout: Union[float, None, tuple, httpx.Timeout] = 60,
169
cert=None,
170
verify: Union[str, bool, ssl.SSLContext] = True,
171
auth=None,
172
verbose: bool = False):
173
"""
174
Configuration for REST inference client.
175
176
Args:
177
transport: Asynchronous transport class for sending requests
178
protocol: Inference server protocol as string or PredictorProtocol object
179
retries: Number of retries for ConnectError or ConnectTimeout (default: 3)
180
http2: Enable HTTP/2 support (default: False)
181
timeout: Maximum end-to-end time in seconds (default: 60)
182
cert: SSL certificate for client authentication (path, tuple, or triple)
183
verify: SSL certificates for verifying host identity (True, path, SSLContext, or False)
184
auth: Authentication class for inference requests
185
verbose: Enable verbose logging (default: False)
186
"""
187
188
# Properties
189
protocol: str # HTTP/HTTPS protocol
190
retries: int # Retry attempts
191
timeout: int # Request timeout
192
http2: bool # HTTP/2 support
193
cert: Optional[str] # Client certificate path
194
verify: bool # SSL verification
195
```
196
197
### Client Factory
198
199
Factory class for creating appropriate client instances based on URL or configuration.
200
201
```python { .api }
202
class InferenceClientFactory:
203
@staticmethod
204
def get_inference_client(url: str, config: Optional[RESTConfig] = None):
205
"""
206
Create appropriate inference client based on URL.
207
208
Args:
209
url (str): Server URL (determines client type)
210
config (RESTConfig, optional): Configuration for REST client
211
212
Returns:
213
Union[InferenceRESTClient, InferenceGRPCClient]: Client instance
214
"""
215
```
216
217
## Usage Examples
218
219
### Basic REST Client Usage
220
221
```python
222
import asyncio
223
from kserve import InferenceRESTClient, InferRequest, InferInput
224
225
async def main():
226
# Create client
227
client = InferenceRESTClient("http://localhost:8080")
228
229
# Check if server is ready
230
ready = await client.is_server_ready()
231
if not ready:
232
print("Server not ready")
233
return
234
235
# Prepare input data
236
input_data = InferInput(
237
name="input-0",
238
shape=[1, 4],
239
datatype="FP32"
240
)
241
input_data.set_data_from_numpy(numpy_array)
242
243
# Create request
244
request = InferRequest(
245
model_name="iris-classifier",
246
inputs=[input_data]
247
)
248
249
# Make inference request
250
response = await client.infer(request)
251
252
# Extract predictions
253
predictions = response.outputs[0].as_numpy()
254
print(f"Predictions: {predictions}")
255
256
asyncio.run(main())
257
```
258
259
### Advanced REST Client with Configuration
260
261
```python
262
from kserve import InferenceRESTClient, RESTConfig
263
import ssl
264
265
async def secure_inference():
266
# Configure SSL client
267
config = RESTConfig(
268
protocol="https",
269
retries=5,
270
timeout=120,
271
cert="/path/to/client.crt",
272
verify=True
273
)
274
275
client = InferenceRESTClient(
276
"https://model-server.example.com",
277
config=config
278
)
279
280
# Check model readiness
281
model_ready = await client.is_model_ready("my-model")
282
if model_ready:
283
response = await client.infer(request)
284
return response.outputs
285
```
286
287
### gRPC Client Usage
288
289
```python
290
from kserve import InferenceGRPCClient, InferRequest, InferInput
291
import numpy as np
292
293
async def grpc_inference():
294
# Create gRPC client
295
client = InferenceGRPCClient("localhost:8081", verbose=True)
296
297
# Prepare batch input
298
batch_data = np.random.rand(32, 224, 224, 3)
299
300
input_tensor = InferInput(
301
name="images",
302
shape=list(batch_data.shape),
303
datatype="FP32"
304
)
305
input_tensor.set_data_from_numpy(batch_data)
306
307
# Create request
308
request = InferRequest(
309
model_name="resnet-50",
310
inputs=[input_tensor]
311
)
312
313
# Send gRPC request
314
response = await client.infer(request)
315
316
# Process results
317
probabilities = response.outputs[0].as_numpy()
318
predicted_classes = np.argmax(probabilities, axis=1)
319
320
return predicted_classes
321
```
322
323
### Error Handling and Retries
324
325
```python
326
from kserve import InferenceRESTClient, RESTConfig
327
import asyncio
328
import logging
329
330
async def robust_inference():
331
config = RESTConfig(
332
retries=3,
333
timeout=30
334
)
335
336
client = InferenceRESTClient("http://model-service:8080", config)
337
338
try:
339
# Check server health first
340
if not await client.is_server_live():
341
raise ConnectionError("Server not live")
342
343
if not await client.is_model_ready("my-model"):
344
raise RuntimeError("Model not ready")
345
346
# Make request with automatic retries
347
response = await client.infer(request)
348
return response
349
350
except asyncio.TimeoutError:
351
logging.error("Request timed out after retries")
352
raise
353
except ConnectionError as e:
354
logging.error(f"Connection failed: {e}")
355
raise
356
except Exception as e:
357
logging.error(f"Inference failed: {e}")
358
raise
359
```
360
361
### Multiple Model Requests
362
363
```python
364
from kserve import InferenceRESTClient
365
import asyncio
366
367
async def multi_model_inference():
368
client = InferenceRESTClient("http://localhost:8080")
369
370
# Define models and their requests
371
models = ["model-a", "model-b", "model-c"]
372
requests = [create_request(model) for model in models]
373
374
# Send requests concurrently
375
tasks = [
376
client.infer(req) for req in requests
377
]
378
379
responses = await asyncio.gather(*tasks, return_exceptions=True)
380
381
# Process results
382
results = {}
383
for model, response in zip(models, responses):
384
if isinstance(response, Exception):
385
results[model] = {"error": str(response)}
386
else:
387
results[model] = {"predictions": response.outputs[0].as_numpy()}
388
389
return results
390
```
391
392
## Types
393
394
```python { .api }
395
from typing import Optional, List, Tuple, Union, Any
396
import numpy as np
397
398
ClientURL = str
399
ModelName = str
400
ServerMetadata = List[Tuple[str, str]]
401
NumpyArray = np.ndarray
402
```