or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

framework-servers.mdindex.mdinference-clients.mdkubernetes-client.mdmodel-serving.mdprotocol.mdresource-models.mdstorage.md

inference-clients.mddocs/

0

# Inference Clients

1

2

High-level async clients for making inference requests to KServe models with retry logic, SSL support, and protocol conversion. These clients provide a convenient interface for interacting with deployed models.

3

4

## Capabilities

5

6

### REST Client

7

8

Asynchronous REST client for HTTP-based inference requests with configurable retry policies and SSL support.

9

10

```python { .api }

11

class InferenceRESTClient:

12

def __init__(self, url: str, config: RESTConfig = None):

13

"""

14

Initialize REST inference client.

15

16

Args:

17

url (str): Base URL of the inference service

18

config (RESTConfig, optional): Client configuration

19

"""

20

21

async def infer(self, request: InferRequest) -> InferResponse:

22

"""

23

Send inference request to model.

24

25

Args:

26

request (InferRequest): Inference request with inputs

27

28

Returns:

29

InferResponse: Model predictions and metadata

30

31

Raises:

32

ConnectionError: If unable to connect to service

33

TimeoutError: If request times out

34

"""

35

36

async def explain(self, request: InferRequest) -> InferResponse:

37

"""

38

Request explanation for model predictions.

39

40

Args:

41

request (InferRequest): Input data for explanation

42

43

Returns:

44

InferResponse: Explanation results

45

"""

46

47

async def is_server_ready(self) -> bool:

48

"""

49

Check if inference server is ready.

50

51

Returns:

52

bool: True if server is ready

53

"""

54

55

async def is_server_live(self) -> bool:

56

"""

57

Check if inference server is live.

58

59

Returns:

60

bool: True if server is live

61

"""

62

63

async def is_model_ready(self, model_name: str) -> bool:

64

"""

65

Check if specific model is ready.

66

67

Args:

68

model_name (str): Name of model to check

69

70

Returns:

71

bool: True if model is ready

72

"""

73

74

# Properties

75

host: str # Server host URL

76

config: RESTConfig # Client configuration

77

protocol_version: str # Protocol version (v1 or v2)

78

```

79

80

### gRPC Client

81

82

Asynchronous gRPC client for high-performance inference requests with streaming support and built-in retry policies.

83

84

```python { .api }

85

class InferenceGRPCClient:

86

def __init__(self,

87

url: str,

88

verbose: bool = False,

89

use_ssl: bool = False,

90

root_certificates: Optional[str] = None,

91

private_key: Optional[str] = None,

92

certificate_chain: Optional[str] = None,

93

creds: Optional[grpc.ChannelCredentials] = None,

94

channel_args: Optional[List[Tuple[str, Any]]] = None,

95

timeout: Optional[float] = 60,

96

retries: int = 3):

97

"""

98

Initialize gRPC inference client.

99

100

Args:

101

url: gRPC server URL (host:port)

102

verbose: Enable verbose logging

103

use_ssl: Use SSL-enabled channel (ignored if creds provided)

104

root_certificates: Path to PEM-encoded root certificates file

105

private_key: Path to PEM-encoded private key file

106

certificate_chain: Path to PEM-encoded certificate chain file

107

creds: ChannelCredentials instance for secure communication

108

channel_args: List of key-value pairs for channel configuration

109

timeout: Maximum end-to-end time in seconds (default: 60)

110

retries: Number of retries if request fails

111

"""

112

113

async def infer(self, request: InferRequest) -> InferResponse:

114

"""

115

Send inference request via gRPC.

116

117

Args:

118

request (InferRequest): Inference request with inputs

119

120

Returns:

121

InferResponse: Model predictions and metadata

122

"""

123

124

async def is_server_ready(self) -> bool:

125

"""

126

Check if gRPC server is ready.

127

128

Returns:

129

bool: True if server is ready

130

"""

131

132

async def is_server_live(self) -> bool:

133

"""

134

Check if gRPC server is live.

135

136

Returns:

137

bool: True if server is live

138

"""

139

140

async def is_model_ready(self, model_name: str) -> bool:

141

"""

142

Check if specific model is ready via gRPC.

143

144

Args:

145

model_name (str): Name of model to check

146

147

Returns:

148

bool: True if model is ready

149

"""

150

151

# Properties

152

host: str # Server host URL

153

channel: Any # gRPC channel

154

metadata: List[Tuple[str, str]] # Request metadata

155

```

156

157

### Client Configuration

158

159

Configuration class for REST client with transport and authentication options.

160

161

```python { .api }

162

class RESTConfig:

163

def __init__(self,

164

transport: httpx.AsyncBaseTransport = None,

165

protocol: Union[str, PredictorProtocol] = "v1",

166

retries: int = 3,

167

http2: bool = False,

168

timeout: Union[float, None, tuple, httpx.Timeout] = 60,

169

cert=None,

170

verify: Union[str, bool, ssl.SSLContext] = True,

171

auth=None,

172

verbose: bool = False):

173

"""

174

Configuration for REST inference client.

175

176

Args:

177

transport: Asynchronous transport class for sending requests

178

protocol: Inference server protocol as string or PredictorProtocol object

179

retries: Number of retries for ConnectError or ConnectTimeout (default: 3)

180

http2: Enable HTTP/2 support (default: False)

181

timeout: Maximum end-to-end time in seconds (default: 60)

182

cert: SSL certificate for client authentication (path, tuple, or triple)

183

verify: SSL certificates for verifying host identity (True, path, SSLContext, or False)

184

auth: Authentication class for inference requests

185

verbose: Enable verbose logging (default: False)

186

"""

187

188

# Properties

189

protocol: str # HTTP/HTTPS protocol

190

retries: int # Retry attempts

191

timeout: int # Request timeout

192

http2: bool # HTTP/2 support

193

cert: Optional[str] # Client certificate path

194

verify: bool # SSL verification

195

```

196

197

### Client Factory

198

199

Factory class for creating appropriate client instances based on URL or configuration.

200

201

```python { .api }

202

class InferenceClientFactory:

203

@staticmethod

204

def get_inference_client(url: str, config: Optional[RESTConfig] = None):

205

"""

206

Create appropriate inference client based on URL.

207

208

Args:

209

url (str): Server URL (determines client type)

210

config (RESTConfig, optional): Configuration for REST client

211

212

Returns:

213

Union[InferenceRESTClient, InferenceGRPCClient]: Client instance

214

"""

215

```

216

217

## Usage Examples

218

219

### Basic REST Client Usage

220

221

```python

222

import asyncio

223

from kserve import InferenceRESTClient, InferRequest, InferInput

224

225

async def main():

226

# Create client

227

client = InferenceRESTClient("http://localhost:8080")

228

229

# Check if server is ready

230

ready = await client.is_server_ready()

231

if not ready:

232

print("Server not ready")

233

return

234

235

# Prepare input data

236

input_data = InferInput(

237

name="input-0",

238

shape=[1, 4],

239

datatype="FP32"

240

)

241

input_data.set_data_from_numpy(numpy_array)

242

243

# Create request

244

request = InferRequest(

245

model_name="iris-classifier",

246

inputs=[input_data]

247

)

248

249

# Make inference request

250

response = await client.infer(request)

251

252

# Extract predictions

253

predictions = response.outputs[0].as_numpy()

254

print(f"Predictions: {predictions}")

255

256

asyncio.run(main())

257

```

258

259

### Advanced REST Client with Configuration

260

261

```python

262

from kserve import InferenceRESTClient, RESTConfig

263

import ssl

264

265

async def secure_inference():

266

# Configure SSL client

267

config = RESTConfig(

268

protocol="https",

269

retries=5,

270

timeout=120,

271

cert="/path/to/client.crt",

272

verify=True

273

)

274

275

client = InferenceRESTClient(

276

"https://model-server.example.com",

277

config=config

278

)

279

280

# Check model readiness

281

model_ready = await client.is_model_ready("my-model")

282

if model_ready:

283

response = await client.infer(request)

284

return response.outputs

285

```

286

287

### gRPC Client Usage

288

289

```python

290

from kserve import InferenceGRPCClient, InferRequest, InferInput

291

import numpy as np

292

293

async def grpc_inference():

294

# Create gRPC client

295

client = InferenceGRPCClient("localhost:8081", verbose=True)

296

297

# Prepare batch input

298

batch_data = np.random.rand(32, 224, 224, 3)

299

300

input_tensor = InferInput(

301

name="images",

302

shape=list(batch_data.shape),

303

datatype="FP32"

304

)

305

input_tensor.set_data_from_numpy(batch_data)

306

307

# Create request

308

request = InferRequest(

309

model_name="resnet-50",

310

inputs=[input_tensor]

311

)

312

313

# Send gRPC request

314

response = await client.infer(request)

315

316

# Process results

317

probabilities = response.outputs[0].as_numpy()

318

predicted_classes = np.argmax(probabilities, axis=1)

319

320

return predicted_classes

321

```

322

323

### Error Handling and Retries

324

325

```python

326

from kserve import InferenceRESTClient, RESTConfig

327

import asyncio

328

import logging

329

330

async def robust_inference():

331

config = RESTConfig(

332

retries=3,

333

timeout=30

334

)

335

336

client = InferenceRESTClient("http://model-service:8080", config)

337

338

try:

339

# Check server health first

340

if not await client.is_server_live():

341

raise ConnectionError("Server not live")

342

343

if not await client.is_model_ready("my-model"):

344

raise RuntimeError("Model not ready")

345

346

# Make request with automatic retries

347

response = await client.infer(request)

348

return response

349

350

except asyncio.TimeoutError:

351

logging.error("Request timed out after retries")

352

raise

353

except ConnectionError as e:

354

logging.error(f"Connection failed: {e}")

355

raise

356

except Exception as e:

357

logging.error(f"Inference failed: {e}")

358

raise

359

```

360

361

### Multiple Model Requests

362

363

```python

364

from kserve import InferenceRESTClient

365

import asyncio

366

367

async def multi_model_inference():

368

client = InferenceRESTClient("http://localhost:8080")

369

370

# Define models and their requests

371

models = ["model-a", "model-b", "model-c"]

372

requests = [create_request(model) for model in models]

373

374

# Send requests concurrently

375

tasks = [

376

client.infer(req) for req in requests

377

]

378

379

responses = await asyncio.gather(*tasks, return_exceptions=True)

380

381

# Process results

382

results = {}

383

for model, response in zip(models, responses):

384

if isinstance(response, Exception):

385

results[model] = {"error": str(response)}

386

else:

387

results[model] = {"predictions": response.outputs[0].as_numpy()}

388

389

return results

390

```

391

392

## Types

393

394

```python { .api }

395

from typing import Optional, List, Tuple, Union, Any

396

import numpy as np

397

398

ClientURL = str

399

ModelName = str

400

ServerMetadata = List[Tuple[str, str]]

401

NumpyArray = np.ndarray

402

```