or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

asset-management.mdautoml.mdclient-auth.mdcompute-management.mdhyperparameter-tuning.mdindex.mdjob-management.mdmodel-deployment.md

model-deployment.mddocs/

0

# Model Deployment

1

2

Comprehensive model deployment capabilities for real-time and batch inference with online endpoints, batch endpoints, and various deployment configurations supporting different compute types and scaling options.

3

4

## Capabilities

5

6

### Online Endpoints

7

8

Real-time inference endpoints for serving models with low latency and high availability.

9

10

```python { .api }

11

class OnlineEndpoint:

12

def __init__(

13

self,

14

*,

15

name: str,

16

description: str = None,

17

tags: dict = None,

18

properties: dict = None,

19

auth_mode: str = "key",

20

identity: IdentityConfiguration = None,

21

**kwargs

22

):

23

"""

24

Online endpoint for real-time model inference.

25

26

Parameters:

27

- name: Endpoint name (must be unique in workspace)

28

- description: Endpoint description

29

- tags: Dictionary of tags

30

- properties: Custom properties

31

- auth_mode: Authentication mode ("key", "aml_token", "aad_token")

32

- identity: Managed identity configuration

33

"""

34

35

class ManagedOnlineEndpoint(OnlineEndpoint):

36

def __init__(

37

self,

38

*,

39

name: str,

40

public_network_access: str = "enabled",

41

**kwargs

42

):

43

"""

44

Azure-managed online endpoint with automatic scaling and load balancing.

45

46

Parameters:

47

- name: Endpoint name

48

- public_network_access: Network access ("enabled", "disabled")

49

"""

50

51

class KubernetesOnlineEndpoint(OnlineEndpoint):

52

def __init__(

53

self,

54

*,

55

name: str,

56

compute: str,

57

**kwargs

58

):

59

"""

60

Kubernetes-based online endpoint for custom compute environments.

61

62

Parameters:

63

- name: Endpoint name

64

- compute: Kubernetes compute target name

65

"""

66

```

67

68

#### Usage Example

69

70

```python

71

from azure.ai.ml.entities import ManagedOnlineEndpoint

72

73

# Create a managed online endpoint

74

endpoint = ManagedOnlineEndpoint(

75

name="my-model-endpoint",

76

description="Endpoint for my ML model",

77

auth_mode="key",

78

tags={"environment": "production", "version": "1.0"}

79

)

80

81

# Create the endpoint

82

ml_client.online_endpoints.begin_create_or_update(endpoint).result()

83

```

84

85

### Online Deployments

86

87

Deploy models to online endpoints with specific resource configurations and scaling settings.

88

89

```python { .api }

90

class OnlineDeployment:

91

def __init__(

92

self,

93

*,

94

name: str,

95

endpoint_name: str,

96

model: Model = None,

97

environment: Environment = None,

98

code_configuration: CodeConfiguration = None,

99

**kwargs

100

):

101

"""

102

Base online deployment class.

103

104

Parameters:

105

- name: Deployment name

106

- endpoint_name: Target endpoint name

107

- model: Model to deploy

108

- environment: Runtime environment

109

- code_configuration: Scoring script configuration

110

"""

111

112

class ManagedOnlineDeployment(OnlineDeployment):

113

def __init__(

114

self,

115

*,

116

name: str,

117

endpoint_name: str,

118

model: Model,

119

environment: Environment = None,

120

code_configuration: CodeConfiguration = None,

121

instance_type: str = "Standard_DS3_v2",

122

instance_count: int = 1,

123

scale_settings: OnlineScaleSettings = None,

124

request_settings: OnlineRequestSettings = None,

125

liveness_probe: ProbeSettings = None,

126

readiness_probe: ProbeSettings = None,

127

environment_variables: dict = None,

128

**kwargs

129

):

130

"""

131

Azure-managed online deployment with automatic scaling.

132

133

Parameters:

134

- name: Deployment name

135

- endpoint_name: Target endpoint name

136

- model: Model to deploy

137

- environment: Runtime environment

138

- code_configuration: Scoring script configuration

139

- instance_type: VM size for deployment

140

- instance_count: Number of instances

141

- scale_settings: Auto-scaling configuration

142

- request_settings: Request handling settings

143

- liveness_probe: Health check configuration

144

- readiness_probe: Readiness check configuration

145

- environment_variables: Environment variables

146

"""

147

148

class KubernetesOnlineDeployment(OnlineDeployment):

149

def __init__(

150

self,

151

*,

152

name: str,

153

endpoint_name: str,

154

model: Model,

155

environment: Environment = None,

156

code_configuration: CodeConfiguration = None,

157

instance_type: str = None,

158

instance_count: int = 1,

159

resources: ResourceRequirementsSettings = None,

160

**kwargs

161

):

162

"""

163

Kubernetes-based online deployment for custom compute.

164

165

Parameters:

166

- name: Deployment name

167

- endpoint_name: Target endpoint name

168

- model: Model to deploy

169

- environment: Runtime environment

170

- code_configuration: Scoring script configuration

171

- instance_type: Instance type (if applicable)

172

- instance_count: Number of replicas

173

- resources: Resource requirements (CPU, memory, GPU)

174

"""

175

```

176

177

#### Usage Example

178

179

```python

180

from azure.ai.ml.entities import ManagedOnlineDeployment, CodeConfiguration, Model, Environment

181

182

# Define the model, environment, and code configuration

183

model = Model(path="./model", name="my-model", version="1")

184

environment = Environment(

185

image="mcr.microsoft.com/azureml/sklearn-1.0-ubuntu20.04-py38-cpu-inference:latest"

186

)

187

code_config = CodeConfiguration(

188

code="./src",

189

scoring_script="score.py"

190

)

191

192

# Create the deployment

193

deployment = ManagedOnlineDeployment(

194

name="my-model-deployment",

195

endpoint_name="my-model-endpoint",

196

model=model,

197

environment=environment,

198

code_configuration=code_config,

199

instance_type="Standard_DS3_v2",

200

instance_count=1

201

)

202

203

# Deploy the model

204

ml_client.online_deployments.begin_create_or_update(deployment).result()

205

```

206

207

### Batch Endpoints and Deployments

208

209

Batch inference for processing large datasets asynchronously.

210

211

```python { .api }

212

class BatchEndpoint:

213

def __init__(

214

self,

215

*,

216

name: str,

217

description: str = None,

218

tags: dict = None,

219

properties: dict = None,

220

auth_mode: str = "aad_token",

221

**kwargs

222

):

223

"""

224

Batch endpoint for asynchronous batch inference.

225

226

Parameters:

227

- name: Endpoint name

228

- description: Endpoint description

229

- tags: Dictionary of tags

230

- properties: Custom properties

231

- auth_mode: Authentication mode ("aad_token", "key")

232

"""

233

234

class BatchDeployment:

235

def __init__(

236

self,

237

*,

238

name: str,

239

endpoint_name: str,

240

model: Model = None,

241

environment: Environment = None,

242

code_configuration: CodeConfiguration = None,

243

compute: str,

244

instance_count: int = 1,

245

max_concurrency_per_instance: int = 1,

246

mini_batch_size: int = 10,

247

retry_settings: BatchRetrySettings = None,

248

output_action: str = "append_row",

249

output_file_name: str = "predictions.csv",

250

logging_level: str = "info",

251

environment_variables: dict = None,

252

**kwargs

253

):

254

"""

255

Batch deployment for processing large datasets.

256

257

Parameters:

258

- name: Deployment name

259

- endpoint_name: Target batch endpoint name

260

- model: Model to deploy

261

- environment: Runtime environment

262

- code_configuration: Scoring script configuration

263

- compute: Compute cluster for batch processing

264

- instance_count: Number of compute instances

265

- max_concurrency_per_instance: Max concurrent processes per instance

266

- mini_batch_size: Size of mini-batches for processing

267

- retry_settings: Retry configuration for failed batches

268

- output_action: How to handle outputs ("append_row", "summary_only")

269

- output_file_name: Name of output file

270

- logging_level: Logging level ("debug", "info", "warning", "error")

271

- environment_variables: Environment variables

272

"""

273

274

class ModelBatchDeployment(BatchDeployment):

275

def __init__(

276

self,

277

*,

278

name: str,

279

endpoint_name: str,

280

model: Model,

281

settings: ModelBatchDeploymentSettings = None,

282

**kwargs

283

):

284

"""

285

Model-specific batch deployment with optimized settings.

286

287

Parameters:

288

- name: Deployment name

289

- endpoint_name: Target batch endpoint name

290

- model: Model to deploy

291

- settings: Model-specific deployment settings

292

"""

293

```

294

295

### Deployment Configuration Classes

296

297

```python { .api }

298

class CodeConfiguration:

299

def __init__(

300

self,

301

*,

302

code: str,

303

scoring_script: str

304

):

305

"""

306

Code configuration for deployments.

307

308

Parameters:

309

- code: Path to source code directory

310

- scoring_script: Name of scoring script file

311

"""

312

313

class OnlineScaleSettings:

314

"""Base class for online scaling settings."""

315

316

class DefaultScaleSettings(OnlineScaleSettings):

317

def __init__(self):

318

"""Default scaling settings (no auto-scaling)."""

319

320

class TargetUtilizationScaleSettings(OnlineScaleSettings):

321

def __init__(

322

self,

323

*,

324

min_instances: int = 1,

325

max_instances: int = 1,

326

target_utilization_percentage: int = 70,

327

polling_interval: int = 300,

328

scale_up_cooldown: int = 300,

329

scale_down_cooldown: int = 300

330

):

331

"""

332

Auto-scaling based on CPU/memory utilization.

333

334

Parameters:

335

- min_instances: Minimum number of instances

336

- max_instances: Maximum number of instances

337

- target_utilization_percentage: Target CPU utilization percentage

338

- polling_interval: Polling interval in seconds

339

- scale_up_cooldown: Cooldown period for scaling up

340

- scale_down_cooldown: Cooldown period for scaling down

341

"""

342

343

class OnlineRequestSettings:

344

def __init__(

345

self,

346

*,

347

request_timeout_ms: int = 90000,

348

max_concurrent_requests_per_instance: int = 1,

349

max_queue_wait_ms: int = 30000

350

):

351

"""

352

Request handling settings for online deployments.

353

354

Parameters:

355

- request_timeout_ms: Request timeout in milliseconds

356

- max_concurrent_requests_per_instance: Max concurrent requests per instance

357

- max_queue_wait_ms: Max queue wait time in milliseconds

358

"""

359

360

class ProbeSettings:

361

def __init__(

362

self,

363

*,

364

failure_threshold: int = 30,

365

success_threshold: int = 1,

366

timeout: int = 2,

367

period: int = 10,

368

initial_delay: int = 10

369

):

370

"""

371

Health probe settings for deployments.

372

373

Parameters:

374

- failure_threshold: Number of failures before marking unhealthy

375

- success_threshold: Number of successes to mark healthy

376

- timeout: Probe timeout in seconds

377

- period: Probe period in seconds

378

- initial_delay: Initial delay before first probe

379

"""

380

381

class BatchRetrySettings:

382

def __init__(

383

self,

384

*,

385

max_retries: int = 3,

386

timeout: int = 30

387

):

388

"""

389

Retry settings for batch deployments.

390

391

Parameters:

392

- max_retries: Maximum number of retries

393

- timeout: Timeout for each retry in seconds

394

"""

395

396

class ResourceRequirementsSettings:

397

def __init__(

398

self,

399

*,

400

cpu: str = None,

401

memory: str = None,

402

gpu: str = None

403

):

404

"""

405

Resource requirements for Kubernetes deployments.

406

407

Parameters:

408

- cpu: CPU requirements (e.g., "1", "500m")

409

- memory: Memory requirements (e.g., "2Gi", "512Mi")

410

- gpu: GPU requirements (e.g., "1")

411

"""

412

```

413

414

### Serverless Endpoints

415

416

Serverless inference endpoints with automatic scaling and pay-per-use pricing.

417

418

```python { .api }

419

class ServerlessEndpoint:

420

def __init__(

421

self,

422

*,

423

name: str,

424

model_id: str,

425

auth_mode: str = "key",

426

content_safety: dict = None,

427

**kwargs

428

):

429

"""

430

Serverless endpoint for model inference.

431

432

Parameters:

433

- name: Endpoint name

434

- model_id: Model identifier from model catalog

435

- auth_mode: Authentication mode ("key", "aad_token")

436

- content_safety: Content safety configuration

437

"""

438

```

439

440

### Endpoint Authentication

441

442

Authentication methods and credential management for endpoints.

443

444

```python { .api }

445

class EndpointAuthKeys:

446

def __init__(

447

self,

448

*,

449

primary_key: str = None,

450

secondary_key: str = None

451

):

452

"""

453

API key authentication for endpoints.

454

455

Parameters:

456

- primary_key: Primary API key

457

- secondary_key: Secondary API key

458

"""

459

460

class EndpointAuthToken:

461

def __init__(

462

self,

463

*,

464

access_token: str

465

):

466

"""

467

Token-based authentication for endpoints.

468

469

Parameters:

470

- access_token: Access token for authentication

471

"""

472

473

class EndpointAadToken:

474

def __init__(

475

self,

476

*,

477

access_token: str

478

):

479

"""

480

Azure AD token authentication for endpoints.

481

482

Parameters:

483

- access_token: Azure AD access token

484

"""

485

```

486

487

#### Usage Example

488

489

```python

490

# Invoke online endpoint

491

import json

492

493

# Prepare test data

494

test_data = {

495

"data": [

496

[1.0, 2.0, 3.0, 4.0],

497

[2.0, 3.0, 4.0, 5.0]

498

]

499

}

500

501

# Get endpoint URI and key

502

endpoint = ml_client.online_endpoints.get("my-model-endpoint")

503

keys = ml_client.online_endpoints.get_keys("my-model-endpoint")

504

505

# Make prediction request

506

import requests

507

508

response = requests.post(

509

endpoint.scoring_uri,

510

headers={

511

"Authorization": f"Bearer {keys.primary_key}",

512

"Content-Type": "application/json"

513

},

514

data=json.dumps(test_data)

515

)

516

517

predictions = response.json()

518

print(predictions)

519

```