or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

absa.mdcore-model-training.mddata-utilities.mdindex.mdknowledge-distillation.mdmodel-cards.mdmodel-export.md

model-export.mddocs/

0

# Model Export

1

2

Export functionality for ONNX and OpenVINO formats to enable efficient deployment and inference. These utilities allow SetFit models to be deployed in production environments with optimized inference engines.

3

4

**Note**: Export functions are available in submodules and require explicit imports:

5

- ONNX functions: `from setfit.exporters.onnx import ...`

6

- OpenVINO functions: `from setfit.exporters.openvino import ...`

7

- Utility functions: `from setfit.exporters.utils import ...`

8

9

## Capabilities

10

11

### ONNX Export

12

13

Export SetFit models to ONNX format for cross-platform deployment and hardware optimization.

14

15

```python { .api }

16

def export_onnx(

17

model_body: SentenceTransformer,

18

model_head: Union[torch.nn.Module, LogisticRegression],

19

opset: int,

20

output_path: str = "model.onnx",

21

ignore_ir_version: bool = True,

22

use_hummingbird: bool = False

23

) -> None:

24

"""

25

Export SetFit model to ONNX IR format.

26

27

Parameters:

28

- model_body: Sentence transformer body to export

29

- model_head: Classification head (PyTorch module or sklearn)

30

- opset: ONNX opset version to use (required)

31

- output_path: Path to save the ONNX model

32

- ignore_ir_version: Whether to ignore IR version warnings

33

- use_hummingbird: Whether to use Hummingbird ML for sklearn conversion

34

35

Returns:

36

None (saves model to output_path)

37

"""

38

39

def export_onnx_setfit_model(

40

setfit_model: "OnnxSetFitModel",

41

inputs: Dict[str, torch.Tensor],

42

output_path: str,

43

opset: int = 12

44

) -> None:

45

"""

46

Export SetFit model wrapper to ONNX format.

47

48

Parameters:

49

- setfit_model: ONNX-compatible SetFit model wrapper

50

- inputs: Sample inputs for tracing the model

51

- output_path: Path to save the ONNX model

52

- opset: ONNX opset version

53

"""

54

55

def export_sklearn_head_to_onnx(

56

model_head: LogisticRegression,

57

opset: int = 11

58

) -> "onnx.onnx_ml_pb2.ModelProto":

59

"""

60

Convert sklearn classification head to ONNX format.

61

62

Parameters:

63

- model_head: Trained sklearn LogisticRegression model

64

- opset: ONNX opset version

65

66

Returns:

67

ONNX model proto for the sklearn head

68

"""

69

```

70

71

### OpenVINO Export

72

73

Export SetFit models to OpenVINO IR format for Intel hardware optimization.

74

75

```python { .api }

76

def export_to_openvino(

77

model: SetFitModel,

78

output_path: str = "model.xml"

79

) -> None:

80

"""

81

Export SetFit model to OpenVINO IR format.

82

83

Parameters:

84

- model: Trained SetFit model to export

85

- output_path: Path to save the OpenVINO IR file (default: "model.xml")

86

87

Returns:

88

None (saves model to output_path)

89

"""

90

91

def hummingbird_export(

92

model: Union[LogisticRegression, SetFitModel],

93

data_sample: Union[np.ndarray, torch.Tensor]

94

) -> torch.jit.ScriptModule:

95

"""

96

Export model using Hummingbird ML for PyTorch conversion.

97

98

Parameters:

99

- model: Model to export (sklearn or SetFit)

100

- data_sample: Sample data for tracing

101

102

Returns:

103

TorchScript model ready for deployment

104

"""

105

```

106

107

### ONNX Model Wrapper

108

109

Wrapper class that prepares SetFit models for ONNX export with proper input/output handling.

110

111

```python { .api }

112

class OnnxSetFitModel:

113

def __init__(

114

self,

115

model_body: "PreTrainedModel",

116

pooler: Optional[Union[torch.nn.Module, Callable]] = None,

117

model_head: Optional[Union[torch.nn.Module, LogisticRegression]] = None

118

):

119

"""

120

ONNX export wrapper for SetFit models.

121

122

Parameters:

123

- model_body: Pre-trained transformer model body

124

- pooler: Pooling function/module for embeddings

125

- model_head: Classification head for predictions

126

"""

127

128

def forward(

129

self,

130

input_ids: torch.Tensor,

131

attention_mask: torch.Tensor

132

) -> torch.Tensor:

133

"""

134

Forward pass through the model for ONNX export.

135

136

Parameters:

137

- input_ids: Token IDs from tokenizer

138

- attention_mask: Attention mask for input tokens

139

140

Returns:

141

Model predictions or embeddings

142

"""

143

```

144

145

### Utility Functions

146

147

Helper functions for model export and optimization.

148

149

```python { .api }

150

def mean_pooling(

151

token_embeddings: torch.Tensor,

152

attention_mask: torch.Tensor

153

) -> torch.Tensor:

154

"""

155

Perform attention-aware mean pooling on token embeddings.

156

157

Parameters:

158

- token_embeddings: Token-level embeddings [batch, seq_len, hidden_size]

159

- attention_mask: Attention mask [batch, seq_len]

160

161

Returns:

162

Pooled sentence embeddings [batch, hidden_size]

163

"""

164

```

165

166

## Usage Examples

167

168

### Basic ONNX Export

169

170

```python

171

from setfit import SetFitModel

172

from setfit.exporters.onnx import export_onnx

173

from transformers import AutoTokenizer

174

import torch

175

176

# Load trained SetFit model

177

model = SetFitModel.from_pretrained("path/to/your/trained/model")

178

179

# Export to ONNX

180

onnx_path = export_onnx(

181

model_body=model.model_body,

182

model_head=model.model_head,

183

output_path="./setfit_model.onnx",

184

opset=11,

185

device="cpu"

186

)

187

188

print(f"Model exported to: {onnx_path}")

189

190

# Verify ONNX model

191

import onnxruntime as ort

192

193

# Create ONNX runtime session

194

ort_session = ort.InferenceSession(onnx_path)

195

196

# Prepare sample input

197

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

198

sample_text = "This is a test sentence."

199

inputs = tokenizer(

200

sample_text,

201

return_tensors="np",

202

padding=True,

203

truncation=True,

204

max_length=512

205

)

206

207

# Run inference

208

onnx_outputs = ort_session.run(

209

None,

210

{

211

"input_ids": inputs["input_ids"],

212

"attention_mask": inputs["attention_mask"]

213

}

214

)

215

216

print(f"ONNX output shape: {onnx_outputs[0].shape}")

217

print(f"ONNX predictions: {onnx_outputs[0]}")

218

```

219

220

### Advanced ONNX Export with Quantization

221

222

```python

223

from setfit import SetFitModel

224

from setfit.exporters.onnx import OnnxSetFitModel, export_onnx_setfit_model

225

import torch

226

from transformers import AutoTokenizer, AutoModel

227

228

# Load model components

229

model = SetFitModel.from_pretrained("your-model")

230

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

231

232

# Create ONNX-compatible wrapper

233

onnx_model = OnnxSetFitModel(

234

model_body=model.model_body[0].auto_model, # Get the transformer

235

model_head=model.model_head

236

)

237

238

# Prepare sample inputs for tracing

239

sample_inputs = tokenizer(

240

"Sample text for tracing",

241

return_tensors="pt",

242

padding=True,

243

truncation=True,

244

max_length=512

245

)

246

247

# Export with dynamic shapes

248

torch.onnx.export(

249

onnx_model,

250

(sample_inputs["input_ids"], sample_inputs["attention_mask"]),

251

"./setfit_dynamic.onnx",

252

export_params=True,

253

opset_version=11,

254

do_constant_folding=True,

255

input_names=["input_ids", "attention_mask"],

256

output_names=["predictions"],

257

dynamic_axes={

258

"input_ids": {0: "batch_size", 1: "sequence_length"},

259

"attention_mask": {0: "batch_size", 1: "sequence_length"},

260

"predictions": {0: "batch_size"}

261

}

262

)

263

264

# Apply post-export quantization

265

from onnxruntime.quantization import quantize_dynamic, QuantType

266

267

quantize_dynamic(

268

"./setfit_dynamic.onnx",

269

"./setfit_quantized.onnx",

270

weight_type=QuantType.QUInt8

271

)

272

273

print("Quantized ONNX model saved to: ./setfit_quantized.onnx")

274

```

275

276

### OpenVINO Export for Intel Hardware

277

278

```python

279

from setfit import SetFitModel

280

from setfit.exporters.openvino import export_to_openvino

281

import openvino as ov

282

283

# Load trained model

284

model = SetFitModel.from_pretrained("your-trained-setfit-model")

285

286

# Export to OpenVINO IR

287

ir_path = export_to_openvino(

288

model=model,

289

output_path="./setfit_openvino",

290

precision="FP16", # Use half precision for speed

291

optimize=True # Apply OpenVINO optimizations

292

)

293

294

print(f"OpenVINO IR exported to: {ir_path}")

295

296

# Load and use OpenVINO model

297

core = ov.Core()

298

compiled_model = core.compile_model(f"{ir_path}/model.xml", "CPU")

299

300

# Prepare input

301

from transformers import AutoTokenizer

302

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

303

304

texts = ["This is amazing!", "This is terrible."]

305

inputs = tokenizer(

306

texts,

307

return_tensors="np",

308

padding=True,

309

truncation=True,

310

max_length=512

311

)

312

313

# Run inference

314

infer_request = compiled_model.create_infer_request()

315

infer_request.infer({

316

"input_ids": inputs["input_ids"],

317

"attention_mask": inputs["attention_mask"]

318

})

319

320

predictions = infer_request.get_output_tensor().data

321

print(f"OpenVINO predictions: {predictions}")

322

```

323

324

### TorchScript Export with Hummingbird

325

326

```python

327

from setfit import SetFitModel

328

from setfit.exporters.utils import hummingbird_export

329

import torch

330

import numpy as np

331

332

# Load model

333

model = SetFitModel.from_pretrained("your-model")

334

335

# If using sklearn head, convert with Hummingbird

336

if hasattr(model.model_head, 'predict'): # sklearn model

337

# Create sample data for tracing

338

sample_embeddings = np.random.randn(10, 384).astype(np.float32)

339

340

# Export sklearn head to PyTorch

341

torch_head = hummingbird_export(

342

model=model.model_head,

343

data_sample=sample_embeddings

344

)

345

346

print("Sklearn head converted to TorchScript")

347

else:

348

# Already a PyTorch head

349

torch_head = torch.jit.script(model.model_head)

350

351

# Create complete TorchScript model

352

class TorchScriptSetFit(torch.nn.Module):

353

def __init__(self, sentence_transformer, classification_head):

354

super().__init__()

355

self.sentence_transformer = sentence_transformer

356

self.classification_head = classification_head

357

358

def forward(self, input_ids, attention_mask):

359

# Get embeddings

360

outputs = self.sentence_transformer(

361

input_ids=input_ids,

362

attention_mask=attention_mask

363

)

364

embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling

365

366

# Classify

367

predictions = self.classification_head(embeddings)

368

return predictions

369

370

# Create scriptable model

371

scriptable_model = TorchScriptSetFit(

372

sentence_transformer=model.model_body[0].auto_model,

373

classification_head=torch_head

374

)

375

376

# Convert to TorchScript

377

traced_model = torch.jit.trace(

378

scriptable_model,

379

(sample_inputs["input_ids"], sample_inputs["attention_mask"])

380

)

381

382

# Save TorchScript model

383

traced_model.save("./setfit_torchscript.pt")

384

print("TorchScript model saved")

385

386

# Load and use

387

loaded_model = torch.jit.load("./setfit_torchscript.pt")

388

loaded_model.eval()

389

390

with torch.no_grad():

391

ts_predictions = loaded_model(

392

sample_inputs["input_ids"],

393

sample_inputs["attention_mask"]

394

)

395

396

print(f"TorchScript predictions: {ts_predictions}")

397

```

398

399

### Deployment Performance Comparison

400

401

```python

402

import time

403

import numpy as np

404

from setfit import SetFitModel

405

import onnxruntime as ort

406

import openvino as ov

407

import torch

408

409

def benchmark_models(texts, num_runs=100):

410

"""Compare inference speed across different export formats."""

411

412

# Original PyTorch model

413

pytorch_model = SetFitModel.from_pretrained("your-model")

414

415

# ONNX model

416

ort_session = ort.InferenceSession("./setfit_model.onnx")

417

418

# OpenVINO model

419

core = ov.Core()

420

ov_model = core.compile_model("./setfit_openvino/model.xml", "CPU")

421

422

# TorchScript model

423

ts_model = torch.jit.load("./setfit_torchscript.pt")

424

ts_model.eval()

425

426

# Prepare inputs

427

from transformers import AutoTokenizer

428

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

429

430

inputs = tokenizer(

431

texts,

432

return_tensors="pt",

433

padding=True,

434

truncation=True,

435

max_length=512

436

)

437

438

np_inputs = {

439

"input_ids": inputs["input_ids"].numpy(),

440

"attention_mask": inputs["attention_mask"].numpy()

441

}

442

443

results = {}

444

445

# Benchmark PyTorch

446

start_time = time.time()

447

for _ in range(num_runs):

448

with torch.no_grad():

449

_ = pytorch_model.predict(texts)

450

pytorch_time = (time.time() - start_time) / num_runs

451

results["PyTorch"] = pytorch_time

452

453

# Benchmark ONNX

454

start_time = time.time()

455

for _ in range(num_runs):

456

_ = ort_session.run(None, np_inputs)

457

onnx_time = (time.time() - start_time) / num_runs

458

results["ONNX"] = onnx_time

459

460

# Benchmark OpenVINO

461

infer_request = ov_model.create_infer_request()

462

start_time = time.time()

463

for _ in range(num_runs):

464

infer_request.infer(np_inputs)

465

_ = infer_request.get_output_tensor().data

466

openvino_time = (time.time() - start_time) / num_runs

467

results["OpenVINO"] = openvino_time

468

469

# Benchmark TorchScript

470

start_time = time.time()

471

for _ in range(num_runs):

472

with torch.no_grad():

473

_ = ts_model(inputs["input_ids"], inputs["attention_mask"])

474

torchscript_time = (time.time() - start_time) / num_runs

475

results["TorchScript"] = torchscript_time

476

477

return results

478

479

# Run benchmark

480

test_texts = [

481

"This product is amazing!",

482

"I'm not satisfied with this purchase.",

483

"Great value for money.",

484

"Poor quality, would not recommend."

485

]

486

487

benchmark_results = benchmark_models(test_texts, num_runs=50)

488

489

print("Inference Speed Comparison (average per batch):")

490

baseline_time = benchmark_results["PyTorch"]

491

492

for model_type, avg_time in benchmark_results.items():

493

speedup = baseline_time / avg_time

494

print(f"{model_type:12}: {avg_time:.4f}s ({speedup:.1f}x speedup)")

495

```

496

497

### Production Deployment Example

498

499

```python

500

from fastapi import FastAPI

501

import onnxruntime as ort

502

from typing import List

503

from pydantic import BaseModel

504

import numpy as np

505

from transformers import AutoTokenizer

506

507

# Initialize FastAPI app

508

app = FastAPI(title="SetFit ONNX Inference API")

509

510

# Load ONNX model and tokenizer

511

ort_session = ort.InferenceSession("./setfit_model.onnx")

512

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

513

514

# Define request/response models

515

class PredictionRequest(BaseModel):

516

texts: List[str]

517

max_length: int = 512

518

519

class PredictionResponse(BaseModel):

520

predictions: List[int]

521

probabilities: List[List[float]]

522

processing_time: float

523

524

@app.post("/predict", response_model=PredictionResponse)

525

async def predict(request: PredictionRequest):

526

import time

527

start_time = time.time()

528

529

# Tokenize inputs

530

inputs = tokenizer(

531

request.texts,

532

return_tensors="np",

533

padding=True,

534

truncation=True,

535

max_length=request.max_length

536

)

537

538

# Run ONNX inference

539

outputs = ort_session.run(

540

None,

541

{

542

"input_ids": inputs["input_ids"],

543

"attention_mask": inputs["attention_mask"]

544

}

545

)

546

547

# Process outputs

548

logits = outputs[0]

549

probabilities = softmax(logits, axis=1).tolist()

550

predictions = np.argmax(logits, axis=1).tolist()

551

552

processing_time = time.time() - start_time

553

554

return PredictionResponse(

555

predictions=predictions,

556

probabilities=probabilities,

557

processing_time=processing_time

558

)

559

560

def softmax(x, axis=None):

561

"""Compute softmax values."""

562

exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))

563

return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

564

565

@app.get("/health")

566

async def health():

567

return {"status": "healthy", "model": "setfit-onnx"}

568

569

if __name__ == "__main__":

570

import uvicorn

571

uvicorn.run(app, host="0.0.0.0", port=8000)

572

```