0
# Model Export
1
2
Export functionality for ONNX and OpenVINO formats to enable efficient deployment and inference. These utilities allow SetFit models to be deployed in production environments with optimized inference engines.
3
4
**Note**: Export functions are available in submodules and require explicit imports:
5
- ONNX functions: `from setfit.exporters.onnx import ...`
6
- OpenVINO functions: `from setfit.exporters.openvino import ...`
7
- Utility functions: `from setfit.exporters.utils import ...`
8
9
## Capabilities
10
11
### ONNX Export
12
13
Export SetFit models to ONNX format for cross-platform deployment and hardware optimization.
14
15
```python { .api }
16
def export_onnx(
17
model_body: SentenceTransformer,
18
model_head: Union[torch.nn.Module, LogisticRegression],
19
opset: int,
20
output_path: str = "model.onnx",
21
ignore_ir_version: bool = True,
22
use_hummingbird: bool = False
23
) -> None:
24
"""
25
Export SetFit model to ONNX IR format.
26
27
Parameters:
28
- model_body: Sentence transformer body to export
29
- model_head: Classification head (PyTorch module or sklearn)
30
- opset: ONNX opset version to use (required)
31
- output_path: Path to save the ONNX model
32
- ignore_ir_version: Whether to ignore IR version warnings
33
- use_hummingbird: Whether to use Hummingbird ML for sklearn conversion
34
35
Returns:
36
None (saves model to output_path)
37
"""
38
39
def export_onnx_setfit_model(
40
setfit_model: "OnnxSetFitModel",
41
inputs: Dict[str, torch.Tensor],
42
output_path: str,
43
opset: int = 12
44
) -> None:
45
"""
46
Export SetFit model wrapper to ONNX format.
47
48
Parameters:
49
- setfit_model: ONNX-compatible SetFit model wrapper
50
- inputs: Sample inputs for tracing the model
51
- output_path: Path to save the ONNX model
52
- opset: ONNX opset version
53
"""
54
55
def export_sklearn_head_to_onnx(
56
model_head: LogisticRegression,
57
opset: int = 11
58
) -> "onnx.onnx_ml_pb2.ModelProto":
59
"""
60
Convert sklearn classification head to ONNX format.
61
62
Parameters:
63
- model_head: Trained sklearn LogisticRegression model
64
- opset: ONNX opset version
65
66
Returns:
67
ONNX model proto for the sklearn head
68
"""
69
```
70
71
### OpenVINO Export
72
73
Export SetFit models to OpenVINO IR format for Intel hardware optimization.
74
75
```python { .api }
76
def export_to_openvino(
77
model: SetFitModel,
78
output_path: str = "model.xml"
79
) -> None:
80
"""
81
Export SetFit model to OpenVINO IR format.
82
83
Parameters:
84
- model: Trained SetFit model to export
85
- output_path: Path to save the OpenVINO IR file (default: "model.xml")
86
87
Returns:
88
None (saves model to output_path)
89
"""
90
91
def hummingbird_export(
92
model: Union[LogisticRegression, SetFitModel],
93
data_sample: Union[np.ndarray, torch.Tensor]
94
) -> torch.jit.ScriptModule:
95
"""
96
Export model using Hummingbird ML for PyTorch conversion.
97
98
Parameters:
99
- model: Model to export (sklearn or SetFit)
100
- data_sample: Sample data for tracing
101
102
Returns:
103
TorchScript model ready for deployment
104
"""
105
```
106
107
### ONNX Model Wrapper
108
109
Wrapper class that prepares SetFit models for ONNX export with proper input/output handling.
110
111
```python { .api }
112
class OnnxSetFitModel:
113
def __init__(
114
self,
115
model_body: "PreTrainedModel",
116
pooler: Optional[Union[torch.nn.Module, Callable]] = None,
117
model_head: Optional[Union[torch.nn.Module, LogisticRegression]] = None
118
):
119
"""
120
ONNX export wrapper for SetFit models.
121
122
Parameters:
123
- model_body: Pre-trained transformer model body
124
- pooler: Pooling function/module for embeddings
125
- model_head: Classification head for predictions
126
"""
127
128
def forward(
129
self,
130
input_ids: torch.Tensor,
131
attention_mask: torch.Tensor
132
) -> torch.Tensor:
133
"""
134
Forward pass through the model for ONNX export.
135
136
Parameters:
137
- input_ids: Token IDs from tokenizer
138
- attention_mask: Attention mask for input tokens
139
140
Returns:
141
Model predictions or embeddings
142
"""
143
```
144
145
### Utility Functions
146
147
Helper functions for model export and optimization.
148
149
```python { .api }
150
def mean_pooling(
151
token_embeddings: torch.Tensor,
152
attention_mask: torch.Tensor
153
) -> torch.Tensor:
154
"""
155
Perform attention-aware mean pooling on token embeddings.
156
157
Parameters:
158
- token_embeddings: Token-level embeddings [batch, seq_len, hidden_size]
159
- attention_mask: Attention mask [batch, seq_len]
160
161
Returns:
162
Pooled sentence embeddings [batch, hidden_size]
163
"""
164
```
165
166
## Usage Examples
167
168
### Basic ONNX Export
169
170
```python
171
from setfit import SetFitModel
172
from setfit.exporters.onnx import export_onnx
173
from transformers import AutoTokenizer
174
import torch
175
176
# Load trained SetFit model
177
model = SetFitModel.from_pretrained("path/to/your/trained/model")
178
179
# Export to ONNX
180
onnx_path = export_onnx(
181
model_body=model.model_body,
182
model_head=model.model_head,
183
output_path="./setfit_model.onnx",
184
opset=11,
185
device="cpu"
186
)
187
188
print(f"Model exported to: {onnx_path}")
189
190
# Verify ONNX model
191
import onnxruntime as ort
192
193
# Create ONNX runtime session
194
ort_session = ort.InferenceSession(onnx_path)
195
196
# Prepare sample input
197
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
198
sample_text = "This is a test sentence."
199
inputs = tokenizer(
200
sample_text,
201
return_tensors="np",
202
padding=True,
203
truncation=True,
204
max_length=512
205
)
206
207
# Run inference
208
onnx_outputs = ort_session.run(
209
None,
210
{
211
"input_ids": inputs["input_ids"],
212
"attention_mask": inputs["attention_mask"]
213
}
214
)
215
216
print(f"ONNX output shape: {onnx_outputs[0].shape}")
217
print(f"ONNX predictions: {onnx_outputs[0]}")
218
```
219
220
### Advanced ONNX Export with Quantization
221
222
```python
223
from setfit import SetFitModel
224
from setfit.exporters.onnx import OnnxSetFitModel, export_onnx_setfit_model
225
import torch
226
from transformers import AutoTokenizer, AutoModel
227
228
# Load model components
229
model = SetFitModel.from_pretrained("your-model")
230
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
231
232
# Create ONNX-compatible wrapper
233
onnx_model = OnnxSetFitModel(
234
model_body=model.model_body[0].auto_model, # Get the transformer
235
model_head=model.model_head
236
)
237
238
# Prepare sample inputs for tracing
239
sample_inputs = tokenizer(
240
"Sample text for tracing",
241
return_tensors="pt",
242
padding=True,
243
truncation=True,
244
max_length=512
245
)
246
247
# Export with dynamic shapes
248
torch.onnx.export(
249
onnx_model,
250
(sample_inputs["input_ids"], sample_inputs["attention_mask"]),
251
"./setfit_dynamic.onnx",
252
export_params=True,
253
opset_version=11,
254
do_constant_folding=True,
255
input_names=["input_ids", "attention_mask"],
256
output_names=["predictions"],
257
dynamic_axes={
258
"input_ids": {0: "batch_size", 1: "sequence_length"},
259
"attention_mask": {0: "batch_size", 1: "sequence_length"},
260
"predictions": {0: "batch_size"}
261
}
262
)
263
264
# Apply post-export quantization
265
from onnxruntime.quantization import quantize_dynamic, QuantType
266
267
quantize_dynamic(
268
"./setfit_dynamic.onnx",
269
"./setfit_quantized.onnx",
270
weight_type=QuantType.QUInt8
271
)
272
273
print("Quantized ONNX model saved to: ./setfit_quantized.onnx")
274
```
275
276
### OpenVINO Export for Intel Hardware
277
278
```python
279
from setfit import SetFitModel
280
from setfit.exporters.openvino import export_to_openvino
281
import openvino as ov
282
283
# Load trained model
284
model = SetFitModel.from_pretrained("your-trained-setfit-model")
285
286
# Export to OpenVINO IR
287
ir_path = export_to_openvino(
288
model=model,
289
output_path="./setfit_openvino",
290
precision="FP16", # Use half precision for speed
291
optimize=True # Apply OpenVINO optimizations
292
)
293
294
print(f"OpenVINO IR exported to: {ir_path}")
295
296
# Load and use OpenVINO model
297
core = ov.Core()
298
compiled_model = core.compile_model(f"{ir_path}/model.xml", "CPU")
299
300
# Prepare input
301
from transformers import AutoTokenizer
302
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
303
304
texts = ["This is amazing!", "This is terrible."]
305
inputs = tokenizer(
306
texts,
307
return_tensors="np",
308
padding=True,
309
truncation=True,
310
max_length=512
311
)
312
313
# Run inference
314
infer_request = compiled_model.create_infer_request()
315
infer_request.infer({
316
"input_ids": inputs["input_ids"],
317
"attention_mask": inputs["attention_mask"]
318
})
319
320
predictions = infer_request.get_output_tensor().data
321
print(f"OpenVINO predictions: {predictions}")
322
```
323
324
### TorchScript Export with Hummingbird
325
326
```python
327
from setfit import SetFitModel
328
from setfit.exporters.utils import hummingbird_export
329
import torch
330
import numpy as np
331
332
# Load model
333
model = SetFitModel.from_pretrained("your-model")
334
335
# If using sklearn head, convert with Hummingbird
336
if hasattr(model.model_head, 'predict'): # sklearn model
337
# Create sample data for tracing
338
sample_embeddings = np.random.randn(10, 384).astype(np.float32)
339
340
# Export sklearn head to PyTorch
341
torch_head = hummingbird_export(
342
model=model.model_head,
343
data_sample=sample_embeddings
344
)
345
346
print("Sklearn head converted to TorchScript")
347
else:
348
# Already a PyTorch head
349
torch_head = torch.jit.script(model.model_head)
350
351
# Create complete TorchScript model
352
class TorchScriptSetFit(torch.nn.Module):
353
def __init__(self, sentence_transformer, classification_head):
354
super().__init__()
355
self.sentence_transformer = sentence_transformer
356
self.classification_head = classification_head
357
358
def forward(self, input_ids, attention_mask):
359
# Get embeddings
360
outputs = self.sentence_transformer(
361
input_ids=input_ids,
362
attention_mask=attention_mask
363
)
364
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
365
366
# Classify
367
predictions = self.classification_head(embeddings)
368
return predictions
369
370
# Create scriptable model
371
scriptable_model = TorchScriptSetFit(
372
sentence_transformer=model.model_body[0].auto_model,
373
classification_head=torch_head
374
)
375
376
# Convert to TorchScript
377
traced_model = torch.jit.trace(
378
scriptable_model,
379
(sample_inputs["input_ids"], sample_inputs["attention_mask"])
380
)
381
382
# Save TorchScript model
383
traced_model.save("./setfit_torchscript.pt")
384
print("TorchScript model saved")
385
386
# Load and use
387
loaded_model = torch.jit.load("./setfit_torchscript.pt")
388
loaded_model.eval()
389
390
with torch.no_grad():
391
ts_predictions = loaded_model(
392
sample_inputs["input_ids"],
393
sample_inputs["attention_mask"]
394
)
395
396
print(f"TorchScript predictions: {ts_predictions}")
397
```
398
399
### Deployment Performance Comparison
400
401
```python
402
import time
403
import numpy as np
404
from setfit import SetFitModel
405
import onnxruntime as ort
406
import openvino as ov
407
import torch
408
409
def benchmark_models(texts, num_runs=100):
410
"""Compare inference speed across different export formats."""
411
412
# Original PyTorch model
413
pytorch_model = SetFitModel.from_pretrained("your-model")
414
415
# ONNX model
416
ort_session = ort.InferenceSession("./setfit_model.onnx")
417
418
# OpenVINO model
419
core = ov.Core()
420
ov_model = core.compile_model("./setfit_openvino/model.xml", "CPU")
421
422
# TorchScript model
423
ts_model = torch.jit.load("./setfit_torchscript.pt")
424
ts_model.eval()
425
426
# Prepare inputs
427
from transformers import AutoTokenizer
428
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
429
430
inputs = tokenizer(
431
texts,
432
return_tensors="pt",
433
padding=True,
434
truncation=True,
435
max_length=512
436
)
437
438
np_inputs = {
439
"input_ids": inputs["input_ids"].numpy(),
440
"attention_mask": inputs["attention_mask"].numpy()
441
}
442
443
results = {}
444
445
# Benchmark PyTorch
446
start_time = time.time()
447
for _ in range(num_runs):
448
with torch.no_grad():
449
_ = pytorch_model.predict(texts)
450
pytorch_time = (time.time() - start_time) / num_runs
451
results["PyTorch"] = pytorch_time
452
453
# Benchmark ONNX
454
start_time = time.time()
455
for _ in range(num_runs):
456
_ = ort_session.run(None, np_inputs)
457
onnx_time = (time.time() - start_time) / num_runs
458
results["ONNX"] = onnx_time
459
460
# Benchmark OpenVINO
461
infer_request = ov_model.create_infer_request()
462
start_time = time.time()
463
for _ in range(num_runs):
464
infer_request.infer(np_inputs)
465
_ = infer_request.get_output_tensor().data
466
openvino_time = (time.time() - start_time) / num_runs
467
results["OpenVINO"] = openvino_time
468
469
# Benchmark TorchScript
470
start_time = time.time()
471
for _ in range(num_runs):
472
with torch.no_grad():
473
_ = ts_model(inputs["input_ids"], inputs["attention_mask"])
474
torchscript_time = (time.time() - start_time) / num_runs
475
results["TorchScript"] = torchscript_time
476
477
return results
478
479
# Run benchmark
480
test_texts = [
481
"This product is amazing!",
482
"I'm not satisfied with this purchase.",
483
"Great value for money.",
484
"Poor quality, would not recommend."
485
]
486
487
benchmark_results = benchmark_models(test_texts, num_runs=50)
488
489
print("Inference Speed Comparison (average per batch):")
490
baseline_time = benchmark_results["PyTorch"]
491
492
for model_type, avg_time in benchmark_results.items():
493
speedup = baseline_time / avg_time
494
print(f"{model_type:12}: {avg_time:.4f}s ({speedup:.1f}x speedup)")
495
```
496
497
### Production Deployment Example
498
499
```python
500
from fastapi import FastAPI
501
import onnxruntime as ort
502
from typing import List
503
from pydantic import BaseModel
504
import numpy as np
505
from transformers import AutoTokenizer
506
507
# Initialize FastAPI app
508
app = FastAPI(title="SetFit ONNX Inference API")
509
510
# Load ONNX model and tokenizer
511
ort_session = ort.InferenceSession("./setfit_model.onnx")
512
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
513
514
# Define request/response models
515
class PredictionRequest(BaseModel):
516
texts: List[str]
517
max_length: int = 512
518
519
class PredictionResponse(BaseModel):
520
predictions: List[int]
521
probabilities: List[List[float]]
522
processing_time: float
523
524
@app.post("/predict", response_model=PredictionResponse)
525
async def predict(request: PredictionRequest):
526
import time
527
start_time = time.time()
528
529
# Tokenize inputs
530
inputs = tokenizer(
531
request.texts,
532
return_tensors="np",
533
padding=True,
534
truncation=True,
535
max_length=request.max_length
536
)
537
538
# Run ONNX inference
539
outputs = ort_session.run(
540
None,
541
{
542
"input_ids": inputs["input_ids"],
543
"attention_mask": inputs["attention_mask"]
544
}
545
)
546
547
# Process outputs
548
logits = outputs[0]
549
probabilities = softmax(logits, axis=1).tolist()
550
predictions = np.argmax(logits, axis=1).tolist()
551
552
processing_time = time.time() - start_time
553
554
return PredictionResponse(
555
predictions=predictions,
556
probabilities=probabilities,
557
processing_time=processing_time
558
)
559
560
def softmax(x, axis=None):
561
"""Compute softmax values."""
562
exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
563
return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
564
565
@app.get("/health")
566
async def health():
567
return {"status": "healthy", "model": "setfit-onnx"}
568
569
if __name__ == "__main__":
570
import uvicorn
571
uvicorn.run(app, host="0.0.0.0", port=8000)
572
```