Efficient few-shot learning with Sentence Transformers
—
Export functionality for ONNX and OpenVINO formats to enable efficient deployment and inference. These utilities allow SetFit models to be deployed in production environments with optimized inference engines.
Note: Export functions are available in submodules and require explicit imports:
from setfit.exporters.onnx import ...from setfit.exporters.openvino import ...from setfit.exporters.utils import ...Export SetFit models to ONNX format for cross-platform deployment and hardware optimization.
def export_onnx(
model_body: SentenceTransformer,
model_head: Union[torch.nn.Module, LogisticRegression],
opset: int,
output_path: str = "model.onnx",
ignore_ir_version: bool = True,
use_hummingbird: bool = False
) -> None:
"""
Export SetFit model to ONNX IR format.
Parameters:
- model_body: Sentence transformer body to export
- model_head: Classification head (PyTorch module or sklearn)
- opset: ONNX opset version to use (required)
- output_path: Path to save the ONNX model
- ignore_ir_version: Whether to ignore IR version warnings
- use_hummingbird: Whether to use Hummingbird ML for sklearn conversion
Returns:
None (saves model to output_path)
"""
def export_onnx_setfit_model(
setfit_model: "OnnxSetFitModel",
inputs: Dict[str, torch.Tensor],
output_path: str,
opset: int = 12
) -> None:
"""
Export SetFit model wrapper to ONNX format.
Parameters:
- setfit_model: ONNX-compatible SetFit model wrapper
- inputs: Sample inputs for tracing the model
- output_path: Path to save the ONNX model
- opset: ONNX opset version
"""
def export_sklearn_head_to_onnx(
model_head: LogisticRegression,
opset: int = 11
) -> "onnx.onnx_ml_pb2.ModelProto":
"""
Convert sklearn classification head to ONNX format.
Parameters:
- model_head: Trained sklearn LogisticRegression model
- opset: ONNX opset version
Returns:
ONNX model proto for the sklearn head
"""Export SetFit models to OpenVINO IR format for Intel hardware optimization.
def export_to_openvino(
model: SetFitModel,
output_path: str = "model.xml"
) -> None:
"""
Export SetFit model to OpenVINO IR format.
Parameters:
- model: Trained SetFit model to export
- output_path: Path to save the OpenVINO IR file (default: "model.xml")
Returns:
None (saves model to output_path)
"""
def hummingbird_export(
model: Union[LogisticRegression, SetFitModel],
data_sample: Union[np.ndarray, torch.Tensor]
) -> torch.jit.ScriptModule:
"""
Export model using Hummingbird ML for PyTorch conversion.
Parameters:
- model: Model to export (sklearn or SetFit)
- data_sample: Sample data for tracing
Returns:
TorchScript model ready for deployment
"""Wrapper class that prepares SetFit models for ONNX export with proper input/output handling.
class OnnxSetFitModel:
def __init__(
self,
model_body: "PreTrainedModel",
pooler: Optional[Union[torch.nn.Module, Callable]] = None,
model_head: Optional[Union[torch.nn.Module, LogisticRegression]] = None
):
"""
ONNX export wrapper for SetFit models.
Parameters:
- model_body: Pre-trained transformer model body
- pooler: Pooling function/module for embeddings
- model_head: Classification head for predictions
"""
def forward(
self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor
) -> torch.Tensor:
"""
Forward pass through the model for ONNX export.
Parameters:
- input_ids: Token IDs from tokenizer
- attention_mask: Attention mask for input tokens
Returns:
Model predictions or embeddings
"""Helper functions for model export and optimization.
def mean_pooling(
token_embeddings: torch.Tensor,
attention_mask: torch.Tensor
) -> torch.Tensor:
"""
Perform attention-aware mean pooling on token embeddings.
Parameters:
- token_embeddings: Token-level embeddings [batch, seq_len, hidden_size]
- attention_mask: Attention mask [batch, seq_len]
Returns:
Pooled sentence embeddings [batch, hidden_size]
"""from setfit import SetFitModel
from setfit.exporters.onnx import export_onnx
from transformers import AutoTokenizer
import torch
# Load trained SetFit model
model = SetFitModel.from_pretrained("path/to/your/trained/model")
# Export to ONNX
onnx_path = export_onnx(
model_body=model.model_body,
model_head=model.model_head,
output_path="./setfit_model.onnx",
opset=11,
device="cpu"
)
print(f"Model exported to: {onnx_path}")
# Verify ONNX model
import onnxruntime as ort
# Create ONNX runtime session
ort_session = ort.InferenceSession(onnx_path)
# Prepare sample input
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
sample_text = "This is a test sentence."
inputs = tokenizer(
sample_text,
return_tensors="np",
padding=True,
truncation=True,
max_length=512
)
# Run inference
onnx_outputs = ort_session.run(
None,
{
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"]
}
)
print(f"ONNX output shape: {onnx_outputs[0].shape}")
print(f"ONNX predictions: {onnx_outputs[0]}")from setfit import SetFitModel
from setfit.exporters.onnx import OnnxSetFitModel, export_onnx_setfit_model
import torch
from transformers import AutoTokenizer, AutoModel
# Load model components
model = SetFitModel.from_pretrained("your-model")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# Create ONNX-compatible wrapper
onnx_model = OnnxSetFitModel(
model_body=model.model_body[0].auto_model, # Get the transformer
model_head=model.model_head
)
# Prepare sample inputs for tracing
sample_inputs = tokenizer(
"Sample text for tracing",
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
# Export with dynamic shapes
torch.onnx.export(
onnx_model,
(sample_inputs["input_ids"], sample_inputs["attention_mask"]),
"./setfit_dynamic.onnx",
export_params=True,
opset_version=11,
do_constant_folding=True,
input_names=["input_ids", "attention_mask"],
output_names=["predictions"],
dynamic_axes={
"input_ids": {0: "batch_size", 1: "sequence_length"},
"attention_mask": {0: "batch_size", 1: "sequence_length"},
"predictions": {0: "batch_size"}
}
)
# Apply post-export quantization
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic(
"./setfit_dynamic.onnx",
"./setfit_quantized.onnx",
weight_type=QuantType.QUInt8
)
print("Quantized ONNX model saved to: ./setfit_quantized.onnx")from setfit import SetFitModel
from setfit.exporters.openvino import export_to_openvino
import openvino as ov
# Load trained model
model = SetFitModel.from_pretrained("your-trained-setfit-model")
# Export to OpenVINO IR
ir_path = export_to_openvino(
model=model,
output_path="./setfit_openvino",
precision="FP16", # Use half precision for speed
optimize=True # Apply OpenVINO optimizations
)
print(f"OpenVINO IR exported to: {ir_path}")
# Load and use OpenVINO model
core = ov.Core()
compiled_model = core.compile_model(f"{ir_path}/model.xml", "CPU")
# Prepare input
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
texts = ["This is amazing!", "This is terrible."]
inputs = tokenizer(
texts,
return_tensors="np",
padding=True,
truncation=True,
max_length=512
)
# Run inference
infer_request = compiled_model.create_infer_request()
infer_request.infer({
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"]
})
predictions = infer_request.get_output_tensor().data
print(f"OpenVINO predictions: {predictions}")from setfit import SetFitModel
from setfit.exporters.utils import hummingbird_export
import torch
import numpy as np
# Load model
model = SetFitModel.from_pretrained("your-model")
# If using sklearn head, convert with Hummingbird
if hasattr(model.model_head, 'predict'): # sklearn model
# Create sample data for tracing
sample_embeddings = np.random.randn(10, 384).astype(np.float32)
# Export sklearn head to PyTorch
torch_head = hummingbird_export(
model=model.model_head,
data_sample=sample_embeddings
)
print("Sklearn head converted to TorchScript")
else:
# Already a PyTorch head
torch_head = torch.jit.script(model.model_head)
# Create complete TorchScript model
class TorchScriptSetFit(torch.nn.Module):
def __init__(self, sentence_transformer, classification_head):
super().__init__()
self.sentence_transformer = sentence_transformer
self.classification_head = classification_head
def forward(self, input_ids, attention_mask):
# Get embeddings
outputs = self.sentence_transformer(
input_ids=input_ids,
attention_mask=attention_mask
)
embeddings = outputs.last_hidden_state.mean(dim=1) # Mean pooling
# Classify
predictions = self.classification_head(embeddings)
return predictions
# Create scriptable model
scriptable_model = TorchScriptSetFit(
sentence_transformer=model.model_body[0].auto_model,
classification_head=torch_head
)
# Convert to TorchScript
traced_model = torch.jit.trace(
scriptable_model,
(sample_inputs["input_ids"], sample_inputs["attention_mask"])
)
# Save TorchScript model
traced_model.save("./setfit_torchscript.pt")
print("TorchScript model saved")
# Load and use
loaded_model = torch.jit.load("./setfit_torchscript.pt")
loaded_model.eval()
with torch.no_grad():
ts_predictions = loaded_model(
sample_inputs["input_ids"],
sample_inputs["attention_mask"]
)
print(f"TorchScript predictions: {ts_predictions}")import time
import numpy as np
from setfit import SetFitModel
import onnxruntime as ort
import openvino as ov
import torch
def benchmark_models(texts, num_runs=100):
"""Compare inference speed across different export formats."""
# Original PyTorch model
pytorch_model = SetFitModel.from_pretrained("your-model")
# ONNX model
ort_session = ort.InferenceSession("./setfit_model.onnx")
# OpenVINO model
core = ov.Core()
ov_model = core.compile_model("./setfit_openvino/model.xml", "CPU")
# TorchScript model
ts_model = torch.jit.load("./setfit_torchscript.pt")
ts_model.eval()
# Prepare inputs
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
inputs = tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
np_inputs = {
"input_ids": inputs["input_ids"].numpy(),
"attention_mask": inputs["attention_mask"].numpy()
}
results = {}
# Benchmark PyTorch
start_time = time.time()
for _ in range(num_runs):
with torch.no_grad():
_ = pytorch_model.predict(texts)
pytorch_time = (time.time() - start_time) / num_runs
results["PyTorch"] = pytorch_time
# Benchmark ONNX
start_time = time.time()
for _ in range(num_runs):
_ = ort_session.run(None, np_inputs)
onnx_time = (time.time() - start_time) / num_runs
results["ONNX"] = onnx_time
# Benchmark OpenVINO
infer_request = ov_model.create_infer_request()
start_time = time.time()
for _ in range(num_runs):
infer_request.infer(np_inputs)
_ = infer_request.get_output_tensor().data
openvino_time = (time.time() - start_time) / num_runs
results["OpenVINO"] = openvino_time
# Benchmark TorchScript
start_time = time.time()
for _ in range(num_runs):
with torch.no_grad():
_ = ts_model(inputs["input_ids"], inputs["attention_mask"])
torchscript_time = (time.time() - start_time) / num_runs
results["TorchScript"] = torchscript_time
return results
# Run benchmark
test_texts = [
"This product is amazing!",
"I'm not satisfied with this purchase.",
"Great value for money.",
"Poor quality, would not recommend."
]
benchmark_results = benchmark_models(test_texts, num_runs=50)
print("Inference Speed Comparison (average per batch):")
baseline_time = benchmark_results["PyTorch"]
for model_type, avg_time in benchmark_results.items():
speedup = baseline_time / avg_time
print(f"{model_type:12}: {avg_time:.4f}s ({speedup:.1f}x speedup)")from fastapi import FastAPI
import onnxruntime as ort
from typing import List
from pydantic import BaseModel
import numpy as np
from transformers import AutoTokenizer
# Initialize FastAPI app
app = FastAPI(title="SetFit ONNX Inference API")
# Load ONNX model and tokenizer
ort_session = ort.InferenceSession("./setfit_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# Define request/response models
class PredictionRequest(BaseModel):
texts: List[str]
max_length: int = 512
class PredictionResponse(BaseModel):
predictions: List[int]
probabilities: List[List[float]]
processing_time: float
@app.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
import time
start_time = time.time()
# Tokenize inputs
inputs = tokenizer(
request.texts,
return_tensors="np",
padding=True,
truncation=True,
max_length=request.max_length
)
# Run ONNX inference
outputs = ort_session.run(
None,
{
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"]
}
)
# Process outputs
logits = outputs[0]
probabilities = softmax(logits, axis=1).tolist()
predictions = np.argmax(logits, axis=1).tolist()
processing_time = time.time() - start_time
return PredictionResponse(
predictions=predictions,
probabilities=probabilities,
processing_time=processing_time
)
def softmax(x, axis=None):
"""Compute softmax values."""
exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
@app.get("/health")
async def health():
return {"status": "healthy", "model": "setfit-onnx"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)Install with Tessl CLI
npx tessl i tessl/pypi-setfit