CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-setfit

Efficient few-shot learning with Sentence Transformers

Pending
Overview
Eval results
Files

model-export.mddocs/

Model Export

Export functionality for ONNX and OpenVINO formats to enable efficient deployment and inference. These utilities allow SetFit models to be deployed in production environments with optimized inference engines.

Note: Export functions are available in submodules and require explicit imports:

  • ONNX functions: from setfit.exporters.onnx import ...
  • OpenVINO functions: from setfit.exporters.openvino import ...
  • Utility functions: from setfit.exporters.utils import ...

Capabilities

ONNX Export

Export SetFit models to ONNX format for cross-platform deployment and hardware optimization.

def export_onnx(
    model_body: SentenceTransformer,
    model_head: Union[torch.nn.Module, LogisticRegression],
    opset: int,
    output_path: str = "model.onnx",
    ignore_ir_version: bool = True,
    use_hummingbird: bool = False
) -> None:
    """
    Export SetFit model to ONNX IR format.

    Parameters:
    - model_body: Sentence transformer body to export
    - model_head: Classification head (PyTorch module or sklearn)
    - opset: ONNX opset version to use (required)
    - output_path: Path to save the ONNX model
    - ignore_ir_version: Whether to ignore IR version warnings
    - use_hummingbird: Whether to use Hummingbird ML for sklearn conversion

    Returns:
    None (saves model to output_path)
    """

def export_onnx_setfit_model(
    setfit_model: "OnnxSetFitModel",
    inputs: Dict[str, torch.Tensor],
    output_path: str,
    opset: int = 12
) -> None:
    """
    Export SetFit model wrapper to ONNX format.

    Parameters:
    - setfit_model: ONNX-compatible SetFit model wrapper
    - inputs: Sample inputs for tracing the model
    - output_path: Path to save the ONNX model
    - opset: ONNX opset version
    """

def export_sklearn_head_to_onnx(
    model_head: LogisticRegression,
    opset: int = 11
) -> "onnx.onnx_ml_pb2.ModelProto":
    """
    Convert sklearn classification head to ONNX format.

    Parameters:
    - model_head: Trained sklearn LogisticRegression model
    - opset: ONNX opset version

    Returns:
    ONNX model proto for the sklearn head
    """

OpenVINO Export

Export SetFit models to OpenVINO IR format for Intel hardware optimization.

def export_to_openvino(
    model: SetFitModel,
    output_path: str = "model.xml"
) -> None:
    """
    Export SetFit model to OpenVINO IR format.

    Parameters:
    - model: Trained SetFit model to export
    - output_path: Path to save the OpenVINO IR file (default: "model.xml")

    Returns:
    None (saves model to output_path)
    """

def hummingbird_export(
    model: Union[LogisticRegression, SetFitModel],
    data_sample: Union[np.ndarray, torch.Tensor]
) -> torch.jit.ScriptModule:
    """
    Export model using Hummingbird ML for PyTorch conversion.

    Parameters:
    - model: Model to export (sklearn or SetFit)
    - data_sample: Sample data for tracing

    Returns:
    TorchScript model ready for deployment
    """

ONNX Model Wrapper

Wrapper class that prepares SetFit models for ONNX export with proper input/output handling.

class OnnxSetFitModel:
    def __init__(
        self,
        model_body: "PreTrainedModel",
        pooler: Optional[Union[torch.nn.Module, Callable]] = None,
        model_head: Optional[Union[torch.nn.Module, LogisticRegression]] = None
    ):
        """
        ONNX export wrapper for SetFit models.

        Parameters:
        - model_body: Pre-trained transformer model body
        - pooler: Pooling function/module for embeddings
        - model_head: Classification head for predictions
        """

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor
    ) -> torch.Tensor:
        """
        Forward pass through the model for ONNX export.

        Parameters:
        - input_ids: Token IDs from tokenizer
        - attention_mask: Attention mask for input tokens

        Returns:
        Model predictions or embeddings
        """

Utility Functions

Helper functions for model export and optimization.

def mean_pooling(
    token_embeddings: torch.Tensor,
    attention_mask: torch.Tensor
) -> torch.Tensor:
    """
    Perform attention-aware mean pooling on token embeddings.

    Parameters:
    - token_embeddings: Token-level embeddings [batch, seq_len, hidden_size]
    - attention_mask: Attention mask [batch, seq_len]

    Returns:
    Pooled sentence embeddings [batch, hidden_size]
    """

Usage Examples

Basic ONNX Export

from setfit import SetFitModel
from setfit.exporters.onnx import export_onnx
from transformers import AutoTokenizer
import torch

# Load trained SetFit model
model = SetFitModel.from_pretrained("path/to/your/trained/model")

# Export to ONNX
onnx_path = export_onnx(
    model_body=model.model_body,
    model_head=model.model_head,
    output_path="./setfit_model.onnx",
    opset=11,
    device="cpu"
)

print(f"Model exported to: {onnx_path}")

# Verify ONNX model
import onnxruntime as ort

# Create ONNX runtime session
ort_session = ort.InferenceSession(onnx_path)

# Prepare sample input
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
sample_text = "This is a test sentence."
inputs = tokenizer(
    sample_text,
    return_tensors="np",
    padding=True,
    truncation=True,
    max_length=512
)

# Run inference
onnx_outputs = ort_session.run(
    None,
    {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"]
    }
)

print(f"ONNX output shape: {onnx_outputs[0].shape}")
print(f"ONNX predictions: {onnx_outputs[0]}")

Advanced ONNX Export with Quantization

from setfit import SetFitModel
from setfit.exporters.onnx import OnnxSetFitModel, export_onnx_setfit_model
import torch
from transformers import AutoTokenizer, AutoModel

# Load model components
model = SetFitModel.from_pretrained("your-model")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Create ONNX-compatible wrapper
onnx_model = OnnxSetFitModel(
    model_body=model.model_body[0].auto_model,  # Get the transformer
    model_head=model.model_head
)

# Prepare sample inputs for tracing
sample_inputs = tokenizer(
    "Sample text for tracing",
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
)

# Export with dynamic shapes
torch.onnx.export(
    onnx_model,
    (sample_inputs["input_ids"], sample_inputs["attention_mask"]),
    "./setfit_dynamic.onnx",
    export_params=True,
    opset_version=11,
    do_constant_folding=True,
    input_names=["input_ids", "attention_mask"],
    output_names=["predictions"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "predictions": {0: "batch_size"}
    }
)

# Apply post-export quantization
from onnxruntime.quantization import quantize_dynamic, QuantType

quantize_dynamic(
    "./setfit_dynamic.onnx",
    "./setfit_quantized.onnx",
    weight_type=QuantType.QUInt8
)

print("Quantized ONNX model saved to: ./setfit_quantized.onnx")

OpenVINO Export for Intel Hardware

from setfit import SetFitModel
from setfit.exporters.openvino import export_to_openvino
import openvino as ov

# Load trained model
model = SetFitModel.from_pretrained("your-trained-setfit-model")

# Export to OpenVINO IR
ir_path = export_to_openvino(
    model=model,
    output_path="./setfit_openvino",
    precision="FP16",  # Use half precision for speed
    optimize=True      # Apply OpenVINO optimizations
)

print(f"OpenVINO IR exported to: {ir_path}")

# Load and use OpenVINO model
core = ov.Core()
compiled_model = core.compile_model(f"{ir_path}/model.xml", "CPU")

# Prepare input
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

texts = ["This is amazing!", "This is terrible."]
inputs = tokenizer(
    texts,
    return_tensors="np",
    padding=True,
    truncation=True,
    max_length=512
)

# Run inference
infer_request = compiled_model.create_infer_request()
infer_request.infer({
    "input_ids": inputs["input_ids"],
    "attention_mask": inputs["attention_mask"]
})

predictions = infer_request.get_output_tensor().data
print(f"OpenVINO predictions: {predictions}")

TorchScript Export with Hummingbird

from setfit import SetFitModel
from setfit.exporters.utils import hummingbird_export
import torch
import numpy as np

# Load model
model = SetFitModel.from_pretrained("your-model")

# If using sklearn head, convert with Hummingbird
if hasattr(model.model_head, 'predict'):  # sklearn model
    # Create sample data for tracing
    sample_embeddings = np.random.randn(10, 384).astype(np.float32)
    
    # Export sklearn head to PyTorch
    torch_head = hummingbird_export(
        model=model.model_head,
        data_sample=sample_embeddings
    )
    
    print("Sklearn head converted to TorchScript")
else:
    # Already a PyTorch head
    torch_head = torch.jit.script(model.model_head)

# Create complete TorchScript model
class TorchScriptSetFit(torch.nn.Module):
    def __init__(self, sentence_transformer, classification_head):
        super().__init__()
        self.sentence_transformer = sentence_transformer
        self.classification_head = classification_head
    
    def forward(self, input_ids, attention_mask):
        # Get embeddings
        outputs = self.sentence_transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        
        # Classify
        predictions = self.classification_head(embeddings)
        return predictions

# Create scriptable model
scriptable_model = TorchScriptSetFit(
    sentence_transformer=model.model_body[0].auto_model,
    classification_head=torch_head
)

# Convert to TorchScript
traced_model = torch.jit.trace(
    scriptable_model,
    (sample_inputs["input_ids"], sample_inputs["attention_mask"])
)

# Save TorchScript model
traced_model.save("./setfit_torchscript.pt")
print("TorchScript model saved")

# Load and use
loaded_model = torch.jit.load("./setfit_torchscript.pt")
loaded_model.eval()

with torch.no_grad():
    ts_predictions = loaded_model(
        sample_inputs["input_ids"],
        sample_inputs["attention_mask"]
    )

print(f"TorchScript predictions: {ts_predictions}")

Deployment Performance Comparison

import time
import numpy as np
from setfit import SetFitModel
import onnxruntime as ort
import openvino as ov
import torch

def benchmark_models(texts, num_runs=100):
    """Compare inference speed across different export formats."""
    
    # Original PyTorch model
    pytorch_model = SetFitModel.from_pretrained("your-model")
    
    # ONNX model
    ort_session = ort.InferenceSession("./setfit_model.onnx")
    
    # OpenVINO model
    core = ov.Core()
    ov_model = core.compile_model("./setfit_openvino/model.xml", "CPU")
    
    # TorchScript model
    ts_model = torch.jit.load("./setfit_torchscript.pt")
    ts_model.eval()
    
    # Prepare inputs
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    
    np_inputs = {
        "input_ids": inputs["input_ids"].numpy(),
        "attention_mask": inputs["attention_mask"].numpy()
    }
    
    results = {}
    
    # Benchmark PyTorch
    start_time = time.time()
    for _ in range(num_runs):
        with torch.no_grad():
            _ = pytorch_model.predict(texts)
    pytorch_time = (time.time() - start_time) / num_runs
    results["PyTorch"] = pytorch_time
    
    # Benchmark ONNX
    start_time = time.time()
    for _ in range(num_runs):
        _ = ort_session.run(None, np_inputs)
    onnx_time = (time.time() - start_time) / num_runs
    results["ONNX"] = onnx_time
    
    # Benchmark OpenVINO
    infer_request = ov_model.create_infer_request()
    start_time = time.time()
    for _ in range(num_runs):
        infer_request.infer(np_inputs)
        _ = infer_request.get_output_tensor().data
    openvino_time = (time.time() - start_time) / num_runs
    results["OpenVINO"] = openvino_time
    
    # Benchmark TorchScript
    start_time = time.time()
    for _ in range(num_runs):
        with torch.no_grad():
            _ = ts_model(inputs["input_ids"], inputs["attention_mask"])
    torchscript_time = (time.time() - start_time) / num_runs
    results["TorchScript"] = torchscript_time
    
    return results

# Run benchmark
test_texts = [
    "This product is amazing!",
    "I'm not satisfied with this purchase.",
    "Great value for money.",
    "Poor quality, would not recommend."
]

benchmark_results = benchmark_models(test_texts, num_runs=50)

print("Inference Speed Comparison (average per batch):")
baseline_time = benchmark_results["PyTorch"]

for model_type, avg_time in benchmark_results.items():
    speedup = baseline_time / avg_time
    print(f"{model_type:12}: {avg_time:.4f}s ({speedup:.1f}x speedup)")

Production Deployment Example

from fastapi import FastAPI
import onnxruntime as ort
from typing import List
from pydantic import BaseModel
import numpy as np
from transformers import AutoTokenizer

# Initialize FastAPI app
app = FastAPI(title="SetFit ONNX Inference API")

# Load ONNX model and tokenizer
ort_session = ort.InferenceSession("./setfit_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Define request/response models
class PredictionRequest(BaseModel):
    texts: List[str]
    max_length: int = 512

class PredictionResponse(BaseModel):
    predictions: List[int]
    probabilities: List[List[float]]
    processing_time: float

@app.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
    import time
    start_time = time.time()
    
    # Tokenize inputs
    inputs = tokenizer(
        request.texts,
        return_tensors="np",
        padding=True,
        truncation=True,
        max_length=request.max_length
    )
    
    # Run ONNX inference
    outputs = ort_session.run(
        None,
        {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"]
        }
    )
    
    # Process outputs
    logits = outputs[0]
    probabilities = softmax(logits, axis=1).tolist()
    predictions = np.argmax(logits, axis=1).tolist()
    
    processing_time = time.time() - start_time
    
    return PredictionResponse(
        predictions=predictions,
        probabilities=probabilities,
        processing_time=processing_time
    )

def softmax(x, axis=None):
    """Compute softmax values."""
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

@app.get("/health")
async def health():
    return {"status": "healthy", "model": "setfit-onnx"}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

Install with Tessl CLI

npx tessl i tessl/pypi-setfit

docs

absa.md

core-model-training.md

data-utilities.md

index.md

knowledge-distillation.md

model-cards.md

model-export.md

tile.json