tessl/pypi-setfit

Efficient few-shot learning with Sentence Transformers

—

Pending

Overview

Eval results

Files

Model Export

Name: tessl/pypi-setfit
Author: tessl

Export functionality for ONNX and OpenVINO formats to enable efficient deployment and inference. These utilities allow SetFit models to be deployed in production environments with optimized inference engines.

Note: Export functions are available in submodules and require explicit imports:

ONNX functions: from setfit.exporters.onnx import ...
OpenVINO functions: from setfit.exporters.openvino import ...
Utility functions: from setfit.exporters.utils import ...

Capabilities

ONNX Export

Export SetFit models to ONNX format for cross-platform deployment and hardware optimization.

def export_onnx(
    model_body: SentenceTransformer,
    model_head: Union[torch.nn.Module, LogisticRegression],
    opset: int,
    output_path: str = "model.onnx",
    ignore_ir_version: bool = True,
    use_hummingbird: bool = False
) -> None:
    """
    Export SetFit model to ONNX IR format.

    Parameters:
    - model_body: Sentence transformer body to export
    - model_head: Classification head (PyTorch module or sklearn)
    - opset: ONNX opset version to use (required)
    - output_path: Path to save the ONNX model
    - ignore_ir_version: Whether to ignore IR version warnings
    - use_hummingbird: Whether to use Hummingbird ML for sklearn conversion

    Returns:
    None (saves model to output_path)
    """

def export_onnx_setfit_model(
    setfit_model: "OnnxSetFitModel",
    inputs: Dict[str, torch.Tensor],
    output_path: str,
    opset: int = 12
) -> None:
    """
    Export SetFit model wrapper to ONNX format.

    Parameters:
    - setfit_model: ONNX-compatible SetFit model wrapper
    - inputs: Sample inputs for tracing the model
    - output_path: Path to save the ONNX model
    - opset: ONNX opset version
    """

def export_sklearn_head_to_onnx(
    model_head: LogisticRegression,
    opset: int = 11
) -> "onnx.onnx_ml_pb2.ModelProto":
    """
    Convert sklearn classification head to ONNX format.

    Parameters:
    - model_head: Trained sklearn LogisticRegression model
    - opset: ONNX opset version

    Returns:
    ONNX model proto for the sklearn head
    """

OpenVINO Export

Export SetFit models to OpenVINO IR format for Intel hardware optimization.

def export_to_openvino(
    model: SetFitModel,
    output_path: str = "model.xml"
) -> None:
    """
    Export SetFit model to OpenVINO IR format.

    Parameters:
    - model: Trained SetFit model to export
    - output_path: Path to save the OpenVINO IR file (default: "model.xml")

    Returns:
    None (saves model to output_path)
    """

def hummingbird_export(
    model: Union[LogisticRegression, SetFitModel],
    data_sample: Union[np.ndarray, torch.Tensor]
) -> torch.jit.ScriptModule:
    """
    Export model using Hummingbird ML for PyTorch conversion.

    Parameters:
    - model: Model to export (sklearn or SetFit)
    - data_sample: Sample data for tracing

    Returns:
    TorchScript model ready for deployment
    """

ONNX Model Wrapper

Wrapper class that prepares SetFit models for ONNX export with proper input/output handling.

class OnnxSetFitModel:
    def __init__(
        self,
        model_body: "PreTrainedModel",
        pooler: Optional[Union[torch.nn.Module, Callable]] = None,
        model_head: Optional[Union[torch.nn.Module, LogisticRegression]] = None
    ):
        """
        ONNX export wrapper for SetFit models.

        Parameters:
        - model_body: Pre-trained transformer model body
        - pooler: Pooling function/module for embeddings
        - model_head: Classification head for predictions
        """

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor
    ) -> torch.Tensor:
        """
        Forward pass through the model for ONNX export.

        Parameters:
        - input_ids: Token IDs from tokenizer
        - attention_mask: Attention mask for input tokens

        Returns:
        Model predictions or embeddings
        """

Utility Functions

Helper functions for model export and optimization.

def mean_pooling(
    token_embeddings: torch.Tensor,
    attention_mask: torch.Tensor
) -> torch.Tensor:
    """
    Perform attention-aware mean pooling on token embeddings.

    Parameters:
    - token_embeddings: Token-level embeddings [batch, seq_len, hidden_size]
    - attention_mask: Attention mask [batch, seq_len]

    Returns:
    Pooled sentence embeddings [batch, hidden_size]
    """

Usage Examples

Basic ONNX Export

from setfit import SetFitModel
from setfit.exporters.onnx import export_onnx
from transformers import AutoTokenizer
import torch

# Load trained SetFit model
model = SetFitModel.from_pretrained("path/to/your/trained/model")

# Export to ONNX
onnx_path = export_onnx(
    model_body=model.model_body,
    model_head=model.model_head,
    output_path="./setfit_model.onnx",
    opset=11,
    device="cpu"
)

print(f"Model exported to: {onnx_path}")

# Verify ONNX model
import onnxruntime as ort

# Create ONNX runtime session
ort_session = ort.InferenceSession(onnx_path)

# Prepare sample input
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
sample_text = "This is a test sentence."
inputs = tokenizer(
    sample_text,
    return_tensors="np",
    padding=True,
    truncation=True,
    max_length=512
)

# Run inference
onnx_outputs = ort_session.run(
    None,
    {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"]
    }
)

print(f"ONNX output shape: {onnx_outputs[0].shape}")
print(f"ONNX predictions: {onnx_outputs[0]}")

Advanced ONNX Export with Quantization

from setfit import SetFitModel
from setfit.exporters.onnx import OnnxSetFitModel, export_onnx_setfit_model
import torch
from transformers import AutoTokenizer, AutoModel

# Load model components
model = SetFitModel.from_pretrained("your-model")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Create ONNX-compatible wrapper
onnx_model = OnnxSetFitModel(
    model_body=model.model_body[0].auto_model,  # Get the transformer
    model_head=model.model_head
)

# Prepare sample inputs for tracing
sample_inputs = tokenizer(
    "Sample text for tracing",
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
)

# Export with dynamic shapes
torch.onnx.export(
    onnx_model,
    (sample_inputs["input_ids"], sample_inputs["attention_mask"]),
    "./setfit_dynamic.onnx",
    export_params=True,
    opset_version=11,
    do_constant_folding=True,
    input_names=["input_ids", "attention_mask"],
    output_names=["predictions"],
    dynamic_axes={
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "attention_mask": {0: "batch_size", 1: "sequence_length"},
        "predictions": {0: "batch_size"}
    }
)

# Apply post-export quantization
from onnxruntime.quantization import quantize_dynamic, QuantType

quantize_dynamic(
    "./setfit_dynamic.onnx",
    "./setfit_quantized.onnx",
    weight_type=QuantType.QUInt8
)

print("Quantized ONNX model saved to: ./setfit_quantized.onnx")

OpenVINO Export for Intel Hardware

from setfit import SetFitModel
from setfit.exporters.openvino import export_to_openvino
import openvino as ov

# Load trained model
model = SetFitModel.from_pretrained("your-trained-setfit-model")

# Export to OpenVINO IR
ir_path = export_to_openvino(
    model=model,
    output_path="./setfit_openvino",
    precision="FP16",  # Use half precision for speed
    optimize=True      # Apply OpenVINO optimizations
)

print(f"OpenVINO IR exported to: {ir_path}")

# Load and use OpenVINO model
core = ov.Core()
compiled_model = core.compile_model(f"{ir_path}/model.xml", "CPU")

# Prepare input
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

texts = ["This is amazing!", "This is terrible."]
inputs = tokenizer(
    texts,
    return_tensors="np",
    padding=True,
    truncation=True,
    max_length=512
)

# Run inference
infer_request = compiled_model.create_infer_request()
infer_request.infer({
    "input_ids": inputs["input_ids"],
    "attention_mask": inputs["attention_mask"]
})

predictions = infer_request.get_output_tensor().data
print(f"OpenVINO predictions: {predictions}")

TorchScript Export with Hummingbird

from setfit import SetFitModel
from setfit.exporters.utils import hummingbird_export
import torch
import numpy as np

# Load model
model = SetFitModel.from_pretrained("your-model")

# If using sklearn head, convert with Hummingbird
if hasattr(model.model_head, 'predict'):  # sklearn model
    # Create sample data for tracing
    sample_embeddings = np.random.randn(10, 384).astype(np.float32)
    
    # Export sklearn head to PyTorch
    torch_head = hummingbird_export(
        model=model.model_head,
        data_sample=sample_embeddings
    )
    
    print("Sklearn head converted to TorchScript")
else:
    # Already a PyTorch head
    torch_head = torch.jit.script(model.model_head)

# Create complete TorchScript model
class TorchScriptSetFit(torch.nn.Module):
    def __init__(self, sentence_transformer, classification_head):
        super().__init__()
        self.sentence_transformer = sentence_transformer
        self.classification_head = classification_head
    
    def forward(self, input_ids, attention_mask):
        # Get embeddings
        outputs = self.sentence_transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        
        # Classify
        predictions = self.classification_head(embeddings)
        return predictions

# Create scriptable model
scriptable_model = TorchScriptSetFit(
    sentence_transformer=model.model_body[0].auto_model,
    classification_head=torch_head
)

# Convert to TorchScript
traced_model = torch.jit.trace(
    scriptable_model,
    (sample_inputs["input_ids"], sample_inputs["attention_mask"])
)

# Save TorchScript model
traced_model.save("./setfit_torchscript.pt")
print("TorchScript model saved")

# Load and use
loaded_model = torch.jit.load("./setfit_torchscript.pt")
loaded_model.eval()

with torch.no_grad():
    ts_predictions = loaded_model(
        sample_inputs["input_ids"],
        sample_inputs["attention_mask"]
    )

print(f"TorchScript predictions: {ts_predictions}")

Deployment Performance Comparison

import time
import numpy as np
from setfit import SetFitModel
import onnxruntime as ort
import openvino as ov
import torch

def benchmark_models(texts, num_runs=100):
    """Compare inference speed across different export formats."""
    
    # Original PyTorch model
    pytorch_model = SetFitModel.from_pretrained("your-model")
    
    # ONNX model
    ort_session = ort.InferenceSession("./setfit_model.onnx")
    
    # OpenVINO model
    core = ov.Core()
    ov_model = core.compile_model("./setfit_openvino/model.xml", "CPU")
    
    # TorchScript model
    ts_model = torch.jit.load("./setfit_torchscript.pt")
    ts_model.eval()
    
    # Prepare inputs
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    
    np_inputs = {
        "input_ids": inputs["input_ids"].numpy(),
        "attention_mask": inputs["attention_mask"].numpy()
    }
    
    results = {}
    
    # Benchmark PyTorch
    start_time = time.time()
    for _ in range(num_runs):
        with torch.no_grad():
            _ = pytorch_model.predict(texts)
    pytorch_time = (time.time() - start_time) / num_runs
    results["PyTorch"] = pytorch_time
    
    # Benchmark ONNX
    start_time = time.time()
    for _ in range(num_runs):
        _ = ort_session.run(None, np_inputs)
    onnx_time = (time.time() - start_time) / num_runs
    results["ONNX"] = onnx_time
    
    # Benchmark OpenVINO
    infer_request = ov_model.create_infer_request()
    start_time = time.time()
    for _ in range(num_runs):
        infer_request.infer(np_inputs)
        _ = infer_request.get_output_tensor().data
    openvino_time = (time.time() - start_time) / num_runs
    results["OpenVINO"] = openvino_time
    
    # Benchmark TorchScript
    start_time = time.time()
    for _ in range(num_runs):
        with torch.no_grad():
            _ = ts_model(inputs["input_ids"], inputs["attention_mask"])
    torchscript_time = (time.time() - start_time) / num_runs
    results["TorchScript"] = torchscript_time
    
    return results

# Run benchmark
test_texts = [
    "This product is amazing!",
    "I'm not satisfied with this purchase.",
    "Great value for money.",
    "Poor quality, would not recommend."
]

benchmark_results = benchmark_models(test_texts, num_runs=50)

print("Inference Speed Comparison (average per batch):")
baseline_time = benchmark_results["PyTorch"]

for model_type, avg_time in benchmark_results.items():
    speedup = baseline_time / avg_time
    print(f"{model_type:12}: {avg_time:.4f}s ({speedup:.1f}x speedup)")

Production Deployment Example

from fastapi import FastAPI
import onnxruntime as ort
from typing import List
from pydantic import BaseModel
import numpy as np
from transformers import AutoTokenizer

# Initialize FastAPI app
app = FastAPI(title="SetFit ONNX Inference API")

# Load ONNX model and tokenizer
ort_session = ort.InferenceSession("./setfit_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Define request/response models
class PredictionRequest(BaseModel):
    texts: List[str]
    max_length: int = 512

class PredictionResponse(BaseModel):
    predictions: List[int]
    probabilities: List[List[float]]
    processing_time: float

@app.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
    import time
    start_time = time.time()
    
    # Tokenize inputs
    inputs = tokenizer(
        request.texts,
        return_tensors="np",
        padding=True,
        truncation=True,
        max_length=request.max_length
    )
    
    # Run ONNX inference
    outputs = ort_session.run(
        None,
        {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"]
        }
    )
    
    # Process outputs
    logits = outputs[0]
    probabilities = softmax(logits, axis=1).tolist()
    predictions = np.argmax(logits, axis=1).tolist()
    
    processing_time = time.time() - start_time
    
    return PredictionResponse(
        predictions=predictions,
        probabilities=probabilities,
        processing_time=processing_time
    )

def softmax(x, axis=None):
    """Compute softmax values."""
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

@app.get("/health")
async def health():
    return {"status": "healthy", "model": "setfit-onnx"}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

Install with Tessl CLI