Embeddings, Retrieval, and Reranking framework for computing dense, sparse, and cross-encoder embeddings using state-of-the-art transformer models
—
The sentence-transformers package provides various utility functions for model optimization, quantization, export to different formats, similarity computation, and training enhancements.
def quantize_embeddings(
embeddings: Tensor | np.ndarray,
precision: Literal["float32", "int8", "uint8", "binary", "ubinary"],
ranges: np.ndarray | None = None,
calibration_embeddings: np.ndarray | None = None
) -> np.ndarray{ .api }
Quantize embeddings to reduce memory usage and improve inference speed.
Parameters:
embeddings: Unquantized (e.g. float) embeddings to quantize to a given precisionprecision: The precision to convert to ("float32", "int8", "uint8", "binary", "ubinary")ranges: Ranges for quantization of embeddings. Used for int8 quantization, where the ranges refer to the minimum and maximum values for each dimension. 2D array with shape (2, embedding_dim)calibration_embeddings: Embeddings used for calibration during quantization. Used for int8 quantization to compute rangesReturns: Quantized embeddings with the specified precision
Usage Examples:
import numpy as np
from sentence_transformers import quantize_embeddings, SentenceTransformer
# Generate sample embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = ["Hello world", "How are you?", "Machine learning is great"]
embeddings = model.encode(sentences)
# Float32 quantization (no change, returns same embeddings)
quantized_embs = quantize_embeddings(embeddings, precision="float32")
print(f"Original size: {embeddings.nbytes} bytes")
print(f"Quantized size: {quantized_embs.nbytes} bytes")
# Int8 quantization with calibration
calibration_data = model.encode(["Sample sentence " + str(i) for i in range(100)])
quantized_int8 = quantize_embeddings(
embeddings,
precision="int8",
calibration_embeddings=calibration_data
)
# Binary quantization (extreme compression)
binary_embs = quantize_embeddings(embeddings, precision="binary")def export_optimized_onnx_model(
model: SentenceTransformer,
onnx_model_path: str,
opset_version: int = 14,
optimization_level: str = "O2"
) -> None{ .api }
Export SentenceTransformer model to optimized ONNX format for deployment.
Parameters:
model: SentenceTransformer model to exportonnx_model_path: Output path for ONNX modelopset_version: ONNX opset version to useoptimization_level: Optimization level ("O1", "O2", "O3")def export_dynamic_quantized_onnx_model(
model: SentenceTransformer,
onnx_model_path: str,
quantization_mode: str = "IntegerOps"
) -> None{ .api }
Export model to dynamically quantized ONNX format.
Parameters:
model: SentenceTransformer model to exportonnx_model_path: Output path for quantized ONNX modelquantization_mode: Quantization mode ("IntegerOps", "QLinearOps")def export_static_quantized_openvino_model(
model: SentenceTransformer,
openvino_model_path: str,
calibration_dataset: list[str] | None = None
) -> None{ .api }
Export model to statically quantized OpenVINO format for Intel hardware optimization.
Parameters:
model: SentenceTransformer model to exportopenvino_model_path: Output path for OpenVINO modelcalibration_dataset: Dataset for static quantization calibrationUsage Examples:
from sentence_transformers.backend import (
export_optimized_onnx_model,
export_dynamic_quantized_onnx_model,
export_static_quantized_openvino_model
)
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Export to optimized ONNX
export_optimized_onnx_model(
model=model,
onnx_model_path="./optimized_model.onnx",
opset_version=14,
optimization_level="O2"
)
# Export to quantized ONNX for even faster inference
export_dynamic_quantized_onnx_model(
model=model,
onnx_model_path="./quantized_model.onnx",
quantization_mode="IntegerOps"
)
# Export to OpenVINO for Intel hardware
calibration_texts = ["Sample text " + str(i) for i in range(100)]
export_static_quantized_openvino_model(
model=model,
openvino_model_path="./openvino_model",
calibration_dataset=calibration_texts
)
# Use exported ONNX model with ONNX Runtime
import onnxruntime as ort
import numpy as np
# Load ONNX model
ort_session = ort.InferenceSession("./optimized_model.onnx")
# Tokenize input
inputs = model.tokenizer("Hello world", return_tensors="np", padding=True, truncation=True)
# Run inference
onnx_outputs = ort_session.run(None, {
"input_ids": inputs["input_ids"].astype(np.int64),
"attention_mask": inputs["attention_mask"].astype(np.int64)
})
print(f"ONNX embedding shape: {onnx_outputs[0].shape}")def mine_hard_negatives(
model: SentenceTransformer,
sentences: list[str],
labels: list[int],
batch_size: int = 32,
top_k: int = 10,
margin: float = 0.2
) -> list[dict[str, Any]]{ .api }
Mine hard negative examples for improved contrastive training.
Parameters:
model: SentenceTransformer model for encodingsentences: List of sentences to mine fromlabels: Corresponding labels for sentencesbatch_size: Batch size for encodingtop_k: Number of hard negatives to return per positivemargin: Margin for hard negative selectionReturns: List of dictionaries with anchor, positive, and hard negative examples
Usage Examples:
from sentence_transformers import mine_hard_negatives
# Prepare labeled data
sentences = [
"Python is a programming language",
"Java is used for software development",
"Machine learning uses algorithms",
"Deep learning is a subset of ML",
"Cars are vehicles",
"Trucks are large vehicles"
]
labels = [0, 0, 1, 1, 2, 2] # Programming, ML, Vehicles
# Mine hard negatives
hard_negatives = mine_hard_negatives(
model=model,
sentences=sentences,
labels=labels,
top_k=2,
margin=0.3
)
print("Hard negative examples:")
for example in hard_negatives[:3]: # Show first 3
print(f"Anchor: {example['anchor']}")
print(f"Positive: {example['positive']}")
print(f"Hard Negative: {example['negative']}")
print(f"Similarity: {example['similarity']:.4f}")
print()
# Use hard negatives in training
from sentence_transformers.losses import TripletLoss
from datasets import Dataset
# Convert to training format
train_examples = [
{
"anchor": ex["anchor"],
"positive": ex["positive"],
"negative": ex["negative"]
}
for ex in hard_negatives
]
train_dataset = Dataset.from_list(train_examples)
triplet_loss = TripletLoss(model)
# Train with hard negatives (improves model performance)The SimilarityFunction enum provides standardized similarity computation methods:
from sentence_transformers import SimilarityFunction
class SimilarityFunction(Enum):
COSINE = "cosine"
DOT_PRODUCT = "dot"
DOT = "dot" # Alias for DOT_PRODUCT
EUCLIDEAN = "euclidean"
MANHATTAN = "manhattan"{ .api }
Usage Examples:
# Use with SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2', similarity_fn_name=SimilarityFunction.COSINE)
# Manual similarity computation
import torch
import torch.nn.functional as F
def compute_similarity(embeddings1, embeddings2, similarity_fn):
"""Compute similarity between two sets of embeddings."""
if similarity_fn == SimilarityFunction.COSINE:
return F.cosine_similarity(embeddings1, embeddings2, dim=-1)
elif similarity_fn == SimilarityFunction.DOT_PRODUCT:
return torch.sum(embeddings1 * embeddings2, dim=-1)
elif similarity_fn == SimilarityFunction.EUCLIDEAN:
return -torch.cdist(embeddings1, embeddings2, p=2)
elif similarity_fn == SimilarityFunction.MANHATTAN:
return -torch.cdist(embeddings1, embeddings2, p=1)
# Example usage
emb1 = model.encode(["First sentence"])
emb2 = model.encode(["Second sentence"])
for sim_fn in SimilarityFunction:
if sim_fn != SimilarityFunction.DOT: # Skip alias
sim_score = compute_similarity(
torch.tensor(emb1),
torch.tensor(emb2),
sim_fn
)
print(f"{sim_fn.value}: {sim_score.item():.4f}")class DefaultBatchSampler:
def __init__(
self,
dataset: Dataset,
batch_size: int,
drop_last: bool = False,
generator: torch.Generator | None = None
){ .api }
Standard batch sampler for single dataset training.
class MultiDatasetDefaultBatchSampler:
def __init__(
self,
datasets: dict[str, Dataset],
batch_sizes: dict[str, int] | int,
sampling_strategy: str = "proportional",
generator: torch.Generator | None = None
){ .api }
Batch sampler for multi-dataset training with different sampling strategies.
Parameters:
datasets: Dictionary of dataset names to Dataset objectsbatch_sizes: Batch size per dataset or single batch sizesampling_strategy: "proportional" or "round_robin"generator: Random generator for reproducibilityUsage Examples:
from sentence_transformers import DefaultBatchSampler, MultiDatasetDefaultBatchSampler
from datasets import Dataset
# Single dataset sampler
dataset = Dataset.from_list([{"text": f"Example {i}"} for i in range(1000)])
sampler = DefaultBatchSampler(
dataset=dataset,
batch_size=32,
drop_last=True
)
# Multi-dataset sampler
dataset1 = Dataset.from_list([{"text": f"Dataset1 {i}"} for i in range(500)])
dataset2 = Dataset.from_list([{"text": f"Dataset2 {i}"} for i in range(300)])
multi_sampler = MultiDatasetDefaultBatchSampler(
datasets={"ds1": dataset1, "ds2": dataset2},
batch_sizes={"ds1": 32, "ds2": 16},
sampling_strategy="proportional"
)
# Use in training
from sentence_transformers import SentenceTransformerTrainer
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset={"ds1": dataset1, "ds2": dataset2},
# Sampler is automatically configured based on datasets
)The sentence_transformers.models module provides modular components for building custom architectures:
from sentence_transformers.models import (
Transformer, # BERT, RoBERTa, etc.
Pooling, # Mean, max, CLS pooling
Dense, # Linear transformation
Normalize # L2 normalization
)Usage Examples:
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Transformer, Pooling, Dense, Normalize
# Build custom model architecture
transformer = Transformer('distilbert-base-uncased', max_seq_length=256)
pooling = Pooling(
word_embedding_dimension=transformer.get_word_embedding_dimension(),
pooling_mode='mean'
)
dense = Dense(
in_features=pooling.get_sentence_embedding_dimension(),
out_features=256,
activation_function='tanh'
)
normalize = Normalize()
# Combine components
custom_model = SentenceTransformer(modules=[transformer, pooling, dense, normalize])
# Use custom model
embeddings = custom_model.encode(["Custom architecture example"])
print(f"Custom embedding shape: {embeddings.shape}")from sentence_transformers.models import (
CNN, # Convolutional layers
LSTM, # LSTM layers
BoW, # Bag of words
WordEmbeddings, # Word embeddings layer
WordWeights, # TF-IDF weighting
StaticEmbedding, # Static embeddings (Word2Vec, GloVe)
WeightedLayerPooling, # Weighted pooling across layers
CLIPModel, # CLIP integration
Router, # Multi-encoder routing
Dropout, # Dropout layer
LayerNorm # Layer normalization
)def create_memory_efficient_model(base_model_name, target_dim=256):
"""Create memory-efficient model with reduced dimensions."""
from sentence_transformers.models import Transformer, Pooling, Dense, Normalize
transformer = Transformer(base_model_name, max_seq_length=256)
pooling = Pooling(transformer.get_word_embedding_dimension(), pooling_mode='mean')
# Add dimension reduction for memory efficiency
dense = Dense(
in_features=pooling.get_sentence_embedding_dimension(),
out_features=target_dim,
activation_function='tanh'
)
normalize = Normalize()
return SentenceTransformer(modules=[transformer, pooling, dense, normalize])
# Create efficient model
efficient_model = create_memory_efficient_model('bert-base-uncased', target_dim=128)def optimize_for_inference(model, sentences, batch_size=64):
"""Optimized inference with batching and no gradients."""
import torch
model.eval() # Set to evaluation mode
embeddings = []
with torch.no_grad(): # Disable gradient computation
for i in range(0, len(sentences), batch_size):
batch = sentences[i:i + batch_size]
batch_embeddings = model.encode(
batch,
batch_size=len(batch),
show_progress_bar=False,
convert_to_tensor=False,
normalize_embeddings=True # For cosine similarity
)
embeddings.extend(batch_embeddings)
return embeddings
# Optimized inference
sentences = [f"Sentence {i}" for i in range(1000)]
fast_embeddings = optimize_for_inference(model, sentences)from sentence_transformers import LoggingHandler
import logging
class LoggingHandler(logging.Handler):
def emit(self, record: logging.LogRecord) -> None:
"""Emit log record without interfering with tqdm progress bars."""
pass{ .api }
Custom logging handler that works seamlessly with tqdm progress bars.
Usage Examples:
import logging
from sentence_transformers import LoggingHandler
# Set up logging
logging.basicConfig(
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()]
)
logger = logging.getLogger(__name__)
# Use with training
def train_with_logging(model, trainer):
logger.info("Starting training...")
trainer.train()
logger.info("Training completed!")
logger.info(f"Model saved to {trainer.args.output_dir}")# Note: These are deprecated in favor of HuggingFace Datasets
from sentence_transformers.datasets import SentencesDataset, ParallelSentencesDataset
from sentence_transformers.readers import InputExampledef create_training_dataset(examples, format_type="triplet"):
"""Create training dataset in various formats."""
from datasets import Dataset
if format_type == "triplet":
# Format: anchor, positive, negative
formatted_examples = [
{
"anchor": ex["anchor"],
"positive": ex["positive"],
"negative": ex["negative"]
}
for ex in examples
]
elif format_type == "pairs":
# Format: sentence1, sentence2, label
formatted_examples = [
{
"sentence1": ex["sentence1"],
"sentence2": ex["sentence2"],
"label": ex["label"]
}
for ex in examples
]
return Dataset.from_list(formatted_examples)
# Example usage
examples = [
{
"anchor": "Python programming",
"positive": "Coding in Python",
"negative": "Java development"
}
]
dataset = create_training_dataset(examples, format_type="triplet")def analyze_model_performance(model, test_sentences):
"""Analyze model performance characteristics."""
import time
import numpy as np
# Encoding speed test
start_time = time.time()
embeddings = model.encode(test_sentences, batch_size=32)
encoding_time = time.time() - start_time
# Embedding analysis
embedding_dim = embeddings.shape[1]
embedding_norms = np.linalg.norm(embeddings, axis=1)
# Similarity analysis
similarities = np.dot(embeddings, embeddings.T)
results = {
"encoding_speed": len(test_sentences) / encoding_time,
"embedding_dimension": embedding_dim,
"avg_embedding_norm": np.mean(embedding_norms),
"std_embedding_norm": np.std(embedding_norms),
"avg_similarity": np.mean(similarities[np.triu_indices_from(similarities, k=1)]),
"similarity_std": np.std(similarities[np.triu_indices_from(similarities, k=1)])
}
return results
# Analyze model
test_texts = ["Sample sentence " + str(i) for i in range(100)]
performance = analyze_model_performance(model, test_texts)
for metric, value in performance.items():
print(f"{metric}: {value:.4f}")Custom logging handler that integrates with tqdm progress bars for clean output during training and inference.
class LoggingHandler(logging.Handler):
def __init__(self, level=logging.NOTSET) -> None: ...
def emit(self, record) -> None: ...Usage Example:
import logging
from sentence_transformers import LoggingHandler
# Set up logging with tqdm-compatible handler
logger = logging.getLogger("sentence_transformers")
logger.setLevel(logging.INFO)
logger.addHandler(LoggingHandler())
# Now logging output won't interfere with progress bars
logger.info("Training started")Default batch sampler used in the SentenceTransformer library, equivalent to PyTorch's BatchSampler with epoch support.
class DefaultBatchSampler(BatchSampler):
def __init__(
self,
sampler,
batch_size: int,
drop_last: bool = False
) -> None: ...
def set_epoch(self, epoch: int) -> None: ...Batch sampler for training on multiple datasets simultaneously with balanced sampling.
class MultiDatasetDefaultBatchSampler(BatchSampler):
def __init__(
self,
samplers,
batch_sizes: list[int],
drop_last: bool = False
) -> None: ...
def set_epoch(self, epoch: int) -> None: ...These components are included for backwards compatibility but are deprecated in favor of the modern training framework.
class SentencesDataset:
"""Deprecated: Use SentenceTransformerTrainer instead"""
def __init__(self, examples: list, model) -> None: ...
class ParallelSentencesDataset:
"""Deprecated: Use SentenceTransformerTrainer instead"""
def __init__(self, student_model, teacher_model) -> None: ...class InputExample:
"""Deprecated: Use standard data formats instead"""
def __init__(
self,
guid: str = "",
texts: list[str] = None,
label: int | float = 0
) -> None: ...Migration Note: These legacy components exist for compatibility with the old model.fit() training approach. For new projects, use the modern SentenceTransformerTrainer class instead.
Install with Tessl CLI
npx tessl i tessl/pypi-sentence-transformers