Embeddings, Retrieval, and Reranking framework for computing dense, sparse, and cross-encoder embeddings using state-of-the-art transformer models
—
The sentence-transformers package provides a modern training framework built on HuggingFace Trainer, supporting various learning objectives and multi-dataset training for sentence transformer models.
SentenceTransformerTrainer(
model: SentenceTransformer | None = None,
args: SentenceTransformerTrainingArguments | None = None,
train_dataset: Dataset | None = None,
eval_dataset: Dataset | None = None,
tokenizer: PreTrainedTokenizer | None = None,
data_collator: DataCollator | None = None,
compute_metrics: callable | None = None,
callbacks: list[TrainerCallback] | None = None,
optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
preprocess_logits_for_metrics: callable | None = None,
loss: torch.nn.Module | dict[str, torch.nn.Module] | None = None
){ .api }
Modern trainer for sentence transformer models.
Parameters:
model: SentenceTransformer model to trainargs: Training configuration argumentstrain_dataset: Training dataset(s) - single Dataset or dict of datasetseval_dataset: Evaluation dataset(s) - single Dataset or dict of datasetstokenizer: Tokenizer (usually auto-detected from model)data_collator: Custom data collator for batchingcompute_metrics: Function to compute evaluation metricscallbacks: List of training callbacksoptimizers: Custom optimizer and learning rate schedulerpreprocess_logits_for_metrics: Function to preprocess logits for metricsloss: Loss function(s) - single loss or dict mapping dataset names to lossesdef train(
resume_from_checkpoint: str | bool | None = None,
trial: dict[str, Any] | None = None,
ignore_keys_for_eval: list[str] | None = None,
**kwargs
) -> TrainOutput{ .api }
Train the sentence transformer model.
Parameters:
resume_from_checkpoint: Path to checkpoint or True to resume from latesttrial: Hyperparameter optimization trial objectignore_keys_for_eval: Keys to ignore during evaluation**kwargs: Additional arguments passed to base trainerReturns: Training output with logs and metrics
def evaluate(
eval_dataset: Dataset | None = None,
ignore_keys: list[str] | None = None,
metric_key_prefix: str = "eval"
) -> dict[str, float]{ .api }
Evaluate the model on evaluation dataset(s).
def predict(
test_dataset: Dataset,
ignore_keys: list[str] | None = None,
metric_key_prefix: str = "test"
) -> PredictionOutput{ .api }
Make predictions on test dataset.
def add_dataset(
train_dataset: Dataset,
eval_dataset: Dataset | None = None,
dataset_name: str | None = None,
loss: torch.nn.Module | None = None
) -> None{ .api }
Add additional dataset to multi-dataset training setup.
Parameters:
train_dataset: Training dataset to addeval_dataset: Optional evaluation datasetdataset_name: Name for the dataset (auto-generated if None)loss: Specific loss function for this datasetclass SentenceTransformerTrainingArguments(TrainingArguments):
def __init__(
self,
output_dir: str,
evaluation_strategy: str | IntervalStrategy = "no",
eval_steps: int | None = None,
eval_delay: float = 0,
logging_dir: str | None = None,
logging_strategy: str | IntervalStrategy = "steps",
logging_steps: int = 500,
save_strategy: str | IntervalStrategy = "steps",
save_steps: int = 500,
save_total_limit: int | None = None,
seed: int = 42,
data_seed: int | None = None,
jit_mode_eval: bool = False,
use_ipex: bool = False,
bf16: bool = False,
fp16: bool = False,
fp16_opt_level: str = "O1",
half_precision_backend: str = "auto",
bf16_full_eval: bool = False,
fp16_full_eval: bool = False,
tf32: bool | None = None,
local_rank: int = -1,
ddp_backend: str | None = None,
tpu_num_cores: int | None = None,
tpu_metrics_debug: bool = False,
debug: str | list[DebugOption] = "",
dataloader_drop_last: bool = False,
dataloader_num_workers: int = 0,
past_index: int = -1,
run_name: str | None = None,
disable_tqdm: bool | None = None,
remove_unused_columns: bool = True,
label_names: list[str] | None = None,
load_best_model_at_end: bool = False,
ignore_data_skip: bool = False,
fsdp: str | list[str] = "",
fsdp_min_num_params: int = 0,
fsdp_config: dict[str, Any] | None = None,
fsdp_transformer_layer_cls_to_wrap: str | None = None,
deepspeed: str | None = None,
label_smoothing_factor: float = 0.0,
optim: str | OptimizerNames = "adamw_torch",
optim_args: str | None = None,
adafactor: bool = False,
group_by_length: bool = False,
length_column_name: str | None = "length",
report_to: str | list[str] | None = None,
ddp_find_unused_parameters: bool | None = None,
ddp_bucket_cap_mb: int | None = None,
ddp_broadcast_buffers: bool | None = None,
dataloader_pin_memory: bool = True,
skip_memory_metrics: bool = True,
use_legacy_prediction_loop: bool = False,
push_to_hub: bool = False,
resume_from_checkpoint: str | None = None,
hub_model_id: str | None = None,
hub_strategy: str | HubStrategy = "every_save",
hub_token: str | None = None,
hub_private_repo: bool = False,
hub_always_push: bool = False,
gradient_checkpointing: bool = False,
include_inputs_for_metrics: bool = False,
auto_find_batch_size: bool = False,
full_determinism: bool = False,
torchdynamo: str | None = None,
ray_scope: str | None = "last",
ddp_timeout: int = 1800,
torch_compile: bool = False,
torch_compile_backend: str | None = None,
torch_compile_mode: str | None = None,
dispatch_batches: bool | None = None,
split_batches: bool | None = None,
include_tokens_per_second: bool = False,
# Sentence Transformers specific arguments
batch_sampler: str = "batch_sampler",
multi_dataset_batch_sampler: str = "proportional",
**kwargs
){ .api }
Training arguments extending HuggingFace TrainingArguments with sentence transformer specific options.
Key Sentence Transformer Parameters:
batch_sampler: Strategy for sampling batches from datasetsmulti_dataset_batch_sampler: Strategy for multi-dataset batch sampling ("proportional", "round_robin")class SentenceTransformerModelCardData:
def __init__(
self,
language: str | list[str] | None = None,
license: str | None = None,
tags: str | list[str] | None = None,
model_name: str | None = None,
model_id: str | None = None,
eval_results: list[EvalResult] | None = None,
train_datasets: str | list[str] | None = None,
eval_datasets: str | list[str] | None = None,
prior_models: str | list[str] | None = None,
base_model: str | None = None,
similarity_fn_name: str | None = None,
model_max_length: int | None = None
){ .api }
Data class for generating comprehensive model cards for sentence transformer models.
Parameters:
language: Supported language(s)license: Model licensetags: Categorization tagsmodel_name: Human-readable model namemodel_id: Unique model identifiereval_results: Evaluation results and benchmarkstrain_datasets: Datasets used for trainingeval_datasets: Datasets used for evaluationprior_models: Models used as starting pointsbase_model: Base transformer modelsimilarity_fn_name: Default similarity functionmodel_max_length: Maximum input lengthclass DefaultBatchSampler:
def __init__(
self,
dataset: Dataset,
batch_size: int,
drop_last: bool = False,
generator: torch.Generator | None = None
){ .api }
Standard batch sampler for single dataset training.
class MultiDatasetDefaultBatchSampler:
def __init__(
self,
datasets: dict[str, Dataset],
batch_sizes: dict[str, int] | int,
sampling_strategy: str = "proportional",
generator: torch.Generator | None = None
){ .api }
Abstract base class for multi-dataset batch sampling.
Parameters:
datasets: Dictionary mapping dataset names to Dataset objectsbatch_sizes: Batch sizes per dataset or single batch size for allsampling_strategy: How to sample from multiple datasets ("proportional", "round_robin")generator: Random number generator for reproducibilityfrom sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from datasets import Dataset
# Prepare training data
train_data = [
{"anchor": "The cat sits on the mat", "positive": "A feline rests on a rug"},
{"anchor": "Python is a programming language", "positive": "Python is used for coding"},
{"anchor": "Machine learning uses data", "positive": "ML algorithms process datasets"}
]
train_dataset = Dataset.from_list(train_data)
# Initialize model
model = SentenceTransformer('distilbert-base-uncased')
# Define loss function
loss = MultipleNegativesRankingLoss(model)
# Training arguments
args = SentenceTransformerTrainingArguments(
output_dir='./sentence-transformer-training',
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=2e-5,
warmup_ratio=0.1,
logging_steps=10,
logging_dir='./logs',
evaluation_strategy="steps",
eval_steps=100,
save_steps=100,
save_total_limit=2,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
run_name="sentence-transformer-training"
)
# Create trainer
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
loss=loss
)
# Train model
trainer.train()
# Save trained model
model.save('./trained-sentence-transformer')from sentence_transformers.losses import CosineSimilarityLoss, TripletLoss
# Prepare multiple datasets
dataset1 = Dataset.from_list([
{"sentence1": "The cat sits", "sentence2": "A cat is sitting", "label": 1.0},
{"sentence1": "Dogs are pets", "sentence2": "Cats are animals", "label": 0.3}
])
dataset2 = Dataset.from_list([
{"anchor": "Python programming", "positive": "Coding in Python", "negative": "Java development"},
{"anchor": "Machine learning", "positive": "AI algorithms", "negative": "Web design"}
])
# Define different losses for different datasets
loss1 = CosineSimilarityLoss(model)
loss2 = TripletLoss(model)
# Multi-dataset training
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset={"similarity": dataset1, "triplet": dataset2},
loss={"similarity": loss1, "triplet": loss2}
)
trainer.train()from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import numpy as np
# Prepare evaluation data
eval_sentences1 = ["The cat sits on the mat", "I love programming"]
eval_sentences2 = ["A feline rests on a rug", "I enjoy coding"]
eval_scores = [0.9, 0.8] # Similarity scores
evaluator = EmbeddingSimilarityEvaluator(
eval_sentences1,
eval_sentences2,
eval_scores,
name="dev"
)
def compute_metrics(eval_pred):
"""Custom metrics computation."""
predictions = eval_pred.predictions
labels = eval_pred.label_ids
# Compute custom metrics
mse = np.mean((predictions - labels) ** 2)
return {"mse": mse}
# Enhanced training arguments
args = SentenceTransformerTrainingArguments(
output_dir='./advanced-training',
num_train_epochs=5,
per_device_train_batch_size=32,
gradient_accumulation_steps=2,
learning_rate=2e-5,
weight_decay=0.01,
warmup_ratio=0.1,
lr_scheduler_type="cosine",
logging_steps=10,
evaluation_strategy="epoch",
save_strategy="epoch",
save_total_limit=3,
load_best_model_at_end=True,
metric_for_best_model="eval_cosine_pearson",
greater_is_better=True,
push_to_hub=False,
report_to=["tensorboard"],
run_name="advanced-sentence-transformer"
)
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
loss=loss,
compute_metrics=compute_metrics
)
# Add evaluation
def evaluation_callback(trainer):
"""Custom evaluation during training."""
results = evaluator(model, output_path=args.output_dir + "/evaluation")
trainer.log({"eval_cosine_pearson": results})
# Train with evaluation
trainer.train()from sentence_transformers.losses import MatryoshkaLoss, TripletLoss
# Matryoshka representation learning
base_loss = MultipleNegativesRankingLoss(model)
matryoshka_loss = MatryoshkaLoss(
model=model,
loss=base_loss,
matryoshka_dims=[768, 512, 256, 128, 64] # Progressive dimensions
)
# Training with progressive dimensionality
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
loss=matryoshka_loss
)
trainer.train()
# The trained model can now produce embeddings at multiple dimensions
embeddings_768 = model.encode(["Test sentence"], truncate_dim=768)
embeddings_256 = model.encode(["Test sentence"], truncate_dim=256)
embeddings_64 = model.encode(["Test sentence"], truncate_dim=64)# For multi-GPU training
args = SentenceTransformerTrainingArguments(
output_dir='./distributed-training',
num_train_epochs=3,
per_device_train_batch_size=16, # Per GPU batch size
gradient_accumulation_steps=4, # Effective batch size: 16 * 4 * num_gpus
dataloader_num_workers=4,
ddp_find_unused_parameters=False,
fp16=True, # Mixed precision training
logging_steps=10,
save_steps=500,
evaluation_strategy="steps",
eval_steps=500,
warmup_ratio=0.1,
learning_rate=2e-5,
run_name="distributed-training"
)
# The trainer automatically handles multi-GPU setup
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
loss=loss
)
# Launch with: torchrun --nproc_per_node=2 train_script.py
trainer.train()from transformers import TrainerCallback
from ray import tune
def model_init():
"""Initialize model for hyperparameter search."""
return SentenceTransformer('distilbert-base-uncased')
def hp_space(trial):
"""Define hyperparameter search space."""
return {
"learning_rate": tune.loguniform(1e-6, 1e-4),
"per_device_train_batch_size": tune.choice([16, 32, 64]),
"warmup_ratio": tune.uniform(0.0, 0.3),
"weight_decay": tune.uniform(0.0, 0.3),
}
# Hyperparameter search
trainer = SentenceTransformerTrainer(
model_init=model_init,
args=args,
train_dataset=train_dataset,
loss=MultipleNegativesRankingLoss,
compute_metrics=compute_metrics
)
best_trial = trainer.hyperparameter_search(
hp_space=hp_space,
compute_objective=lambda metrics: metrics["eval_loss"],
n_trials=10,
direction="minimize"
)
print(f"Best hyperparameters: {best_trial.hyperparameters}")from transformers import DataCollatorWithPadding
from typing import Dict, List, Any
import torch
class CustomDataCollator:
"""Custom data collator for sentence transformer training."""
def __init__(self, tokenizer, padding=True, max_length=None):
self.tokenizer = tokenizer
self.padding = padding
self.max_length = max_length
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
# Custom processing of training examples
batch = {}
# Extract texts
texts = []
for feature in features:
if 'anchor' in feature and 'positive' in feature:
texts.extend([feature['anchor'], feature['positive']])
if 'negative' in feature:
texts.append(feature['negative'])
elif 'sentence1' in feature and 'sentence2' in feature:
texts.extend([feature['sentence1'], feature['sentence2']])
# Tokenize
tokenized = self.tokenizer(
texts,
padding=self.padding,
truncation=True,
max_length=self.max_length,
return_tensors='pt'
)
return tokenized
# Use custom data collator
custom_collator = CustomDataCollator(model.tokenizer, max_length=512)
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
data_collator=custom_collator,
loss=loss
)from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
import wandb
class CustomCallback(TrainerCallback):
"""Custom training callback for monitoring."""
def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
print("Training started!")
def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
# Custom logic at epoch end
model = kwargs.get('model')
if model:
# Evaluate on custom data or log additional metrics
pass
def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
print(f"Model saved at step {state.global_step}")
# Add callbacks to trainer
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
loss=loss,
callbacks=[CustomCallback()]
)from sentence_transformers import SentenceTransformerModelCardData
# Create comprehensive model card
model_card_data = SentenceTransformerModelCardData(
language=['en'],
license='apache-2.0',
tags=['sentence-transformers', 'sentence-similarity', 'embeddings'],
model_name='Custom Sentence Transformer',
base_model='distilbert-base-uncased',
train_datasets=['custom-similarity-dataset'],
eval_datasets=['sts-benchmark'],
similarity_fn_name='cosine',
model_max_length=512
)
# Save model with card
trainer.save_model('./final-model')
model.save('./final-model', model_card_data=model_card_data)
# Push to hub with model card
model.push_to_hub('my-username/my-sentence-transformer', model_card_data=model_card_data)Install with Tessl CLI
npx tessl i tessl/pypi-sentence-transformers