Embeddings, Retrieval, and Reranking framework for computing dense, sparse, and cross-encoder embeddings using state-of-the-art transformer models
—
Sparse encoders generate sparse embeddings that combine the efficiency of traditional sparse retrieval methods (like BM25) with neural approaches, providing efficient storage and fast retrieval for large-scale systems.
SparseEncoder(
model_name_or_path: str | None = None,
modules: list[torch.nn.Module] | None = None,
device: str | None = None,
prompts: dict[str, str] | None = None,
default_prompt_name: str | None = None,
similarity_fn_name: str | SimilarityFunction | None = None,
cache_folder: str | None = None,
trust_remote_code: bool = False,
revision: str | None = None,
local_files_only: bool = False,
token: str | bool | None = None,
max_active_dims: int | None = None,
model_kwargs: dict[str, Any] | None = None
){ .api }
Initialize a SparseEncoder model for generating sparse embeddings.
Parameters:
model_name_or_path: Pre-trained model name or pathmodules: List of PyTorch modules for custom architecturedevice: Device to run the model on ("cuda", "cpu", "mps", "npu")prompts: Dictionary of prompts for different contexts (e.g., {"query": "query: ", "passage": "passage: "})default_prompt_name: Default prompt to use if prompts are providedsimilarity_fn_name: Similarity function name ("cosine", "dot", "euclidean", "manhattan") or SimilarityFunctioncache_folder: Custom cache directory for modelstrust_remote_code: Allow custom code execution from HuggingFace Hubrevision: Model revision/branch/tag to loadlocal_files_only: Use only cached files, don't downloadtoken: HuggingFace authentication tokenmax_active_dims: Maximum number of active (non-zero) dimensions in output embeddingsmodel_kwargs: Additional model arguments (torch_dtype, attn_implementation, etc.)def encode(
sentences: list[str] | str,
batch_size: int = 32,
show_progress_bar: bool | None = None,
convert_to_numpy: bool = True,
convert_to_tensor: bool = False,
device: str | None = None
) -> list[dict[str, Any]] | dict[str, Any]{ .api }
Encode sentences into sparse embeddings.
Parameters:
sentences: Input text(s) to encodebatch_size: Batch size for processingshow_progress_bar: Display progress barconvert_to_numpy: Return numpy arraysconvert_to_tensor: Return PyTorch tensorsdevice: Device for computationReturns: Sparse embeddings as dictionaries with indices and values
def encode_queries(
queries: list[str] | str,
**kwargs
) -> list[dict[str, Any]] | dict[str, Any]{ .api }
Encode queries with query-specific processing.
def encode_corpus(
corpus: list[str] | str,
**kwargs
) -> list[dict[str, Any]] | dict[str, Any]{ .api }
Encode corpus documents with document-specific processing.
def get_sentence_embedding_dimension() -> int{ .api }
Get the vocabulary size (sparse embedding dimension).
def get_max_seq_length() -> int{ .api }
Get maximum sequence length the model can handle.
def tokenize(
texts: list[str] | str,
**kwargs
) -> dict[str, torch.Tensor]{ .api }
Tokenize input texts using the model's tokenizer.
def save(
path: str,
model_name: str | None = None,
create_model_card: bool = True,
train_datasets: list[str] | None = None,
safe_serialization: bool = True
) -> None{ .api }
Save the sparse encoder model to a directory.
def save_pretrained(
save_directory: str,
**kwargs
) -> None{ .api }
Save using HuggingFace format.
def save_to_hub(
repo_id: str,
**kwargs
) -> None{ .api }
Save and push to HuggingFace Hub.
def push_to_hub(
repo_id: str,
**kwargs
) -> None{ .api }
Push existing model to HuggingFace Hub.
def evaluate(
evaluator: SentenceEvaluator,
output_path: str | None = None
) -> float | dict[str, float]{ .api }
Evaluate the model using provided evaluator.
@property
def device() -> torch.device{ .api }
Current device of the model.
@property
def tokenizer() -> PreTrainedTokenizer{ .api }
Access to the model's tokenizer.
@property
def max_seq_length() -> int{ .api }
Maximum sequence length.
SparseEncoderTrainer(
model: SparseEncoder | None = None,
args: SparseEncoderTrainingArguments | None = None,
train_dataset: Dataset | None = None,
eval_dataset: Dataset | None = None,
tokenizer: PreTrainedTokenizer | None = None,
data_collator: DataCollator | None = None,
compute_metrics: callable | None = None,
callbacks: list[TrainerCallback] | None = None,
optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
preprocess_logits_for_metrics: callable | None = None
){ .api }
Trainer for sparse encoder models.
Parameters:
model: SparseEncoder model to trainargs: Training argumentstrain_dataset: Training dataseteval_dataset: Evaluation datasettokenizer: Tokenizer (auto-detected from model)data_collator: Data collator for batchingcompute_metrics: Metrics computation functioncallbacks: Training callbacksoptimizers: Custom optimizer and schedulerpreprocess_logits_for_metrics: Logits preprocessingdef train(
resume_from_checkpoint: str | bool | None = None,
trial: dict[str, Any] | None = None,
ignore_keys_for_eval: list[str] | None = None,
**kwargs
) -> TrainOutput{ .api }
Train the sparse encoder model.
def evaluate(
eval_dataset: Dataset | None = None,
ignore_keys: list[str] | None = None,
metric_key_prefix: str = "eval"
) -> dict[str, float]{ .api }
Evaluate model performance.
class SparseEncoderTrainingArguments(TrainingArguments):
def __init__(
self,
output_dir: str,
evaluation_strategy: str | IntervalStrategy = "no",
eval_steps: int | None = None,
eval_delay: float = 0,
logging_dir: str | None = None,
logging_strategy: str | IntervalStrategy = "steps",
logging_steps: int = 500,
save_strategy: str | IntervalStrategy = "steps",
save_steps: int = 500,
save_total_limit: int | None = None,
seed: int = 42,
data_seed: int | None = None,
jit_mode_eval: bool = False,
use_ipex: bool = False,
bf16: bool = False,
fp16: bool = False,
fp16_opt_level: str = "O1",
half_precision_backend: str = "auto",
bf16_full_eval: bool = False,
fp16_full_eval: bool = False,
tf32: bool | None = None,
local_rank: int = -1,
ddp_backend: str | None = None,
tpu_num_cores: int | None = None,
tpu_metrics_debug: bool = False,
debug: str | list[DebugOption] = "",
dataloader_drop_last: bool = False,
dataloader_num_workers: int = 0,
past_index: int = -1,
run_name: str | None = None,
disable_tqdm: bool | None = None,
remove_unused_columns: bool = True,
label_names: list[str] | None = None,
load_best_model_at_end: bool = False,
ignore_data_skip: bool = False,
fsdp: str | list[str] = "",
fsdp_min_num_params: int = 0,
fsdp_config: dict[str, Any] | None = None,
fsdp_transformer_layer_cls_to_wrap: str | None = None,
deepspeed: str | None = None,
label_smoothing_factor: float = 0.0,
optim: str | OptimizerNames = "adamw_torch",
optim_args: str | None = None,
adafactor: bool = False,
group_by_length: bool = False,
length_column_name: str | None = "length",
report_to: str | list[str] | None = None,
ddp_find_unused_parameters: bool | None = None,
ddp_bucket_cap_mb: int | None = None,
ddp_broadcast_buffers: bool | None = None,
dataloader_pin_memory: bool = True,
skip_memory_metrics: bool = True,
use_legacy_prediction_loop: bool = False,
push_to_hub: bool = False,
resume_from_checkpoint: str | None = None,
hub_model_id: str | None = None,
hub_strategy: str | HubStrategy = "every_save",
hub_token: str | None = None,
hub_private_repo: bool = False,
hub_always_push: bool = False,
gradient_checkpointing: bool = False,
include_inputs_for_metrics: bool = False,
auto_find_batch_size: bool = False,
full_determinism: bool = False,
torchdynamo: str | None = None,
ray_scope: str | None = "last",
ddp_timeout: int = 1800,
torch_compile: bool = False,
torch_compile_backend: str | None = None,
torch_compile_mode: str | None = None,
dispatch_batches: bool | None = None,
split_batches: bool | None = None,
include_tokens_per_second: bool = False,
**kwargs
){ .api }
Training arguments for sparse encoder training.
class SparseEncoderModelCardData:
def __init__(
self,
language: str | list[str] | None = None,
license: str | None = None,
tags: str | list[str] | None = None,
model_name: str | None = None,
model_id: str | None = None,
eval_results: list[EvalResult] | None = None,
train_datasets: str | list[str] | None = None,
eval_datasets: str | list[str] | None = None
){ .api }
Data class for generating model cards for sparse encoder models.
Parameters:
language: Language(s) supportedlicense: Model licensetags: Categorization tagsmodel_name: Human-readable namemodel_id: Model identifiereval_results: Evaluation resultstrain_datasets: Training datasets usedeval_datasets: Evaluation datasets usedfrom sentence_transformers import SparseEncoder
# Load a sparse encoder model
sparse_model = SparseEncoder('naver/splade-cocondenser-ensembledistil')
# Encode sentences to sparse embeddings
sentences = [
"Machine learning is transforming technology",
"Artificial intelligence applications are growing",
"Data science requires statistical knowledge"
]
# Get sparse embeddings
sparse_embeddings = sparse_model.encode(sentences)
# Each embedding is a dictionary with 'indices' and 'values'
for i, embedding in enumerate(sparse_embeddings):
print(f"Sentence {i}:")
print(f" Active dimensions: {len(embedding['indices'])}")
print(f" Sparsity: {len(embedding['indices']) / sparse_model.get_sentence_embedding_dimension():.4f}")
print(f" Max value: {max(embedding['values']):.4f}")
print()# For retrieval tasks with different query/document processing
queries = [
"What is machine learning?",
"How does neural networks work?"
]
documents = [
"Machine learning is a subset of artificial intelligence that focuses on algorithms",
"Neural networks are computational models inspired by biological neural networks",
"Data preprocessing is crucial for machine learning success",
"Deep learning uses multiple layers to model complex patterns"
]
# Encode queries and documents separately
query_embeddings = sparse_model.encode_queries(queries)
doc_embeddings = sparse_model.encode_corpus(documents)
print("Query embeddings:")
for i, emb in enumerate(query_embeddings):
print(f" Query {i}: {len(emb['indices'])} active dimensions")
print("Document embeddings:")
for i, emb in enumerate(doc_embeddings):
print(f" Document {i}: {len(emb['indices'])} active dimensions")import numpy as np
from collections import Counter
def sparse_dot_product(emb1, emb2):
"""Compute dot product between two sparse embeddings."""
# Convert to dictionaries for efficient lookup
dict1 = dict(zip(emb1['indices'], emb1['values']))
dict2 = dict(zip(emb2['indices'], emb2['values']))
# Find common indices and compute dot product
common_indices = set(dict1.keys()) & set(dict2.keys())
return sum(dict1[idx] * dict2[idx] for idx in common_indices)
def sparse_cosine_similarity(emb1, emb2):
"""Compute cosine similarity between sparse embeddings."""
dot_product = sparse_dot_product(emb1, emb2)
norm1 = np.sqrt(sum(v**2 for v in emb1['values']))
norm2 = np.sqrt(sum(v**2 for v in emb2['values']))
return dot_product / (norm1 * norm2) if norm1 * norm2 > 0 else 0.0
# Example usage
query_emb = query_embeddings[0]
similarities = []
for doc_emb in doc_embeddings:
sim = sparse_cosine_similarity(query_emb, doc_emb)
similarities.append(sim)
print("Similarity scores:")
for i, sim in enumerate(similarities):
print(f" Query 0 - Document {i}: {sim:.4f}")from sentence_transformers import SparseEncoder, SparseEncoderTrainer, SparseEncoderTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from datasets import Dataset
# Create training dataset
train_data = [
{"query": "python programming", "positive": "Python is a programming language", "negative": "Cats are pets"},
{"query": "machine learning", "positive": "ML algorithms learn patterns", "negative": "Cooking recipes vary"},
{"query": "data science", "positive": "Data analysis and statistics", "negative": "Weather forecast"}
]
# Convert to dataset format expected by trainer
def prepare_dataset(data):
dataset_dict = {"query": [], "positive": [], "negative": []}
for item in data:
dataset_dict["query"].append(item["query"])
dataset_dict["positive"].append(item["positive"])
dataset_dict["negative"].append(item["negative"])
return Dataset.from_dict(dataset_dict)
train_dataset = prepare_dataset(train_data)
# Initialize sparse encoder model
model = SparseEncoder('distilbert-base-uncased')
# Training arguments
args = SparseEncoderTrainingArguments(
output_dir='./sparse-encoder-output',
num_train_epochs=3,
per_device_train_batch_size=16,
logging_steps=10,
save_steps=100,
evaluation_strategy="steps",
eval_steps=100,
save_total_limit=2,
load_best_model_at_end=True,
)
# Create trainer
trainer = SparseEncoderTrainer(
model=model,
args=args,
train_dataset=train_dataset,
)
# Train the model
trainer.train()
# Save trained model
model.save('./my-sparse-encoder')from sentence_transformers.models import Transformer, SparseLinear
from sentence_transformers import SparseEncoder
# Create custom sparse encoder architecture
transformer = Transformer('distilbert-base-uncased')
sparse_linear = SparseLinear(
transformer.get_word_embedding_dimension(),
vocab_size=30522, # BERT vocabulary size
activation='relu'
)
# Combine modules
sparse_model = SparseEncoder(modules=[transformer, sparse_linear])
# Use the custom model
embeddings = sparse_model.encode(["Custom sparse encoder example"])def analyze_sparsity(embeddings, vocab_size=None):
"""Analyze sparsity patterns in sparse embeddings."""
if not isinstance(embeddings, list):
embeddings = [embeddings]
total_active = []
total_values = []
for emb in embeddings:
active_dims = len(emb['indices'])
total_active.append(active_dims)
total_values.extend(emb['values'])
if vocab_size:
avg_sparsity = sum(total_active) / (len(embeddings) * vocab_size)
print(f"Average sparsity: {avg_sparsity:.6f}")
print(f"Average active dimensions: {np.mean(total_active):.1f}")
print(f"Min/Max active dimensions: {min(total_active)}/{max(total_active)}")
print(f"Average value: {np.mean(total_values):.4f}")
print(f"Value range: {min(total_values):.4f} to {max(total_values):.4f}")
# Analyze encodings
analyze_sparsity(sparse_embeddings, vocab_size=sparse_model.get_sentence_embedding_dimension())from sentence_transformers import SparseEncoderModelCardData
# Create model card
model_card_data = SparseEncoderModelCardData(
language=['en'],
license='apache-2.0',
tags=['sentence-transformers', 'sparse-encoder', 'retrieval'],
model_name='Custom Sparse Encoder',
train_datasets=['ms-marco'],
eval_datasets=['beir']
)
# Save with model card
sparse_model.save('./my-sparse-model', model_card_data=model_card_data)
# Push to hub
sparse_model.push_to_hub('my-username/my-sparse-encoder')def sparse_to_compressed(sparse_embedding):
"""Convert sparse embedding to compressed format."""
return {
'indices': np.array(sparse_embedding['indices'], dtype=np.uint32),
'values': np.array(sparse_embedding['values'], dtype=np.float32)
}
def compressed_to_sparse(compressed_embedding):
"""Convert compressed format back to sparse embedding."""
return {
'indices': compressed_embedding['indices'].tolist(),
'values': compressed_embedding['values'].tolist()
}
# Compress embeddings for storage
compressed_embeddings = [sparse_to_compressed(emb) for emb in sparse_embeddings]def encode_large_corpus(sparse_model, texts, batch_size=1000, save_every=10000):
"""Encode large corpus in batches with periodic saving."""
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
batch_embeddings = sparse_model.encode(
batch,
batch_size=32,
show_progress_bar=True,
convert_to_numpy=False
)
all_embeddings.extend(batch_embeddings)
# Save periodically
if (i + batch_size) % save_every == 0:
print(f"Processed {i + batch_size} documents...")
return all_embeddings
# Example with large dataset
large_corpus = [f"Document {i} with content" for i in range(50000)]
corpus_embeddings = encode_large_corpus(sparse_model, large_corpus)Install with Tessl CLI
npx tessl i tessl/pypi-sentence-transformers