Embeddings, Retrieval, and Reranking framework for computing dense, sparse, and cross-encoder embeddings using state-of-the-art transformer models
—
The sentence-transformers package provides a comprehensive evaluation framework for measuring model performance across various tasks including semantic similarity, information retrieval, classification, and clustering.
from sentence_transformers.evaluation import (
EmbeddingSimilarityEvaluator,
InformationRetrievalEvaluator,
BinaryClassificationEvaluator,
# ... other evaluators
)class SentenceEvaluator:
def __call__(
self,
model: SentenceTransformer,
output_path: str | None = None,
epoch: int = -1,
steps: int = -1
) -> float{ .api }
Abstract base class for all sentence transformer evaluators.
Parameters:
model: SentenceTransformer model to evaluateoutput_path: Directory to save evaluation resultsepoch: Current training epoch (for logging)steps: Current training steps (for logging)Returns: Primary evaluation metric score
class EmbeddingSimilarityEvaluator(SentenceEvaluator):
def __init__(
self,
sentences1: list[str],
sentences2: list[str],
scores: list[float],
batch_size: int = 16,
main_similarity: SimilarityFunction | None = None,
name: str = "",
show_progress_bar: bool = None,
write_csv: bool = True
){ .api }
Evaluates model performance on semantic textual similarity tasks by computing correlation between predicted and gold similarity scores.
Parameters:
sentences1: First sentences in pairssentences2: Second sentences in pairsscores: Gold similarity scores (typically -1 to 1 or 0 to 1)batch_size: Batch size for encodingmain_similarity: Similarity function to use (defaults to model's function)name: Name for evaluation resultsshow_progress_bar: Display progress during evaluationwrite_csv: Save detailed results to CSV fileReturns: Spearman correlation coefficient
class MSEEvaluator(SentenceEvaluator):
def __init__(
self,
sentences1: list[str],
sentences2: list[str],
scores: list[float],
batch_size: int = 16,
name: str = "",
show_progress_bar: bool = None,
write_csv: bool = True
){ .api }
Evaluates model using Mean Squared Error between predicted and gold similarity scores.
Returns: Negative MSE (higher is better)
class MSEEvaluatorFromDataFrame(SentenceEvaluator):
def __init__(
self,
dataframe: pandas.DataFrame,
sentence1_column_name: str = None,
sentence2_column_name: str = None,
score_column_name: str = None,
batch_size: int = 16,
name: str = "",
show_progress_bar: bool = None,
write_csv: bool = True
){ .api }
MSE evaluator that loads data from a pandas DataFrame.
Parameters:
dataframe: DataFrame containing evaluation datasentence1_column_name: Column name for first sentencessentence2_column_name: Column name for second sentencesscore_column_name: Column name for similarity scoresclass BinaryClassificationEvaluator(SentenceEvaluator):
def __init__(
self,
sentences1: list[str],
sentences2: list[str],
labels: list[int],
batch_size: int = 16,
name: str = "",
show_progress_bar: bool = None,
write_csv: bool = True
){ .api }
Evaluates binary classification performance using cosine similarity as classification score.
Parameters:
sentences1: First sentences in pairssentences2: Second sentences in pairslabels: Binary labels (0 or 1)batch_size: Batch size for encodingname: Name for evaluation resultsshow_progress_bar: Display progress barwrite_csv: Save results to CSVReturns: Average Precision (AP) score
class LabelAccuracyEvaluator(SentenceEvaluator):
def __init__(
self,
sentences: list[str],
labels: list[int],
name: str = "",
batch_size: int = 32,
show_progress_bar: bool = None,
write_csv: bool = True
){ .api }
Evaluates classification accuracy by finding the closest label embedding for each sentence.
Parameters:
sentences: Input sentences to classifylabels: Ground truth labelsname: Name for evaluationbatch_size: Batch size for encodingshow_progress_bar: Display progress barwrite_csv: Save results to CSVReturns: Classification accuracy
class InformationRetrievalEvaluator(SentenceEvaluator):
def __init__(
self,
queries: dict[str, str],
corpus: dict[str, str],
relevant_docs: dict[str, set[str]],
corpus_chunk_size: int = 50000,
mrr_at_k: list[int] = [10],
ndcg_at_k: list[int] = [10],
accuracy_at_k: list[int] = [1, 3, 5, 10],
precision_recall_at_k: list[int] = [1, 3, 5, 10],
map_at_k: list[int] = [100],
max_corpus_size: int = None,
show_progress_bar: bool = None,
batch_size: int = 32,
name: str = "",
write_csv: bool = True
){ .api }
Comprehensive information retrieval evaluation with multiple metrics.
Parameters:
queries: Dictionary mapping query IDs to query textscorpus: Dictionary mapping document IDs to document textsrelevant_docs: Dictionary mapping query IDs to sets of relevant document IDscorpus_chunk_size: Size of corpus chunks for processingmrr_at_k: Ranks for Mean Reciprocal Rank calculationndcg_at_k: Ranks for NDCG calculationaccuracy_at_k: Ranks for accuracy calculationprecision_recall_at_k: Ranks for precision/recall calculationmap_at_k: Ranks for Mean Average Precision calculationmax_corpus_size: Maximum corpus size to useshow_progress_bar: Display progress barbatch_size: Batch size for encodingname: Name for evaluationwrite_csv: Save results to CSVReturns: NDCG@10 score
class RerankingEvaluator(SentenceEvaluator):
def __init__(
self,
samples: list[dict],
mrr_at_k: list[int] = [10],
ndcg_at_k: list[int] = [10],
accuracy_at_k: list[int] = [1, 3, 5, 10],
precision_recall_at_k: list[int] = [1, 3, 5, 10],
map_at_k: list[int] = [100],
name: str = "",
write_csv: bool = True,
batch_size: int = 512,
show_progress_bar: bool = None
){ .api }
Evaluates reranking performance on query-document pairs.
Parameters:
samples: List of samples with query, positive, and negative documentsReturns: MRR@10 score
class TripletEvaluator(SentenceEvaluator):
def __init__(
self,
anchors: list[str],
positives: list[str],
negatives: list[str],
main_distance_function: SimilarityFunction | None = None,
name: str = "",
batch_size: int = 16,
show_progress_bar: bool = None,
write_csv: bool = True
){ .api }
Evaluates triplet accuracy: anchor should be closer to positive than negative.
Parameters:
anchors: Anchor sentencespositives: Positive sentencesnegatives: Negative sentencesmain_distance_function: Distance function to usename: Name for evaluationbatch_size: Batch size for encodingshow_progress_bar: Display progress barwrite_csv: Save results to CSVReturns: Triplet accuracy (percentage of correct triplets)
class ParaphraseMiningEvaluator(SentenceEvaluator):
def __init__(
self,
sentences_map: dict[str, str],
duplicates_list: set[tuple[str, str]],
duplicates_dict: dict[str, dict[str, bool]] = None,
query_chunk_size: int = 5000,
corpus_chunk_size: int = 100000,
max_pairs: int = 500000,
top_k: int = 100,
name: str = "",
batch_size: int = 16,
show_progress_bar: bool = None,
write_csv: bool = True
){ .api }
Evaluates paraphrase mining performance by finding duplicate/similar sentences.
Parameters:
sentences_map: Dictionary mapping sentence IDs to textsduplicates_list: Set of sentence ID pairs that are duplicatesduplicates_dict: Alternative format for duplicatesquery_chunk_size: Size of query chunks for processingcorpus_chunk_size: Size of corpus chunksmax_pairs: Maximum pairs to evaluatetop_k: Number of top pairs to considername: Name for evaluationbatch_size: Batch size for encodingshow_progress_bar: Display progress barwrite_csv: Save results to CSVReturns: Average Precision score
class TranslationEvaluator(SentenceEvaluator):
def __init__(
self,
source_sentences: list[str],
target_sentences: list[str],
batch_size: int = 16,
name: str = "",
show_progress_bar: bool = None,
write_csv: bool = True
){ .api }
Evaluates cross-lingual or translation performance by measuring similarity between source and target sentences.
Parameters:
source_sentences: Source language sentencestarget_sentences: Target language sentences (translations)batch_size: Batch size for encodingname: Name for evaluationshow_progress_bar: Display progress barwrite_csv: Save results to CSVReturns: Average cosine similarity between translations
class SequentialEvaluator(SentenceEvaluator):
def __init__(
self,
evaluators: list[SentenceEvaluator],
main_score_function: callable = None
){ .api }
Runs multiple evaluators sequentially and combines their results.
Parameters:
evaluators: List of evaluators to runmain_score_function: Function to combine scores into main scoreReturns: Combined evaluation score
class NanoBEIREvaluator(SentenceEvaluator):
def __init__(
self,
dataset_name: str | None = None,
dataset_config: str | None = None,
dataset_revision: str | None = None,
corpus_chunk_size: int = 50000,
max_corpus_size: int | None = None,
**kwargs
){ .api }
Evaluator for NanoBEIR (Neural Assessment of Natural Language Generation over Information Retrieval) benchmark tasks.
Parameters:
dataset_name: Name of the NanoBEIR datasetdataset_config: Dataset configurationdataset_revision: Dataset revision to usecorpus_chunk_size: Corpus processing chunk sizemax_corpus_size: Maximum corpus size to evaluate on**kwargs: Additional arguments passed to base evaluatorReturns: NDCG@10 score on the NanoBEIR task
from sentence_transformers.evaluation import SimilarityFunction
class SimilarityFunction(Enum):
COSINE = "cosine"
DOT_PRODUCT = "dot"
EUCLIDEAN = "euclidean"
MANHATTAN = "manhattan"{ .api }
Enumeration of similarity functions available for evaluation.
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Prepare evaluation data
sentences1 = ["The cat sits on the mat", "I love programming"]
sentences2 = ["A feline rests on a rug", "I enjoy coding"]
scores = [0.9, 0.8] # Similarity scores
# Create evaluator
evaluator = EmbeddingSimilarityEvaluator(
sentences1=sentences1,
sentences2=sentences2,
scores=scores,
name="dev"
)
# Evaluate model
correlation = evaluator(model, output_path="./evaluation_results/")
print(f"Spearman correlation: {correlation:.4f}")from sentence_transformers.evaluation import InformationRetrievalEvaluator
# Prepare IR evaluation data
queries = {
"q1": "What is machine learning?",
"q2": "How do neural networks work?"
}
corpus = {
"d1": "Machine learning is a subset of artificial intelligence",
"d2": "Neural networks are computational models inspired by biology",
"d3": "Weather forecasting uses statistical models",
"d4": "Deep learning uses multiple layers of neural networks"
}
relevant_docs = {
"q1": {"d1", "d4"}, # Relevant documents for q1
"q2": {"d2", "d4"} # Relevant documents for q2
}
# Create IR evaluator
ir_evaluator = InformationRetrievalEvaluator(
queries=queries,
corpus=corpus,
relevant_docs=relevant_docs,
name="test_retrieval"
)
# Evaluate
ndcg_score = ir_evaluator(model, output_path="./ir_results/")
print(f"NDCG@10: {ndcg_score:.4f}")from sentence_transformers.evaluation import BinaryClassificationEvaluator
# Prepare binary classification data
sentences1 = [
"The cat sits on the mat",
"I love programming",
"Dogs are great pets",
"Weather is nice today"
]
sentences2 = [
"A feline rests on a rug", # Similar to first
"Cooking is fun", # Different from second
"Cats are wonderful animals", # Related to third
"It's sunny outside" # Similar to fourth
]
labels = [1, 0, 1, 1] # Binary similarity labels
# Create evaluator
binary_evaluator = BinaryClassificationEvaluator(
sentences1=sentences1,
sentences2=sentences2,
labels=labels,
name="binary_classification"
)
# Evaluate
ap_score = binary_evaluator(model, output_path="./binary_results/")
print(f"Average Precision: {ap_score:.4f}")from sentence_transformers.evaluation import TripletEvaluator
# Prepare triplet data
anchors = ["The cat sits on the mat", "I love programming"]
positives = ["A feline rests on a rug", "I enjoy coding"]
negatives = ["Dogs are great pets", "Weather is nice"]
# Create triplet evaluator
triplet_evaluator = TripletEvaluator(
anchors=anchors,
positives=positives,
negatives=negatives,
name="triplet_eval"
)
# Evaluate
accuracy = triplet_evaluator(model, output_path="./triplet_results/")
print(f"Triplet accuracy: {accuracy:.4f}")from sentence_transformers.evaluation import SequentialEvaluator
# Create multiple evaluators
similarity_eval = EmbeddingSimilarityEvaluator(sentences1, sentences2, scores, name="similarity")
binary_eval = BinaryClassificationEvaluator(sentences1, sentences2, labels, name="binary")
triplet_eval = TripletEvaluator(anchors, positives, negatives, name="triplet")
# Combine evaluators
sequential_evaluator = SequentialEvaluator(
evaluators=[similarity_eval, binary_eval, triplet_eval],
main_score_function=lambda scores: sum(scores) / len(scores) # Average score
)
# Run all evaluations
combined_score = sequential_evaluator(model, output_path="./multi_eval_results/")
print(f"Combined evaluation score: {combined_score:.4f}")from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
# Create evaluator for training
dev_evaluator = EmbeddingSimilarityEvaluator(
sentences1=dev_sentences1,
sentences2=dev_sentences2,
scores=dev_scores,
name="sts-dev"
)
# Training arguments with evaluation
args = SentenceTransformerTrainingArguments(
output_dir='./training_with_eval',
evaluation_strategy="steps",
eval_steps=100,
logging_steps=100,
save_steps=100,
num_train_epochs=3,
per_device_train_batch_size=16,
load_best_model_at_end=True,
metric_for_best_model="eval_spearman_cosine",
greater_is_better=True
)
def compute_metrics(eval_pred):
"""Custom metrics for trainer."""
# This would be called during training evaluation
return dev_evaluator(model, output_path=args.output_dir)
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
loss=loss,
compute_metrics=compute_metrics
)
trainer.train()class CustomEvaluator(SentenceEvaluator):
"""Custom evaluator for specific task."""
def __init__(self, test_data, name="custom"):
self.test_data = test_data
self.name = name
def __call__(self, model, output_path=None, epoch=-1, steps=-1):
# Implement custom evaluation logic
embeddings = model.encode([item['text'] for item in self.test_data])
# Calculate your custom metric
custom_score = self.calculate_custom_metric(embeddings)
# Save results if output_path provided
if output_path:
self.save_results(custom_score, output_path, epoch, steps)
return custom_score
def calculate_custom_metric(self, embeddings):
# Implement your metric calculation
return 0.85 # Placeholder
def save_results(self, score, output_path, epoch, steps):
# Save evaluation results
import os, csv
csv_file = os.path.join(output_path, f"{self.name}_results.csv")
with open(csv_file, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['epoch', 'steps', 'score'])
writer.writerow([epoch, steps, score])
# Use custom evaluator
custom_eval = CustomEvaluator(test_data)
score = custom_eval(model, output_path="./custom_results/")def evaluate_on_multiple_datasets(model, datasets_config):
"""Evaluate model on multiple datasets."""
results = {}
for dataset_name, config in datasets_config.items():
if config['type'] == 'similarity':
evaluator = EmbeddingSimilarityEvaluator(
sentences1=config['sentences1'],
sentences2=config['sentences2'],
scores=config['scores'],
name=dataset_name
)
elif config['type'] == 'retrieval':
evaluator = InformationRetrievalEvaluator(
queries=config['queries'],
corpus=config['corpus'],
relevant_docs=config['relevant_docs'],
name=dataset_name
)
score = evaluator(model, output_path=f"./results/{dataset_name}/")
results[dataset_name] = score
print(f"{dataset_name}: {score:.4f}")
return results
# Configuration for multiple datasets
datasets_config = {
"sts_benchmark": {
"type": "similarity",
"sentences1": sts_sentences1,
"sentences2": sts_sentences2,
"scores": sts_scores
},
"msmarco": {
"type": "retrieval",
"queries": msmarco_queries,
"corpus": msmarco_corpus,
"relevant_docs": msmarco_qrels
}
}
# Run evaluations
all_results = evaluate_on_multiple_datasets(model, datasets_config)Install with Tessl CLI
npx tessl i tessl/pypi-sentence-transformers