A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.
—
PyTerrier's evaluation framework provides comprehensive tools for conducting information retrieval experiments, including statistical significance testing, parameter tuning, and cross-validation. The framework is designed for rigorous experimental evaluation of retrieval systems.
Core experimental framework for evaluating multiple retrieval systems with statistical significance testing.
class Experiment:
"""
Comprehensive evaluation framework for comparing retrieval systems.
Parameters:
- retr_systems: List of transformer systems to evaluate
- topics: DataFrame with queries ('qid', 'query' columns)
- qrels: DataFrame with relevance judgments ('qid', 'docno', 'label' columns)
- eval_metrics: List of evaluation metrics to compute
- names: Optional list of system names for results
- perquery: Whether to compute per-query results (default: False)
- dataframe: Whether to return results as DataFrame (default: True)
- batch_size: Batch size for processing queries (default: None)
- filter_by_qrels: Filter results to only qrels topics (default: True)
- filter_by_topics: Filter qrels to only topic qids (default: True)
- baseline: Index of baseline system for significance testing
- test: Statistical test to use ('ttest', 'wilcoxon', 'fisher')
- correction: Multiple testing correction ('bonferroni', 'holm', 'fdr')
- highlight: Highlighting mode for significant results ('bold', 'color')
- round: Number of decimal places for results (default: 4)
- verbose: Enable verbose output (default: False)
- save_dir: Directory to save detailed results
"""
def __init__(self, retr_systems: List['Transformer'], topics: pd.DataFrame,
qrels: pd.DataFrame, eval_metrics: List[str],
names: List[str] = None, perquery: bool = False,
dataframe: bool = True, batch_size: int = None,
filter_by_qrels: bool = True, filter_by_topics: bool = True,
baseline: int = None, test: str = None, correction: str = None,
highlight: str = None, round: int = 4, verbose: bool = False,
save_dir: str = None, **kwargs) -> pd.DataFrame: ...Usage Examples:
# Basic experiment
systems = [bm25_retriever, pl2_retriever, dfr_retriever]
topics = dataset.get_topics()
qrels = dataset.get_qrels()
results = pt.Experiment(
systems,
topics,
qrels,
['map', 'ndcg', 'P_10'],
names=['BM25', 'PL2', 'DFR']
)
print(results)
# Experiment with significance testing
results = pt.Experiment(
systems,
topics,
qrels,
['map', 'ndcg'],
names=['BM25', 'PL2', 'DFR'],
baseline=0, # Use first system as baseline
test='ttest', # Paired t-test
correction='bonferroni' # Multiple testing correction
)
# Per-query results for detailed analysis
perquery_results = pt.Experiment(
systems,
topics,
qrels,
['map', 'ndcg'],
perquery=True, # Return per-query scores
save_dir='/path/to/results' # Save detailed results
)Advanced parameter optimization classes for systematic hyperparameter tuning.
class GridSearch:
"""
Grid search optimization for finding best parameter combinations.
Parameters:
- pipeline: Transformer pipeline to optimize
- params: Dictionary mapping parameter names to value lists
- topics: Topics DataFrame for evaluation
- qrels: Qrels DataFrame for evaluation
- metric: Single metric to optimize (e.g., 'map', 'ndcg')
- verbose: Enable verbose output
- jobs: Number of parallel jobs for evaluation
"""
def __init__(self, pipeline: 'Transformer', params: Dict[str, List[Any]],
topics: pd.DataFrame, qrels: pd.DataFrame, metric: str,
verbose: bool = False, jobs: int = 1, **kwargs) -> 'Transformer': ...
class GridScan:
"""
Parameter grid scanning for exploring parameter space.
Parameters:
- pipeline: Transformer pipeline to scan
- params: Dictionary mapping parameter names to value lists
- topics: Topics DataFrame for evaluation
- qrels: Qrels DataFrame for evaluation
- metrics: List of metrics to compute for each parameter combination
- verbose: Enable verbose output
- jobs: Number of parallel jobs for evaluation
"""
def __init__(self, pipeline: 'Transformer', params: Dict[str, List[Any]],
topics: pd.DataFrame, qrels: pd.DataFrame, metrics: List[str],
verbose: bool = False, jobs: int = 1, **kwargs) -> pd.DataFrame: ...
class KFoldGridSearch:
"""
K-fold cross-validation grid search for robust parameter optimization.
Parameters:
- pipeline: Transformer pipeline to optimize
- params: Dictionary mapping parameter names to value lists
- topics_list: List of topic DataFrames for each fold
- qrels: Qrels DataFrame for evaluation
- metric: Single metric to optimize
- verbose: Enable verbose output
- jobs: Number of parallel jobs for evaluation
"""
def __init__(self, pipeline: 'Transformer', params: Dict[str, List[Any]],
topics_list: List[pd.DataFrame], qrels: pd.DataFrame,
metric: str, verbose: bool = False, jobs: int = 1, **kwargs) -> 'Transformer': ...Usage Examples:
# Grid search for optimal BM25 parameters
bm25_pipeline = pt.terrier.Retriever(index_ref, wmodel='BM25')
param_grid = {
'k1': [0.9, 1.2, 1.5, 1.8],
'b': [0.3, 0.5, 0.7, 0.9]
}
best_bm25 = pt.GridSearch(
bm25_pipeline,
param_grid,
topics,
qrels,
'map',
verbose=True
)
# Grid scan to explore parameter space
scan_results = pt.GridScan(
bm25_pipeline,
param_grid,
topics,
qrels,
['map', 'ndcg', 'P_10'],
verbose=True
)
print(scan_results)
# K-fold cross-validation for robust optimization
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
topics_folds = [topics.iloc[train_idx] for train_idx, _ in kf.split(topics)]
best_params = pt.KFoldGridSearch(
bm25_pipeline,
param_grid,
topics_folds,
qrels,
'map'
)Simple evaluation function for single system assessment.
class Evaluate:
"""
Single result evaluation for computing metrics on retrieval results.
Parameters:
- res: Results DataFrame with 'qid', 'docno', 'score' columns
- qrels: Qrels DataFrame with 'qid', 'docno', 'label' columns
- metrics: List of evaluation metrics to compute
- perquery: Whether to return per-query results (default: False)
"""
def __init__(self, res: pd.DataFrame, qrels: pd.DataFrame,
metrics: List[str], perquery: bool = False) -> pd.DataFrame: ...Usage Example:
# Evaluate single system results
results = retriever.transform(topics)
evaluation = pt.Evaluate(results, qrels, ['map', 'ndcg', 'P_10'])
print(evaluation)
# Per-query evaluation
perquery_eval = pt.Evaluate(results, qrels, ['map', 'ndcg'], perquery=True)Helper transformers for result processing and normalization in evaluation pipelines.
class PerQueryMaxMinScoreTransformer(Transformer):
"""
Per-query min-max score normalization transformer.
Normalizes document scores within each query to [0,1] range using
min-max normalization per query.
"""
def __init__(self): ...Usage Example:
# Normalize scores before fusion
normalized_pipeline = (
retriever >>
pt.pipelines.PerQueryMaxMinScoreTransformer() >>
reranker
)PyTerrier supports comprehensive evaluation metrics through integration with the ir-measures library:
P_5, P_10, P_20: Precision at rank cutoffsR_5, R_10, R_20: Recall at rank cutoffsF_5, F_10, F_20: F1-score at rank cutoffsmap: Mean Average Precisionndcg: Normalized Discounted Cumulative Gainndcg_cut_5, ndcg_cut_10, ndcg_cut_20: NDCG at rank cutoffsmrr: Mean Reciprocal Rankbpref: Binary preference measurerbp: Rank-biased precisionerr: Expected reciprocal rankinfap: Inferred Average Precision# Evaluate across multiple datasets
datasets = ['vaswani', 'antique', 'trec-robust-2004']
systems = [bm25, pl2, dfr]
all_results = []
for dataset_name in datasets:
dataset = pt.get_dataset(dataset_name)
topics = dataset.get_topics()
qrels = dataset.get_qrels()
results = pt.Experiment(
systems,
topics,
qrels,
['map', 'ndcg'],
names=['BM25', 'PL2', 'DFR']
)
results['dataset'] = dataset_name
all_results.append(results)
combined = pd.concat(all_results)# Comprehensive significance testing
results = pt.Experiment(
[bm25, improved_system],
topics,
qrels,
['map', 'ndcg', 'P_10'],
names=['Baseline', 'Improved'],
baseline=0, # First system as baseline
test='ttest', # Paired t-test
correction='bonferroni', # Multiple testing correction
highlight='bold', # Bold significant improvements
perquery=True, # Enable per-query analysis
save_dir='/path/to/results' # Save detailed results
)# Evaluate with different training set sizes
training_sizes = [0.1, 0.2, 0.5, 0.8, 1.0]
learning_results = []
for size in training_sizes:
# Sample training data
train_sample = training_data.sample(frac=size, random_state=42)
# Train model
trained_model = ltr_model.fit(train_sample)
# Evaluate
pipeline = retriever >> trained_model
result = pt.Experiment([pipeline], test_topics, test_qrels, ['map'])
result['training_size'] = size
learning_results.append(result)
learning_curve = pd.concat(learning_results)from sklearn.model_selection import KFold
# K-fold cross-validation for robust evaluation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []
for fold, (train_idx, test_idx) in enumerate(kf.split(topics)):
train_topics = topics.iloc[train_idx]
test_topics = topics.iloc[test_idx]
# Train on fold
trained_system = estimator.fit(train_topics)
# Evaluate on fold
result = pt.Experiment([trained_system], test_topics, qrels, ['map'])
result['fold'] = fold
cv_results.append(result)
cv_summary = pd.concat(cv_results).groupby('name').agg({
'map': ['mean', 'std']
}).round(4)from typing import Dict, List, Any, Union, Optional
import pandas as pd
# Evaluation-specific types
SystemList = List['Transformer'] # List of systems to evaluate
MetricList = List[str] # List of evaluation metrics
ParameterGrid = Dict[str, List[Any]] # Parameter search space
StatisticalTest = str # Statistical test name ('ttest', 'wilcoxon', 'fisher')
CorrectionMethod = str # Multiple testing correction method
HighlightMode = str # Result highlighting mode ('bold', 'color')
BaselineIndex = int # Index of baseline system
TopicsFold = List[pd.DataFrame] # List of topic DataFrames for cross-validationInstall with Tessl CLI
npx tessl i tessl/pypi-python-terrier