A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.
—
PyTerrier's dataset management system provides built-in access to standard information retrieval test collections and supports creating custom datasets. The system includes topics (queries), relevance judgments (qrels), document corpora, and pre-built indexes.
Core functions for discovering, accessing, and managing IR test collections.
def get_dataset(name: str) -> 'Dataset':
"""
Retrieve a specific dataset by name.
Parameters:
- name: Dataset name (e.g., 'vaswani', 'msmarco-passage', 'trec-covid')
Returns:
- Dataset object providing access to topics, qrels, corpus, and indexes
"""
def find_datasets(query: str = None, **kwargs) -> List[str]:
"""
Find datasets matching search criteria.
Parameters:
- query: Search query for dataset names or descriptions
- **kwargs: Additional search filters
Returns:
- List of matching dataset names
"""
def list_datasets() -> List[str]:
"""
List all available datasets.
Returns:
- List of all dataset names
"""
def transformer_from_dataset(dataset_name: str, variant: str = None, **kwargs) -> 'Transformer':
"""
Create a transformer (typically retriever) from a dataset.
Parameters:
- dataset_name: Name of the dataset
- variant: Specific variant or index type
- **kwargs: Additional transformer parameters
Returns:
- Configured transformer for the dataset
"""Usage Examples:
# Get specific dataset
vaswani = pt.get_dataset('vaswani')
msmarco = pt.get_dataset('msmarco-passage')
# Find datasets by query
covid_datasets = pt.find_datasets('covid')
passage_datasets = pt.find_datasets('passage')
# List all available datasets
all_datasets = pt.list_datasets()
print(f"Available datasets: {len(all_datasets)}")
# Create retriever from dataset
bm25_retriever = pt.transformer_from_dataset('vaswani', 'terrier_stemmed', wmodel='BM25')Core dataset class providing access to all components of an IR test collection.
class Dataset:
"""
Represents an information retrieval test collection with topics, qrels, corpus, and indexes.
"""
def get_topics(self, variant: str = None) -> pd.DataFrame:
"""
Get query topics for the dataset.
Parameters:
- variant: Specific topic variant (e.g., 'title', 'description', 'narrative')
Returns:
- DataFrame with 'qid' and 'query' columns
"""
def get_qrels(self, variant: str = None) -> pd.DataFrame:
"""
Get relevance judgments (qrels) for the dataset.
Parameters:
- variant: Specific qrels variant if multiple available
Returns:
- DataFrame with 'qid', 'docno', and 'label' columns
"""
def get_corpus_iter(self, verbose: bool = True) -> Iterator[Dict[str, Any]]:
"""
Get iterator over document corpus.
Parameters:
- verbose: Show progress information
Returns:
- Iterator yielding documents with 'docno' and 'text' fields
"""
def get_corpus(self) -> pd.DataFrame:
"""
Get entire document corpus as DataFrame.
Returns:
- DataFrame with 'docno' and 'text' columns
"""
def get_index(self, variant: str = None) -> Any:
"""
Get pre-built index for the dataset.
Parameters:
- variant: Index variant (e.g., 'terrier_stemmed', 'terrier_unstemmed')
Returns:
- IndexRef object for the dataset index
"""
def info(self) -> Dict[str, Any]:
"""
Get metadata information about the dataset.
Returns:
- Dictionary with dataset metadata
"""Usage Examples:
# Get dataset components
dataset = pt.get_dataset('vaswani')
# Get topics (queries)
topics = dataset.get_topics()
print(f"Number of topics: {len(topics)}")
# Get relevance judgments
qrels = dataset.get_qrels()
print(f"Number of qrels: {len(qrels)}")
# Get document corpus
corpus_iter = dataset.get_corpus_iter()
for doc in corpus_iter:
print(f"Document {doc['docno']}: {doc['text'][:100]}...")
break
# Get pre-built index
index_ref = dataset.get_index('terrier_stemmed')
# Get dataset information
info = dataset.info()
print(f"Dataset info: {info}")Extended dataset class for handling remote datasets that are downloaded on demand.
class RemoteDataset(Dataset):
"""
Dataset stored remotely and downloaded on first access.
Inherits all Dataset methods and adds remote download capabilities.
"""
def download(self, force: bool = False) -> None:
"""
Download dataset components.
Parameters:
- force: Force re-download even if already cached
"""
def is_downloaded(self) -> bool:
"""
Check if dataset has been downloaded.
Returns:
- True if dataset is locally available
"""Provider classes for different dataset sources and formats.
class DatasetProvider:
"""
Abstract base class for dataset providers.
"""
def get_dataset(self, name: str) -> Dataset: ...
def list_datasets(self) -> List[str]: ...
def find_datasets(self, query: str) -> List[str]: ...
class BuiltinDatasetProvider(DatasetProvider):
"""
Provider for built-in PyTerrier datasets.
"""
class IRDSDatasetProvider(DatasetProvider):
"""
Provider for ir-datasets integration.
Provides access to datasets from the ir-datasets library.
"""Global registry for managing available datasets across different providers.
DATASET_MAP: Dict[str, Dataset] # Global dataset registry mapping names to Dataset objects# Small test collections for development
vaswani = pt.get_dataset('vaswani') # Classic Vaswani collection (11,429 docs)
antique = pt.get_dataset('antique') # ANTIQUE non-factoid QA dataset
# TREC collections
robust04 = pt.get_dataset('trec-robust-2004') # TREC Robust 2004
covid = pt.get_dataset('trec-covid') # TREC-COVID dataset
# Web collections
msmarco_passage = pt.get_dataset('msmarco-passage') # MS MARCO passage ranking
msmarco_document = pt.get_dataset('msmarco-document') # MS MARCO document ranking
# Academic collections
cord19 = pt.get_dataset('cord19') # CORD-19 COVID-19 research papersMany datasets provide multiple variants for different use cases:
# Get different topic variants
vaswani = pt.get_dataset('vaswani')
title_topics = vaswani.get_topics('title') # Title-only queries
desc_topics = vaswani.get_topics('description') # Description queries
narrative_topics = vaswani.get_topics('narrative') # Full narrative queries
# Get different index variants
stemmed_index = vaswani.get_index('terrier_stemmed') # Stemmed index
unstemmed_index = vaswani.get_index('terrier_unstemmed') # Unstemmed index# Create custom dataset from local files
class CustomDataset(pt.datasets.Dataset):
def __init__(self, topics_file, qrels_file, corpus_path):
self.topics_file = topics_file
self.qrels_file = qrels_file
self.corpus_path = corpus_path
def get_topics(self):
return pd.read_csv(self.topics_file)
def get_qrels(self):
return pd.read_csv(self.qrels_file)
def get_corpus_iter(self):
# Custom corpus loading logic
pass
# Register custom dataset
custom_dataset = CustomDataset('/path/to/topics.csv', '/path/to/qrels.csv', '/path/to/corpus/')
pt.datasets.DATASET_MAP['my-custom'] = custom_dataset# Create retrieval pipeline from dataset
dataset = pt.get_dataset('vaswani')
retriever = pt.terrier.Retriever.from_dataset('vaswani', 'terrier_stemmed')
# Evaluate on dataset
topics = dataset.get_topics()
qrels = dataset.get_qrels()
results = retriever.transform(topics)
evaluation = pt.Experiment([retriever], topics, qrels, ['map', 'ndcg'])# Process large corpus efficiently
dataset = pt.get_dataset('msmarco-passage')
corpus_iter = dataset.get_corpus_iter()
# Create custom indexer for corpus
indexer = pt.IterDictIndexer('/path/to/custom_index')
index_ref = indexer.index(corpus_iter)# Compare across multiple datasets
datasets = ['vaswani', 'antique', 'trec-robust-2004']
results = []
for dataset_name in datasets:
dataset = pt.get_dataset(dataset_name)
retriever = pt.transformer_from_dataset(dataset_name, wmodel='BM25')
topics = dataset.get_topics()
qrels = dataset.get_qrels()
result = pt.Experiment([retriever], topics, qrels, ['map'])
result['dataset'] = dataset_name
results.append(result)
# Combine results
combined_results = pd.concat(results)from typing import Dict, List, Any, Iterator, Optional, Union
import pandas as pd
# Dataset-specific types
DatasetName = str # Dataset identifier
TopicVariant = str # Topic variant name ('title', 'description', etc.)
QrelsVariant = str # Qrels variant name
IndexVariant = str # Index variant name ('terrier_stemmed', etc.)
DatasetInfo = Dict[str, Any] # Dataset metadata
DocumentIterator = Iterator[Dict[str, Any]] # Document corpus iterator
DatasetProvider = Any # Dataset provider instanceInstall with Tessl CLI
npx tessl i tessl/pypi-python-terrier