CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-python-terrier

A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.

Pending
Overview
Eval results
Files

datasets.mddocs/

Datasets

PyTerrier's dataset management system provides built-in access to standard information retrieval test collections and supports creating custom datasets. The system includes topics (queries), relevance judgments (qrels), document corpora, and pre-built indexes.

Capabilities

Dataset Access Functions

Core functions for discovering, accessing, and managing IR test collections.

def get_dataset(name: str) -> 'Dataset':
    """
    Retrieve a specific dataset by name.
    
    Parameters:
    - name: Dataset name (e.g., 'vaswani', 'msmarco-passage', 'trec-covid')
    
    Returns:
    - Dataset object providing access to topics, qrels, corpus, and indexes
    """

def find_datasets(query: str = None, **kwargs) -> List[str]:
    """
    Find datasets matching search criteria.
    
    Parameters:
    - query: Search query for dataset names or descriptions
    - **kwargs: Additional search filters
    
    Returns:
    - List of matching dataset names
    """

def list_datasets() -> List[str]:
    """
    List all available datasets.
    
    Returns:
    - List of all dataset names
    """

def transformer_from_dataset(dataset_name: str, variant: str = None, **kwargs) -> 'Transformer':
    """
    Create a transformer (typically retriever) from a dataset.
    
    Parameters:
    - dataset_name: Name of the dataset
    - variant: Specific variant or index type
    - **kwargs: Additional transformer parameters
    
    Returns:
    - Configured transformer for the dataset
    """

Usage Examples:

# Get specific dataset
vaswani = pt.get_dataset('vaswani')
msmarco = pt.get_dataset('msmarco-passage')

# Find datasets by query
covid_datasets = pt.find_datasets('covid')
passage_datasets = pt.find_datasets('passage')

# List all available datasets
all_datasets = pt.list_datasets()
print(f"Available datasets: {len(all_datasets)}")

# Create retriever from dataset
bm25_retriever = pt.transformer_from_dataset('vaswani', 'terrier_stemmed', wmodel='BM25')

Dataset Class

Core dataset class providing access to all components of an IR test collection.

class Dataset:
    """
    Represents an information retrieval test collection with topics, qrels, corpus, and indexes.
    """
    
    def get_topics(self, variant: str = None) -> pd.DataFrame:
        """
        Get query topics for the dataset.
        
        Parameters:
        - variant: Specific topic variant (e.g., 'title', 'description', 'narrative')
        
        Returns:
        - DataFrame with 'qid' and 'query' columns
        """
    
    def get_qrels(self, variant: str = None) -> pd.DataFrame:
        """
        Get relevance judgments (qrels) for the dataset.
        
        Parameters:
        - variant: Specific qrels variant if multiple available
        
        Returns:
        - DataFrame with 'qid', 'docno', and 'label' columns
        """
    
    def get_corpus_iter(self, verbose: bool = True) -> Iterator[Dict[str, Any]]:
        """
        Get iterator over document corpus.
        
        Parameters:
        - verbose: Show progress information
        
        Returns:
        - Iterator yielding documents with 'docno' and 'text' fields
        """
        
    def get_corpus(self) -> pd.DataFrame:
        """
        Get entire document corpus as DataFrame.
        
        Returns:
        - DataFrame with 'docno' and 'text' columns
        """
    
    def get_index(self, variant: str = None) -> Any:
        """
        Get pre-built index for the dataset.
        
        Parameters:  
        - variant: Index variant (e.g., 'terrier_stemmed', 'terrier_unstemmed')
        
        Returns:
        - IndexRef object for the dataset index
        """
    
    def info(self) -> Dict[str, Any]:
        """
        Get metadata information about the dataset.
        
        Returns:
        - Dictionary with dataset metadata
        """

Usage Examples:

# Get dataset components
dataset = pt.get_dataset('vaswani')

# Get topics (queries)
topics = dataset.get_topics()
print(f"Number of topics: {len(topics)}")

# Get relevance judgments  
qrels = dataset.get_qrels()
print(f"Number of qrels: {len(qrels)}")

# Get document corpus
corpus_iter = dataset.get_corpus_iter()
for doc in corpus_iter:
    print(f"Document {doc['docno']}: {doc['text'][:100]}...")
    break

# Get pre-built index
index_ref = dataset.get_index('terrier_stemmed')

# Get dataset information
info = dataset.info()
print(f"Dataset info: {info}")

Remote Dataset Support

Extended dataset class for handling remote datasets that are downloaded on demand.

class RemoteDataset(Dataset):
    """
    Dataset stored remotely and downloaded on first access.
    
    Inherits all Dataset methods and adds remote download capabilities.
    """
    
    def download(self, force: bool = False) -> None:
        """
        Download dataset components.
        
        Parameters:
        - force: Force re-download even if already cached
        """
    
    def is_downloaded(self) -> bool:
        """
        Check if dataset has been downloaded.
        
        Returns:
        - True if dataset is locally available
        """

Dataset Providers

Provider classes for different dataset sources and formats.

class DatasetProvider:
    """
    Abstract base class for dataset providers.
    """
    
    def get_dataset(self, name: str) -> Dataset: ...
    def list_datasets(self) -> List[str]: ...
    def find_datasets(self, query: str) -> List[str]: ...

class BuiltinDatasetProvider(DatasetProvider):
    """
    Provider for built-in PyTerrier datasets.
    """

class IRDSDatasetProvider(DatasetProvider):
    """
    Provider for ir-datasets integration.
    Provides access to datasets from the ir-datasets library.
    """

Dataset Registry

Global registry for managing available datasets across different providers.

DATASET_MAP: Dict[str, Dataset]  # Global dataset registry mapping names to Dataset objects

Common Datasets

Built-in Test Collections

# Small test collections for development
vaswani = pt.get_dataset('vaswani')          # Classic Vaswani collection (11,429 docs)
antique = pt.get_dataset('antique')          # ANTIQUE non-factoid QA dataset

# TREC collections  
robust04 = pt.get_dataset('trec-robust-2004')  # TREC Robust 2004
covid = pt.get_dataset('trec-covid')            # TREC-COVID dataset

# Web collections
msmarco_passage = pt.get_dataset('msmarco-passage')    # MS MARCO passage ranking
msmarco_document = pt.get_dataset('msmarco-document')  # MS MARCO document ranking

# Academic collections
cord19 = pt.get_dataset('cord19')            # CORD-19 COVID-19 research papers

Dataset Variants

Many datasets provide multiple variants for different use cases:

# Get different topic variants
vaswani = pt.get_dataset('vaswani')
title_topics = vaswani.get_topics('title')           # Title-only queries
desc_topics = vaswani.get_topics('description')      # Description queries
narrative_topics = vaswani.get_topics('narrative')   # Full narrative queries

# Get different index variants
stemmed_index = vaswani.get_index('terrier_stemmed')     # Stemmed index
unstemmed_index = vaswani.get_index('terrier_unstemmed') # Unstemmed index

Advanced Dataset Usage

Custom Dataset Creation

# Create custom dataset from local files
class CustomDataset(pt.datasets.Dataset):
    def __init__(self, topics_file, qrels_file, corpus_path):
        self.topics_file = topics_file
        self.qrels_file = qrels_file  
        self.corpus_path = corpus_path
    
    def get_topics(self):
        return pd.read_csv(self.topics_file)
    
    def get_qrels(self):
        return pd.read_csv(self.qrels_file)
        
    def get_corpus_iter(self):
        # Custom corpus loading logic
        pass

# Register custom dataset
custom_dataset = CustomDataset('/path/to/topics.csv', '/path/to/qrels.csv', '/path/to/corpus/')
pt.datasets.DATASET_MAP['my-custom'] = custom_dataset

Dataset Integration with Pipelines

# Create retrieval pipeline from dataset
dataset = pt.get_dataset('vaswani')
retriever = pt.terrier.Retriever.from_dataset('vaswani', 'terrier_stemmed')

# Evaluate on dataset
topics = dataset.get_topics()
qrels = dataset.get_qrels()

results = retriever.transform(topics)
evaluation = pt.Experiment([retriever], topics, qrels, ['map', 'ndcg'])

Corpus Processing

# Process large corpus efficiently
dataset = pt.get_dataset('msmarco-passage')
corpus_iter = dataset.get_corpus_iter()

# Create custom indexer for corpus
indexer = pt.IterDictIndexer('/path/to/custom_index')
index_ref = indexer.index(corpus_iter)

Multi-Dataset Experiments

# Compare across multiple datasets
datasets = ['vaswani', 'antique', 'trec-robust-2004']
results = []

for dataset_name in datasets:
    dataset = pt.get_dataset(dataset_name)
    retriever = pt.transformer_from_dataset(dataset_name, wmodel='BM25')
    
    topics = dataset.get_topics()
    qrels = dataset.get_qrels()
    
    result = pt.Experiment([retriever], topics, qrels, ['map'])
    result['dataset'] = dataset_name
    results.append(result)

# Combine results
combined_results = pd.concat(results)

Types

from typing import Dict, List, Any, Iterator, Optional, Union
import pandas as pd

# Dataset-specific types
DatasetName = str  # Dataset identifier
TopicVariant = str  # Topic variant name ('title', 'description', etc.)
QrelsVariant = str  # Qrels variant name
IndexVariant = str  # Index variant name ('terrier_stemmed', etc.)
DatasetInfo = Dict[str, Any]  # Dataset metadata
DocumentIterator = Iterator[Dict[str, Any]]  # Document corpus iterator
DatasetProvider = Any  # Dataset provider instance

Install with Tessl CLI

npx tessl i tessl/pypi-python-terrier

docs

datasets.md

evaluation.md

index.md

indexing.md

java.md

retrieval.md

text-processing.md

transformers.md

utilities.md

tile.json