A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.
npx @tessl/cli install tessl/pypi-python-terrier@0.13.0A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks. PyTerrier provides a declarative approach to information retrieval research through composable transformer pipelines that can be chained using Python operators.
pip install python-terrierimport pyterrier as ptCommon for working with specific components:
from pyterrier import Transformer, Estimator, Indexer
from pyterrier import Experiment, GridSearch
from pyterrier.terrier import Retriever, IndexFactoryimport pyterrier as pt
import pandas as pd
# Initialize PyTerrier (sets up Java VM)
if not pt.java.started():
pt.java.init()
# Create a simple retrieval pipeline
bm25 = pt.terrier.Retriever.from_dataset('vaswani', 'terrier_stemmed', wmodel='BM25')
# Perform retrieval
queries = pd.DataFrame([
{'qid': '1', 'query': 'information retrieval'},
{'qid': '2', 'query': 'search engines'}
])
results = bm25.transform(queries)
print(results.head())
# Chain transformers using operators
dataset = pt.get_dataset('vaswani')
text_getter = pt.text.get_text(dataset)
reranker = pt.terrier.Retriever(dataset.get_index(), wmodel='PL2')
pipeline = bm25 >> text_getter >> reranker
results = pipeline.transform(queries)
# Run experiments with evaluation
topics = pt.get_dataset('vaswani').get_topics()
qrels = pt.get_dataset('vaswani').get_qrels()
evaluation = pt.Experiment([bm25], topics, qrels, ['map', 'ndcg'])
print(evaluation)PyTerrier's architecture is built around several key design patterns:
>>, +, **, etc.)transform()) and iterator (transform_iter()) interfacesBase classes and pipeline operators that form the foundation of PyTerrier's transformer architecture, enabling composable information retrieval pipelines.
class Transformer:
def transform(self, topics_or_res: pd.DataFrame) -> pd.DataFrame: ...
def transform_iter(self, input_iter) -> Iterator: ...
def __rshift__(self, other): ... # >> operator for composition
def __add__(self, other): ... # + operator for score addition
def __pow__(self, other): ... # ** operator for feature union
def __or__(self, other): ... # | operator for set union
def __and__(self, other): ... # & operator for set intersection
class Estimator(Transformer):
def fit(self, topics_and_res: pd.DataFrame) -> 'Estimator': ...
class Indexer(Transformer):
def index(self, iter_dict) -> IndexRef: ...Retrieval components for searching indexed collections, including various weighting models, feature extraction, and text scoring capabilities.
class Retriever(Transformer):
@staticmethod
def from_dataset(dataset_name: str, variant: str = None, version: str = 'latest', **kwargs) -> 'Retriever': ...
def __init__(self, index_location: Union[str, Any],
controls: Optional[Dict[str, str]] = None,
properties: Optional[Dict[str, str]] = None,
metadata: List[str] = ["docno"],
num_results: Optional[int] = None,
wmodel: Optional[Union[str, Callable]] = None,
threads: int = 1,
verbose: bool = False): ...
class FeaturesRetriever(Transformer):
def __init__(self, index_location: Union[str, Any], features: List[str],
controls: Optional[Dict[str, str]] = None,
properties: Optional[Dict[str, str]] = None,
threads: int = 1, **kwargs): ...
class TextScorer(Transformer):
def __init__(self, wmodel: str = 'BM25', background_index: Any = None,
takes: str = 'docs', body_attr: str = 'text',
verbose: bool = False, **kwargs): ...Index creation and management functionality for building searchable collections from various document formats.
class IndexFactory:
@staticmethod
def from_dataset(dataset_name: str) -> IndexRef: ...
@staticmethod
def from_trec(path: str, **kwargs) -> IndexRef: ...
class FilesIndexer(Indexer):
def __init__(self, index_path: str, **kwargs): ...
class TRECCollectionIndexer(Indexer):
def __init__(self, index_path: str, **kwargs): ...
class DFIndexer(Indexer):
def __init__(self, index_path: str, **kwargs): ...Java VM initialization, configuration, and integration with the underlying Terrier platform.
def init(version: str = None, **kwargs) -> None: ...
def started() -> bool: ...
def configure(**kwargs) -> None: ...
def set_memory_limit(memory: str) -> None: ...
def extend_classpath(paths: List[str]) -> None: ...
def set_property(key: str, value: str) -> None: ...Dataset management for accessing standard IR test collections and creating custom datasets.
def get_dataset(name: str) -> Dataset: ...
def find_datasets(query: str = None, **kwargs) -> List[str]: ...
def list_datasets() -> List[str]: ...
class Dataset:
def get_topics(self, variant: str = None) -> pd.DataFrame: ...
def get_qrels(self, variant: str = None) -> pd.DataFrame: ...
def get_corpus_iter(self, verbose: bool = True) -> Iterator: ...Comprehensive evaluation and parameter tuning framework with statistical significance testing.
class Experiment:
def __init__(self, retr_systems: List[Transformer], topics: pd.DataFrame,
qrels: pd.DataFrame, eval_metrics: List[str], **kwargs): ...
class GridSearch:
def __init__(self, pipeline: Transformer, params: Dict, topics: pd.DataFrame,
qrels: pd.DataFrame, metric: str, **kwargs): ...
class GridScan:
def __init__(self, pipeline: Transformer, params: Dict, topics: pd.DataFrame,
qrels: pd.DataFrame, metrics: List[str], **kwargs): ...Text processing utilities including stemming, tokenization, stopword removal, and text transformation.
class TerrierStemmer(Transformer):
def __init__(self, stemmer: str = 'porter'): ...
class TerrierTokeniser(Transformer):
def __init__(self, **kwargs): ...
class TerrierStopwords(Transformer):
def __init__(self, stopwords: str = 'terrier'): ...Supporting utilities for DataFrame manipulation, progress tracking, I/O operations, and general helper functions.
def set_tqdm(tqdm_type: str = None) -> None: ...
def coerce_dataframe(input_data) -> pd.DataFrame: ...
def add_ranks(df: pd.DataFrame, single_query: bool = False) -> pd.DataFrame: ...
def autoopen(filename: str, mode: str = 'r', **kwargs): ...# Core type definitions used across PyTerrier
from typing import Dict, List, Any, Iterator, Union, Optional, Callable, Sequence, Literal
import pandas as pd
import numpy.typing as npt
IterDictRecord = Dict[str, Any]
IterDict = Iterator[IterDictRecord]
IndexRef = Any # Java IndexRef object
Dataset = Any # Dataset object
TransformerLike = Union['Transformer', Callable[[pd.DataFrame], pd.DataFrame]]
QueryInput = Union[str, Dict[str, str], pd.DataFrame]
WeightingModel = str # Weighting model identifier (e.g., 'BM25', 'PL2')
MetricList = List[str] # List of evaluation metrics