A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.
—
PyTerrier's retrieval components provide comprehensive search functionality for indexed collections, supporting various weighting models, feature extraction, and text scoring capabilities. The retrieval system is built around the Transformer interface, enabling seamless integration into complex pipelines.
The primary retrieval class that replaces the deprecated BatchRetrieve and TerrierRetrieve classes, providing access to various weighting models and retrieval configurations.
class Retriever(Transformer):
"""
Main retrieval class supporting various weighting models and configurations.
Parameters:
- index_location: Index reference, path, or dataset name
- controls: Dictionary of Terrier controls/properties (optional)
- properties: Dictionary of Terrier properties (optional)
- metadata: List of metadata fields to include in results (default: ["docno"])
- num_results: Maximum number of results to return (optional)
- wmodel: Weighting model name or callable (optional)
- threads: Number of threads for parallel retrieval (default: 1)
- verbose: Enable verbose output (default: False)
"""
def __init__(self, index_location: Union[str, Any],
controls: Optional[Dict[str, str]] = None,
properties: Optional[Dict[str, str]] = None,
metadata: List[str] = ["docno"],
num_results: Optional[int] = None,
wmodel: Optional[Union[str, Callable]] = None,
threads: int = 1,
verbose: bool = False): ...
@staticmethod
def from_dataset(dataset_name: str, variant: str = None, **kwargs) -> 'Retriever': ...Supported Weighting Models:
BM25: Okapi BM25 ranking functionPL2: Divergence from Randomness PL2 modelTF_IDF: Classic TF-IDF weightingDPH: Divergence from Randomness DPH modelDFR_BM25: Divergence from Randomness version of BM25Hiemstra_LM: Hiemstra Language ModelDirichletLM: Dirichlet Language ModelJelinekMercerLM: Jelinek-Mercer Language ModelUsage Examples:
# Create retriever from index path
bm25 = pt.terrier.Retriever('/path/to/index', wmodel='BM25')
# Create retriever from dataset
vaswani_retriever = pt.terrier.Retriever.from_dataset('vaswani', 'terrier_stemmed')
# Configure retrieval parameters
pl2 = pt.terrier.Retriever(index_ref, wmodel='PL2',
controls={'c': '1.0'},
num_results=50)
# Include metadata fields
retriever_with_meta = pt.terrier.Retriever(index_ref,
metadata=['docno', 'title', 'url'])
# Perform retrieval
queries = pd.DataFrame([
{'qid': '1', 'query': 'information retrieval'},
{'qid': '2', 'query': 'search engines'}
])
results = bm25.transform(queries)Retrieval component specialized for extracting ranking features, useful for learning-to-rank and feature analysis.
class FeaturesRetriever(Transformer):
"""
Feature extraction retriever for learning-to-rank applications.
Parameters:
- index_ref: Reference to the index
- features: List of feature names to extract
- controls: Dictionary of Terrier controls
- properties: Dictionary of Terrier properties
"""
def __init__(self, index_ref: Any, features: List[str],
controls: Dict[str, str] = None,
properties: Dict[str, str] = None, **kwargs): ...Common Features:
TF: Term frequencyIDF: Inverse document frequencyQTFN: Query term frequency normalizedWMODEL:BM25: BM25 weighting model scoreWMODEL:PL2: PL2 weighting model scoreDOCLEN: Document lengthQLEN: Query lengthUsage Example:
# Extract multiple features for learning-to-rank
features_retriever = pt.terrier.FeaturesRetriever(
index_ref,
features=['TF', 'IDF', 'WMODEL:BM25', 'WMODEL:PL2', 'DOCLEN']
)
# Get features for query-document pairs
topics_and_res = pd.DataFrame([
{'qid': '1', 'query': 'information retrieval', 'docno': 'doc1'},
{'qid': '1', 'query': 'information retrieval', 'docno': 'doc2'}
])
features = features_retriever.transform(topics_and_res)Component for scoring text passages against queries without requiring a pre-built index.
class TextScorer(Transformer):
"""
Score text passages against queries using specified weighting models.
Parameters:
- wmodel: Weighting model to use for scoring (default: 'BM25')
- background_index: Optional background index for IDF statistics
- takes: Specifies input format ('queries' or 'docs')
- body_attr: Attribute name containing text to score (default: 'text')
- verbose: Enable verbose output
"""
def __init__(self, wmodel: str = 'BM25', background_index: Any = None,
takes: str = 'docs', body_attr: str = 'text',
verbose: bool = False, **kwargs): ...Usage Example:
# Score documents against queries
scorer = pt.terrier.TextScorer(wmodel='BM25')
# Input: queries and documents to score
input_df = pd.DataFrame([
{'qid': '1', 'query': 'machine learning', 'docno': 'doc1',
'text': 'Machine learning is a subset of artificial intelligence...'},
{'qid': '1', 'query': 'machine learning', 'docno': 'doc2',
'text': 'Deep learning uses neural networks for pattern recognition...'}
])
scored_results = scorer.transform(input_df)Query transformation and expansion capabilities for improving retrieval effectiveness.
# Query rewriting transformers from pt.terrier.rewrite
class SequentialDependenceModel(Transformer):
"""Sequential Dependence Model query rewriting."""
def __init__(self, index_ref: Any, **kwargs): ...
class DependenceModelPrecomputed(Transformer):
"""Precomputed dependence model rewriting."""
def __init__(self, index_ref: Any, **kwargs): ...
class QueryExpansion(Transformer):
"""Relevance feedback based query expansion."""
def __init__(self, index_ref: Any, fb_terms: int = 10, fb_docs: int = 3, **kwargs): ...Usage Example:
# Sequential dependence model for phrase matching
sdm = pt.terrier.rewrite.SequentialDependenceModel(index_ref)
sdm_pipeline = sdm >> retriever
# Query expansion with relevance feedback
qe = pt.terrier.rewrite.QueryExpansion(index_ref, fb_terms=20, fb_docs=5)
qe_pipeline = retriever >> qe >> retrieverThese components are maintained for backward compatibility but issue deprecation warnings:
# Deprecated - use Retriever instead
class BatchRetrieve(Transformer): ...
class TerrierRetrieve(Transformer): ...
# Deprecated - use FeaturesRetriever instead
class FeaturesBatchRetrieve(Transformer): ...# Two-stage retrieval with reranking
first_stage = pt.terrier.Retriever(index_ref, wmodel='BM25', num_results=1000)
reranker = pt.terrier.Retriever(index_ref, wmodel='PL2')
pipeline = first_stage >> (reranker % 50) # Rerank top 1000, return top 50# Extract features for learning-to-rank
feature_pipeline = (
pt.terrier.Retriever(index_ref, num_results=100) >>
pt.terrier.FeaturesRetriever(index_ref, features=['TF', 'IDF', 'WMODEL:BM25'])
)# Late fusion of multiple retrieval models
bm25 = pt.terrier.Retriever(index_ref, wmodel='BM25')
pl2 = pt.terrier.Retriever(index_ref, wmodel='PL2')
fused = bm25 + pl2 # Add scores from both modelsfrom typing import Dict, List, Any, Union, Optional
import pandas as pd
# Retrieval-specific types
IndexRef = Any # Java IndexRef object
WeightingModel = str # Weighting model identifier
Controls = Dict[str, str] # Terrier control parameters
Properties = Dict[str, str] # Terrier properties
MetadataFields = List[str] # Metadata field namesInstall with Tessl CLI
npx tessl i tessl/pypi-python-terrier