A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.
—
PyTerrier's utility modules provide supporting functionality for DataFrame manipulation, I/O operations, progress tracking, debugging, and general helper functions that support the core information retrieval capabilities.
pyterrier.utils)Core utility functions for progress tracking, system information, and general helper functionality.
def set_tqdm(tqdm_type: str = None) -> None:
"""
Configure progress bar type for PyTerrier operations.
Parameters:
- tqdm_type: Progress bar type ('tqdm', 'notebook', 'auto', or None to disable)
"""
def entry_points(group: str) -> List[Any]:
"""
Get package entry points for specified group.
Parameters:
- group: Entry point group name
Returns:
- List of entry point objects
"""
def is_windows() -> bool:
"""
Check if running on Windows operating system.
Returns:
- True if Windows, False otherwise
"""
def noop(*args, **kwargs) -> None:
"""
No-operation function that accepts any arguments and does nothing.
"""
def once() -> Callable:
"""
Decorator that ensures a function can only be called once.
Returns:
- Decorator function
"""
def get_class_methods(cls: type) -> List[str]:
"""
Get list of methods defined by a class (not inherited).
Parameters:
- cls: Class to inspect
Returns:
- List of method names
"""
def pre_invocation_decorator(decorator: Callable) -> Callable:
"""
Create decorator that runs before method invocation.
Parameters:
- decorator: Decorator function to apply
Returns:
- Pre-invocation decorator
"""
def byte_count_to_human_readable(byte_count: int) -> str:
"""
Convert byte count to human-readable format.
Parameters:
- byte_count: Number of bytes
Returns:
- Human-readable string (e.g., '1.2 GB', '45.3 MB')
"""
def temp_env(key: str, value: str):
"""
Context manager for temporarily setting environment variable.
Parameters:
- key: Environment variable name
- value: Temporary value
Returns:
- Context manager
"""
class GeneratorLen:
"""
Wrapper for generator that tracks length.
Provides len() support for generators by consuming and caching items.
"""
def __init__(self, generator: Iterator, length: int = None): ...
def __len__(self) -> int: ...
def __iter__(self) -> Iterator: ...Usage Examples:
# Configure progress bars
pt.utils.set_tqdm('notebook') # For Jupyter notebooks
pt.utils.set_tqdm('tqdm') # For command line
pt.utils.set_tqdm(None) # Disable progress bars
# System information
if pt.utils.is_windows():
print("Running on Windows")
# Temporary environment variable
with pt.utils.temp_env('JAVA_HOME', '/custom/java/path'):
pt.java.init() # Uses custom Java path
# Human-readable byte counts
size_str = pt.utils.byte_count_to_human_readable(1073741824) # "1.0 GB"
# Generator with length
def doc_generator():
for i in range(1000):
yield {'docno': f'doc_{i}', 'text': f'Document {i}'}
gen_with_len = pt.utils.GeneratorLen(doc_generator(), 1000)
print(f"Generator length: {len(gen_with_len)}")pyterrier.model)Utilities for manipulating PyTerrier DataFrames and data structures.
def add_ranks(df: pd.DataFrame, single_query: bool = False) -> pd.DataFrame:
"""
Add rank column to DataFrame based on score values.
Parameters:
- df: DataFrame with score column
- single_query: Whether DataFrame contains single query (default: False)
Returns:
- DataFrame with added 'rank' column
"""
def document_columns(df: pd.DataFrame) -> List[str]:
"""
Get document-related column names from DataFrame.
Parameters:
- df: DataFrame to analyze
Returns:
- List of document-related column names
"""
def query_columns(df: pd.DataFrame, qid: bool = True) -> List[str]:
"""
Get query-related column names from DataFrame.
Parameters:
- df: DataFrame to analyze
- qid: Whether to include 'qid' column (default: True)
Returns:
- List of query-related column names
"""
def push_queries(df: pd.DataFrame) -> pd.DataFrame:
"""
Push query columns (rename query -> query_0, etc.).
Parameters:
- df: DataFrame with query columns
Returns:
- DataFrame with pushed query columns
"""
def push_queries_dict(input_dict: Dict[str, Any]) -> Dict[str, Any]:
"""
Dictionary version of push_queries.
Parameters:
- input_dict: Dictionary with query fields
Returns:
- Dictionary with pushed query fields
"""
def pop_queries(df: pd.DataFrame) -> pd.DataFrame:
"""
Pop query columns (reverse of push_queries).
Parameters:
- df: DataFrame with pushed query columns
Returns:
- DataFrame with restored query columns
"""
def ranked_documents_to_queries(topics_and_res: pd.DataFrame) -> pd.DataFrame:
"""
Extract unique queries from ranked documents DataFrame.
Parameters:
- topics_and_res: DataFrame with queries and results
Returns:
- DataFrame with unique queries
"""
def coerce_queries_dataframe(query: Union[str, Dict, pd.DataFrame]) -> pd.DataFrame:
"""
Convert various input types to standard queries DataFrame.
Parameters:
- query: Query in various formats (string, dict, DataFrame)
Returns:
- Standard queries DataFrame with 'qid' and 'query' columns
"""
def coerce_dataframe_types(dataframe: pd.DataFrame) -> pd.DataFrame:
"""
Ensure proper column data types for PyTerrier DataFrames.
Parameters:
- dataframe: DataFrame to type-check
Returns:
- DataFrame with corrected types
"""
def split_df(df: pd.DataFrame, N: int = None, batch_size: int = None) -> Iterator[pd.DataFrame]:
"""
Split DataFrame into chunks for batch processing.
Parameters:
- df: DataFrame to split
- N: Number of chunks (alternative to batch_size)
- batch_size: Size of each chunk
Returns:
- Iterator of DataFrame chunks
"""
# Constants
FIRST_RANK: int = 0 # Starting rank value
STRICT_SORT: bool = False # Whether to enforce strict sortingUsage Examples:
# Add ranks to results
results_with_ranks = pt.model.add_ranks(retrieval_results)
# Get column information
doc_cols = pt.model.document_columns(results)
query_cols = pt.model.query_columns(results)
# Query manipulation
pushed_queries = pt.model.push_queries(topics)
restored_queries = pt.model.pop_queries(pushed_queries)
# Convert various query formats
query_df = pt.model.coerce_queries_dataframe("information retrieval")
query_df = pt.model.coerce_queries_dataframe({'qid': '1', 'query': 'search'})
# Batch processing
for batch in pt.model.split_df(large_dataframe, batch_size=1000):
processed_batch = some_transformer.transform(batch)pyterrier.io)File input/output utilities with support for various formats and compression.
def autoopen(filename: str, mode: str = 'r', **kwargs):
"""
Automatically handle file opening with compression detection.
Parameters:
- filename: File path (supports .gz, .bz2, .xz compression)
- mode: File opening mode ('r', 'w', 'rb', 'wb', etc.)
- **kwargs: Additional arguments for file opening
Returns:
- File handle with appropriate compression handling
"""
def finalized_open(path: str, mode: str):
"""
Atomic file writing context manager.
Parameters:
- path: Target file path
- mode: File opening mode
Returns:
- Context manager for atomic file writing
"""
def find_files(directory: str) -> List[str]:
"""
Recursively find all files in directory.
Parameters:
- directory: Directory path to search
Returns:
- List of file paths
"""
def coerce_dataframe(obj: Any) -> pd.DataFrame:
"""
Convert various object types to DataFrame.
Parameters:
- obj: Object to convert (dict, list, etc.)
Returns:
- Converted DataFrame
"""Usage Examples:
# Automatic compression handling
with pt.io.autoopen('data.txt.gz', 'r') as f:
content = f.read()
with pt.io.autoopen('results.json.bz2', 'w') as f:
json.dump(data, f)
# Atomic file writing
with pt.io.finalized_open('important_results.txt', 'w') as f:
f.write("Critical data") # Only written if no exceptions
# File discovery
all_files = pt.io.find_files('/path/to/documents')
text_files = [f for f in all_files if f.endswith('.txt')]
# DataFrame conversion
df = pt.io.coerce_dataframe([{'docno': 'doc1', 'text': 'content'}])pyterrier.debug)Debugging and inspection utilities for PyTerrier pipelines.
def print_columns(by_query: bool = False, message: str = None) -> Transformer:
"""
Debug transformer that prints DataFrame column information.
Parameters:
- by_query: Whether to group output by query (default: False)
- message: Optional message to print with column information
Returns:
- Transformer that prints column info and passes data through
"""Usage Example:
# Debug pipeline by printing column information
debug_pipeline = (
retriever >>
pt.debug.print_columns(message="After retrieval") >>
reranker >>
pt.debug.print_columns(message="After reranking", by_query=True) >>
(lambda df: df.head(10)) # Final cutoff
)
results = debug_pipeline.transform(topics)pyterrier.new)Utilities for creating standard PyTerrier DataFrames.
def empty_Q() -> pd.DataFrame:
"""
Create empty queries DataFrame with standard columns.
Returns:
- Empty DataFrame with 'qid' and 'query' columns
"""
def queries(queries: Union[List[str], Dict[str, str]], qid: str = None, **others) -> pd.DataFrame:
"""
Create queries DataFrame from various input formats.
Parameters:
- queries: Query data (list of strings, dict mapping qid->query, etc.)
- qid: Base qid for auto-generated IDs
- **others: Additional columns to include
Returns:
- Standard queries DataFrame
"""
def empty_R() -> pd.DataFrame:
"""
Create empty results DataFrame with standard columns.
Returns:
- Empty DataFrame with 'qid', 'docno', 'score', 'rank' columns
"""
def ranked_documents(topics: pd.DataFrame = None, docnos: List[str] = None,
scores: List[float] = None, **others) -> pd.DataFrame:
"""
Create ranked documents DataFrame.
Parameters:
- topics: Topics DataFrame to associate with documents
- docnos: List of document IDs
- scores: List of relevance scores
- **others: Additional columns to include
Returns:
- Standard ranked documents DataFrame
"""Usage Examples:
# Create queries DataFrame
topics = pt.new.queries([
"information retrieval",
"search engines",
"natural language processing"
])
topics_with_ids = pt.new.queries({
'q1': 'machine learning',
'q2': 'deep learning',
'q3': 'neural networks'
})
# Create results DataFrame
results = pt.new.ranked_documents(
topics=topics,
docnos=['doc1', 'doc2', 'doc3'],
scores=[0.95, 0.87, 0.76]
)
# Empty DataFrames for initialization
empty_queries = pt.new.empty_Q()
empty_results = pt.new.empty_R()pyterrier.inspect)Utilities for inspecting PyTerrier objects and artifacts.
def artifact_type_format(artifact: Any) -> Tuple[str, str]:
"""
Get artifact type and format information.
Parameters:
- artifact: Artifact object to inspect
Returns:
- Tuple of (type_name, format_name)
"""pyterrier.ltr)Utilities for learning-to-rank applications and feature analysis.
class AblateFeatures(Transformer):
"""
Feature ablation transformer for analyzing feature importance.
Systematically removes features to study their impact on ranking performance.
"""
def __init__(self, features_to_ablate: List[str] = None): ...pyterrier.parallel)Utilities for parallel processing and distributed execution.
class PoolParallelTransformer(Transformer):
"""
Wrapper transformer for parallel execution using process pools.
Parameters:
- transformer: Base transformer to parallelize
- jobs: Number of parallel processes
- backend: Parallel backend ('joblib', 'ray')
"""
def __init__(self, transformer: Transformer, jobs: int = 2, backend: str = 'joblib'): ...from typing import Dict, List, Any, Union, Optional, Iterator, Callable, Tuple
import pandas as pd
# Utility types
ProgressBarType = str # Progress bar implementation name
EnvironmentVariable = Tuple[str, str] # Environment variable key-value pair
ByteCount = int # Number of bytes
HumanReadableSize = str # Human-readable size string (e.g., '1.2 GB')
ColumnNames = List[str] # List of DataFrame column names
BatchSize = int # Batch processing size
ChunkCount = int # Number of chunks for splitting
QueryInput = Union[str, Dict[str, str], pd.DataFrame] # Various query input formats
DataFrameChunk = pd.DataFrame # DataFrame chunk for batch processing
ArtifactInfo = Tuple[str, str] # Artifact type and format informationInstall with Tessl CLI
npx tessl i tessl/pypi-python-terrier