CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-python-terrier

A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.

Pending
Overview
Eval results
Files

utilities.mddocs/

Utilities

PyTerrier's utility modules provide supporting functionality for DataFrame manipulation, I/O operations, progress tracking, debugging, and general helper functions that support the core information retrieval capabilities.

Capabilities

General Utilities (pyterrier.utils)

Core utility functions for progress tracking, system information, and general helper functionality.

def set_tqdm(tqdm_type: str = None) -> None:
    """
    Configure progress bar type for PyTerrier operations.
    
    Parameters:
    - tqdm_type: Progress bar type ('tqdm', 'notebook', 'auto', or None to disable)
    """

def entry_points(group: str) -> List[Any]:
    """
    Get package entry points for specified group.
    
    Parameters:
    - group: Entry point group name
    
    Returns:
    - List of entry point objects
    """

def is_windows() -> bool:
    """
    Check if running on Windows operating system.
    
    Returns:
    - True if Windows, False otherwise
    """

def noop(*args, **kwargs) -> None:
    """
    No-operation function that accepts any arguments and does nothing.
    """

def once() -> Callable:
    """
    Decorator that ensures a function can only be called once.
    
    Returns:
    - Decorator function
    """

def get_class_methods(cls: type) -> List[str]:
    """
    Get list of methods defined by a class (not inherited).
    
    Parameters:
    - cls: Class to inspect
    
    Returns:
    - List of method names
    """

def pre_invocation_decorator(decorator: Callable) -> Callable:
    """
    Create decorator that runs before method invocation.
    
    Parameters:
    - decorator: Decorator function to apply
    
    Returns:
    - Pre-invocation decorator
    """

def byte_count_to_human_readable(byte_count: int) -> str:
    """
    Convert byte count to human-readable format.
    
    Parameters:
    - byte_count: Number of bytes
    
    Returns:
    - Human-readable string (e.g., '1.2 GB', '45.3 MB')
    """

def temp_env(key: str, value: str):
    """
    Context manager for temporarily setting environment variable.
    
    Parameters:
    - key: Environment variable name
    - value: Temporary value
    
    Returns:
    - Context manager
    """

class GeneratorLen:
    """
    Wrapper for generator that tracks length.
    
    Provides len() support for generators by consuming and caching items.
    """
    def __init__(self, generator: Iterator, length: int = None): ...
    def __len__(self) -> int: ...
    def __iter__(self) -> Iterator: ...

Usage Examples:

# Configure progress bars
pt.utils.set_tqdm('notebook')  # For Jupyter notebooks
pt.utils.set_tqdm('tqdm')      # For command line
pt.utils.set_tqdm(None)        # Disable progress bars

# System information
if pt.utils.is_windows():
    print("Running on Windows")

# Temporary environment variable
with pt.utils.temp_env('JAVA_HOME', '/custom/java/path'):
    pt.java.init()  # Uses custom Java path

# Human-readable byte counts
size_str = pt.utils.byte_count_to_human_readable(1073741824)  # "1.0 GB"

# Generator with length
def doc_generator():
    for i in range(1000):
        yield {'docno': f'doc_{i}', 'text': f'Document {i}'}

gen_with_len = pt.utils.GeneratorLen(doc_generator(), 1000)
print(f"Generator length: {len(gen_with_len)}")

DataFrame Model Utilities (pyterrier.model)

Utilities for manipulating PyTerrier DataFrames and data structures.

def add_ranks(df: pd.DataFrame, single_query: bool = False) -> pd.DataFrame:
    """
    Add rank column to DataFrame based on score values.
    
    Parameters:
    - df: DataFrame with score column
    - single_query: Whether DataFrame contains single query (default: False)
    
    Returns:
    - DataFrame with added 'rank' column
    """

def document_columns(df: pd.DataFrame) -> List[str]:
    """
    Get document-related column names from DataFrame.
    
    Parameters:
    - df: DataFrame to analyze
    
    Returns:
    - List of document-related column names
    """

def query_columns(df: pd.DataFrame, qid: bool = True) -> List[str]:
    """
    Get query-related column names from DataFrame.
    
    Parameters:
    - df: DataFrame to analyze
    - qid: Whether to include 'qid' column (default: True)
    
    Returns:
    - List of query-related column names
    """

def push_queries(df: pd.DataFrame) -> pd.DataFrame:
    """
    Push query columns (rename query -> query_0, etc.).
    
    Parameters:
    - df: DataFrame with query columns
    
    Returns:
    - DataFrame with pushed query columns
    """

def push_queries_dict(input_dict: Dict[str, Any]) -> Dict[str, Any]:
    """
    Dictionary version of push_queries.
    
    Parameters:
    - input_dict: Dictionary with query fields
    
    Returns:
    - Dictionary with pushed query fields
    """

def pop_queries(df: pd.DataFrame) -> pd.DataFrame:
    """
    Pop query columns (reverse of push_queries).
    
    Parameters:
    - df: DataFrame with pushed query columns
    
    Returns:
    - DataFrame with restored query columns
    """

def ranked_documents_to_queries(topics_and_res: pd.DataFrame) -> pd.DataFrame:
    """
    Extract unique queries from ranked documents DataFrame.
    
    Parameters:
    - topics_and_res: DataFrame with queries and results
    
    Returns:
    - DataFrame with unique queries
    """

def coerce_queries_dataframe(query: Union[str, Dict, pd.DataFrame]) -> pd.DataFrame:
    """
    Convert various input types to standard queries DataFrame.
    
    Parameters:
    - query: Query in various formats (string, dict, DataFrame)
    
    Returns:
    - Standard queries DataFrame with 'qid' and 'query' columns
    """

def coerce_dataframe_types(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure proper column data types for PyTerrier DataFrames.
    
    Parameters:
    - dataframe: DataFrame to type-check
    
    Returns:
    - DataFrame with corrected types
    """

def split_df(df: pd.DataFrame, N: int = None, batch_size: int = None) -> Iterator[pd.DataFrame]:
    """
    Split DataFrame into chunks for batch processing.
    
    Parameters:
    - df: DataFrame to split
    - N: Number of chunks (alternative to batch_size)
    - batch_size: Size of each chunk
    
    Returns:
    - Iterator of DataFrame chunks
    """

# Constants
FIRST_RANK: int = 0  # Starting rank value
STRICT_SORT: bool = False  # Whether to enforce strict sorting

Usage Examples:

# Add ranks to results
results_with_ranks = pt.model.add_ranks(retrieval_results)

# Get column information
doc_cols = pt.model.document_columns(results)
query_cols = pt.model.query_columns(results)

# Query manipulation
pushed_queries = pt.model.push_queries(topics)
restored_queries = pt.model.pop_queries(pushed_queries)

# Convert various query formats
query_df = pt.model.coerce_queries_dataframe("information retrieval")
query_df = pt.model.coerce_queries_dataframe({'qid': '1', 'query': 'search'})

# Batch processing
for batch in pt.model.split_df(large_dataframe, batch_size=1000):
    processed_batch = some_transformer.transform(batch)

I/O Utilities (pyterrier.io)

File input/output utilities with support for various formats and compression.

def autoopen(filename: str, mode: str = 'r', **kwargs):
    """
    Automatically handle file opening with compression detection.
    
    Parameters:
    - filename: File path (supports .gz, .bz2, .xz compression)
    - mode: File opening mode ('r', 'w', 'rb', 'wb', etc.)
    - **kwargs: Additional arguments for file opening
    
    Returns:
    - File handle with appropriate compression handling
    """

def finalized_open(path: str, mode: str):
    """
    Atomic file writing context manager.
    
    Parameters:
    - path: Target file path
    - mode: File opening mode
    
    Returns:
    - Context manager for atomic file writing
    """

def find_files(directory: str) -> List[str]:
    """
    Recursively find all files in directory.
    
    Parameters:
    - directory: Directory path to search
    
    Returns:
    - List of file paths
    """

def coerce_dataframe(obj: Any) -> pd.DataFrame:
    """
    Convert various object types to DataFrame.
    
    Parameters:
    - obj: Object to convert (dict, list, etc.)
    
    Returns:
    - Converted DataFrame
    """

Usage Examples:

# Automatic compression handling
with pt.io.autoopen('data.txt.gz', 'r') as f:
    content = f.read()

with pt.io.autoopen('results.json.bz2', 'w') as f:
    json.dump(data, f)

# Atomic file writing
with pt.io.finalized_open('important_results.txt', 'w') as f:
    f.write("Critical data")  # Only written if no exceptions

# File discovery
all_files = pt.io.find_files('/path/to/documents')
text_files = [f for f in all_files if f.endswith('.txt')]

# DataFrame conversion
df = pt.io.coerce_dataframe([{'docno': 'doc1', 'text': 'content'}])

Debugging Utilities (pyterrier.debug)

Debugging and inspection utilities for PyTerrier pipelines.

def print_columns(by_query: bool = False, message: str = None) -> Transformer:
    """
    Debug transformer that prints DataFrame column information.
    
    Parameters:
    - by_query: Whether to group output by query (default: False)
    - message: Optional message to print with column information
    
    Returns:
    - Transformer that prints column info and passes data through
    """

Usage Example:

# Debug pipeline by printing column information
debug_pipeline = (
    retriever >>
    pt.debug.print_columns(message="After retrieval") >>
    reranker >>
    pt.debug.print_columns(message="After reranking", by_query=True) >>
    (lambda df: df.head(10))  # Final cutoff
)

results = debug_pipeline.transform(topics)

DataFrame Creation Utilities (pyterrier.new)

Utilities for creating standard PyTerrier DataFrames.

def empty_Q() -> pd.DataFrame:
    """
    Create empty queries DataFrame with standard columns.
    
    Returns:
    - Empty DataFrame with 'qid' and 'query' columns
    """

def queries(queries: Union[List[str], Dict[str, str]], qid: str = None, **others) -> pd.DataFrame:
    """
    Create queries DataFrame from various input formats.
    
    Parameters:
    - queries: Query data (list of strings, dict mapping qid->query, etc.)
    - qid: Base qid for auto-generated IDs
    - **others: Additional columns to include
    
    Returns:
    - Standard queries DataFrame
    """

def empty_R() -> pd.DataFrame:
    """
    Create empty results DataFrame with standard columns.
    
    Returns:
    - Empty DataFrame with 'qid', 'docno', 'score', 'rank' columns
    """

def ranked_documents(topics: pd.DataFrame = None, docnos: List[str] = None, 
                    scores: List[float] = None, **others) -> pd.DataFrame:
    """
    Create ranked documents DataFrame.
    
    Parameters:
    - topics: Topics DataFrame to associate with documents
    - docnos: List of document IDs
    - scores: List of relevance scores
    - **others: Additional columns to include
    
    Returns:
    - Standard ranked documents DataFrame
    """

Usage Examples:

# Create queries DataFrame
topics = pt.new.queries([
    "information retrieval",
    "search engines", 
    "natural language processing"
])

topics_with_ids = pt.new.queries({
    'q1': 'machine learning',
    'q2': 'deep learning',
    'q3': 'neural networks'
})

# Create results DataFrame
results = pt.new.ranked_documents(
    topics=topics,
    docnos=['doc1', 'doc2', 'doc3'],
    scores=[0.95, 0.87, 0.76]
)

# Empty DataFrames for initialization
empty_queries = pt.new.empty_Q()
empty_results = pt.new.empty_R()

Inspection Utilities (pyterrier.inspect)

Utilities for inspecting PyTerrier objects and artifacts.

def artifact_type_format(artifact: Any) -> Tuple[str, str]:
    """
    Get artifact type and format information.
    
    Parameters:
    - artifact: Artifact object to inspect
    
    Returns:
    - Tuple of (type_name, format_name)
    """

Learning-to-Rank Utilities (pyterrier.ltr)

Utilities for learning-to-rank applications and feature analysis.

class AblateFeatures(Transformer):
    """
    Feature ablation transformer for analyzing feature importance.
    
    Systematically removes features to study their impact on ranking performance.
    """
    def __init__(self, features_to_ablate: List[str] = None): ...

Parallel Processing Utilities (pyterrier.parallel)

Utilities for parallel processing and distributed execution.

class PoolParallelTransformer(Transformer):
    """
    Wrapper transformer for parallel execution using process pools.
    
    Parameters:
    - transformer: Base transformer to parallelize
    - jobs: Number of parallel processes
    - backend: Parallel backend ('joblib', 'ray')
    """
    def __init__(self, transformer: Transformer, jobs: int = 2, backend: str = 'joblib'): ...

Types

from typing import Dict, List, Any, Union, Optional, Iterator, Callable, Tuple
import pandas as pd

# Utility types
ProgressBarType = str  # Progress bar implementation name
EnvironmentVariable = Tuple[str, str]  # Environment variable key-value pair
ByteCount = int  # Number of bytes
HumanReadableSize = str  # Human-readable size string (e.g., '1.2 GB')
ColumnNames = List[str]  # List of DataFrame column names
BatchSize = int  # Batch processing size
ChunkCount = int  # Number of chunks for splitting
QueryInput = Union[str, Dict[str, str], pd.DataFrame]  # Various query input formats
DataFrameChunk = pd.DataFrame  # DataFrame chunk for batch processing
ArtifactInfo = Tuple[str, str]  # Artifact type and format information

Install with Tessl CLI

npx tessl i tessl/pypi-python-terrier

docs

datasets.md

evaluation.md

index.md

indexing.md

java.md

retrieval.md

text-processing.md

transformers.md

utilities.md

tile.json