tessl/pypi-python-terrier

A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.

—

Pending

Overview

Eval results

Files

Utilities

Name: tessl/pypi-python-terrier
Author: tessl

PyTerrier's utility modules provide supporting functionality for DataFrame manipulation, I/O operations, progress tracking, debugging, and general helper functions that support the core information retrieval capabilities.

Capabilities

General Utilities (`pyterrier.utils`)

Core utility functions for progress tracking, system information, and general helper functionality.

def set_tqdm(tqdm_type: str = None) -> None:
    """
    Configure progress bar type for PyTerrier operations.
    
    Parameters:
    - tqdm_type: Progress bar type ('tqdm', 'notebook', 'auto', or None to disable)
    """

def entry_points(group: str) -> List[Any]:
    """
    Get package entry points for specified group.
    
    Parameters:
    - group: Entry point group name
    
    Returns:
    - List of entry point objects
    """

def is_windows() -> bool:
    """
    Check if running on Windows operating system.
    
    Returns:
    - True if Windows, False otherwise
    """

def noop(*args, **kwargs) -> None:
    """
    No-operation function that accepts any arguments and does nothing.
    """

def once() -> Callable:
    """
    Decorator that ensures a function can only be called once.
    
    Returns:
    - Decorator function
    """

def get_class_methods(cls: type) -> List[str]:
    """
    Get list of methods defined by a class (not inherited).
    
    Parameters:
    - cls: Class to inspect
    
    Returns:
    - List of method names
    """

def pre_invocation_decorator(decorator: Callable) -> Callable:
    """
    Create decorator that runs before method invocation.
    
    Parameters:
    - decorator: Decorator function to apply
    
    Returns:
    - Pre-invocation decorator
    """

def byte_count_to_human_readable(byte_count: int) -> str:
    """
    Convert byte count to human-readable format.
    
    Parameters:
    - byte_count: Number of bytes
    
    Returns:
    - Human-readable string (e.g., '1.2 GB', '45.3 MB')
    """

def temp_env(key: str, value: str):
    """
    Context manager for temporarily setting environment variable.
    
    Parameters:
    - key: Environment variable name
    - value: Temporary value
    
    Returns:
    - Context manager
    """

class GeneratorLen:
    """
    Wrapper for generator that tracks length.
    
    Provides len() support for generators by consuming and caching items.
    """
    def __init__(self, generator: Iterator, length: int = None): ...
    def __len__(self) -> int: ...
    def __iter__(self) -> Iterator: ...

Usage Examples:

# Configure progress bars
pt.utils.set_tqdm('notebook')  # For Jupyter notebooks
pt.utils.set_tqdm('tqdm')      # For command line
pt.utils.set_tqdm(None)        # Disable progress bars

# System information
if pt.utils.is_windows():
    print("Running on Windows")

# Temporary environment variable
with pt.utils.temp_env('JAVA_HOME', '/custom/java/path'):
    pt.java.init()  # Uses custom Java path

# Human-readable byte counts
size_str = pt.utils.byte_count_to_human_readable(1073741824)  # "1.0 GB"

# Generator with length
def doc_generator():
    for i in range(1000):
        yield {'docno': f'doc_{i}', 'text': f'Document {i}'}

gen_with_len = pt.utils.GeneratorLen(doc_generator(), 1000)
print(f"Generator length: {len(gen_with_len)}")

DataFrame Model Utilities (`pyterrier.model`)

Utilities for manipulating PyTerrier DataFrames and data structures.

def add_ranks(df: pd.DataFrame, single_query: bool = False) -> pd.DataFrame:
    """
    Add rank column to DataFrame based on score values.
    
    Parameters:
    - df: DataFrame with score column
    - single_query: Whether DataFrame contains single query (default: False)
    
    Returns:
    - DataFrame with added 'rank' column
    """

def document_columns(df: pd.DataFrame) -> List[str]:
    """
    Get document-related column names from DataFrame.
    
    Parameters:
    - df: DataFrame to analyze
    
    Returns:
    - List of document-related column names
    """

def query_columns(df: pd.DataFrame, qid: bool = True) -> List[str]:
    """
    Get query-related column names from DataFrame.
    
    Parameters:
    - df: DataFrame to analyze
    - qid: Whether to include 'qid' column (default: True)
    
    Returns:
    - List of query-related column names
    """

def push_queries(df: pd.DataFrame) -> pd.DataFrame:
    """
    Push query columns (rename query -> query_0, etc.).
    
    Parameters:
    - df: DataFrame with query columns
    
    Returns:
    - DataFrame with pushed query columns
    """

def push_queries_dict(input_dict: Dict[str, Any]) -> Dict[str, Any]:
    """
    Dictionary version of push_queries.
    
    Parameters:
    - input_dict: Dictionary with query fields
    
    Returns:
    - Dictionary with pushed query fields
    """

def pop_queries(df: pd.DataFrame) -> pd.DataFrame:
    """
    Pop query columns (reverse of push_queries).
    
    Parameters:
    - df: DataFrame with pushed query columns
    
    Returns:
    - DataFrame with restored query columns
    """

def ranked_documents_to_queries(topics_and_res: pd.DataFrame) -> pd.DataFrame:
    """
    Extract unique queries from ranked documents DataFrame.
    
    Parameters:
    - topics_and_res: DataFrame with queries and results
    
    Returns:
    - DataFrame with unique queries
    """

def coerce_queries_dataframe(query: Union[str, Dict, pd.DataFrame]) -> pd.DataFrame:
    """
    Convert various input types to standard queries DataFrame.
    
    Parameters:
    - query: Query in various formats (string, dict, DataFrame)
    
    Returns:
    - Standard queries DataFrame with 'qid' and 'query' columns
    """

def coerce_dataframe_types(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure proper column data types for PyTerrier DataFrames.
    
    Parameters:
    - dataframe: DataFrame to type-check
    
    Returns:
    - DataFrame with corrected types
    """

def split_df(df: pd.DataFrame, N: int = None, batch_size: int = None) -> Iterator[pd.DataFrame]:
    """
    Split DataFrame into chunks for batch processing.
    
    Parameters:
    - df: DataFrame to split
    - N: Number of chunks (alternative to batch_size)
    - batch_size: Size of each chunk
    
    Returns:
    - Iterator of DataFrame chunks
    """

# Constants
FIRST_RANK: int = 0  # Starting rank value
STRICT_SORT: bool = False  # Whether to enforce strict sorting

Usage Examples:

# Add ranks to results
results_with_ranks = pt.model.add_ranks(retrieval_results)

# Get column information
doc_cols = pt.model.document_columns(results)
query_cols = pt.model.query_columns(results)

# Query manipulation
pushed_queries = pt.model.push_queries(topics)
restored_queries = pt.model.pop_queries(pushed_queries)

# Convert various query formats
query_df = pt.model.coerce_queries_dataframe("information retrieval")
query_df = pt.model.coerce_queries_dataframe({'qid': '1', 'query': 'search'})

# Batch processing
for batch in pt.model.split_df(large_dataframe, batch_size=1000):
    processed_batch = some_transformer.transform(batch)

I/O Utilities (`pyterrier.io`)

File input/output utilities with support for various formats and compression.

def autoopen(filename: str, mode: str = 'r', **kwargs):
    """
    Automatically handle file opening with compression detection.
    
    Parameters:
    - filename: File path (supports .gz, .bz2, .xz compression)
    - mode: File opening mode ('r', 'w', 'rb', 'wb', etc.)
    - **kwargs: Additional arguments for file opening
    
    Returns:
    - File handle with appropriate compression handling
    """

def finalized_open(path: str, mode: str):
    """
    Atomic file writing context manager.
    
    Parameters:
    - path: Target file path
    - mode: File opening mode
    
    Returns:
    - Context manager for atomic file writing
    """

def find_files(directory: str) -> List[str]:
    """
    Recursively find all files in directory.
    
    Parameters:
    - directory: Directory path to search
    
    Returns:
    - List of file paths
    """

def coerce_dataframe(obj: Any) -> pd.DataFrame:
    """
    Convert various object types to DataFrame.
    
    Parameters:
    - obj: Object to convert (dict, list, etc.)
    
    Returns:
    - Converted DataFrame
    """

Usage Examples:

# Automatic compression handling
with pt.io.autoopen('data.txt.gz', 'r') as f:
    content = f.read()

with pt.io.autoopen('results.json.bz2', 'w') as f:
    json.dump(data, f)

# Atomic file writing
with pt.io.finalized_open('important_results.txt', 'w') as f:
    f.write("Critical data")  # Only written if no exceptions

# File discovery
all_files = pt.io.find_files('/path/to/documents')
text_files = [f for f in all_files if f.endswith('.txt')]

# DataFrame conversion
df = pt.io.coerce_dataframe([{'docno': 'doc1', 'text': 'content'}])

Debugging Utilities (`pyterrier.debug`)

Debugging and inspection utilities for PyTerrier pipelines.

def print_columns(by_query: bool = False, message: str = None) -> Transformer:
    """
    Debug transformer that prints DataFrame column information.
    
    Parameters:
    - by_query: Whether to group output by query (default: False)
    - message: Optional message to print with column information
    
    Returns:
    - Transformer that prints column info and passes data through
    """

Usage Example:

# Debug pipeline by printing column information
debug_pipeline = (
    retriever >>
    pt.debug.print_columns(message="After retrieval") >>
    reranker >>
    pt.debug.print_columns(message="After reranking", by_query=True) >>
    (lambda df: df.head(10))  # Final cutoff
)

results = debug_pipeline.transform(topics)

DataFrame Creation Utilities (`pyterrier.new`)

Utilities for creating standard PyTerrier DataFrames.

def empty_Q() -> pd.DataFrame:
    """
    Create empty queries DataFrame with standard columns.
    
    Returns:
    - Empty DataFrame with 'qid' and 'query' columns
    """

def queries(queries: Union[List[str], Dict[str, str]], qid: str = None, **others) -> pd.DataFrame:
    """
    Create queries DataFrame from various input formats.
    
    Parameters:
    - queries: Query data (list of strings, dict mapping qid->query, etc.)
    - qid: Base qid for auto-generated IDs
    - **others: Additional columns to include
    
    Returns:
    - Standard queries DataFrame
    """

def empty_R() -> pd.DataFrame:
    """
    Create empty results DataFrame with standard columns.
    
    Returns:
    - Empty DataFrame with 'qid', 'docno', 'score', 'rank' columns
    """

def ranked_documents(topics: pd.DataFrame = None, docnos: List[str] = None, 
                    scores: List[float] = None, **others) -> pd.DataFrame:
    """
    Create ranked documents DataFrame.
    
    Parameters:
    - topics: Topics DataFrame to associate with documents
    - docnos: List of document IDs
    - scores: List of relevance scores
    - **others: Additional columns to include
    
    Returns:
    - Standard ranked documents DataFrame
    """

Usage Examples:

# Create queries DataFrame
topics = pt.new.queries([
    "information retrieval",
    "search engines", 
    "natural language processing"
])

topics_with_ids = pt.new.queries({
    'q1': 'machine learning',
    'q2': 'deep learning',
    'q3': 'neural networks'
})

# Create results DataFrame
results = pt.new.ranked_documents(
    topics=topics,
    docnos=['doc1', 'doc2', 'doc3'],
    scores=[0.95, 0.87, 0.76]
)

# Empty DataFrames for initialization
empty_queries = pt.new.empty_Q()
empty_results = pt.new.empty_R()

Inspection Utilities (`pyterrier.inspect`)

Utilities for inspecting PyTerrier objects and artifacts.

def artifact_type_format(artifact: Any) -> Tuple[str, str]:
    """
    Get artifact type and format information.
    
    Parameters:
    - artifact: Artifact object to inspect
    
    Returns:
    - Tuple of (type_name, format_name)
    """

Learning-to-Rank Utilities (`pyterrier.ltr`)

Utilities for learning-to-rank applications and feature analysis.

class AblateFeatures(Transformer):
    """
    Feature ablation transformer for analyzing feature importance.
    
    Systematically removes features to study their impact on ranking performance.
    """
    def __init__(self, features_to_ablate: List[str] = None): ...

Parallel Processing Utilities (`pyterrier.parallel`)

Utilities for parallel processing and distributed execution.

class PoolParallelTransformer(Transformer):
    """
    Wrapper transformer for parallel execution using process pools.
    
    Parameters:
    - transformer: Base transformer to parallelize
    - jobs: Number of parallel processes
    - backend: Parallel backend ('joblib', 'ray')
    """
    def __init__(self, transformer: Transformer, jobs: int = 2, backend: str = 'joblib'): ...

Types

from typing import Dict, List, Any, Union, Optional, Iterator, Callable, Tuple
import pandas as pd

# Utility types
ProgressBarType = str  # Progress bar implementation name
EnvironmentVariable = Tuple[str, str]  # Environment variable key-value pair
ByteCount = int  # Number of bytes
HumanReadableSize = str  # Human-readable size string (e.g., '1.2 GB')
ColumnNames = List[str]  # List of DataFrame column names
BatchSize = int  # Batch processing size
ChunkCount = int  # Number of chunks for splitting
QueryInput = Union[str, Dict[str, str], pd.DataFrame]  # Various query input formats
DataFrameChunk = pd.DataFrame  # DataFrame chunk for batch processing
ArtifactInfo = Tuple[str, str]  # Artifact type and format information

Install with Tessl CLI