tessl/pypi-python-terrier

A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.

—

Pending

Overview

Eval results

Files

Indexing

Name: tessl/pypi-python-terrier
Author: tessl

PyTerrier's indexing components provide comprehensive functionality for creating searchable indexes from various document formats. The indexing system supports multiple input formats, customizable text processing pipelines, and flexible index configurations.

Capabilities

Index Factory

Central factory class for creating and managing indexes from various sources including datasets, file collections, and custom iterators.

class IndexFactory:
    """
    Factory class for creating and managing Terrier indexes.
    """
    
    @staticmethod
    def from_dataset(dataset_name: str, variant: str = None, **kwargs) -> Any: ...
    
    @staticmethod
    def from_trec(path: str, single_file: bool = False, **kwargs) -> Any: ...
    
    @staticmethod  
    def from_xml(path: str, **kwargs) -> Any: ...
    
    @staticmethod
    def memory(documents: List[Dict[str, Any]], **kwargs) -> Any: ...

Usage Examples:

# Create index from dataset
vaswani_index = pt.terrier.IndexFactory.from_dataset('vaswani')

# Create index from TREC collection
trec_index = pt.terrier.IndexFactory.from_trec('/path/to/trec/files')

# Create in-memory index for small collections
documents = [
    {'docno': 'doc1', 'text': 'This is the first document'},
    {'docno': 'doc2', 'text': 'This is the second document'}
]
memory_index = pt.terrier.IndexFactory.memory(documents)

Generic Indexers

Base indexer classes for creating indexes from different input sources and formats.

class TerrierIndexer(Indexer):
    """
    Generic Terrier indexer with configurable text processing pipeline.
    
    Parameters:
    - index_path: Path where index will be created
    - blocks: Whether to record block information for phrase queries
    - overwrite: Whether to overwrite existing index
    - verbose: Enable verbose output
    - meta: Dictionary mapping metadata field names to lengths
    - stemmer: Stemmer to use ('porter', 'weak_porter', etc.)
    - stopwords: Stopword list to use ('terrier', 'smart', etc.) 
    - tokeniser: Tokeniser configuration
    """
    def __init__(self, index_path: str, blocks: bool = False, 
                 overwrite: bool = False, verbose: bool = False,
                 meta: Dict[str, int] = None, stemmer: str = None,
                 stopwords: str = None, tokeniser: str = None, **kwargs): ...

File-Based Indexers

Specialized indexers for processing file collections and directories.

class FilesIndexer(Indexer):
    """
    Index files from a directory or file list.
    
    Parameters:
    - index_path: Path where index will be created
    - blocks: Whether to record block information
    - verbose: Enable verbose output
    - meta: Metadata field configuration
    - type: File type ('txt', 'pdf', 'docx', etc.)
    """
    def __init__(self, index_path: str, blocks: bool = False,
                 verbose: bool = False, meta: Dict[str, int] = None,
                 type: str = 'txt', **kwargs): ...

Usage Example:

# Index text files from directory
files_indexer = pt.FilesIndexer('/path/to/index', verbose=True)
index_ref = files_indexer.index('/path/to/documents/')

# Index PDF files with metadata
pdf_indexer = pt.FilesIndexer(
    '/path/to/pdf_index',
    type='pdf', 
    meta={'title': 100, 'author': 50}
)
index_ref = pdf_indexer.index('/path/to/pdfs/')

TREC Collection Indexer

Specialized indexer for TREC-formatted document collections with support for various TREC formats.

class TRECCollectionIndexer(Indexer):
    """
    Index TREC-formatted document collections.
    
    Parameters:  
    - index_path: Path where index will be created
    - collection: List of TREC collection files or single file path
    - blocks: Whether to record block information
    - verbose: Enable verbose output
    - meta: Metadata field configuration
    """
    def __init__(self, index_path: str, collection: Union[str, List[str]] = None,
                 blocks: bool = False, verbose: bool = False, 
                 meta: Dict[str, int] = None, **kwargs): ...

Usage Example:

# Index single TREC file
trec_indexer = pt.TRECCollectionIndexer(
    '/path/to/trec_index',
    collection='/path/to/collection.trec'
)
index_ref = trec_indexer.index()

# Index multiple TREC files
multi_trec_indexer = pt.TRECCollectionIndexer(
    '/path/to/multi_index',
    collection=['/path/to/file1.trec', '/path/to/file2.trec']
)
index_ref = multi_trec_indexer.index()

DataFrame Indexer

Indexer for creating indexes directly from pandas DataFrames, enabling in-memory document processing.

class DFIndexer(Indexer):
    """
    Index documents from pandas DataFrame.
    
    Parameters:
    - index_path: Path where index will be created  
    - blocks: Whether to record block information
    - verbose: Enable verbose output
    - meta: Metadata field configuration
    - text_attr: Name of column containing document text (default: 'text')
    - docno_attr: Name of column containing document IDs (default: 'docno')
    """
    def __init__(self, index_path: str, blocks: bool = False,
                 verbose: bool = False, meta: Dict[str, int] = None,
                 text_attr: str = 'text', docno_attr: str = 'docno', **kwargs): ...

class DFIndexUtils:
    """Utilities for DataFrame indexing operations."""
    
    @staticmethod
    def create_df(documents: List[Dict[str, Any]]) -> pd.DataFrame: ...
    
    @staticmethod
    def validate_df(df: pd.DataFrame) -> bool: ...

Usage Example:

# Create DataFrame with documents
documents_df = pd.DataFrame([
    {'docno': 'doc1', 'text': 'First document content', 'title': 'Document 1'},
    {'docno': 'doc2', 'text': 'Second document content', 'title': 'Document 2'}
])

# Index DataFrame
df_indexer = pt.DFIndexer(
    '/path/to/df_index',
    meta={'title': 100},  # Include title metadata with max length 100
    verbose=True
)
index_ref = df_indexer.index(documents_df)

Iterator Dictionary Indexer

Indexer for processing document iterators, useful for streaming large collections without loading everything into memory.

class IterDictIndexer(Indexer):
    """
    Index documents from iterator of dictionaries.
    
    Parameters:
    - index_path: Path where index will be created
    - blocks: Whether to record block information
    - verbose: Enable verbose output
    - meta: Metadata field configuration
    - text_attr: Name of field containing document text (default: 'text')
    - docno_attr: Name of field containing document IDs (default: 'docno')
    """
    def __init__(self, index_path: str, blocks: bool = False,
                 verbose: bool = False, meta: Dict[str, int] = None,
                 text_attr: str = 'text', docno_attr: str = 'docno', **kwargs): ...

Usage Example:

# Define document iterator
def document_iterator():
    for i in range(1000):
        yield {
            'docno': f'doc_{i}',
            'text': f'This is document number {i} with some content.',
            'category': f'category_{i % 10}'
        }

# Index iterator
iter_indexer = pt.IterDictIndexer(
    '/path/to/iter_index',
    meta={'category': 20},
    verbose=True
)
index_ref = iter_indexer.index(document_iterator())

Indexing Configuration

Enumeration and utilities for configuring indexing behavior and text processing pipelines.

class IndexingType:
    """Enumeration of indexing types and configurations."""
    CLASSIC = 'classic'
    SINGLEPASS = 'singlepass'
    MEMORY = 'memory'

# High-level indexing function
def index(iter_dict_or_df, index_path: str = None, 
          indexer_class = None, **kwargs) -> Any:
    """
    High-level function for creating indexes from various input types.
    
    Parameters:
    - iter_dict_or_df: Input data (DataFrame, iterator, or file path)
    - index_path: Where to create the index
    - indexer_class: Specific indexer class to use
    - **kwargs: Additional indexer parameters
    
    Returns:
    - IndexRef object for the created index
    """

Usage Example:

# High-level indexing function
documents = [
    {'docno': 'doc1', 'text': 'Document 1 content'},
    {'docno': 'doc2', 'text': 'Document 2 content'}
]

# Simple indexing
index_ref = pt.index(documents, '/path/to/simple_index')

# Indexing with custom parameters  
index_ref = pt.index(
    documents, 
    '/path/to/custom_index',
    stemmer='porter',
    stopwords='smart',
    blocks=True
)

Advanced Indexing Patterns

Custom Text Processing Pipeline

# Configure custom text processing
custom_indexer = pt.DFIndexer(
    '/path/to/custom_index',
    stemmer='weak_porter',      # Use weak Porter stemmer
    stopwords='smart',          # Use SMART stopword list  
    blocks=True,               # Enable block information for phrases
    meta={'title': 100, 'url': 200}  # Include metadata fields
)

Multi-Field Indexing

# Index documents with multiple text fields
documents = pd.DataFrame([
    {
        'docno': 'doc1', 
        'text': 'Main document content here',
        'title': 'Document Title',
        'abstract': 'Document abstract or summary'
    }
])

# Configure indexer to handle multiple fields
multi_field_indexer = pt.DFIndexer(
    '/path/to/multi_field_index',
    meta={'title': 100, 'abstract': 500},
    verbose=True
)

Incremental Indexing

# Create base index
base_indexer = pt.DFIndexer('/path/to/base_index')
base_index = base_indexer.index(initial_documents)

# Add more documents (typically requires rebuilding)
additional_indexer = pt.DFIndexer('/path/to/updated_index')  
updated_index = additional_indexer.index(all_documents)  # Full rebuild

Types

from typing import Dict, List, Any, Union, Iterator, Optional
import pandas as pd

# Indexing-specific types
IndexRef = Any  # Java IndexRef object  
IndexPath = str  # File system path for index
DocumentIterator = Iterator[Dict[str, Any]]  # Document iterator type
MetadataConfig = Dict[str, int]  # Metadata field name to max length mapping
TextProcessingConfig = Dict[str, str]  # Text processing configuration

Install with Tessl CLI