A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.
—
PyTerrier's indexing components provide comprehensive functionality for creating searchable indexes from various document formats. The indexing system supports multiple input formats, customizable text processing pipelines, and flexible index configurations.
Central factory class for creating and managing indexes from various sources including datasets, file collections, and custom iterators.
class IndexFactory:
"""
Factory class for creating and managing Terrier indexes.
"""
@staticmethod
def from_dataset(dataset_name: str, variant: str = None, **kwargs) -> Any: ...
@staticmethod
def from_trec(path: str, single_file: bool = False, **kwargs) -> Any: ...
@staticmethod
def from_xml(path: str, **kwargs) -> Any: ...
@staticmethod
def memory(documents: List[Dict[str, Any]], **kwargs) -> Any: ...Usage Examples:
# Create index from dataset
vaswani_index = pt.terrier.IndexFactory.from_dataset('vaswani')
# Create index from TREC collection
trec_index = pt.terrier.IndexFactory.from_trec('/path/to/trec/files')
# Create in-memory index for small collections
documents = [
{'docno': 'doc1', 'text': 'This is the first document'},
{'docno': 'doc2', 'text': 'This is the second document'}
]
memory_index = pt.terrier.IndexFactory.memory(documents)Base indexer classes for creating indexes from different input sources and formats.
class TerrierIndexer(Indexer):
"""
Generic Terrier indexer with configurable text processing pipeline.
Parameters:
- index_path: Path where index will be created
- blocks: Whether to record block information for phrase queries
- overwrite: Whether to overwrite existing index
- verbose: Enable verbose output
- meta: Dictionary mapping metadata field names to lengths
- stemmer: Stemmer to use ('porter', 'weak_porter', etc.)
- stopwords: Stopword list to use ('terrier', 'smart', etc.)
- tokeniser: Tokeniser configuration
"""
def __init__(self, index_path: str, blocks: bool = False,
overwrite: bool = False, verbose: bool = False,
meta: Dict[str, int] = None, stemmer: str = None,
stopwords: str = None, tokeniser: str = None, **kwargs): ...Specialized indexers for processing file collections and directories.
class FilesIndexer(Indexer):
"""
Index files from a directory or file list.
Parameters:
- index_path: Path where index will be created
- blocks: Whether to record block information
- verbose: Enable verbose output
- meta: Metadata field configuration
- type: File type ('txt', 'pdf', 'docx', etc.)
"""
def __init__(self, index_path: str, blocks: bool = False,
verbose: bool = False, meta: Dict[str, int] = None,
type: str = 'txt', **kwargs): ...Usage Example:
# Index text files from directory
files_indexer = pt.FilesIndexer('/path/to/index', verbose=True)
index_ref = files_indexer.index('/path/to/documents/')
# Index PDF files with metadata
pdf_indexer = pt.FilesIndexer(
'/path/to/pdf_index',
type='pdf',
meta={'title': 100, 'author': 50}
)
index_ref = pdf_indexer.index('/path/to/pdfs/')Specialized indexer for TREC-formatted document collections with support for various TREC formats.
class TRECCollectionIndexer(Indexer):
"""
Index TREC-formatted document collections.
Parameters:
- index_path: Path where index will be created
- collection: List of TREC collection files or single file path
- blocks: Whether to record block information
- verbose: Enable verbose output
- meta: Metadata field configuration
"""
def __init__(self, index_path: str, collection: Union[str, List[str]] = None,
blocks: bool = False, verbose: bool = False,
meta: Dict[str, int] = None, **kwargs): ...Usage Example:
# Index single TREC file
trec_indexer = pt.TRECCollectionIndexer(
'/path/to/trec_index',
collection='/path/to/collection.trec'
)
index_ref = trec_indexer.index()
# Index multiple TREC files
multi_trec_indexer = pt.TRECCollectionIndexer(
'/path/to/multi_index',
collection=['/path/to/file1.trec', '/path/to/file2.trec']
)
index_ref = multi_trec_indexer.index()Indexer for creating indexes directly from pandas DataFrames, enabling in-memory document processing.
class DFIndexer(Indexer):
"""
Index documents from pandas DataFrame.
Parameters:
- index_path: Path where index will be created
- blocks: Whether to record block information
- verbose: Enable verbose output
- meta: Metadata field configuration
- text_attr: Name of column containing document text (default: 'text')
- docno_attr: Name of column containing document IDs (default: 'docno')
"""
def __init__(self, index_path: str, blocks: bool = False,
verbose: bool = False, meta: Dict[str, int] = None,
text_attr: str = 'text', docno_attr: str = 'docno', **kwargs): ...
class DFIndexUtils:
"""Utilities for DataFrame indexing operations."""
@staticmethod
def create_df(documents: List[Dict[str, Any]]) -> pd.DataFrame: ...
@staticmethod
def validate_df(df: pd.DataFrame) -> bool: ...Usage Example:
# Create DataFrame with documents
documents_df = pd.DataFrame([
{'docno': 'doc1', 'text': 'First document content', 'title': 'Document 1'},
{'docno': 'doc2', 'text': 'Second document content', 'title': 'Document 2'}
])
# Index DataFrame
df_indexer = pt.DFIndexer(
'/path/to/df_index',
meta={'title': 100}, # Include title metadata with max length 100
verbose=True
)
index_ref = df_indexer.index(documents_df)Indexer for processing document iterators, useful for streaming large collections without loading everything into memory.
class IterDictIndexer(Indexer):
"""
Index documents from iterator of dictionaries.
Parameters:
- index_path: Path where index will be created
- blocks: Whether to record block information
- verbose: Enable verbose output
- meta: Metadata field configuration
- text_attr: Name of field containing document text (default: 'text')
- docno_attr: Name of field containing document IDs (default: 'docno')
"""
def __init__(self, index_path: str, blocks: bool = False,
verbose: bool = False, meta: Dict[str, int] = None,
text_attr: str = 'text', docno_attr: str = 'docno', **kwargs): ...Usage Example:
# Define document iterator
def document_iterator():
for i in range(1000):
yield {
'docno': f'doc_{i}',
'text': f'This is document number {i} with some content.',
'category': f'category_{i % 10}'
}
# Index iterator
iter_indexer = pt.IterDictIndexer(
'/path/to/iter_index',
meta={'category': 20},
verbose=True
)
index_ref = iter_indexer.index(document_iterator())Enumeration and utilities for configuring indexing behavior and text processing pipelines.
class IndexingType:
"""Enumeration of indexing types and configurations."""
CLASSIC = 'classic'
SINGLEPASS = 'singlepass'
MEMORY = 'memory'
# High-level indexing function
def index(iter_dict_or_df, index_path: str = None,
indexer_class = None, **kwargs) -> Any:
"""
High-level function for creating indexes from various input types.
Parameters:
- iter_dict_or_df: Input data (DataFrame, iterator, or file path)
- index_path: Where to create the index
- indexer_class: Specific indexer class to use
- **kwargs: Additional indexer parameters
Returns:
- IndexRef object for the created index
"""Usage Example:
# High-level indexing function
documents = [
{'docno': 'doc1', 'text': 'Document 1 content'},
{'docno': 'doc2', 'text': 'Document 2 content'}
]
# Simple indexing
index_ref = pt.index(documents, '/path/to/simple_index')
# Indexing with custom parameters
index_ref = pt.index(
documents,
'/path/to/custom_index',
stemmer='porter',
stopwords='smart',
blocks=True
)# Configure custom text processing
custom_indexer = pt.DFIndexer(
'/path/to/custom_index',
stemmer='weak_porter', # Use weak Porter stemmer
stopwords='smart', # Use SMART stopword list
blocks=True, # Enable block information for phrases
meta={'title': 100, 'url': 200} # Include metadata fields
)# Index documents with multiple text fields
documents = pd.DataFrame([
{
'docno': 'doc1',
'text': 'Main document content here',
'title': 'Document Title',
'abstract': 'Document abstract or summary'
}
])
# Configure indexer to handle multiple fields
multi_field_indexer = pt.DFIndexer(
'/path/to/multi_field_index',
meta={'title': 100, 'abstract': 500},
verbose=True
)# Create base index
base_indexer = pt.DFIndexer('/path/to/base_index')
base_index = base_indexer.index(initial_documents)
# Add more documents (typically requires rebuilding)
additional_indexer = pt.DFIndexer('/path/to/updated_index')
updated_index = additional_indexer.index(all_documents) # Full rebuildfrom typing import Dict, List, Any, Union, Iterator, Optional
import pandas as pd
# Indexing-specific types
IndexRef = Any # Java IndexRef object
IndexPath = str # File system path for index
DocumentIterator = Iterator[Dict[str, Any]] # Document iterator type
MetadataConfig = Dict[str, int] # Metadata field name to max length mapping
TextProcessingConfig = Dict[str, str] # Text processing configurationInstall with Tessl CLI
npx tessl i tessl/pypi-python-terrier