A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.
—
PyTerrier's text processing components provide comprehensive text analysis and transformation capabilities, including stemming, tokenization, stopword removal, and text loading utilities integrated with the Terrier platform.
Text stemming functionality using various stemming algorithms supported by the Terrier platform.
class TerrierStemmer(Transformer):
"""
Stemming transformer using Terrier's stemming implementations.
Parameters:
- stemmer: Stemmer name to use (default: 'porter')
- text_attr: Attribute name containing text to stem (default: 'text')
"""
def __init__(self, stemmer: str = 'porter', text_attr: str = 'text'): ...Supported Stemmers:
porter: Porter stemmer (most common)weak_porter: Weak Porter stemmersnowball: Snowball stemmerlovins: Lovins stemmerpaice: Paice/Husk stemmerUsage Examples:
# Basic Porter stemming
porter_stemmer = pt.terrier.TerrierStemmer()
# Apply stemming to query text
stemmed_queries = porter_stemmer.transform(topics)
# Use different stemmer
snowball_stemmer = pt.terrier.TerrierStemmer(stemmer='snowball')
# Stem custom text attribute
custom_stemmer = pt.terrier.TerrierStemmer(
stemmer='porter',
text_attr='custom_text'
)
# Pipeline integration
pipeline = retriever >> pt.terrier.TerrierStemmer() >> rerankerText tokenization functionality for splitting text into tokens using Terrier's tokenization implementations.
class TerrierTokeniser(Transformer):
"""
Tokenization transformer using Terrier's tokenizer implementations.
Parameters:
- tokeniser: Tokenizer configuration or name
- text_attr: Attribute name containing text to tokenize (default: 'text')
- **kwargs: Additional tokenizer configuration options
"""
def __init__(self, tokeniser: str = None, text_attr: str = 'text', **kwargs): ...Tokenizer Options:
UTFTokeniser: UTF-8 aware tokenizationEnglishTokeniser: English-specific tokenization rulesUsage Examples:
# Basic tokenization
tokenizer = pt.terrier.TerrierTokeniser()
tokenized_text = tokenizer.transform(documents)
# UTF-8 tokenization for international text
utf_tokenizer = pt.terrier.TerrierTokeniser(tokeniser='UTFTokeniser')
# English-specific tokenization
english_tokenizer = pt.terrier.TerrierTokeniser(tokeniser='EnglishTokeniser')
# Custom tokenizer configuration
custom_tokenizer = pt.terrier.TerrierTokeniser(
tokeniser='EnglishTokeniser',
lowercase=True,
numbers=False
)Stopword filtering using various predefined stopword lists or custom stopword sets.
class TerrierStopwords(Transformer):
"""
Stopword removal transformer using Terrier's stopword lists.
Parameters:
- stopwords: Stopword list name or custom list (default: 'terrier')
- text_attr: Attribute name containing text to filter (default: 'text')
"""
def __init__(self, stopwords: Union[str, List[str]] = 'terrier',
text_attr: str = 'text'): ...Predefined Stopword Lists:
terrier: Default Terrier stopword listsmart: SMART stopword listindri: Indri stopword listcustom: Use custom stopword listUsage Examples:
# Basic stopword removal
stopword_filter = pt.terrier.TerrierStopwords()
filtered_text = stopword_filter.transform(documents)
# Use SMART stopword list
smart_filter = pt.terrier.TerrierStopwords(stopwords='smart')
# Custom stopword list
custom_stopwords = ['the', 'and', 'or', 'but', 'custom_word']
custom_filter = pt.terrier.TerrierStopwords(stopwords=custom_stopwords)
# Filter custom text attribute
attr_filter = pt.terrier.TerrierStopwords(
stopwords='smart',
text_attr='title'
)Text loading utilities for reading and processing text from various sources and formats.
class TerrierTextLoader(Transformer):
"""
Text loading transformer for extracting text from documents.
Parameters:
- text_loader: Text loader implementation to use
- **kwargs: Additional text loader configuration options
"""
def __init__(self, text_loader: str = None, **kwargs): ...
def terrier_text_loader(text_loader_spec: str = None, **kwargs) -> 'TerrierTextLoader':
"""
Factory function for creating text loaders.
Parameters:
- text_loader_spec: Text loader specification string
- **kwargs: Additional configuration options
Returns:
- Configured TerrierTextLoader instance
"""Text Loader Types:
txt: Plain text filespdf: PDF document extractiondocx: Microsoft Word document extractionhtml: HTML content extractionxml: XML content extractionUsage Examples:
# Basic text loading
text_loader = pt.terrier.TerrierTextLoader()
# PDF text extraction
pdf_loader = pt.terrier.terrier_text_loader('pdf')
pdf_text = pdf_loader.transform(pdf_documents)
# HTML content extraction
html_loader = pt.terrier.terrier_text_loader('html')
html_text = html_loader.transform(html_documents)
# Microsoft Word document extraction
docx_loader = pt.terrier.terrier_text_loader('docx')Protocol interface for components that support text loading capabilities.
from typing import Protocol
class HasTextLoader(Protocol):
"""
Protocol for components that support text loading functionality.
"""
def get_text_loader(self) -> Any: ...# Comprehensive text processing pipeline
text_pipeline = (
pt.terrier.TerrierTextLoader() >> # Load text content
pt.terrier.TerrierTokeniser() >> # Tokenize text
pt.terrier.TerrierStopwords(stopwords='smart') >> # Remove stopwords
pt.terrier.TerrierStemmer(stemmer='porter') # Apply stemming
)
processed_documents = text_pipeline.transform(raw_documents)# Query preprocessing pipeline
query_processor = (
pt.terrier.TerrierTokeniser() >>
pt.terrier.TerrierStopwords() >>
pt.terrier.TerrierStemmer()
)
# Apply to queries before retrieval
processed_queries = query_processor.transform(topics)
retrieval_results = retriever.transform(processed_queries)# Document preprocessing for indexing
doc_processor = (
pt.terrier.TerrierTextLoader() >>
pt.terrier.TerrierTokeniser(tokeniser='EnglishTokeniser') >>
pt.terrier.TerrierStopwords(stopwords='terrier')
# Note: Stemming typically done during indexing, not preprocessing
)
# Process documents before indexing
processed_docs = doc_processor.transform(document_collection)
indexer = pt.DFIndexer('/path/to/index', stemmer='porter')
index_ref = indexer.index(processed_docs)# Process different text fields with different settings
title_processor = pt.terrier.TerrierStemmer(
stemmer='weak_porter',
text_attr='title'
)
content_processor = pt.terrier.TerrierStemmer(
stemmer='porter',
text_attr='content'
)
# Apply different processing to different fields
processed_titles = title_processor.transform(documents)
processed_content = content_processor.transform(documents)# Configure for non-English text
international_tokenizer = pt.terrier.TerrierTokeniser(
tokeniser='UTFTokeniser'
)
# Custom stopwords for specific language
spanish_stopwords = ['el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'es', 'se']
spanish_filter = pt.terrier.TerrierStopwords(stopwords=spanish_stopwords)
# Language-specific pipeline
spanish_pipeline = (
international_tokenizer >>
spanish_filter >>
pt.terrier.TerrierStemmer(stemmer='snowball') # Snowball supports multiple languages
)# Combine with custom transformers
import re
class CustomTextCleaner(pt.Transformer):
def transform(self, df):
# Custom cleaning logic
df = df.copy()
df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)
df['text'] = df['text'].str.lower()
return df
# Integrated pipeline
custom_pipeline = (
CustomTextCleaner() >>
pt.terrier.TerrierTokeniser() >>
pt.terrier.TerrierStopwords() >>
pt.terrier.TerrierStemmer()
)# Optimize text processing for large collections
optimized_pipeline = (
pt.terrier.TerrierTokeniser() >>
pt.terrier.TerrierStopwords(stopwords='smart') >>
pt.terrier.TerrierStemmer(stemmer='porter')
).parallel(jobs=4) # Parallel processing
# Batch processing for memory efficiency
batch_size = 1000
for batch in pt.model.split_df(large_document_collection, batch_size=batch_size):
processed_batch = optimized_pipeline.transform(batch)
# Process batch results# Process queries at retrieval time
retrieval_pipeline = (
pt.terrier.TerrierStemmer() >> # Stem queries
pt.terrier.Retriever(index_ref, wmodel='BM25')
)
results = retrieval_pipeline.transform(topics)# Process retrieved documents
document_pipeline = (
pt.terrier.Retriever(index_ref) >>
pt.text.get_text(dataset) >> # Get full document text
pt.terrier.TerrierStemmer() >> # Process retrieved text
some_reranker
)from typing import Union, List, Any, Protocol
import pandas as pd
# Text processing types
StemmerName = str # Stemmer algorithm name
TokeniserName = str # Tokenizer implementation name
StopwordList = Union[str, List[str]] # Stopword list specification
TextAttribute = str # Column/attribute name containing text
TextLoaderSpec = str # Text loader specification
ProcessingConfig = Dict[str, Any] # Text processing configuration
# Protocol types
class HasTextLoader(Protocol):
def get_text_loader(self) -> Any: ...Install with Tessl CLI
npx tessl i tessl/pypi-python-terrier