High-level Python library for Elasticsearch providing an idiomatic way to write and manipulate queries.
—
Text analysis and processing capabilities for custom analyzer creation, tokenizer configuration, and text processing setup. Supports multilingual and domain-specific search requirements with comprehensive character filtering, tokenization, and token filtering options.
Functions for creating custom analyzers with configurable components.
def analyzer(name, **kwargs):
"""
Create custom analyzer.
Args:
name (str): Analyzer name or type
**kwargs: Analyzer configuration
Returns:
Analyzer: Analyzer object
Parameters:
tokenizer (str or dict): Tokenizer configuration
char_filter (list): Character filters to apply
filter (list): Token filters to apply
position_increment_gap (int): Gap between array elements
Examples:
analyzer('custom_english',
tokenizer='standard',
filter=['lowercase', 'stop', 'stemmer'])
analyzer('my_analyzer',
tokenizer={'keyword': {'buffer_size': 256}},
char_filter=['html_strip'],
filter=['lowercase', 'asciifolding'])
"""Functions for creating custom tokenizers.
def tokenizer(name, **kwargs):
"""
Create custom tokenizer.
Args:
name (str): Tokenizer name or type
**kwargs: Tokenizer configuration
Returns:
Tokenizer: Tokenizer object
Examples:
tokenizer('standard', max_token_length=255)
tokenizer('pattern', pattern=r'\W+', lowercase=True)
tokenizer('ngram', min_gram=3, max_gram=4)
"""Functions for creating character filters.
def char_filter(name, **kwargs):
"""
Create character filter.
Args:
name (str): Character filter name or type
**kwargs: Character filter configuration
Returns:
CharFilter: Character filter object
Examples:
char_filter('html_strip', escaped_tags=['b'])
char_filter('mapping', mappings=['& => and', '| => or'])
char_filter('pattern_replace', pattern='[0-9]', replacement='#')
"""Functions for creating token filters.
def token_filter(name, **kwargs):
"""
Create token filter.
Args:
name (str): Token filter name or type
**kwargs: Token filter configuration
Returns:
TokenFilter: Token filter object
Examples:
token_filter('stop', stopwords=['the', 'is', 'at'])
token_filter('synonym', synonyms=['laptop,notebook', 'car,automobile'])
token_filter('stemmer', language='english')
"""Functions for creating normalizers for keyword fields.
def normalizer(name, **kwargs):
"""
Create normalizer for keyword fields.
Args:
name (str): Normalizer name
**kwargs: Normalizer configuration
Returns:
Normalizer: Normalizer object
Parameters:
char_filter (list): Character filters to apply
filter (list): Token filters to apply
Examples:
normalizer('lowercase_normalizer', filter=['lowercase'])
normalizer('ascii_normalizer',
char_filter=['mapping'],
filter=['lowercase', 'asciifolding'])
"""Pre-configured analyzers for common use cases.
class StandardAnalyzer:
"""
Standard analyzer with standard tokenizer and lowercase filter.
"""
def __init__(self, max_token_length=255, stopwords=None, **kwargs):
"""
Args:
max_token_length (int): Maximum token length
stopwords (list or str): Stop words configuration
**kwargs: Additional parameters
"""
class SimpleAnalyzer:
"""
Simple analyzer that splits on non-letter characters and lowercases.
"""
def __init__(self, **kwargs):
"""
Args:
**kwargs: Additional parameters
"""
class WhitespaceAnalyzer:
"""
Whitespace analyzer that splits on whitespace characters.
"""
def __init__(self, **kwargs):
"""
Args:
**kwargs: Additional parameters
"""
class StopAnalyzer:
"""
Stop analyzer with stop word filtering.
"""
def __init__(self, stopwords=None, **kwargs):
"""
Args:
stopwords (list or str): Stop words configuration
**kwargs: Additional parameters
"""
class KeywordAnalyzer:
"""
Keyword analyzer that treats input as single token.
"""
def __init__(self, **kwargs):
"""
Args:
**kwargs: Additional parameters
"""
class PatternAnalyzer:
"""
Pattern analyzer using regular expressions.
"""
def __init__(self, pattern=r'\W+', flags=None, lowercase=True, stopwords=None, **kwargs):
"""
Args:
pattern (str): Regular expression pattern
flags (str): Regular expression flags
lowercase (bool): Convert to lowercase
stopwords (list or str): Stop words configuration
**kwargs: Additional parameters
"""
class LanguageAnalyzer:
"""
Language-specific analyzer.
"""
def __init__(self, language, **kwargs):
"""
Args:
language (str): Language code ('english', 'spanish', 'french', etc.)
**kwargs: Language-specific parameters
Supported languages:
arabic, armenian, basque, bengali, brazilian, bulgarian, catalan,
chinese, cjk, czech, danish, dutch, english, estonian, finnish,
french, galician, german, greek, hindi, hungarian, indonesian,
irish, italian, latvian, lithuanian, norwegian, persian, portuguese,
romanian, russian, sorani, spanish, swedish, turkish, thai
"""
class FingerprintAnalyzer:
"""
Fingerprint analyzer for deduplication.
"""
def __init__(self, separator=' ', max_output_size=255, stopwords=None, **kwargs):
"""
Args:
separator (str): Token separator in output
max_output_size (int): Maximum output size
stopwords (list or str): Stop words configuration
**kwargs: Additional parameters
"""
class CustomAnalyzer:
"""
Custom analyzer builder.
"""
def __init__(self, tokenizer, char_filter=None, filter=None, **kwargs):
"""
Args:
tokenizer (str or dict): Tokenizer configuration
char_filter (list, optional): Character filters
filter (list, optional): Token filters
**kwargs: Additional parameters
"""Pre-configured tokenizers for various text processing needs.
class StandardTokenizer:
"""
Standard tokenizer based on Unicode Text Segmentation.
"""
def __init__(self, max_token_length=255, **kwargs):
"""
Args:
max_token_length (int): Maximum token length
**kwargs: Additional parameters
"""
class KeywordTokenizer:
"""
Keyword tokenizer that outputs entire input as single token.
"""
def __init__(self, buffer_size=256, **kwargs):
"""
Args:
buffer_size (int): Input buffer size
**kwargs: Additional parameters
"""
class WhitespaceTokenizer:
"""
Whitespace tokenizer that splits on whitespace.
"""
def __init__(self, max_token_length=255, **kwargs):
"""
Args:
max_token_length (int): Maximum token length
**kwargs: Additional parameters
"""
class PatternTokenizer:
"""
Pattern tokenizer using regular expressions.
"""
def __init__(self, pattern=r'\W+', flags=None, group=-1, **kwargs):
"""
Args:
pattern (str): Regular expression pattern
flags (str): Regular expression flags
group (int): Capture group to extract (-1 = split on pattern)
**kwargs: Additional parameters
"""
class NGramTokenizer:
"""
N-gram tokenizer for partial matching.
"""
def __init__(self, min_gram=1, max_gram=2, token_chars=None, **kwargs):
"""
Args:
min_gram (int): Minimum n-gram length
max_gram (int): Maximum n-gram length
token_chars (list): Character classes to include in tokens
**kwargs: Additional parameters
Token character classes: letter, digit, whitespace, punctuation, symbol
"""
class EdgeNGramTokenizer:
"""
Edge n-gram tokenizer for prefix matching.
"""
def __init__(self, min_gram=1, max_gram=2, token_chars=None, **kwargs):
"""
Args and parameters same as NGramTokenizer.
"""
class PathHierarchyTokenizer:
"""
Path hierarchy tokenizer for filesystem paths.
"""
def __init__(self, delimiter='/', replacement=None, buffer_size=1024,
reverse=False, skip=0, **kwargs):
"""
Args:
delimiter (str): Path delimiter
replacement (str, optional): Replacement for delimiter in output
buffer_size (int): Input buffer size
reverse (bool): Process path in reverse order
skip (int): Number of initial tokens to skip
**kwargs: Additional parameters
"""
class ClassicTokenizer:
"""
Classic tokenizer based on English grammar.
"""
def __init__(self, max_token_length=255, **kwargs):
"""
Args:
max_token_length (int): Maximum token length
**kwargs: Additional parameters
"""
class LetterTokenizer:
"""
Letter tokenizer that splits on non-letter characters.
"""
def __init__(self, **kwargs):
"""
Args:
**kwargs: Additional parameters
"""
class LowercaseTokenizer:
"""
Lowercase tokenizer that splits on non-letter and lowercases.
"""
def __init__(self, **kwargs):
"""
Args:
**kwargs: Additional parameters
"""Character filters for preprocessing text before tokenization.
class HtmlStripCharFilter:
"""
HTML strip character filter.
"""
def __init__(self, escaped_tags=None, **kwargs):
"""
Args:
escaped_tags (list, optional): HTML tags to escape instead of strip
**kwargs: Additional parameters
"""
class MappingCharFilter:
"""
Mapping character filter for character replacement.
"""
def __init__(self, mappings=None, mappings_path=None, **kwargs):
"""
Args:
mappings (list, optional): List of mappings ('from => to')
mappings_path (str, optional): Path to mappings file
**kwargs: Additional parameters
"""
class PatternReplaceCharFilter:
"""
Pattern replace character filter using regular expressions.
"""
def __init__(self, pattern, replacement='', flags=None, **kwargs):
"""
Args:
pattern (str): Regular expression pattern
replacement (str): Replacement string
flags (str): Regular expression flags
**kwargs: Additional parameters
"""Token filters for processing tokens after tokenization.
class LowercaseTokenFilter:
"""
Lowercase token filter.
"""
def __init__(self, language=None, **kwargs):
"""
Args:
language (str, optional): Language-specific lowercasing
**kwargs: Additional parameters
"""
class UppercaseTokenFilter:
"""
Uppercase token filter.
"""
def __init__(self, **kwargs):
"""
Args:
**kwargs: Additional parameters
"""
class StopTokenFilter:
"""
Stop word token filter.
"""
def __init__(self, stopwords=None, stopwords_path=None, ignore_case=False,
remove_trailing=True, **kwargs):
"""
Args:
stopwords (list or str, optional): Stop words or language name
stopwords_path (str, optional): Path to stop words file
ignore_case (bool): Case insensitive matching
remove_trailing (bool): Remove trailing stop words
**kwargs: Additional parameters
"""
class StemmerTokenFilter:
"""
Stemmer token filter.
"""
def __init__(self, language='english', **kwargs):
"""
Args:
language (str): Stemming language
**kwargs: Additional parameters
Supported languages: Same as LanguageAnalyzer
"""
class SnowballTokenFilter:
"""
Snowball stemmer token filter.
"""
def __init__(self, language='english', **kwargs):
"""
Args:
language (str): Snowball stemming language
**kwargs: Additional parameters
"""
class SynonymTokenFilter:
"""
Synonym token filter.
"""
def __init__(self, synonyms=None, synonyms_path=None, expand=True,
lenient=False, **kwargs):
"""
Args:
synonyms (list, optional): List of synonym rules
synonyms_path (str, optional): Path to synonyms file
expand (bool): Expand synonyms
lenient (bool): Ignore malformed synonym rules
**kwargs: Additional parameters
Synonym formats:
- 'laptop,notebook,computer' (equivalent synonyms)
- 'laptop,notebook => computer' (explicit mapping)
"""
class NGramTokenFilter:
"""
N-gram token filter.
"""
def __init__(self, min_gram=1, max_gram=2, preserve_original=False, **kwargs):
"""
Args:
min_gram (int): Minimum n-gram length
max_gram (int): Maximum n-gram length
preserve_original (bool): Keep original tokens
**kwargs: Additional parameters
"""
class EdgeNGramTokenFilter:
"""
Edge n-gram token filter.
"""
def __init__(self, min_gram=1, max_gram=2, side='front', preserve_original=False, **kwargs):
"""
Args:
min_gram (int): Minimum n-gram length
max_gram (int): Maximum n-gram length
side (str): Side to generate n-grams from ('front' or 'back')
preserve_original (bool): Keep original tokens
**kwargs: Additional parameters
"""
class ShingleTokenFilter:
"""
Shingle token filter for word n-grams.
"""
def __init__(self, min_shingle_size=2, max_shingle_size=2, output_unigrams=True,
output_unigrams_if_no_shingles=False, token_separator=' ',
filler_token='_', **kwargs):
"""
Args:
min_shingle_size (int): Minimum shingle size
max_shingle_size (int): Maximum shingle size
output_unigrams (bool): Output single tokens
output_unigrams_if_no_shingles (bool): Output unigrams when no shingles
token_separator (str): Token separator in shingles
filler_token (str): Filler for missing positions
**kwargs: Additional parameters
"""
class AsciiFoldingTokenFilter:
"""
ASCII folding token filter for removing accents.
"""
def __init__(self, preserve_original=False, **kwargs):
"""
Args:
preserve_original (bool): Keep original tokens
**kwargs: Additional parameters
"""
class LengthTokenFilter:
"""
Length token filter for filtering by token length.
"""
def __init__(self, min_length=0, max_length=None, **kwargs):
"""
Args:
min_length (int): Minimum token length
max_length (int, optional): Maximum token length
**kwargs: Additional parameters
"""
class TruncateTokenFilter:
"""
Truncate token filter for limiting token length.
"""
def __init__(self, length=10, **kwargs):
"""
Args:
length (int): Maximum token length
**kwargs: Additional parameters
"""
class ReverseTokenFilter:
"""
Reverse token filter for reversing token characters.
"""
def __init__(self, **kwargs):
"""
Args:
**kwargs: Additional parameters
"""
class ElisionTokenFilter:
"""
Elision token filter for removing elisions.
"""
def __init__(self, articles=None, articles_path=None, articles_case=False, **kwargs):
"""
Args:
articles (list, optional): List of elision articles
articles_path (str, optional): Path to articles file
articles_case (bool): Case sensitive matching
**kwargs: Additional parameters
"""
class PhoneticTokenFilter:
"""
Phonetic token filter for phonetic matching.
"""
def __init__(self, encoder='metaphone', replace=True, **kwargs):
"""
Args:
encoder (str): Phonetic encoder algorithm
replace (bool): Replace original token
**kwargs: Additional parameters
Encoders: metaphone, double_metaphone, soundex, refined_soundex,
caverphone1, caverphone2, cologne, nysiis, koelnerphonetik,
haasephonetik, beider_morse, daitch_mokotoff
"""from elasticsearch_dsl import Document, Text, analyzer, tokenizer, char_filter, token_filter
# Define custom analyzer
my_analyzer = analyzer(
'my_custom_analyzer',
tokenizer=tokenizer('standard', max_token_length=200),
char_filter=[
char_filter('html_strip'),
char_filter('mapping', mappings=['& => and', '@ => at'])
],
filter=[
token_filter('lowercase'),
token_filter('stop', stopwords=['the', 'is', 'at', 'which', 'on']),
token_filter('stemmer', language='english'),
token_filter('synonym', synonyms=[
'laptop,notebook,computer',
'car,automobile,vehicle'
])
]
)
# Use in document definition
class Article(Document):
title = Text(analyzer=my_analyzer)
content = Text(
analyzer=my_analyzer,
fields={
'raw': Text(analyzer='keyword'),
'stemmed': Text(analyzer='stemmer')
}
)
class Index:
name = 'articles'
settings = {
'analysis': {
'analyzer': {
'my_custom_analyzer': my_analyzer.to_dict()
}
}
}# Multi-language document with different analyzers
class MultilingualDocument(Document):
# English content
title_en = Text(analyzer='english')
content_en = Text(analyzer='english')
# Spanish content
title_es = Text(analyzer='spanish')
content_es = Text(analyzer='spanish')
# French content
title_fr = Text(analyzer='french')
content_fr = Text(analyzer='french')
# Auto-detect language field
content_auto = Text(
fields={
'english': Text(analyzer='english'),
'spanish': Text(analyzer='spanish'),
'french': Text(analyzer='french')
}
)
class Index:
name = 'multilingual_docs'# Analyzer for search-as-you-type functionality
search_analyzer = analyzer(
'search_as_you_type_analyzer',
tokenizer='standard',
filter=[
'lowercase',
token_filter('edge_ngram', min_gram=1, max_gram=20)
]
)
autocomplete_analyzer = analyzer(
'autocomplete_analyzer',
tokenizer='standard',
filter=[
'lowercase',
token_filter('shingle', min_shingle_size=2, max_shingle_size=3),
token_filter('edge_ngram', min_gram=1, max_gram=20)
]
)
class SearchDocument(Document):
# For prefix matching
title = Text(
analyzer=search_analyzer,
search_analyzer='standard',
fields={
'autocomplete': Text(
analyzer=autocomplete_analyzer,
search_analyzer='standard'
)
}
)
class Index:
name = 'search_docs'# Analyzer for code/technical content
code_analyzer = analyzer(
'code_analyzer',
tokenizer=tokenizer('pattern', pattern=r'[^\w\.]+'),
char_filter=[
char_filter('pattern_replace', pattern=r'//.*', replacement=''), # Remove comments
char_filter('pattern_replace', pattern=r'/\*.*?\*/', replacement='') # Remove block comments
],
filter=[
'lowercase',
token_filter('stop', stopwords=['the', 'a', 'an', 'and', 'or', 'but']),
token_filter('ngram', min_gram=3, max_gram=8) # For partial matching
]
)
# Analyzer for email addresses
email_analyzer = analyzer(
'email_analyzer',
tokenizer=tokenizer('uax_url_email'),
filter=[
'lowercase',
token_filter('pattern_replace', pattern=r'@.*', replacement='') # Remove domain
]
)
class TechnicalDocument(Document):
code_snippet = Text(analyzer=code_analyzer)
author_email = Text(analyzer=email_analyzer)
class Index:
name = 'technical_docs'# Analyzer for name matching with phonetic encoding
name_analyzer = analyzer(
'name_analyzer',
tokenizer='standard',
filter=[
'lowercase',
token_filter('phonetic', encoder='double_metaphone', replace=False),
token_filter('unique') # Remove duplicates
]
)
# Analyzer with ASCII folding for international names
international_name_analyzer = analyzer(
'international_name_analyzer',
tokenizer='standard',
filter=[
'lowercase',
token_filter('asciifolding'), # Remove accents
token_filter('phonetic', encoder='metaphone'),
token_filter('ngram', min_gram=2, max_gram=4) # For partial matching
]
)
class PersonDocument(Document):
name = Text(
analyzer=name_analyzer,
fields={
'international': Text(analyzer=international_name_analyzer),
'exact': Text(analyzer='keyword')
}
)
class Index:
name = 'people'from elasticsearch_dsl import connections
# Test analyzer output
def test_analyzer(analyzer_name, text):
"""Test analyzer output on sample text."""
client = connections.get_connection()
response = client.indices.analyze(
body={
'analyzer': analyzer_name,
'text': text
}
)
tokens = [token['token'] for token in response['tokens']]
return tokens
# Test custom analyzer
test_text = "The quick brown fox jumps over the lazy dog's back!"
tokens = test_analyzer('my_custom_analyzer', test_text)
print(f"Tokens: {tokens}")
# Test different analyzers
analyzers = ['standard', 'english', 'keyword', 'simple']
for analyzer_name in analyzers:
tokens = test_analyzer(analyzer_name, test_text)
print(f"{analyzer_name}: {tokens}")Install with Tessl CLI
npx tessl i tessl/pypi-elasticsearch-dsl