0
# Analysis
1
2
Text analysis and processing capabilities for custom analyzer creation, tokenizer configuration, and text processing setup. Supports multilingual and domain-specific search requirements with comprehensive character filtering, tokenization, and token filtering options.
3
4
## Capabilities
5
6
### Analyzer Creation
7
8
Functions for creating custom analyzers with configurable components.
9
10
```python { .api }
11
def analyzer(name, **kwargs):
12
"""
13
Create custom analyzer.
14
15
Args:
16
name (str): Analyzer name or type
17
**kwargs: Analyzer configuration
18
19
Returns:
20
Analyzer: Analyzer object
21
22
Parameters:
23
tokenizer (str or dict): Tokenizer configuration
24
char_filter (list): Character filters to apply
25
filter (list): Token filters to apply
26
position_increment_gap (int): Gap between array elements
27
28
Examples:
29
analyzer('custom_english',
30
tokenizer='standard',
31
filter=['lowercase', 'stop', 'stemmer'])
32
33
analyzer('my_analyzer',
34
tokenizer={'keyword': {'buffer_size': 256}},
35
char_filter=['html_strip'],
36
filter=['lowercase', 'asciifolding'])
37
"""
38
```
39
40
### Tokenizer Creation
41
42
Functions for creating custom tokenizers.
43
44
```python { .api }
45
def tokenizer(name, **kwargs):
46
"""
47
Create custom tokenizer.
48
49
Args:
50
name (str): Tokenizer name or type
51
**kwargs: Tokenizer configuration
52
53
Returns:
54
Tokenizer: Tokenizer object
55
56
Examples:
57
tokenizer('standard', max_token_length=255)
58
tokenizer('pattern', pattern=r'\W+', lowercase=True)
59
tokenizer('ngram', min_gram=3, max_gram=4)
60
"""
61
```
62
63
### Character Filter Creation
64
65
Functions for creating character filters.
66
67
```python { .api }
68
def char_filter(name, **kwargs):
69
"""
70
Create character filter.
71
72
Args:
73
name (str): Character filter name or type
74
**kwargs: Character filter configuration
75
76
Returns:
77
CharFilter: Character filter object
78
79
Examples:
80
char_filter('html_strip', escaped_tags=['b'])
81
char_filter('mapping', mappings=['& => and', '| => or'])
82
char_filter('pattern_replace', pattern='[0-9]', replacement='#')
83
"""
84
```
85
86
### Token Filter Creation
87
88
Functions for creating token filters.
89
90
```python { .api }
91
def token_filter(name, **kwargs):
92
"""
93
Create token filter.
94
95
Args:
96
name (str): Token filter name or type
97
**kwargs: Token filter configuration
98
99
Returns:
100
TokenFilter: Token filter object
101
102
Examples:
103
token_filter('stop', stopwords=['the', 'is', 'at'])
104
token_filter('synonym', synonyms=['laptop,notebook', 'car,automobile'])
105
token_filter('stemmer', language='english')
106
"""
107
```
108
109
### Normalizer Creation
110
111
Functions for creating normalizers for keyword fields.
112
113
```python { .api }
114
def normalizer(name, **kwargs):
115
"""
116
Create normalizer for keyword fields.
117
118
Args:
119
name (str): Normalizer name
120
**kwargs: Normalizer configuration
121
122
Returns:
123
Normalizer: Normalizer object
124
125
Parameters:
126
char_filter (list): Character filters to apply
127
filter (list): Token filters to apply
128
129
Examples:
130
normalizer('lowercase_normalizer', filter=['lowercase'])
131
normalizer('ascii_normalizer',
132
char_filter=['mapping'],
133
filter=['lowercase', 'asciifolding'])
134
"""
135
```
136
137
### Built-in Analyzers
138
139
Pre-configured analyzers for common use cases.
140
141
```python { .api }
142
class StandardAnalyzer:
143
"""
144
Standard analyzer with standard tokenizer and lowercase filter.
145
"""
146
def __init__(self, max_token_length=255, stopwords=None, **kwargs):
147
"""
148
Args:
149
max_token_length (int): Maximum token length
150
stopwords (list or str): Stop words configuration
151
**kwargs: Additional parameters
152
"""
153
154
class SimpleAnalyzer:
155
"""
156
Simple analyzer that splits on non-letter characters and lowercases.
157
"""
158
def __init__(self, **kwargs):
159
"""
160
Args:
161
**kwargs: Additional parameters
162
"""
163
164
class WhitespaceAnalyzer:
165
"""
166
Whitespace analyzer that splits on whitespace characters.
167
"""
168
def __init__(self, **kwargs):
169
"""
170
Args:
171
**kwargs: Additional parameters
172
"""
173
174
class StopAnalyzer:
175
"""
176
Stop analyzer with stop word filtering.
177
"""
178
def __init__(self, stopwords=None, **kwargs):
179
"""
180
Args:
181
stopwords (list or str): Stop words configuration
182
**kwargs: Additional parameters
183
"""
184
185
class KeywordAnalyzer:
186
"""
187
Keyword analyzer that treats input as single token.
188
"""
189
def __init__(self, **kwargs):
190
"""
191
Args:
192
**kwargs: Additional parameters
193
"""
194
195
class PatternAnalyzer:
196
"""
197
Pattern analyzer using regular expressions.
198
"""
199
def __init__(self, pattern=r'\W+', flags=None, lowercase=True, stopwords=None, **kwargs):
200
"""
201
Args:
202
pattern (str): Regular expression pattern
203
flags (str): Regular expression flags
204
lowercase (bool): Convert to lowercase
205
stopwords (list or str): Stop words configuration
206
**kwargs: Additional parameters
207
"""
208
209
class LanguageAnalyzer:
210
"""
211
Language-specific analyzer.
212
"""
213
def __init__(self, language, **kwargs):
214
"""
215
Args:
216
language (str): Language code ('english', 'spanish', 'french', etc.)
217
**kwargs: Language-specific parameters
218
219
Supported languages:
220
arabic, armenian, basque, bengali, brazilian, bulgarian, catalan,
221
chinese, cjk, czech, danish, dutch, english, estonian, finnish,
222
french, galician, german, greek, hindi, hungarian, indonesian,
223
irish, italian, latvian, lithuanian, norwegian, persian, portuguese,
224
romanian, russian, sorani, spanish, swedish, turkish, thai
225
"""
226
227
class FingerprintAnalyzer:
228
"""
229
Fingerprint analyzer for deduplication.
230
"""
231
def __init__(self, separator=' ', max_output_size=255, stopwords=None, **kwargs):
232
"""
233
Args:
234
separator (str): Token separator in output
235
max_output_size (int): Maximum output size
236
stopwords (list or str): Stop words configuration
237
**kwargs: Additional parameters
238
"""
239
240
class CustomAnalyzer:
241
"""
242
Custom analyzer builder.
243
"""
244
def __init__(self, tokenizer, char_filter=None, filter=None, **kwargs):
245
"""
246
Args:
247
tokenizer (str or dict): Tokenizer configuration
248
char_filter (list, optional): Character filters
249
filter (list, optional): Token filters
250
**kwargs: Additional parameters
251
"""
252
```
253
254
### Built-in Tokenizers
255
256
Pre-configured tokenizers for various text processing needs.
257
258
```python { .api }
259
class StandardTokenizer:
260
"""
261
Standard tokenizer based on Unicode Text Segmentation.
262
"""
263
def __init__(self, max_token_length=255, **kwargs):
264
"""
265
Args:
266
max_token_length (int): Maximum token length
267
**kwargs: Additional parameters
268
"""
269
270
class KeywordTokenizer:
271
"""
272
Keyword tokenizer that outputs entire input as single token.
273
"""
274
def __init__(self, buffer_size=256, **kwargs):
275
"""
276
Args:
277
buffer_size (int): Input buffer size
278
**kwargs: Additional parameters
279
"""
280
281
class WhitespaceTokenizer:
282
"""
283
Whitespace tokenizer that splits on whitespace.
284
"""
285
def __init__(self, max_token_length=255, **kwargs):
286
"""
287
Args:
288
max_token_length (int): Maximum token length
289
**kwargs: Additional parameters
290
"""
291
292
class PatternTokenizer:
293
"""
294
Pattern tokenizer using regular expressions.
295
"""
296
def __init__(self, pattern=r'\W+', flags=None, group=-1, **kwargs):
297
"""
298
Args:
299
pattern (str): Regular expression pattern
300
flags (str): Regular expression flags
301
group (int): Capture group to extract (-1 = split on pattern)
302
**kwargs: Additional parameters
303
"""
304
305
class NGramTokenizer:
306
"""
307
N-gram tokenizer for partial matching.
308
"""
309
def __init__(self, min_gram=1, max_gram=2, token_chars=None, **kwargs):
310
"""
311
Args:
312
min_gram (int): Minimum n-gram length
313
max_gram (int): Maximum n-gram length
314
token_chars (list): Character classes to include in tokens
315
**kwargs: Additional parameters
316
317
Token character classes: letter, digit, whitespace, punctuation, symbol
318
"""
319
320
class EdgeNGramTokenizer:
321
"""
322
Edge n-gram tokenizer for prefix matching.
323
"""
324
def __init__(self, min_gram=1, max_gram=2, token_chars=None, **kwargs):
325
"""
326
Args and parameters same as NGramTokenizer.
327
"""
328
329
class PathHierarchyTokenizer:
330
"""
331
Path hierarchy tokenizer for filesystem paths.
332
"""
333
def __init__(self, delimiter='/', replacement=None, buffer_size=1024,
334
reverse=False, skip=0, **kwargs):
335
"""
336
Args:
337
delimiter (str): Path delimiter
338
replacement (str, optional): Replacement for delimiter in output
339
buffer_size (int): Input buffer size
340
reverse (bool): Process path in reverse order
341
skip (int): Number of initial tokens to skip
342
**kwargs: Additional parameters
343
"""
344
345
class ClassicTokenizer:
346
"""
347
Classic tokenizer based on English grammar.
348
"""
349
def __init__(self, max_token_length=255, **kwargs):
350
"""
351
Args:
352
max_token_length (int): Maximum token length
353
**kwargs: Additional parameters
354
"""
355
356
class LetterTokenizer:
357
"""
358
Letter tokenizer that splits on non-letter characters.
359
"""
360
def __init__(self, **kwargs):
361
"""
362
Args:
363
**kwargs: Additional parameters
364
"""
365
366
class LowercaseTokenizer:
367
"""
368
Lowercase tokenizer that splits on non-letter and lowercases.
369
"""
370
def __init__(self, **kwargs):
371
"""
372
Args:
373
**kwargs: Additional parameters
374
"""
375
```
376
377
### Character Filters
378
379
Character filters for preprocessing text before tokenization.
380
381
```python { .api }
382
class HtmlStripCharFilter:
383
"""
384
HTML strip character filter.
385
"""
386
def __init__(self, escaped_tags=None, **kwargs):
387
"""
388
Args:
389
escaped_tags (list, optional): HTML tags to escape instead of strip
390
**kwargs: Additional parameters
391
"""
392
393
class MappingCharFilter:
394
"""
395
Mapping character filter for character replacement.
396
"""
397
def __init__(self, mappings=None, mappings_path=None, **kwargs):
398
"""
399
Args:
400
mappings (list, optional): List of mappings ('from => to')
401
mappings_path (str, optional): Path to mappings file
402
**kwargs: Additional parameters
403
"""
404
405
class PatternReplaceCharFilter:
406
"""
407
Pattern replace character filter using regular expressions.
408
"""
409
def __init__(self, pattern, replacement='', flags=None, **kwargs):
410
"""
411
Args:
412
pattern (str): Regular expression pattern
413
replacement (str): Replacement string
414
flags (str): Regular expression flags
415
**kwargs: Additional parameters
416
"""
417
```
418
419
### Token Filters
420
421
Token filters for processing tokens after tokenization.
422
423
```python { .api }
424
class LowercaseTokenFilter:
425
"""
426
Lowercase token filter.
427
"""
428
def __init__(self, language=None, **kwargs):
429
"""
430
Args:
431
language (str, optional): Language-specific lowercasing
432
**kwargs: Additional parameters
433
"""
434
435
class UppercaseTokenFilter:
436
"""
437
Uppercase token filter.
438
"""
439
def __init__(self, **kwargs):
440
"""
441
Args:
442
**kwargs: Additional parameters
443
"""
444
445
class StopTokenFilter:
446
"""
447
Stop word token filter.
448
"""
449
def __init__(self, stopwords=None, stopwords_path=None, ignore_case=False,
450
remove_trailing=True, **kwargs):
451
"""
452
Args:
453
stopwords (list or str, optional): Stop words or language name
454
stopwords_path (str, optional): Path to stop words file
455
ignore_case (bool): Case insensitive matching
456
remove_trailing (bool): Remove trailing stop words
457
**kwargs: Additional parameters
458
"""
459
460
class StemmerTokenFilter:
461
"""
462
Stemmer token filter.
463
"""
464
def __init__(self, language='english', **kwargs):
465
"""
466
Args:
467
language (str): Stemming language
468
**kwargs: Additional parameters
469
470
Supported languages: Same as LanguageAnalyzer
471
"""
472
473
class SnowballTokenFilter:
474
"""
475
Snowball stemmer token filter.
476
"""
477
def __init__(self, language='english', **kwargs):
478
"""
479
Args:
480
language (str): Snowball stemming language
481
**kwargs: Additional parameters
482
"""
483
484
class SynonymTokenFilter:
485
"""
486
Synonym token filter.
487
"""
488
def __init__(self, synonyms=None, synonyms_path=None, expand=True,
489
lenient=False, **kwargs):
490
"""
491
Args:
492
synonyms (list, optional): List of synonym rules
493
synonyms_path (str, optional): Path to synonyms file
494
expand (bool): Expand synonyms
495
lenient (bool): Ignore malformed synonym rules
496
**kwargs: Additional parameters
497
498
Synonym formats:
499
- 'laptop,notebook,computer' (equivalent synonyms)
500
- 'laptop,notebook => computer' (explicit mapping)
501
"""
502
503
class NGramTokenFilter:
504
"""
505
N-gram token filter.
506
"""
507
def __init__(self, min_gram=1, max_gram=2, preserve_original=False, **kwargs):
508
"""
509
Args:
510
min_gram (int): Minimum n-gram length
511
max_gram (int): Maximum n-gram length
512
preserve_original (bool): Keep original tokens
513
**kwargs: Additional parameters
514
"""
515
516
class EdgeNGramTokenFilter:
517
"""
518
Edge n-gram token filter.
519
"""
520
def __init__(self, min_gram=1, max_gram=2, side='front', preserve_original=False, **kwargs):
521
"""
522
Args:
523
min_gram (int): Minimum n-gram length
524
max_gram (int): Maximum n-gram length
525
side (str): Side to generate n-grams from ('front' or 'back')
526
preserve_original (bool): Keep original tokens
527
**kwargs: Additional parameters
528
"""
529
530
class ShingleTokenFilter:
531
"""
532
Shingle token filter for word n-grams.
533
"""
534
def __init__(self, min_shingle_size=2, max_shingle_size=2, output_unigrams=True,
535
output_unigrams_if_no_shingles=False, token_separator=' ',
536
filler_token='_', **kwargs):
537
"""
538
Args:
539
min_shingle_size (int): Minimum shingle size
540
max_shingle_size (int): Maximum shingle size
541
output_unigrams (bool): Output single tokens
542
output_unigrams_if_no_shingles (bool): Output unigrams when no shingles
543
token_separator (str): Token separator in shingles
544
filler_token (str): Filler for missing positions
545
**kwargs: Additional parameters
546
"""
547
548
class AsciiFoldingTokenFilter:
549
"""
550
ASCII folding token filter for removing accents.
551
"""
552
def __init__(self, preserve_original=False, **kwargs):
553
"""
554
Args:
555
preserve_original (bool): Keep original tokens
556
**kwargs: Additional parameters
557
"""
558
559
class LengthTokenFilter:
560
"""
561
Length token filter for filtering by token length.
562
"""
563
def __init__(self, min_length=0, max_length=None, **kwargs):
564
"""
565
Args:
566
min_length (int): Minimum token length
567
max_length (int, optional): Maximum token length
568
**kwargs: Additional parameters
569
"""
570
571
class TruncateTokenFilter:
572
"""
573
Truncate token filter for limiting token length.
574
"""
575
def __init__(self, length=10, **kwargs):
576
"""
577
Args:
578
length (int): Maximum token length
579
**kwargs: Additional parameters
580
"""
581
582
class ReverseTokenFilter:
583
"""
584
Reverse token filter for reversing token characters.
585
"""
586
def __init__(self, **kwargs):
587
"""
588
Args:
589
**kwargs: Additional parameters
590
"""
591
592
class ElisionTokenFilter:
593
"""
594
Elision token filter for removing elisions.
595
"""
596
def __init__(self, articles=None, articles_path=None, articles_case=False, **kwargs):
597
"""
598
Args:
599
articles (list, optional): List of elision articles
600
articles_path (str, optional): Path to articles file
601
articles_case (bool): Case sensitive matching
602
**kwargs: Additional parameters
603
"""
604
605
class PhoneticTokenFilter:
606
"""
607
Phonetic token filter for phonetic matching.
608
"""
609
def __init__(self, encoder='metaphone', replace=True, **kwargs):
610
"""
611
Args:
612
encoder (str): Phonetic encoder algorithm
613
replace (bool): Replace original token
614
**kwargs: Additional parameters
615
616
Encoders: metaphone, double_metaphone, soundex, refined_soundex,
617
caverphone1, caverphone2, cologne, nysiis, koelnerphonetik,
618
haasephonetik, beider_morse, daitch_mokotoff
619
"""
620
```
621
622
## Usage Examples
623
624
### Custom Analyzer Configuration
625
626
```python
627
from elasticsearch_dsl import Document, Text, analyzer, tokenizer, char_filter, token_filter
628
629
# Define custom analyzer
630
my_analyzer = analyzer(
631
'my_custom_analyzer',
632
tokenizer=tokenizer('standard', max_token_length=200),
633
char_filter=[
634
char_filter('html_strip'),
635
char_filter('mapping', mappings=['& => and', '@ => at'])
636
],
637
filter=[
638
token_filter('lowercase'),
639
token_filter('stop', stopwords=['the', 'is', 'at', 'which', 'on']),
640
token_filter('stemmer', language='english'),
641
token_filter('synonym', synonyms=[
642
'laptop,notebook,computer',
643
'car,automobile,vehicle'
644
])
645
]
646
)
647
648
# Use in document definition
649
class Article(Document):
650
title = Text(analyzer=my_analyzer)
651
content = Text(
652
analyzer=my_analyzer,
653
fields={
654
'raw': Text(analyzer='keyword'),
655
'stemmed': Text(analyzer='stemmer')
656
}
657
)
658
659
class Index:
660
name = 'articles'
661
settings = {
662
'analysis': {
663
'analyzer': {
664
'my_custom_analyzer': my_analyzer.to_dict()
665
}
666
}
667
}
668
```
669
670
### Language-Specific Analysis
671
672
```python
673
# Multi-language document with different analyzers
674
class MultilingualDocument(Document):
675
# English content
676
title_en = Text(analyzer='english')
677
content_en = Text(analyzer='english')
678
679
# Spanish content
680
title_es = Text(analyzer='spanish')
681
content_es = Text(analyzer='spanish')
682
683
# French content
684
title_fr = Text(analyzer='french')
685
content_fr = Text(analyzer='french')
686
687
# Auto-detect language field
688
content_auto = Text(
689
fields={
690
'english': Text(analyzer='english'),
691
'spanish': Text(analyzer='spanish'),
692
'french': Text(analyzer='french')
693
}
694
)
695
696
class Index:
697
name = 'multilingual_docs'
698
```
699
700
### Search-as-You-Type Analysis
701
702
```python
703
# Analyzer for search-as-you-type functionality
704
search_analyzer = analyzer(
705
'search_as_you_type_analyzer',
706
tokenizer='standard',
707
filter=[
708
'lowercase',
709
token_filter('edge_ngram', min_gram=1, max_gram=20)
710
]
711
)
712
713
autocomplete_analyzer = analyzer(
714
'autocomplete_analyzer',
715
tokenizer='standard',
716
filter=[
717
'lowercase',
718
token_filter('shingle', min_shingle_size=2, max_shingle_size=3),
719
token_filter('edge_ngram', min_gram=1, max_gram=20)
720
]
721
)
722
723
class SearchDocument(Document):
724
# For prefix matching
725
title = Text(
726
analyzer=search_analyzer,
727
search_analyzer='standard',
728
fields={
729
'autocomplete': Text(
730
analyzer=autocomplete_analyzer,
731
search_analyzer='standard'
732
)
733
}
734
)
735
736
class Index:
737
name = 'search_docs'
738
```
739
740
### Domain-Specific Analysis
741
742
```python
743
# Analyzer for code/technical content
744
code_analyzer = analyzer(
745
'code_analyzer',
746
tokenizer=tokenizer('pattern', pattern=r'[^\w\.]+'),
747
char_filter=[
748
char_filter('pattern_replace', pattern=r'//.*', replacement=''), # Remove comments
749
char_filter('pattern_replace', pattern=r'/\*.*?\*/', replacement='') # Remove block comments
750
],
751
filter=[
752
'lowercase',
753
token_filter('stop', stopwords=['the', 'a', 'an', 'and', 'or', 'but']),
754
token_filter('ngram', min_gram=3, max_gram=8) # For partial matching
755
]
756
)
757
758
# Analyzer for email addresses
759
email_analyzer = analyzer(
760
'email_analyzer',
761
tokenizer=tokenizer('uax_url_email'),
762
filter=[
763
'lowercase',
764
token_filter('pattern_replace', pattern=r'@.*', replacement='') # Remove domain
765
]
766
)
767
768
class TechnicalDocument(Document):
769
code_snippet = Text(analyzer=code_analyzer)
770
author_email = Text(analyzer=email_analyzer)
771
772
class Index:
773
name = 'technical_docs'
774
```
775
776
### Phonetic and Fuzzy Matching
777
778
```python
779
# Analyzer for name matching with phonetic encoding
780
name_analyzer = analyzer(
781
'name_analyzer',
782
tokenizer='standard',
783
filter=[
784
'lowercase',
785
token_filter('phonetic', encoder='double_metaphone', replace=False),
786
token_filter('unique') # Remove duplicates
787
]
788
)
789
790
# Analyzer with ASCII folding for international names
791
international_name_analyzer = analyzer(
792
'international_name_analyzer',
793
tokenizer='standard',
794
filter=[
795
'lowercase',
796
token_filter('asciifolding'), # Remove accents
797
token_filter('phonetic', encoder='metaphone'),
798
token_filter('ngram', min_gram=2, max_gram=4) # For partial matching
799
]
800
)
801
802
class PersonDocument(Document):
803
name = Text(
804
analyzer=name_analyzer,
805
fields={
806
'international': Text(analyzer=international_name_analyzer),
807
'exact': Text(analyzer='keyword')
808
}
809
)
810
811
class Index:
812
name = 'people'
813
```
814
815
### Analysis Testing
816
817
```python
818
from elasticsearch_dsl import connections
819
820
# Test analyzer output
821
def test_analyzer(analyzer_name, text):
822
"""Test analyzer output on sample text."""
823
client = connections.get_connection()
824
825
response = client.indices.analyze(
826
body={
827
'analyzer': analyzer_name,
828
'text': text
829
}
830
)
831
832
tokens = [token['token'] for token in response['tokens']]
833
return tokens
834
835
# Test custom analyzer
836
test_text = "The quick brown fox jumps over the lazy dog's back!"
837
tokens = test_analyzer('my_custom_analyzer', test_text)
838
print(f"Tokens: {tokens}")
839
840
# Test different analyzers
841
analyzers = ['standard', 'english', 'keyword', 'simple']
842
for analyzer_name in analyzers:
843
tokens = test_analyzer(analyzer_name, test_text)
844
print(f"{analyzer_name}: {tokens}")
845
```