or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

aggregations.mdanalysis.mdconnections.mddocument-operations.mdfield-types.mdindex-management.mdindex.mdsearch-queries.md

analysis.mddocs/

0

# Analysis

1

2

Text analysis and processing capabilities for custom analyzer creation, tokenizer configuration, and text processing setup. Supports multilingual and domain-specific search requirements with comprehensive character filtering, tokenization, and token filtering options.

3

4

## Capabilities

5

6

### Analyzer Creation

7

8

Functions for creating custom analyzers with configurable components.

9

10

```python { .api }

11

def analyzer(name, **kwargs):

12

"""

13

Create custom analyzer.

14

15

Args:

16

name (str): Analyzer name or type

17

**kwargs: Analyzer configuration

18

19

Returns:

20

Analyzer: Analyzer object

21

22

Parameters:

23

tokenizer (str or dict): Tokenizer configuration

24

char_filter (list): Character filters to apply

25

filter (list): Token filters to apply

26

position_increment_gap (int): Gap between array elements

27

28

Examples:

29

analyzer('custom_english',

30

tokenizer='standard',

31

filter=['lowercase', 'stop', 'stemmer'])

32

33

analyzer('my_analyzer',

34

tokenizer={'keyword': {'buffer_size': 256}},

35

char_filter=['html_strip'],

36

filter=['lowercase', 'asciifolding'])

37

"""

38

```

39

40

### Tokenizer Creation

41

42

Functions for creating custom tokenizers.

43

44

```python { .api }

45

def tokenizer(name, **kwargs):

46

"""

47

Create custom tokenizer.

48

49

Args:

50

name (str): Tokenizer name or type

51

**kwargs: Tokenizer configuration

52

53

Returns:

54

Tokenizer: Tokenizer object

55

56

Examples:

57

tokenizer('standard', max_token_length=255)

58

tokenizer('pattern', pattern=r'\W+', lowercase=True)

59

tokenizer('ngram', min_gram=3, max_gram=4)

60

"""

61

```

62

63

### Character Filter Creation

64

65

Functions for creating character filters.

66

67

```python { .api }

68

def char_filter(name, **kwargs):

69

"""

70

Create character filter.

71

72

Args:

73

name (str): Character filter name or type

74

**kwargs: Character filter configuration

75

76

Returns:

77

CharFilter: Character filter object

78

79

Examples:

80

char_filter('html_strip', escaped_tags=['b'])

81

char_filter('mapping', mappings=['& => and', '| => or'])

82

char_filter('pattern_replace', pattern='[0-9]', replacement='#')

83

"""

84

```

85

86

### Token Filter Creation

87

88

Functions for creating token filters.

89

90

```python { .api }

91

def token_filter(name, **kwargs):

92

"""

93

Create token filter.

94

95

Args:

96

name (str): Token filter name or type

97

**kwargs: Token filter configuration

98

99

Returns:

100

TokenFilter: Token filter object

101

102

Examples:

103

token_filter('stop', stopwords=['the', 'is', 'at'])

104

token_filter('synonym', synonyms=['laptop,notebook', 'car,automobile'])

105

token_filter('stemmer', language='english')

106

"""

107

```

108

109

### Normalizer Creation

110

111

Functions for creating normalizers for keyword fields.

112

113

```python { .api }

114

def normalizer(name, **kwargs):

115

"""

116

Create normalizer for keyword fields.

117

118

Args:

119

name (str): Normalizer name

120

**kwargs: Normalizer configuration

121

122

Returns:

123

Normalizer: Normalizer object

124

125

Parameters:

126

char_filter (list): Character filters to apply

127

filter (list): Token filters to apply

128

129

Examples:

130

normalizer('lowercase_normalizer', filter=['lowercase'])

131

normalizer('ascii_normalizer',

132

char_filter=['mapping'],

133

filter=['lowercase', 'asciifolding'])

134

"""

135

```

136

137

### Built-in Analyzers

138

139

Pre-configured analyzers for common use cases.

140

141

```python { .api }

142

class StandardAnalyzer:

143

"""

144

Standard analyzer with standard tokenizer and lowercase filter.

145

"""

146

def __init__(self, max_token_length=255, stopwords=None, **kwargs):

147

"""

148

Args:

149

max_token_length (int): Maximum token length

150

stopwords (list or str): Stop words configuration

151

**kwargs: Additional parameters

152

"""

153

154

class SimpleAnalyzer:

155

"""

156

Simple analyzer that splits on non-letter characters and lowercases.

157

"""

158

def __init__(self, **kwargs):

159

"""

160

Args:

161

**kwargs: Additional parameters

162

"""

163

164

class WhitespaceAnalyzer:

165

"""

166

Whitespace analyzer that splits on whitespace characters.

167

"""

168

def __init__(self, **kwargs):

169

"""

170

Args:

171

**kwargs: Additional parameters

172

"""

173

174

class StopAnalyzer:

175

"""

176

Stop analyzer with stop word filtering.

177

"""

178

def __init__(self, stopwords=None, **kwargs):

179

"""

180

Args:

181

stopwords (list or str): Stop words configuration

182

**kwargs: Additional parameters

183

"""

184

185

class KeywordAnalyzer:

186

"""

187

Keyword analyzer that treats input as single token.

188

"""

189

def __init__(self, **kwargs):

190

"""

191

Args:

192

**kwargs: Additional parameters

193

"""

194

195

class PatternAnalyzer:

196

"""

197

Pattern analyzer using regular expressions.

198

"""

199

def __init__(self, pattern=r'\W+', flags=None, lowercase=True, stopwords=None, **kwargs):

200

"""

201

Args:

202

pattern (str): Regular expression pattern

203

flags (str): Regular expression flags

204

lowercase (bool): Convert to lowercase

205

stopwords (list or str): Stop words configuration

206

**kwargs: Additional parameters

207

"""

208

209

class LanguageAnalyzer:

210

"""

211

Language-specific analyzer.

212

"""

213

def __init__(self, language, **kwargs):

214

"""

215

Args:

216

language (str): Language code ('english', 'spanish', 'french', etc.)

217

**kwargs: Language-specific parameters

218

219

Supported languages:

220

arabic, armenian, basque, bengali, brazilian, bulgarian, catalan,

221

chinese, cjk, czech, danish, dutch, english, estonian, finnish,

222

french, galician, german, greek, hindi, hungarian, indonesian,

223

irish, italian, latvian, lithuanian, norwegian, persian, portuguese,

224

romanian, russian, sorani, spanish, swedish, turkish, thai

225

"""

226

227

class FingerprintAnalyzer:

228

"""

229

Fingerprint analyzer for deduplication.

230

"""

231

def __init__(self, separator=' ', max_output_size=255, stopwords=None, **kwargs):

232

"""

233

Args:

234

separator (str): Token separator in output

235

max_output_size (int): Maximum output size

236

stopwords (list or str): Stop words configuration

237

**kwargs: Additional parameters

238

"""

239

240

class CustomAnalyzer:

241

"""

242

Custom analyzer builder.

243

"""

244

def __init__(self, tokenizer, char_filter=None, filter=None, **kwargs):

245

"""

246

Args:

247

tokenizer (str or dict): Tokenizer configuration

248

char_filter (list, optional): Character filters

249

filter (list, optional): Token filters

250

**kwargs: Additional parameters

251

"""

252

```

253

254

### Built-in Tokenizers

255

256

Pre-configured tokenizers for various text processing needs.

257

258

```python { .api }

259

class StandardTokenizer:

260

"""

261

Standard tokenizer based on Unicode Text Segmentation.

262

"""

263

def __init__(self, max_token_length=255, **kwargs):

264

"""

265

Args:

266

max_token_length (int): Maximum token length

267

**kwargs: Additional parameters

268

"""

269

270

class KeywordTokenizer:

271

"""

272

Keyword tokenizer that outputs entire input as single token.

273

"""

274

def __init__(self, buffer_size=256, **kwargs):

275

"""

276

Args:

277

buffer_size (int): Input buffer size

278

**kwargs: Additional parameters

279

"""

280

281

class WhitespaceTokenizer:

282

"""

283

Whitespace tokenizer that splits on whitespace.

284

"""

285

def __init__(self, max_token_length=255, **kwargs):

286

"""

287

Args:

288

max_token_length (int): Maximum token length

289

**kwargs: Additional parameters

290

"""

291

292

class PatternTokenizer:

293

"""

294

Pattern tokenizer using regular expressions.

295

"""

296

def __init__(self, pattern=r'\W+', flags=None, group=-1, **kwargs):

297

"""

298

Args:

299

pattern (str): Regular expression pattern

300

flags (str): Regular expression flags

301

group (int): Capture group to extract (-1 = split on pattern)

302

**kwargs: Additional parameters

303

"""

304

305

class NGramTokenizer:

306

"""

307

N-gram tokenizer for partial matching.

308

"""

309

def __init__(self, min_gram=1, max_gram=2, token_chars=None, **kwargs):

310

"""

311

Args:

312

min_gram (int): Minimum n-gram length

313

max_gram (int): Maximum n-gram length

314

token_chars (list): Character classes to include in tokens

315

**kwargs: Additional parameters

316

317

Token character classes: letter, digit, whitespace, punctuation, symbol

318

"""

319

320

class EdgeNGramTokenizer:

321

"""

322

Edge n-gram tokenizer for prefix matching.

323

"""

324

def __init__(self, min_gram=1, max_gram=2, token_chars=None, **kwargs):

325

"""

326

Args and parameters same as NGramTokenizer.

327

"""

328

329

class PathHierarchyTokenizer:

330

"""

331

Path hierarchy tokenizer for filesystem paths.

332

"""

333

def __init__(self, delimiter='/', replacement=None, buffer_size=1024,

334

reverse=False, skip=0, **kwargs):

335

"""

336

Args:

337

delimiter (str): Path delimiter

338

replacement (str, optional): Replacement for delimiter in output

339

buffer_size (int): Input buffer size

340

reverse (bool): Process path in reverse order

341

skip (int): Number of initial tokens to skip

342

**kwargs: Additional parameters

343

"""

344

345

class ClassicTokenizer:

346

"""

347

Classic tokenizer based on English grammar.

348

"""

349

def __init__(self, max_token_length=255, **kwargs):

350

"""

351

Args:

352

max_token_length (int): Maximum token length

353

**kwargs: Additional parameters

354

"""

355

356

class LetterTokenizer:

357

"""

358

Letter tokenizer that splits on non-letter characters.

359

"""

360

def __init__(self, **kwargs):

361

"""

362

Args:

363

**kwargs: Additional parameters

364

"""

365

366

class LowercaseTokenizer:

367

"""

368

Lowercase tokenizer that splits on non-letter and lowercases.

369

"""

370

def __init__(self, **kwargs):

371

"""

372

Args:

373

**kwargs: Additional parameters

374

"""

375

```

376

377

### Character Filters

378

379

Character filters for preprocessing text before tokenization.

380

381

```python { .api }

382

class HtmlStripCharFilter:

383

"""

384

HTML strip character filter.

385

"""

386

def __init__(self, escaped_tags=None, **kwargs):

387

"""

388

Args:

389

escaped_tags (list, optional): HTML tags to escape instead of strip

390

**kwargs: Additional parameters

391

"""

392

393

class MappingCharFilter:

394

"""

395

Mapping character filter for character replacement.

396

"""

397

def __init__(self, mappings=None, mappings_path=None, **kwargs):

398

"""

399

Args:

400

mappings (list, optional): List of mappings ('from => to')

401

mappings_path (str, optional): Path to mappings file

402

**kwargs: Additional parameters

403

"""

404

405

class PatternReplaceCharFilter:

406

"""

407

Pattern replace character filter using regular expressions.

408

"""

409

def __init__(self, pattern, replacement='', flags=None, **kwargs):

410

"""

411

Args:

412

pattern (str): Regular expression pattern

413

replacement (str): Replacement string

414

flags (str): Regular expression flags

415

**kwargs: Additional parameters

416

"""

417

```

418

419

### Token Filters

420

421

Token filters for processing tokens after tokenization.

422

423

```python { .api }

424

class LowercaseTokenFilter:

425

"""

426

Lowercase token filter.

427

"""

428

def __init__(self, language=None, **kwargs):

429

"""

430

Args:

431

language (str, optional): Language-specific lowercasing

432

**kwargs: Additional parameters

433

"""

434

435

class UppercaseTokenFilter:

436

"""

437

Uppercase token filter.

438

"""

439

def __init__(self, **kwargs):

440

"""

441

Args:

442

**kwargs: Additional parameters

443

"""

444

445

class StopTokenFilter:

446

"""

447

Stop word token filter.

448

"""

449

def __init__(self, stopwords=None, stopwords_path=None, ignore_case=False,

450

remove_trailing=True, **kwargs):

451

"""

452

Args:

453

stopwords (list or str, optional): Stop words or language name

454

stopwords_path (str, optional): Path to stop words file

455

ignore_case (bool): Case insensitive matching

456

remove_trailing (bool): Remove trailing stop words

457

**kwargs: Additional parameters

458

"""

459

460

class StemmerTokenFilter:

461

"""

462

Stemmer token filter.

463

"""

464

def __init__(self, language='english', **kwargs):

465

"""

466

Args:

467

language (str): Stemming language

468

**kwargs: Additional parameters

469

470

Supported languages: Same as LanguageAnalyzer

471

"""

472

473

class SnowballTokenFilter:

474

"""

475

Snowball stemmer token filter.

476

"""

477

def __init__(self, language='english', **kwargs):

478

"""

479

Args:

480

language (str): Snowball stemming language

481

**kwargs: Additional parameters

482

"""

483

484

class SynonymTokenFilter:

485

"""

486

Synonym token filter.

487

"""

488

def __init__(self, synonyms=None, synonyms_path=None, expand=True,

489

lenient=False, **kwargs):

490

"""

491

Args:

492

synonyms (list, optional): List of synonym rules

493

synonyms_path (str, optional): Path to synonyms file

494

expand (bool): Expand synonyms

495

lenient (bool): Ignore malformed synonym rules

496

**kwargs: Additional parameters

497

498

Synonym formats:

499

- 'laptop,notebook,computer' (equivalent synonyms)

500

- 'laptop,notebook => computer' (explicit mapping)

501

"""

502

503

class NGramTokenFilter:

504

"""

505

N-gram token filter.

506

"""

507

def __init__(self, min_gram=1, max_gram=2, preserve_original=False, **kwargs):

508

"""

509

Args:

510

min_gram (int): Minimum n-gram length

511

max_gram (int): Maximum n-gram length

512

preserve_original (bool): Keep original tokens

513

**kwargs: Additional parameters

514

"""

515

516

class EdgeNGramTokenFilter:

517

"""

518

Edge n-gram token filter.

519

"""

520

def __init__(self, min_gram=1, max_gram=2, side='front', preserve_original=False, **kwargs):

521

"""

522

Args:

523

min_gram (int): Minimum n-gram length

524

max_gram (int): Maximum n-gram length

525

side (str): Side to generate n-grams from ('front' or 'back')

526

preserve_original (bool): Keep original tokens

527

**kwargs: Additional parameters

528

"""

529

530

class ShingleTokenFilter:

531

"""

532

Shingle token filter for word n-grams.

533

"""

534

def __init__(self, min_shingle_size=2, max_shingle_size=2, output_unigrams=True,

535

output_unigrams_if_no_shingles=False, token_separator=' ',

536

filler_token='_', **kwargs):

537

"""

538

Args:

539

min_shingle_size (int): Minimum shingle size

540

max_shingle_size (int): Maximum shingle size

541

output_unigrams (bool): Output single tokens

542

output_unigrams_if_no_shingles (bool): Output unigrams when no shingles

543

token_separator (str): Token separator in shingles

544

filler_token (str): Filler for missing positions

545

**kwargs: Additional parameters

546

"""

547

548

class AsciiFoldingTokenFilter:

549

"""

550

ASCII folding token filter for removing accents.

551

"""

552

def __init__(self, preserve_original=False, **kwargs):

553

"""

554

Args:

555

preserve_original (bool): Keep original tokens

556

**kwargs: Additional parameters

557

"""

558

559

class LengthTokenFilter:

560

"""

561

Length token filter for filtering by token length.

562

"""

563

def __init__(self, min_length=0, max_length=None, **kwargs):

564

"""

565

Args:

566

min_length (int): Minimum token length

567

max_length (int, optional): Maximum token length

568

**kwargs: Additional parameters

569

"""

570

571

class TruncateTokenFilter:

572

"""

573

Truncate token filter for limiting token length.

574

"""

575

def __init__(self, length=10, **kwargs):

576

"""

577

Args:

578

length (int): Maximum token length

579

**kwargs: Additional parameters

580

"""

581

582

class ReverseTokenFilter:

583

"""

584

Reverse token filter for reversing token characters.

585

"""

586

def __init__(self, **kwargs):

587

"""

588

Args:

589

**kwargs: Additional parameters

590

"""

591

592

class ElisionTokenFilter:

593

"""

594

Elision token filter for removing elisions.

595

"""

596

def __init__(self, articles=None, articles_path=None, articles_case=False, **kwargs):

597

"""

598

Args:

599

articles (list, optional): List of elision articles

600

articles_path (str, optional): Path to articles file

601

articles_case (bool): Case sensitive matching

602

**kwargs: Additional parameters

603

"""

604

605

class PhoneticTokenFilter:

606

"""

607

Phonetic token filter for phonetic matching.

608

"""

609

def __init__(self, encoder='metaphone', replace=True, **kwargs):

610

"""

611

Args:

612

encoder (str): Phonetic encoder algorithm

613

replace (bool): Replace original token

614

**kwargs: Additional parameters

615

616

Encoders: metaphone, double_metaphone, soundex, refined_soundex,

617

caverphone1, caverphone2, cologne, nysiis, koelnerphonetik,

618

haasephonetik, beider_morse, daitch_mokotoff

619

"""

620

```

621

622

## Usage Examples

623

624

### Custom Analyzer Configuration

625

626

```python

627

from elasticsearch_dsl import Document, Text, analyzer, tokenizer, char_filter, token_filter

628

629

# Define custom analyzer

630

my_analyzer = analyzer(

631

'my_custom_analyzer',

632

tokenizer=tokenizer('standard', max_token_length=200),

633

char_filter=[

634

char_filter('html_strip'),

635

char_filter('mapping', mappings=['& => and', '@ => at'])

636

],

637

filter=[

638

token_filter('lowercase'),

639

token_filter('stop', stopwords=['the', 'is', 'at', 'which', 'on']),

640

token_filter('stemmer', language='english'),

641

token_filter('synonym', synonyms=[

642

'laptop,notebook,computer',

643

'car,automobile,vehicle'

644

])

645

]

646

)

647

648

# Use in document definition

649

class Article(Document):

650

title = Text(analyzer=my_analyzer)

651

content = Text(

652

analyzer=my_analyzer,

653

fields={

654

'raw': Text(analyzer='keyword'),

655

'stemmed': Text(analyzer='stemmer')

656

}

657

)

658

659

class Index:

660

name = 'articles'

661

settings = {

662

'analysis': {

663

'analyzer': {

664

'my_custom_analyzer': my_analyzer.to_dict()

665

}

666

}

667

}

668

```

669

670

### Language-Specific Analysis

671

672

```python

673

# Multi-language document with different analyzers

674

class MultilingualDocument(Document):

675

# English content

676

title_en = Text(analyzer='english')

677

content_en = Text(analyzer='english')

678

679

# Spanish content

680

title_es = Text(analyzer='spanish')

681

content_es = Text(analyzer='spanish')

682

683

# French content

684

title_fr = Text(analyzer='french')

685

content_fr = Text(analyzer='french')

686

687

# Auto-detect language field

688

content_auto = Text(

689

fields={

690

'english': Text(analyzer='english'),

691

'spanish': Text(analyzer='spanish'),

692

'french': Text(analyzer='french')

693

}

694

)

695

696

class Index:

697

name = 'multilingual_docs'

698

```

699

700

### Search-as-You-Type Analysis

701

702

```python

703

# Analyzer for search-as-you-type functionality

704

search_analyzer = analyzer(

705

'search_as_you_type_analyzer',

706

tokenizer='standard',

707

filter=[

708

'lowercase',

709

token_filter('edge_ngram', min_gram=1, max_gram=20)

710

]

711

)

712

713

autocomplete_analyzer = analyzer(

714

'autocomplete_analyzer',

715

tokenizer='standard',

716

filter=[

717

'lowercase',

718

token_filter('shingle', min_shingle_size=2, max_shingle_size=3),

719

token_filter('edge_ngram', min_gram=1, max_gram=20)

720

]

721

)

722

723

class SearchDocument(Document):

724

# For prefix matching

725

title = Text(

726

analyzer=search_analyzer,

727

search_analyzer='standard',

728

fields={

729

'autocomplete': Text(

730

analyzer=autocomplete_analyzer,

731

search_analyzer='standard'

732

)

733

}

734

)

735

736

class Index:

737

name = 'search_docs'

738

```

739

740

### Domain-Specific Analysis

741

742

```python

743

# Analyzer for code/technical content

744

code_analyzer = analyzer(

745

'code_analyzer',

746

tokenizer=tokenizer('pattern', pattern=r'[^\w\.]+'),

747

char_filter=[

748

char_filter('pattern_replace', pattern=r'//.*', replacement=''), # Remove comments

749

char_filter('pattern_replace', pattern=r'/\*.*?\*/', replacement='') # Remove block comments

750

],

751

filter=[

752

'lowercase',

753

token_filter('stop', stopwords=['the', 'a', 'an', 'and', 'or', 'but']),

754

token_filter('ngram', min_gram=3, max_gram=8) # For partial matching

755

]

756

)

757

758

# Analyzer for email addresses

759

email_analyzer = analyzer(

760

'email_analyzer',

761

tokenizer=tokenizer('uax_url_email'),

762

filter=[

763

'lowercase',

764

token_filter('pattern_replace', pattern=r'@.*', replacement='') # Remove domain

765

]

766

)

767

768

class TechnicalDocument(Document):

769

code_snippet = Text(analyzer=code_analyzer)

770

author_email = Text(analyzer=email_analyzer)

771

772

class Index:

773

name = 'technical_docs'

774

```

775

776

### Phonetic and Fuzzy Matching

777

778

```python

779

# Analyzer for name matching with phonetic encoding

780

name_analyzer = analyzer(

781

'name_analyzer',

782

tokenizer='standard',

783

filter=[

784

'lowercase',

785

token_filter('phonetic', encoder='double_metaphone', replace=False),

786

token_filter('unique') # Remove duplicates

787

]

788

)

789

790

# Analyzer with ASCII folding for international names

791

international_name_analyzer = analyzer(

792

'international_name_analyzer',

793

tokenizer='standard',

794

filter=[

795

'lowercase',

796

token_filter('asciifolding'), # Remove accents

797

token_filter('phonetic', encoder='metaphone'),

798

token_filter('ngram', min_gram=2, max_gram=4) # For partial matching

799

]

800

)

801

802

class PersonDocument(Document):

803

name = Text(

804

analyzer=name_analyzer,

805

fields={

806

'international': Text(analyzer=international_name_analyzer),

807

'exact': Text(analyzer='keyword')

808

}

809

)

810

811

class Index:

812

name = 'people'

813

```

814

815

### Analysis Testing

816

817

```python

818

from elasticsearch_dsl import connections

819

820

# Test analyzer output

821

def test_analyzer(analyzer_name, text):

822

"""Test analyzer output on sample text."""

823

client = connections.get_connection()

824

825

response = client.indices.analyze(

826

body={

827

'analyzer': analyzer_name,

828

'text': text

829

}

830

)

831

832

tokens = [token['token'] for token in response['tokens']]

833

return tokens

834

835

# Test custom analyzer

836

test_text = "The quick brown fox jumps over the lazy dog's back!"

837

tokens = test_analyzer('my_custom_analyzer', test_text)

838

print(f"Tokens: {tokens}")

839

840

# Test different analyzers

841

analyzers = ['standard', 'english', 'keyword', 'simple']

842

for analyzer_name in analyzers:

843

tokens = test_analyzer(analyzer_name, test_text)

844

print(f"{analyzer_name}: {tokens}")

845

```