Tessl Tile for npm/lunr@2.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced-querying.md index-building.md index.md searching.md text-processing.md utilities.md

text-processing.mddocs/

0
# Text Processing
1

2
Configurable text processing pipeline for tokenization, stemming, and filtering. The pipeline system allows customization of how text is processed during both indexing and searching, with built-in processors for common operations and support for custom pipeline functions.
3

4
## Capabilities
5

6
### Pipeline Class
7

8
The core pipeline system for chaining text processing functions.
9

10
```javascript { .api }
11
/**
12
 * Configurable text processing pipeline
13
 */
14
class Pipeline {
15
  /**
16
   * Create a new empty pipeline
17
   */
18
  constructor();
19

20
  /**
21
   * Add one or more functions to the end of the pipeline
22
   * @param {...Function} functions - Processing functions to add
23
   */
24
  add(...functions);
25

26
  /**
27
   * Add a function after an existing function
28
   * @param {Function} existingFn - Existing function in pipeline
29
   * @param {Function} newFn - New function to add after existing
30
   */
31
  after(existingFn, newFn);
32

33
  /**
34
   * Add a function before an existing function
35
   * @param {Function} existingFn - Existing function in pipeline
36
   * @param {Function} newFn - New function to add before existing
37
   */
38
  before(existingFn, newFn);
39

40
  /**
41
   * Remove a function from the pipeline
42
   * @param {Function} fn - Function to remove
43
   */
44
  remove(fn);
45

46
  /**
47
   * Process an array of tokens through the pipeline
48
   * @param {Array<lunr.Token>} tokens - Tokens to process
49
   * @returns {Array<lunr.Token>} - Processed tokens
50
   */
51
  run(tokens);
52

53
  /**
54
   * Process a string into tokens and run through pipeline
55
   * @param {string} str - String to process
56
   * @param {Object} metadata - Optional metadata to attach to tokens
57
   * @returns {Array<lunr.Token>} - Processed tokens
58
   */
59
  runString(str, metadata);
60

61
  /**
62
   * Clear all functions from the pipeline
63
   */
64
  reset();
65

66
  /**
67
   * Serialize the pipeline to JSON
68
   * @returns {Array<string>} - Array of registered function labels
69
   */
70
  toJSON();
71

72
  /**
73
   * Registry of all registered pipeline functions
74
   * @type {Object<string, Function>}
75
   */
76
  static registeredFunctions;
77

78
  /**
79
   * Register a function for use in pipelines
80
   * @param {Function} fn - Function to register
81
   * @param {string} label - Unique label for the function
82
   */
83
  static registerFunction(fn, label);
84

85
  /**
86
   * Warn if a function is not registered (for serialization)
87
   * @param {Function} fn - Function to check
88
   */
89
  static warnIfFunctionNotRegistered(fn);
90

91
  /**
92
   * Load a pipeline from serialized data
93
   * @param {Array<string>} serialized - Array of function labels
94
   * @returns {lunr.Pipeline} - Reconstructed pipeline
95
   */
96
  static load(serialized);
97
}
98
```
99

100
**Usage Examples:**
101

102
```javascript
103
const lunr = require('lunr');
104

105
// Create custom pipeline
106
const customPipeline = new lunr.Pipeline();
107
customPipeline.add(
108
  lunr.trimmer,
109
  lunr.stopWordFilter,
110
  lunr.stemmer
111
);
112

113
// Process tokens
114
const tokens = [
115
  new lunr.Token('running'),
116
  new lunr.Token('quickly'),
117
  new lunr.Token('the')
118
];
119

120
const processed = customPipeline.run(tokens);
121
// Results in stemmed tokens: ['run', 'quickli'] (stop word 'the' removed)
122

123
// Process string directly
124
const stringTokens = customPipeline.runString('The runners are running quickly');
125
```
126

127
### Built-in Pipeline Functions
128

129
Core text processing functions provided by Lunr.
130

131
```javascript { .api }
132
/**
133
 * Removes non-word characters from the beginning and end of tokens
134
 * @param {lunr.Token} token - Token to trim
135
 * @returns {lunr.Token|undefined} - Trimmed token, or undefined if nothing remains
136
 */
137
lunr.trimmer;
138

139
/**
140
 * Filters out common English stop words
141
 * @param {lunr.Token} token - Token to check
142
 * @returns {lunr.Token|undefined} - Token if not a stop word, undefined otherwise
143
 */
144
lunr.stopWordFilter;
145

146
/**
147
 * English Porter stemmer - reduces words to their root forms
148
 * @param {lunr.Token} token - Token to stem
149
 * @returns {lunr.Token} - Token with stemmed string
150
 */
151
lunr.stemmer;
152

153
/**
154
 * Generate a custom stop word filter
155
 * @param {Array<string>} stopWords - Array of words to filter out
156
 * @returns {Function} - Stop word filter function
157
 */
158
lunr.generateStopWordFilter;
159
```
160

161
**Usage Examples:**
162

163
```javascript
164
// Using built-in functions individually
165
const token = new lunr.Token('running');
166

167
const trimmed = lunr.trimmer(token);        // Removes punctuation
168
const filtered = lunr.stopWordFilter(token); // Keeps non-stop words
169
const stemmed = lunr.stemmer(token);        // 'running' -> Token('run')
170

171
// Creating custom stop word filter
172
const customStopWords = ['custom', 'specific', 'terms'];
173
const customFilter = lunr.generateStopWordFilter(customStopWords);
174

175
// Use in pipeline
176
const pipeline = new lunr.Pipeline();
177
pipeline.add(lunr.trimmer, customFilter, lunr.stemmer);
178
```
179

180
### Tokenizer
181

182
Breaks text into individual tokens for processing.
183

184
```javascript { .api }
185
/**
186
 * Default tokenizer for converting strings to tokens
187
 * @param {string|Object} obj - String or object to tokenize
188
 * @param {Object} metadata - Optional metadata to attach to tokens
189
 * @returns {Array<lunr.Token>} - Array of tokens
190
 */
191
lunr.tokenizer;
192

193
/**
194
 * Token separation pattern (default: /[\s\-]+/)
195
 * @type {RegExp}
196
 */
197
lunr.tokenizer.separator;
198
```
199

200
**Usage Examples:**
201

202
```javascript
203
// Basic tokenization
204
const tokens = lunr.tokenizer('Hello world, this is a test!');
205
// Returns: [Token('Hello'), Token('world'), Token('this'), Token('is'), Token('a'), Token('test')]
206

207
// Tokenization with metadata
208
const metadata = { source: 'title' };
209
const titleTokens = lunr.tokenizer('My Document Title', metadata);
210

211
// Custom separator
212
const originalSeparator = lunr.tokenizer.separator;
213
lunr.tokenizer.separator = /[\s\-_]+/; // Include underscores
214
const customTokens = lunr.tokenizer('hello_world-test');
215
lunr.tokenizer.separator = originalSeparator; // Restore default
216

217
// Tokenizing objects (extracts string values)
218
const objTokens = lunr.tokenizer({
219
  title: 'Document Title',
220
  content: 'Document content here'
221
});
222
```
223

224
### Token Class
225

226
Individual text tokens with metadata support.
227

228
```javascript { .api }
229
/**
230
 * Wrapper for text tokens with metadata
231
 */
232
class Token {
233
  /**
234
   * Create a new token
235
   * @param {string} str - Token string value
236
   * @param {Object} metadata - Optional metadata object
237
   */
238
  constructor(str, metadata);
239

240
  /**
241
   * Get the string representation of the token
242
   * @returns {string} - Token string value
243
   */
244
  toString();
245

246
  /**
247
   * Apply a function to the token string
248
   * @param {Function} fn - Function to apply to token string
249
   * @returns {lunr.Token} - Token with updated string
250
   */
251
  update(fn);
252

253
  /**
254
   * Create a copy of the token, optionally applying a function
255
   * @param {Function} fn - Optional function to apply during cloning
256
   * @returns {lunr.Token} - Cloned token
257
   */
258
  clone(fn);
259
}
260
```
261

262
**Usage Examples:**
263

264
```javascript
265
// Create token with metadata
266
const token = new lunr.Token('running', { 
267
  position: [0, 7],
268
  field: 'content' 
269
});
270

271
console.log(token.toString()); // 'running'
272

273
// Update token string
274
const uppercased = token.update(function (str) {
275
  return str.toUpperCase();
276
});
277
console.log(uppercased.toString()); // 'RUNNING'
278

279
// Clone with transformation
280
const stemmed = token.clone(function (str) {
281
  return str.replace(/ing$/, '');
282
});
283
console.log(stemmed.toString()); // 'runn'
284

285
// Original token unchanged
286
console.log(token.toString()); // 'running'
287
```
288

289
## Custom Pipeline Functions
290

291
### Creating Custom Processors
292

293
```javascript { .api }
294
/**
295
 * Custom pipeline function template
296
 * @param {lunr.Token} token - Input token
297
 * @returns {lunr.Token|undefined|Array<lunr.Token>} - Processed result
298
 */
299
function customProcessor(token) {
300
  // Return undefined to remove token
301
  // Return token (possibly modified) to keep it
302
  // Return array of tokens to split into multiple tokens
303
}
304
```
305

306
**Usage Examples:**
307

308
```javascript
309
// Remove numbers from tokens
310
function removeNumbers(token) {
311
  const cleaned = token.toString().replace(/\d+/g, '');
312
  if (cleaned.length === 0) {
313
    return undefined; // Remove token entirely
314
  }
315
  return token.update(() => cleaned);
316
}
317

318
// Convert to lowercase (alternative to built-in)
319
function toLowerCase(token) {
320
  return token.update(str => str.toLowerCase());
321
}
322

323
// Split camelCase into separate tokens
324
function splitCamelCase(token) {
325
  const str = token.toString();
326
  const parts = str.split(/(?=[A-Z])/).filter(part => part.length > 0);
327
  
328
  if (parts.length <= 1) {
329
    return token;
330
  }
331
  
332
  return parts.map(part => new lunr.Token(part.toLowerCase(), token.metadata));
333
}
334

335
// Register custom functions for serialization
336
lunr.Pipeline.registerFunction(removeNumbers, 'removeNumbers');
337
lunr.Pipeline.registerFunction(splitCamelCase, 'splitCamelCase');
338

339
// Use in pipeline
340
const customPipeline = new lunr.Pipeline();
341
customPipeline.add(
342
  lunr.trimmer,
343
  removeNumbers,
344
  splitCamelCase,
345
  lunr.stopWordFilter,
346
  lunr.stemmer
347
);
348
```
349

350
### Conditional Processing
351

352
```javascript
353
// Language-aware processor
354
function languageProcessor(token) {
355
  const metadata = token.metadata || {};
356
  
357
  if (metadata.language === 'code') {
358
    // Don't stem code tokens
359
    return token;
360
  } else {
361
    // Apply stemming to natural language
362
    return lunr.stemmer(token);
363
  }
364
}
365

366
// Field-specific processing
367
function fieldSpecificProcessor(token) {
368
  const metadata = token.metadata || {};
369
  
370
  if (metadata.field === 'title') {
371
    // Boost title tokens
372
    return token.update(str => str + '_TITLE');
373
  }
374
  
375
  return token;
376
}
377
```
378

379
## Pipeline Configuration Patterns
380

381
### Index vs Search Pipeline Configuration
382

383
```javascript
384
const idx = lunr(function () {
385
  this.ref('id');
386
  this.field('title');
387
  this.field('content');
388
  
389
  // Configure index-time pipeline (affects indexing)
390
  this.pipeline.remove(lunr.stopWordFilter); // Keep stop words in index
391
  this.pipeline.add(customNormalizer);
392
  
393
  // Configure search-time pipeline (affects queries)
394
  this.searchPipeline.remove(lunr.stemmer);  // No stemming for searches
395
  this.searchPipeline.add(customQueryProcessor);
396
  
397
  documents.forEach(doc => this.add(doc));
398
});
399
```
400

401
### Multi-language Pipeline
402

403
```javascript
404
// Language detection function
405
function detectLanguage(token) {
406
  const str = token.toString();
407
  // Simple heuristics (use proper language detection in practice)
408
  if (/[áéíóúñ]/.test(str)) return 'es';
409
  if (/[àéèêëîïôùûüÿ]/.test(str)) return 'fr';
410
  return 'en';
411
}
412

413
// Multi-language stemmer
414
function multiLangStemmer(token) {
415
  const lang = detectLanguage(token);
416
  
417
  switch (lang) {
418
    case 'es': return spanishStemmer(token);
419
    case 'fr': return frenchStemmer(token);
420
    default: return lunr.stemmer(token);
421
  }
422
}
423

424
// Register for serialization
425
lunr.Pipeline.registerFunction(multiLangStemmer, 'multiLangStemmer');
426
```
427

428
### Debug Pipeline
429

430
```javascript
431
// Debug processor to log pipeline steps
432
function debugProcessor(label) {
433
  function processor(token) {
434
    console.log(`[${label}] Processing:`, token.toString());
435
    return token;
436
  }
437
  
438
  // Register unique function
439
  lunr.Pipeline.registerFunction(processor, `debug_${label}`);
440
  return processor;
441
}
442

443
// Use in pipeline for debugging
444
const debugPipeline = new lunr.Pipeline();
445
debugPipeline.add(
446
  debugProcessor('start'),
447
  lunr.trimmer,
448
  debugProcessor('after_trim'),
449
  lunr.stopWordFilter,
450
  debugProcessor('after_stopwords'),
451
  lunr.stemmer,
452
  debugProcessor('final')
453
);
454
```
455

456
## Advanced Text Processing
457

458
### Metadata Preservation
459

460
```javascript
461
// Preserve positional information
462
function positionTracker(token) {
463
  const metadata = token.metadata || {};
464
  
465
  // Ensure position information is preserved
466
  return token.update(str => {
467
    // Processing logic here
468
    return str.toLowerCase();
469
  });
470
}
471

472
// Use with tokenizer metadata
473
const textWithPositions = 'The quick brown fox';
474
const tokens = lunr.tokenizer(textWithPositions).map((token, index) => {
475
  return new lunr.Token(token.toString(), {
476
    position: index,
477
    original: token.toString()
478
  });
479
});
480
```
481

482
### Custom Normalization
483

484
```javascript
485
// Unicode normalization
486
function unicodeNormalizer(token) {
487
  return token.update(str => {
488
    return str.normalize('NFD')           // Decompose
489
              .replace(/[\u0300-\u036f]/g, '') // Remove diacritics
490
              .normalize('NFC');          // Recompose
491
  });
492
}
493

494
// Synonym expansion
495
const synonymMap = {
496
  'js': 'javascript',
497
  'ts': 'typescript',
498
  'node': 'nodejs'
499
};
500

501
function synonymExpander(token) {
502
  const str = token.toString().toLowerCase();
503
  const synonym = synonymMap[str];
504
  
505
  if (synonym) {
506
    // Return both original and synonym
507
    return [
508
      token,
509
      new lunr.Token(synonym, token.metadata)
510
    ];
511
  }
512
  
513
  return token;
514
}
515
```

Version

Tile

Files

text-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text-processing.mddocs/