0
# Text Processing
1
2
Configurable text processing pipeline for tokenization, stemming, and filtering. The pipeline system allows customization of how text is processed during both indexing and searching, with built-in processors for common operations and support for custom pipeline functions.
3
4
## Capabilities
5
6
### Pipeline Class
7
8
The core pipeline system for chaining text processing functions.
9
10
```javascript { .api }
11
/**
12
* Configurable text processing pipeline
13
*/
14
class Pipeline {
15
/**
16
* Create a new empty pipeline
17
*/
18
constructor();
19
20
/**
21
* Add one or more functions to the end of the pipeline
22
* @param {...Function} functions - Processing functions to add
23
*/
24
add(...functions);
25
26
/**
27
* Add a function after an existing function
28
* @param {Function} existingFn - Existing function in pipeline
29
* @param {Function} newFn - New function to add after existing
30
*/
31
after(existingFn, newFn);
32
33
/**
34
* Add a function before an existing function
35
* @param {Function} existingFn - Existing function in pipeline
36
* @param {Function} newFn - New function to add before existing
37
*/
38
before(existingFn, newFn);
39
40
/**
41
* Remove a function from the pipeline
42
* @param {Function} fn - Function to remove
43
*/
44
remove(fn);
45
46
/**
47
* Process an array of tokens through the pipeline
48
* @param {Array<lunr.Token>} tokens - Tokens to process
49
* @returns {Array<lunr.Token>} - Processed tokens
50
*/
51
run(tokens);
52
53
/**
54
* Process a string into tokens and run through pipeline
55
* @param {string} str - String to process
56
* @param {Object} metadata - Optional metadata to attach to tokens
57
* @returns {Array<lunr.Token>} - Processed tokens
58
*/
59
runString(str, metadata);
60
61
/**
62
* Clear all functions from the pipeline
63
*/
64
reset();
65
66
/**
67
* Serialize the pipeline to JSON
68
* @returns {Array<string>} - Array of registered function labels
69
*/
70
toJSON();
71
72
/**
73
* Registry of all registered pipeline functions
74
* @type {Object<string, Function>}
75
*/
76
static registeredFunctions;
77
78
/**
79
* Register a function for use in pipelines
80
* @param {Function} fn - Function to register
81
* @param {string} label - Unique label for the function
82
*/
83
static registerFunction(fn, label);
84
85
/**
86
* Warn if a function is not registered (for serialization)
87
* @param {Function} fn - Function to check
88
*/
89
static warnIfFunctionNotRegistered(fn);
90
91
/**
92
* Load a pipeline from serialized data
93
* @param {Array<string>} serialized - Array of function labels
94
* @returns {lunr.Pipeline} - Reconstructed pipeline
95
*/
96
static load(serialized);
97
}
98
```
99
100
**Usage Examples:**
101
102
```javascript
103
const lunr = require('lunr');
104
105
// Create custom pipeline
106
const customPipeline = new lunr.Pipeline();
107
customPipeline.add(
108
lunr.trimmer,
109
lunr.stopWordFilter,
110
lunr.stemmer
111
);
112
113
// Process tokens
114
const tokens = [
115
new lunr.Token('running'),
116
new lunr.Token('quickly'),
117
new lunr.Token('the')
118
];
119
120
const processed = customPipeline.run(tokens);
121
// Results in stemmed tokens: ['run', 'quickli'] (stop word 'the' removed)
122
123
// Process string directly
124
const stringTokens = customPipeline.runString('The runners are running quickly');
125
```
126
127
### Built-in Pipeline Functions
128
129
Core text processing functions provided by Lunr.
130
131
```javascript { .api }
132
/**
133
* Removes non-word characters from the beginning and end of tokens
134
* @param {lunr.Token} token - Token to trim
135
* @returns {lunr.Token|undefined} - Trimmed token, or undefined if nothing remains
136
*/
137
lunr.trimmer;
138
139
/**
140
* Filters out common English stop words
141
* @param {lunr.Token} token - Token to check
142
* @returns {lunr.Token|undefined} - Token if not a stop word, undefined otherwise
143
*/
144
lunr.stopWordFilter;
145
146
/**
147
* English Porter stemmer - reduces words to their root forms
148
* @param {lunr.Token} token - Token to stem
149
* @returns {lunr.Token} - Token with stemmed string
150
*/
151
lunr.stemmer;
152
153
/**
154
* Generate a custom stop word filter
155
* @param {Array<string>} stopWords - Array of words to filter out
156
* @returns {Function} - Stop word filter function
157
*/
158
lunr.generateStopWordFilter;
159
```
160
161
**Usage Examples:**
162
163
```javascript
164
// Using built-in functions individually
165
const token = new lunr.Token('running');
166
167
const trimmed = lunr.trimmer(token); // Removes punctuation
168
const filtered = lunr.stopWordFilter(token); // Keeps non-stop words
169
const stemmed = lunr.stemmer(token); // 'running' -> Token('run')
170
171
// Creating custom stop word filter
172
const customStopWords = ['custom', 'specific', 'terms'];
173
const customFilter = lunr.generateStopWordFilter(customStopWords);
174
175
// Use in pipeline
176
const pipeline = new lunr.Pipeline();
177
pipeline.add(lunr.trimmer, customFilter, lunr.stemmer);
178
```
179
180
### Tokenizer
181
182
Breaks text into individual tokens for processing.
183
184
```javascript { .api }
185
/**
186
* Default tokenizer for converting strings to tokens
187
* @param {string|Object} obj - String or object to tokenize
188
* @param {Object} metadata - Optional metadata to attach to tokens
189
* @returns {Array<lunr.Token>} - Array of tokens
190
*/
191
lunr.tokenizer;
192
193
/**
194
* Token separation pattern (default: /[\s\-]+/)
195
* @type {RegExp}
196
*/
197
lunr.tokenizer.separator;
198
```
199
200
**Usage Examples:**
201
202
```javascript
203
// Basic tokenization
204
const tokens = lunr.tokenizer('Hello world, this is a test!');
205
// Returns: [Token('Hello'), Token('world'), Token('this'), Token('is'), Token('a'), Token('test')]
206
207
// Tokenization with metadata
208
const metadata = { source: 'title' };
209
const titleTokens = lunr.tokenizer('My Document Title', metadata);
210
211
// Custom separator
212
const originalSeparator = lunr.tokenizer.separator;
213
lunr.tokenizer.separator = /[\s\-_]+/; // Include underscores
214
const customTokens = lunr.tokenizer('hello_world-test');
215
lunr.tokenizer.separator = originalSeparator; // Restore default
216
217
// Tokenizing objects (extracts string values)
218
const objTokens = lunr.tokenizer({
219
title: 'Document Title',
220
content: 'Document content here'
221
});
222
```
223
224
### Token Class
225
226
Individual text tokens with metadata support.
227
228
```javascript { .api }
229
/**
230
* Wrapper for text tokens with metadata
231
*/
232
class Token {
233
/**
234
* Create a new token
235
* @param {string} str - Token string value
236
* @param {Object} metadata - Optional metadata object
237
*/
238
constructor(str, metadata);
239
240
/**
241
* Get the string representation of the token
242
* @returns {string} - Token string value
243
*/
244
toString();
245
246
/**
247
* Apply a function to the token string
248
* @param {Function} fn - Function to apply to token string
249
* @returns {lunr.Token} - Token with updated string
250
*/
251
update(fn);
252
253
/**
254
* Create a copy of the token, optionally applying a function
255
* @param {Function} fn - Optional function to apply during cloning
256
* @returns {lunr.Token} - Cloned token
257
*/
258
clone(fn);
259
}
260
```
261
262
**Usage Examples:**
263
264
```javascript
265
// Create token with metadata
266
const token = new lunr.Token('running', {
267
position: [0, 7],
268
field: 'content'
269
});
270
271
console.log(token.toString()); // 'running'
272
273
// Update token string
274
const uppercased = token.update(function (str) {
275
return str.toUpperCase();
276
});
277
console.log(uppercased.toString()); // 'RUNNING'
278
279
// Clone with transformation
280
const stemmed = token.clone(function (str) {
281
return str.replace(/ing$/, '');
282
});
283
console.log(stemmed.toString()); // 'runn'
284
285
// Original token unchanged
286
console.log(token.toString()); // 'running'
287
```
288
289
## Custom Pipeline Functions
290
291
### Creating Custom Processors
292
293
```javascript { .api }
294
/**
295
* Custom pipeline function template
296
* @param {lunr.Token} token - Input token
297
* @returns {lunr.Token|undefined|Array<lunr.Token>} - Processed result
298
*/
299
function customProcessor(token) {
300
// Return undefined to remove token
301
// Return token (possibly modified) to keep it
302
// Return array of tokens to split into multiple tokens
303
}
304
```
305
306
**Usage Examples:**
307
308
```javascript
309
// Remove numbers from tokens
310
function removeNumbers(token) {
311
const cleaned = token.toString().replace(/\d+/g, '');
312
if (cleaned.length === 0) {
313
return undefined; // Remove token entirely
314
}
315
return token.update(() => cleaned);
316
}
317
318
// Convert to lowercase (alternative to built-in)
319
function toLowerCase(token) {
320
return token.update(str => str.toLowerCase());
321
}
322
323
// Split camelCase into separate tokens
324
function splitCamelCase(token) {
325
const str = token.toString();
326
const parts = str.split(/(?=[A-Z])/).filter(part => part.length > 0);
327
328
if (parts.length <= 1) {
329
return token;
330
}
331
332
return parts.map(part => new lunr.Token(part.toLowerCase(), token.metadata));
333
}
334
335
// Register custom functions for serialization
336
lunr.Pipeline.registerFunction(removeNumbers, 'removeNumbers');
337
lunr.Pipeline.registerFunction(splitCamelCase, 'splitCamelCase');
338
339
// Use in pipeline
340
const customPipeline = new lunr.Pipeline();
341
customPipeline.add(
342
lunr.trimmer,
343
removeNumbers,
344
splitCamelCase,
345
lunr.stopWordFilter,
346
lunr.stemmer
347
);
348
```
349
350
### Conditional Processing
351
352
```javascript
353
// Language-aware processor
354
function languageProcessor(token) {
355
const metadata = token.metadata || {};
356
357
if (metadata.language === 'code') {
358
// Don't stem code tokens
359
return token;
360
} else {
361
// Apply stemming to natural language
362
return lunr.stemmer(token);
363
}
364
}
365
366
// Field-specific processing
367
function fieldSpecificProcessor(token) {
368
const metadata = token.metadata || {};
369
370
if (metadata.field === 'title') {
371
// Boost title tokens
372
return token.update(str => str + '_TITLE');
373
}
374
375
return token;
376
}
377
```
378
379
## Pipeline Configuration Patterns
380
381
### Index vs Search Pipeline Configuration
382
383
```javascript
384
const idx = lunr(function () {
385
this.ref('id');
386
this.field('title');
387
this.field('content');
388
389
// Configure index-time pipeline (affects indexing)
390
this.pipeline.remove(lunr.stopWordFilter); // Keep stop words in index
391
this.pipeline.add(customNormalizer);
392
393
// Configure search-time pipeline (affects queries)
394
this.searchPipeline.remove(lunr.stemmer); // No stemming for searches
395
this.searchPipeline.add(customQueryProcessor);
396
397
documents.forEach(doc => this.add(doc));
398
});
399
```
400
401
### Multi-language Pipeline
402
403
```javascript
404
// Language detection function
405
function detectLanguage(token) {
406
const str = token.toString();
407
// Simple heuristics (use proper language detection in practice)
408
if (/[áéíóúñ]/.test(str)) return 'es';
409
if (/[àéèêëîïôùûüÿ]/.test(str)) return 'fr';
410
return 'en';
411
}
412
413
// Multi-language stemmer
414
function multiLangStemmer(token) {
415
const lang = detectLanguage(token);
416
417
switch (lang) {
418
case 'es': return spanishStemmer(token);
419
case 'fr': return frenchStemmer(token);
420
default: return lunr.stemmer(token);
421
}
422
}
423
424
// Register for serialization
425
lunr.Pipeline.registerFunction(multiLangStemmer, 'multiLangStemmer');
426
```
427
428
### Debug Pipeline
429
430
```javascript
431
// Debug processor to log pipeline steps
432
function debugProcessor(label) {
433
function processor(token) {
434
console.log(`[${label}] Processing:`, token.toString());
435
return token;
436
}
437
438
// Register unique function
439
lunr.Pipeline.registerFunction(processor, `debug_${label}`);
440
return processor;
441
}
442
443
// Use in pipeline for debugging
444
const debugPipeline = new lunr.Pipeline();
445
debugPipeline.add(
446
debugProcessor('start'),
447
lunr.trimmer,
448
debugProcessor('after_trim'),
449
lunr.stopWordFilter,
450
debugProcessor('after_stopwords'),
451
lunr.stemmer,
452
debugProcessor('final')
453
);
454
```
455
456
## Advanced Text Processing
457
458
### Metadata Preservation
459
460
```javascript
461
// Preserve positional information
462
function positionTracker(token) {
463
const metadata = token.metadata || {};
464
465
// Ensure position information is preserved
466
return token.update(str => {
467
// Processing logic here
468
return str.toLowerCase();
469
});
470
}
471
472
// Use with tokenizer metadata
473
const textWithPositions = 'The quick brown fox';
474
const tokens = lunr.tokenizer(textWithPositions).map((token, index) => {
475
return new lunr.Token(token.toString(), {
476
position: index,
477
original: token.toString()
478
});
479
});
480
```
481
482
### Custom Normalization
483
484
```javascript
485
// Unicode normalization
486
function unicodeNormalizer(token) {
487
return token.update(str => {
488
return str.normalize('NFD') // Decompose
489
.replace(/[\u0300-\u036f]/g, '') // Remove diacritics
490
.normalize('NFC'); // Recompose
491
});
492
}
493
494
// Synonym expansion
495
const synonymMap = {
496
'js': 'javascript',
497
'ts': 'typescript',
498
'node': 'nodejs'
499
};
500
501
function synonymExpander(token) {
502
const str = token.toString().toLowerCase();
503
const synonym = synonymMap[str];
504
505
if (synonym) {
506
// Return both original and synonym
507
return [
508
token,
509
new lunr.Token(synonym, token.metadata)
510
];
511
}
512
513
return token;
514
}
515
```