or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-querying.mdindex-building.mdindex.mdsearching.mdtext-processing.mdutilities.md

text-processing.mddocs/

0

# Text Processing

1

2

Configurable text processing pipeline for tokenization, stemming, and filtering. The pipeline system allows customization of how text is processed during both indexing and searching, with built-in processors for common operations and support for custom pipeline functions.

3

4

## Capabilities

5

6

### Pipeline Class

7

8

The core pipeline system for chaining text processing functions.

9

10

```javascript { .api }

11

/**

12

* Configurable text processing pipeline

13

*/

14

class Pipeline {

15

/**

16

* Create a new empty pipeline

17

*/

18

constructor();

19

20

/**

21

* Add one or more functions to the end of the pipeline

22

* @param {...Function} functions - Processing functions to add

23

*/

24

add(...functions);

25

26

/**

27

* Add a function after an existing function

28

* @param {Function} existingFn - Existing function in pipeline

29

* @param {Function} newFn - New function to add after existing

30

*/

31

after(existingFn, newFn);

32

33

/**

34

* Add a function before an existing function

35

* @param {Function} existingFn - Existing function in pipeline

36

* @param {Function} newFn - New function to add before existing

37

*/

38

before(existingFn, newFn);

39

40

/**

41

* Remove a function from the pipeline

42

* @param {Function} fn - Function to remove

43

*/

44

remove(fn);

45

46

/**

47

* Process an array of tokens through the pipeline

48

* @param {Array<lunr.Token>} tokens - Tokens to process

49

* @returns {Array<lunr.Token>} - Processed tokens

50

*/

51

run(tokens);

52

53

/**

54

* Process a string into tokens and run through pipeline

55

* @param {string} str - String to process

56

* @param {Object} metadata - Optional metadata to attach to tokens

57

* @returns {Array<lunr.Token>} - Processed tokens

58

*/

59

runString(str, metadata);

60

61

/**

62

* Clear all functions from the pipeline

63

*/

64

reset();

65

66

/**

67

* Serialize the pipeline to JSON

68

* @returns {Array<string>} - Array of registered function labels

69

*/

70

toJSON();

71

72

/**

73

* Registry of all registered pipeline functions

74

* @type {Object<string, Function>}

75

*/

76

static registeredFunctions;

77

78

/**

79

* Register a function for use in pipelines

80

* @param {Function} fn - Function to register

81

* @param {string} label - Unique label for the function

82

*/

83

static registerFunction(fn, label);

84

85

/**

86

* Warn if a function is not registered (for serialization)

87

* @param {Function} fn - Function to check

88

*/

89

static warnIfFunctionNotRegistered(fn);

90

91

/**

92

* Load a pipeline from serialized data

93

* @param {Array<string>} serialized - Array of function labels

94

* @returns {lunr.Pipeline} - Reconstructed pipeline

95

*/

96

static load(serialized);

97

}

98

```

99

100

**Usage Examples:**

101

102

```javascript

103

const lunr = require('lunr');

104

105

// Create custom pipeline

106

const customPipeline = new lunr.Pipeline();

107

customPipeline.add(

108

lunr.trimmer,

109

lunr.stopWordFilter,

110

lunr.stemmer

111

);

112

113

// Process tokens

114

const tokens = [

115

new lunr.Token('running'),

116

new lunr.Token('quickly'),

117

new lunr.Token('the')

118

];

119

120

const processed = customPipeline.run(tokens);

121

// Results in stemmed tokens: ['run', 'quickli'] (stop word 'the' removed)

122

123

// Process string directly

124

const stringTokens = customPipeline.runString('The runners are running quickly');

125

```

126

127

### Built-in Pipeline Functions

128

129

Core text processing functions provided by Lunr.

130

131

```javascript { .api }

132

/**

133

* Removes non-word characters from the beginning and end of tokens

134

* @param {lunr.Token} token - Token to trim

135

* @returns {lunr.Token|undefined} - Trimmed token, or undefined if nothing remains

136

*/

137

lunr.trimmer;

138

139

/**

140

* Filters out common English stop words

141

* @param {lunr.Token} token - Token to check

142

* @returns {lunr.Token|undefined} - Token if not a stop word, undefined otherwise

143

*/

144

lunr.stopWordFilter;

145

146

/**

147

* English Porter stemmer - reduces words to their root forms

148

* @param {lunr.Token} token - Token to stem

149

* @returns {lunr.Token} - Token with stemmed string

150

*/

151

lunr.stemmer;

152

153

/**

154

* Generate a custom stop word filter

155

* @param {Array<string>} stopWords - Array of words to filter out

156

* @returns {Function} - Stop word filter function

157

*/

158

lunr.generateStopWordFilter;

159

```

160

161

**Usage Examples:**

162

163

```javascript

164

// Using built-in functions individually

165

const token = new lunr.Token('running');

166

167

const trimmed = lunr.trimmer(token); // Removes punctuation

168

const filtered = lunr.stopWordFilter(token); // Keeps non-stop words

169

const stemmed = lunr.stemmer(token); // 'running' -> Token('run')

170

171

// Creating custom stop word filter

172

const customStopWords = ['custom', 'specific', 'terms'];

173

const customFilter = lunr.generateStopWordFilter(customStopWords);

174

175

// Use in pipeline

176

const pipeline = new lunr.Pipeline();

177

pipeline.add(lunr.trimmer, customFilter, lunr.stemmer);

178

```

179

180

### Tokenizer

181

182

Breaks text into individual tokens for processing.

183

184

```javascript { .api }

185

/**

186

* Default tokenizer for converting strings to tokens

187

* @param {string|Object} obj - String or object to tokenize

188

* @param {Object} metadata - Optional metadata to attach to tokens

189

* @returns {Array<lunr.Token>} - Array of tokens

190

*/

191

lunr.tokenizer;

192

193

/**

194

* Token separation pattern (default: /[\s\-]+/)

195

* @type {RegExp}

196

*/

197

lunr.tokenizer.separator;

198

```

199

200

**Usage Examples:**

201

202

```javascript

203

// Basic tokenization

204

const tokens = lunr.tokenizer('Hello world, this is a test!');

205

// Returns: [Token('Hello'), Token('world'), Token('this'), Token('is'), Token('a'), Token('test')]

206

207

// Tokenization with metadata

208

const metadata = { source: 'title' };

209

const titleTokens = lunr.tokenizer('My Document Title', metadata);

210

211

// Custom separator

212

const originalSeparator = lunr.tokenizer.separator;

213

lunr.tokenizer.separator = /[\s\-_]+/; // Include underscores

214

const customTokens = lunr.tokenizer('hello_world-test');

215

lunr.tokenizer.separator = originalSeparator; // Restore default

216

217

// Tokenizing objects (extracts string values)

218

const objTokens = lunr.tokenizer({

219

title: 'Document Title',

220

content: 'Document content here'

221

});

222

```

223

224

### Token Class

225

226

Individual text tokens with metadata support.

227

228

```javascript { .api }

229

/**

230

* Wrapper for text tokens with metadata

231

*/

232

class Token {

233

/**

234

* Create a new token

235

* @param {string} str - Token string value

236

* @param {Object} metadata - Optional metadata object

237

*/

238

constructor(str, metadata);

239

240

/**

241

* Get the string representation of the token

242

* @returns {string} - Token string value

243

*/

244

toString();

245

246

/**

247

* Apply a function to the token string

248

* @param {Function} fn - Function to apply to token string

249

* @returns {lunr.Token} - Token with updated string

250

*/

251

update(fn);

252

253

/**

254

* Create a copy of the token, optionally applying a function

255

* @param {Function} fn - Optional function to apply during cloning

256

* @returns {lunr.Token} - Cloned token

257

*/

258

clone(fn);

259

}

260

```

261

262

**Usage Examples:**

263

264

```javascript

265

// Create token with metadata

266

const token = new lunr.Token('running', {

267

position: [0, 7],

268

field: 'content'

269

});

270

271

console.log(token.toString()); // 'running'

272

273

// Update token string

274

const uppercased = token.update(function (str) {

275

return str.toUpperCase();

276

});

277

console.log(uppercased.toString()); // 'RUNNING'

278

279

// Clone with transformation

280

const stemmed = token.clone(function (str) {

281

return str.replace(/ing$/, '');

282

});

283

console.log(stemmed.toString()); // 'runn'

284

285

// Original token unchanged

286

console.log(token.toString()); // 'running'

287

```

288

289

## Custom Pipeline Functions

290

291

### Creating Custom Processors

292

293

```javascript { .api }

294

/**

295

* Custom pipeline function template

296

* @param {lunr.Token} token - Input token

297

* @returns {lunr.Token|undefined|Array<lunr.Token>} - Processed result

298

*/

299

function customProcessor(token) {

300

// Return undefined to remove token

301

// Return token (possibly modified) to keep it

302

// Return array of tokens to split into multiple tokens

303

}

304

```

305

306

**Usage Examples:**

307

308

```javascript

309

// Remove numbers from tokens

310

function removeNumbers(token) {

311

const cleaned = token.toString().replace(/\d+/g, '');

312

if (cleaned.length === 0) {

313

return undefined; // Remove token entirely

314

}

315

return token.update(() => cleaned);

316

}

317

318

// Convert to lowercase (alternative to built-in)

319

function toLowerCase(token) {

320

return token.update(str => str.toLowerCase());

321

}

322

323

// Split camelCase into separate tokens

324

function splitCamelCase(token) {

325

const str = token.toString();

326

const parts = str.split(/(?=[A-Z])/).filter(part => part.length > 0);

327

328

if (parts.length <= 1) {

329

return token;

330

}

331

332

return parts.map(part => new lunr.Token(part.toLowerCase(), token.metadata));

333

}

334

335

// Register custom functions for serialization

336

lunr.Pipeline.registerFunction(removeNumbers, 'removeNumbers');

337

lunr.Pipeline.registerFunction(splitCamelCase, 'splitCamelCase');

338

339

// Use in pipeline

340

const customPipeline = new lunr.Pipeline();

341

customPipeline.add(

342

lunr.trimmer,

343

removeNumbers,

344

splitCamelCase,

345

lunr.stopWordFilter,

346

lunr.stemmer

347

);

348

```

349

350

### Conditional Processing

351

352

```javascript

353

// Language-aware processor

354

function languageProcessor(token) {

355

const metadata = token.metadata || {};

356

357

if (metadata.language === 'code') {

358

// Don't stem code tokens

359

return token;

360

} else {

361

// Apply stemming to natural language

362

return lunr.stemmer(token);

363

}

364

}

365

366

// Field-specific processing

367

function fieldSpecificProcessor(token) {

368

const metadata = token.metadata || {};

369

370

if (metadata.field === 'title') {

371

// Boost title tokens

372

return token.update(str => str + '_TITLE');

373

}

374

375

return token;

376

}

377

```

378

379

## Pipeline Configuration Patterns

380

381

### Index vs Search Pipeline Configuration

382

383

```javascript

384

const idx = lunr(function () {

385

this.ref('id');

386

this.field('title');

387

this.field('content');

388

389

// Configure index-time pipeline (affects indexing)

390

this.pipeline.remove(lunr.stopWordFilter); // Keep stop words in index

391

this.pipeline.add(customNormalizer);

392

393

// Configure search-time pipeline (affects queries)

394

this.searchPipeline.remove(lunr.stemmer); // No stemming for searches

395

this.searchPipeline.add(customQueryProcessor);

396

397

documents.forEach(doc => this.add(doc));

398

});

399

```

400

401

### Multi-language Pipeline

402

403

```javascript

404

// Language detection function

405

function detectLanguage(token) {

406

const str = token.toString();

407

// Simple heuristics (use proper language detection in practice)

408

if (/[áéíóúñ]/.test(str)) return 'es';

409

if (/[àéèêëîïôùûüÿ]/.test(str)) return 'fr';

410

return 'en';

411

}

412

413

// Multi-language stemmer

414

function multiLangStemmer(token) {

415

const lang = detectLanguage(token);

416

417

switch (lang) {

418

case 'es': return spanishStemmer(token);

419

case 'fr': return frenchStemmer(token);

420

default: return lunr.stemmer(token);

421

}

422

}

423

424

// Register for serialization

425

lunr.Pipeline.registerFunction(multiLangStemmer, 'multiLangStemmer');

426

```

427

428

### Debug Pipeline

429

430

```javascript

431

// Debug processor to log pipeline steps

432

function debugProcessor(label) {

433

function processor(token) {

434

console.log(`[${label}] Processing:`, token.toString());

435

return token;

436

}

437

438

// Register unique function

439

lunr.Pipeline.registerFunction(processor, `debug_${label}`);

440

return processor;

441

}

442

443

// Use in pipeline for debugging

444

const debugPipeline = new lunr.Pipeline();

445

debugPipeline.add(

446

debugProcessor('start'),

447

lunr.trimmer,

448

debugProcessor('after_trim'),

449

lunr.stopWordFilter,

450

debugProcessor('after_stopwords'),

451

lunr.stemmer,

452

debugProcessor('final')

453

);

454

```

455

456

## Advanced Text Processing

457

458

### Metadata Preservation

459

460

```javascript

461

// Preserve positional information

462

function positionTracker(token) {

463

const metadata = token.metadata || {};

464

465

// Ensure position information is preserved

466

return token.update(str => {

467

// Processing logic here

468

return str.toLowerCase();

469

});

470

}

471

472

// Use with tokenizer metadata

473

const textWithPositions = 'The quick brown fox';

474

const tokens = lunr.tokenizer(textWithPositions).map((token, index) => {

475

return new lunr.Token(token.toString(), {

476

position: index,

477

original: token.toString()

478

});

479

});

480

```

481

482

### Custom Normalization

483

484

```javascript

485

// Unicode normalization

486

function unicodeNormalizer(token) {

487

return token.update(str => {

488

return str.normalize('NFD') // Decompose

489

.replace(/[\u0300-\u036f]/g, '') // Remove diacritics

490

.normalize('NFC'); // Recompose

491

});

492

}

493

494

// Synonym expansion

495

const synonymMap = {

496

'js': 'javascript',

497

'ts': 'typescript',

498

'node': 'nodejs'

499

};

500

501

function synonymExpander(token) {

502

const str = token.toString().toLowerCase();

503

const synonym = synonymMap[str];

504

505

if (synonym) {

506

// Return both original and synonym

507

return [

508

token,

509

new lunr.Token(synonym, token.metadata)

510

];

511

}

512

513

return token;

514

}

515

```