CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/npm-lunr

Simple full-text search in your browser.

Pending
Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

SecuritybySnyk

Pending

The risk profile of this skill

Overview
Eval results
Files

text-processing.mddocs/

Text Processing

Configurable text processing pipeline for tokenization, stemming, and filtering. The pipeline system allows customization of how text is processed during both indexing and searching, with built-in processors for common operations and support for custom pipeline functions.

Capabilities

Pipeline Class

The core pipeline system for chaining text processing functions.

/**
 * Configurable text processing pipeline
 */
class Pipeline {
  /**
   * Create a new empty pipeline
   */
  constructor();

  /**
   * Add one or more functions to the end of the pipeline
   * @param {...Function} functions - Processing functions to add
   */
  add(...functions);

  /**
   * Add a function after an existing function
   * @param {Function} existingFn - Existing function in pipeline
   * @param {Function} newFn - New function to add after existing
   */
  after(existingFn, newFn);

  /**
   * Add a function before an existing function
   * @param {Function} existingFn - Existing function in pipeline
   * @param {Function} newFn - New function to add before existing
   */
  before(existingFn, newFn);

  /**
   * Remove a function from the pipeline
   * @param {Function} fn - Function to remove
   */
  remove(fn);

  /**
   * Process an array of tokens through the pipeline
   * @param {Array<lunr.Token>} tokens - Tokens to process
   * @returns {Array<lunr.Token>} - Processed tokens
   */
  run(tokens);

  /**
   * Process a string into tokens and run through pipeline
   * @param {string} str - String to process
   * @param {Object} metadata - Optional metadata to attach to tokens
   * @returns {Array<lunr.Token>} - Processed tokens
   */
  runString(str, metadata);

  /**
   * Clear all functions from the pipeline
   */
  reset();

  /**
   * Serialize the pipeline to JSON
   * @returns {Array<string>} - Array of registered function labels
   */
  toJSON();

  /**
   * Registry of all registered pipeline functions
   * @type {Object<string, Function>}
   */
  static registeredFunctions;

  /**
   * Register a function for use in pipelines
   * @param {Function} fn - Function to register
   * @param {string} label - Unique label for the function
   */
  static registerFunction(fn, label);

  /**
   * Warn if a function is not registered (for serialization)
   * @param {Function} fn - Function to check
   */
  static warnIfFunctionNotRegistered(fn);

  /**
   * Load a pipeline from serialized data
   * @param {Array<string>} serialized - Array of function labels
   * @returns {lunr.Pipeline} - Reconstructed pipeline
   */
  static load(serialized);
}

Usage Examples:

const lunr = require('lunr');

// Create custom pipeline
const customPipeline = new lunr.Pipeline();
customPipeline.add(
  lunr.trimmer,
  lunr.stopWordFilter,
  lunr.stemmer
);

// Process tokens
const tokens = [
  new lunr.Token('running'),
  new lunr.Token('quickly'),
  new lunr.Token('the')
];

const processed = customPipeline.run(tokens);
// Results in stemmed tokens: ['run', 'quickli'] (stop word 'the' removed)

// Process string directly
const stringTokens = customPipeline.runString('The runners are running quickly');

Built-in Pipeline Functions

Core text processing functions provided by Lunr.

/**
 * Removes non-word characters from the beginning and end of tokens
 * @param {lunr.Token} token - Token to trim
 * @returns {lunr.Token|undefined} - Trimmed token, or undefined if nothing remains
 */
lunr.trimmer;

/**
 * Filters out common English stop words
 * @param {lunr.Token} token - Token to check
 * @returns {lunr.Token|undefined} - Token if not a stop word, undefined otherwise
 */
lunr.stopWordFilter;

/**
 * English Porter stemmer - reduces words to their root forms
 * @param {lunr.Token} token - Token to stem
 * @returns {lunr.Token} - Token with stemmed string
 */
lunr.stemmer;

/**
 * Generate a custom stop word filter
 * @param {Array<string>} stopWords - Array of words to filter out
 * @returns {Function} - Stop word filter function
 */
lunr.generateStopWordFilter;

Usage Examples:

// Using built-in functions individually
const token = new lunr.Token('running');

const trimmed = lunr.trimmer(token);        // Removes punctuation
const filtered = lunr.stopWordFilter(token); // Keeps non-stop words
const stemmed = lunr.stemmer(token);        // 'running' -> Token('run')

// Creating custom stop word filter
const customStopWords = ['custom', 'specific', 'terms'];
const customFilter = lunr.generateStopWordFilter(customStopWords);

// Use in pipeline
const pipeline = new lunr.Pipeline();
pipeline.add(lunr.trimmer, customFilter, lunr.stemmer);

Tokenizer

Breaks text into individual tokens for processing.

/**
 * Default tokenizer for converting strings to tokens
 * @param {string|Object} obj - String or object to tokenize
 * @param {Object} metadata - Optional metadata to attach to tokens
 * @returns {Array<lunr.Token>} - Array of tokens
 */
lunr.tokenizer;

/**
 * Token separation pattern (default: /[\s\-]+/)
 * @type {RegExp}
 */
lunr.tokenizer.separator;

Usage Examples:

// Basic tokenization
const tokens = lunr.tokenizer('Hello world, this is a test!');
// Returns: [Token('Hello'), Token('world'), Token('this'), Token('is'), Token('a'), Token('test')]

// Tokenization with metadata
const metadata = { source: 'title' };
const titleTokens = lunr.tokenizer('My Document Title', metadata);

// Custom separator
const originalSeparator = lunr.tokenizer.separator;
lunr.tokenizer.separator = /[\s\-_]+/; // Include underscores
const customTokens = lunr.tokenizer('hello_world-test');
lunr.tokenizer.separator = originalSeparator; // Restore default

// Tokenizing objects (extracts string values)
const objTokens = lunr.tokenizer({
  title: 'Document Title',
  content: 'Document content here'
});

Token Class

Individual text tokens with metadata support.

/**
 * Wrapper for text tokens with metadata
 */
class Token {
  /**
   * Create a new token
   * @param {string} str - Token string value
   * @param {Object} metadata - Optional metadata object
   */
  constructor(str, metadata);

  /**
   * Get the string representation of the token
   * @returns {string} - Token string value
   */
  toString();

  /**
   * Apply a function to the token string
   * @param {Function} fn - Function to apply to token string
   * @returns {lunr.Token} - Token with updated string
   */
  update(fn);

  /**
   * Create a copy of the token, optionally applying a function
   * @param {Function} fn - Optional function to apply during cloning
   * @returns {lunr.Token} - Cloned token
   */
  clone(fn);
}

Usage Examples:

// Create token with metadata
const token = new lunr.Token('running', { 
  position: [0, 7],
  field: 'content' 
});

console.log(token.toString()); // 'running'

// Update token string
const uppercased = token.update(function (str) {
  return str.toUpperCase();
});
console.log(uppercased.toString()); // 'RUNNING'

// Clone with transformation
const stemmed = token.clone(function (str) {
  return str.replace(/ing$/, '');
});
console.log(stemmed.toString()); // 'runn'

// Original token unchanged
console.log(token.toString()); // 'running'

Custom Pipeline Functions

Creating Custom Processors

/**
 * Custom pipeline function template
 * @param {lunr.Token} token - Input token
 * @returns {lunr.Token|undefined|Array<lunr.Token>} - Processed result
 */
function customProcessor(token) {
  // Return undefined to remove token
  // Return token (possibly modified) to keep it
  // Return array of tokens to split into multiple tokens
}

Usage Examples:

// Remove numbers from tokens
function removeNumbers(token) {
  const cleaned = token.toString().replace(/\d+/g, '');
  if (cleaned.length === 0) {
    return undefined; // Remove token entirely
  }
  return token.update(() => cleaned);
}

// Convert to lowercase (alternative to built-in)
function toLowerCase(token) {
  return token.update(str => str.toLowerCase());
}

// Split camelCase into separate tokens
function splitCamelCase(token) {
  const str = token.toString();
  const parts = str.split(/(?=[A-Z])/).filter(part => part.length > 0);
  
  if (parts.length <= 1) {
    return token;
  }
  
  return parts.map(part => new lunr.Token(part.toLowerCase(), token.metadata));
}

// Register custom functions for serialization
lunr.Pipeline.registerFunction(removeNumbers, 'removeNumbers');
lunr.Pipeline.registerFunction(splitCamelCase, 'splitCamelCase');

// Use in pipeline
const customPipeline = new lunr.Pipeline();
customPipeline.add(
  lunr.trimmer,
  removeNumbers,
  splitCamelCase,
  lunr.stopWordFilter,
  lunr.stemmer
);

Conditional Processing

// Language-aware processor
function languageProcessor(token) {
  const metadata = token.metadata || {};
  
  if (metadata.language === 'code') {
    // Don't stem code tokens
    return token;
  } else {
    // Apply stemming to natural language
    return lunr.stemmer(token);
  }
}

// Field-specific processing
function fieldSpecificProcessor(token) {
  const metadata = token.metadata || {};
  
  if (metadata.field === 'title') {
    // Boost title tokens
    return token.update(str => str + '_TITLE');
  }
  
  return token;
}

Pipeline Configuration Patterns

Index vs Search Pipeline Configuration

const idx = lunr(function () {
  this.ref('id');
  this.field('title');
  this.field('content');
  
  // Configure index-time pipeline (affects indexing)
  this.pipeline.remove(lunr.stopWordFilter); // Keep stop words in index
  this.pipeline.add(customNormalizer);
  
  // Configure search-time pipeline (affects queries)
  this.searchPipeline.remove(lunr.stemmer);  // No stemming for searches
  this.searchPipeline.add(customQueryProcessor);
  
  documents.forEach(doc => this.add(doc));
});

Multi-language Pipeline

// Language detection function
function detectLanguage(token) {
  const str = token.toString();
  // Simple heuristics (use proper language detection in practice)
  if (/[áéíóúñ]/.test(str)) return 'es';
  if (/[àéèêëîïôùûüÿ]/.test(str)) return 'fr';
  return 'en';
}

// Multi-language stemmer
function multiLangStemmer(token) {
  const lang = detectLanguage(token);
  
  switch (lang) {
    case 'es': return spanishStemmer(token);
    case 'fr': return frenchStemmer(token);
    default: return lunr.stemmer(token);
  }
}

// Register for serialization
lunr.Pipeline.registerFunction(multiLangStemmer, 'multiLangStemmer');

Debug Pipeline

// Debug processor to log pipeline steps
function debugProcessor(label) {
  function processor(token) {
    console.log(`[${label}] Processing:`, token.toString());
    return token;
  }
  
  // Register unique function
  lunr.Pipeline.registerFunction(processor, `debug_${label}`);
  return processor;
}

// Use in pipeline for debugging
const debugPipeline = new lunr.Pipeline();
debugPipeline.add(
  debugProcessor('start'),
  lunr.trimmer,
  debugProcessor('after_trim'),
  lunr.stopWordFilter,
  debugProcessor('after_stopwords'),
  lunr.stemmer,
  debugProcessor('final')
);

Advanced Text Processing

Metadata Preservation

// Preserve positional information
function positionTracker(token) {
  const metadata = token.metadata || {};
  
  // Ensure position information is preserved
  return token.update(str => {
    // Processing logic here
    return str.toLowerCase();
  });
}

// Use with tokenizer metadata
const textWithPositions = 'The quick brown fox';
const tokens = lunr.tokenizer(textWithPositions).map((token, index) => {
  return new lunr.Token(token.toString(), {
    position: index,
    original: token.toString()
  });
});

Custom Normalization

// Unicode normalization
function unicodeNormalizer(token) {
  return token.update(str => {
    return str.normalize('NFD')           // Decompose
              .replace(/[\u0300-\u036f]/g, '') // Remove diacritics
              .normalize('NFC');          // Recompose
  });
}

// Synonym expansion
const synonymMap = {
  'js': 'javascript',
  'ts': 'typescript',
  'node': 'nodejs'
};

function synonymExpander(token) {
  const str = token.toString().toLowerCase();
  const synonym = synonymMap[str];
  
  if (synonym) {
    // Return both original and synonym
    return [
      token,
      new lunr.Token(synonym, token.metadata)
    ];
  }
  
  return token;
}

docs

advanced-querying.md

index-building.md

index.md

searching.md

text-processing.md

utilities.md

tile.json