or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

document-search.mdindex-search.mdindex.mdpersistent-storage.mdquery-resolution.mdtext-processing.mdworker-search.md
tile.json

text-processing.mddocs/

Text Processing & Encoding

Comprehensive text processing system with language-specific encoders, character set support, and custom tokenization strategies. FlexSearch's encoding system provides optimal search performance across different languages and character sets.

Capabilities

Encoder Class

Configurable text encoder that processes content into search-optimized terms with support for various languages and character sets.

/**
 * Text processing and encoding utilities for search optimization
 * @param options - Encoder configuration options
 */
class Encoder {
  constructor(options?: EncoderOptions);
}

Text Encoding Operations

Transform text content into search-optimized terms with configurable processing rules.

/**
 * Process and encode text content into search terms
 * @param content - Text content to encode
 * @returns Array of processed search terms
 */
encode(content: string): string[];

/**
 * Update encoder configuration with new options
 * @param options - New encoder configuration
 * @returns Encoder instance for chaining
 */
assign(options: EncoderOptions): this;

Usage Examples:

import { Encoder } from "flexsearch";

// Create encoder with default options
const encoder = new Encoder();

// Encode text content
const terms = encoder.encode("The quick brown fox jumps!");
console.log(terms); // ["the", "quick", "brown", "fox", "jumps"]

// Create encoder with custom options
const customEncoder = new Encoder({
  tokenize: "strict",
  normalize: true,
  minlength: 2,
  exclude: { punctuation: true }
});

const customTerms = customEncoder.encode("Hello, world!");
console.log(customTerms); // ["hello", "world"]

Character Mapping

Add character-level transformations for normalizing different character representations.

/**
 * Add character mapping for normalization
 * @param char_match - Character or pattern to match
 * @param char_replace - Replacement character
 * @returns Encoder instance for chaining
 */
addMapper(char_match: string, char_replace: string): this;

Usage Examples:

// Map accented characters to base characters
encoder.addMapper("á", "a");
encoder.addMapper("é", "e");
encoder.addMapper("ñ", "n");

// Process text with character mapping
const mapped = encoder.encode("café piñata");
console.log(mapped); // ["cafe", "pinata"]

Pattern Matching

Add regex-based pattern matching for complex text transformations.

/**
 * Add pattern matching for text transformation
 * @param match - String or regex pattern to match
 * @param replace - Replacement string
 * @returns Encoder instance for chaining
 */
addMatcher(match: string, replace: string): this;

Usage Examples:

// Remove URL patterns
encoder.addMatcher("https?://[^\\s]+", "");

// Normalize phone numbers
encoder.addMatcher("\\(\\d{3}\\)\\s?\\d{3}-\\d{4}", "phone");

const processed = encoder.encode("Visit https://example.com or call (555) 123-4567");
console.log(processed); // ["visit", "or", "call", "phone"]

Stemming Rules

Add stemming transformations to reduce words to their root forms.

/**
 * Add stemming rules for root form reduction
 * @param match - Word ending or pattern to match
 * @param replace - Replacement ending or root
 * @returns Encoder instance for chaining
 */
addStemmer(match: string, replace: string): this;

Usage Examples:

// Add common English stemming rules
encoder.addStemmer("ing", "");
encoder.addStemmer("ed", "");
encoder.addStemmer("s", "");

const stemmed = encoder.encode("running jumped cats");
console.log(stemmed); // ["run", "jump", "cat"]

Term Filtering

Add stop word filtering and custom term exclusion rules.

/**
 * Add term to filter (stop word)
 * @param term - Term to exclude from indexing
 * @returns Encoder instance for chaining
 */
addFilter(term: string): this;

Usage Examples:

// Add common stop words
encoder.addFilter("the");
encoder.addFilter("and");
encoder.addFilter("or");
encoder.addFilter("but");

const filtered = encoder.encode("the cat and the dog");
console.log(filtered); // ["cat", "dog"]

Text Replacement

Add complex text replacement rules using strings or regular expressions.

/**
 * Add text replacement rules
 * @param match - String or RegExp pattern to match
 * @param replace - Replacement string
 * @returns Encoder instance for chaining
 */
addReplacer(match: string | RegExp, replace: string): this;

Usage Examples:

// Replace abbreviations
encoder.addReplacer("Dr.", "Doctor");
encoder.addReplacer("St.", "Street");

// Remove HTML tags
encoder.addReplacer(/<[^>]*>/g, "");

const replaced = encoder.encode("Dr. Smith lives on Main St. <b>downtown</b>");
console.log(replaced); // ["doctor", "smith", "lives", "on", "main", "street", "downtown"]

Charset Configurations

Pre-configured character set optimizations for different languages and use cases.

/**
 * Predefined character set configurations
 */
const Charset: {
  /** Exact character matching without normalization */
  Exact: EncoderOptions;
  /** Default balanced configuration */
  Default: EncoderOptions;
  /** Unicode normalization enabled */
  Normalize: EncoderOptions;
  /** Balanced Latin character support */
  LatinBalance: EncoderOptions;
  /** Advanced Latin with diacritics */
  LatinAdvanced: EncoderOptions;
  /** Extended Latin character support */
  LatinExtra: EncoderOptions;
  /** Latin Soundex phonetic matching */
  LatinSoundex: EncoderOptions;
  /** Chinese, Japanese, Korean support */
  CJK: EncoderOptions;
  /** @deprecated Simple Latin configuration */
  LatinSimple: EncoderOptions;
  /** @deprecated Exact Latin matching */
  LatinExact: EncoderOptions;
  /** @deprecated Default Latin configuration */
  LatinDefault: EncoderOptions;
};

Usage Examples:

import { Index, Charset } from "flexsearch";

// Use predefined charset for Latin text
const latinIndex = new Index({
  encoder: Charset.LatinAdvanced
});

// Use CJK charset for Asian languages
const cjkIndex = new Index({
  encoder: Charset.CJK
});

// Use Soundex for phonetic matching
const phoneticIndex = new Index({
  encoder: Charset.LatinSoundex
});

Language Packs

Language-specific encoder configurations optimized for different languages.

// Import language-specific configurations
import EnglishPack from "flexsearch/lang/en";
import GermanPack from "flexsearch/lang/de";
import FrenchPack from "flexsearch/lang/fr";

Usage Examples:

// Use English language pack
const englishIndex = new Index({
  encoder: EnglishPack
});

// Use German language pack
const germanIndex = new Index({
  encoder: GermanPack
});

// Combine language pack with custom options
const customEnglish = new Index({
  encoder: {
    ...EnglishPack,
    minlength: 3,
    cache: true
  }
});

Configuration Options

EncoderOptions Interface

interface EncoderOptions {
  /** Right-to-left language support */
  rtl?: boolean;
  /** Remove duplicate terms */
  dedupe?: boolean;
  /** Character types to include in processing */
  include?: EncoderSplitOptions;
  /** Character types to exclude from processing */
  exclude?: EncoderSplitOptions;
  /** Text splitting configuration */
  split?: string | RegExp | "" | false;
  /** Include numeric characters */
  numeric?: boolean;
  /** Text normalization function or boolean */
  normalize?: boolean | ((str: string) => string);
  /** Text preparation function */
  prepare?: (str: string) => string;
  /** Term finalization function */
  finalize?: (terms: string[]) => string[];
  /** Term filtering configuration */
  filter?: Set<string> | ((term: string) => boolean);
  /** Pattern matching map */
  matcher?: Map<string, string>;
  /** Character mapping configuration */
  mapper?: Map<string, string>;
  /** Stemming rules map */
  stemmer?: Map<string, string>;
  /** Text replacement rules */
  replacer?: [string | RegExp, string];
  /** Minimum term length */
  minlength?: number;
  /** Maximum term length */
  maxlength?: number;
  /** Enable encoder result caching */
  cache?: boolean | number;
}

EncoderSplitOptions Interface

interface EncoderSplitOptions {
  /** Include letter characters */
  letter?: boolean;
  /** Include number characters */
  number?: boolean;
  /** Include symbol characters */
  symbol?: boolean;
  /** Include punctuation characters */
  punctuation?: boolean;
  /** Include control characters */
  control?: boolean;
  /** Custom character or character array */
  char?: string | string[];
}

Usage Examples:

// Custom encoder with specific character inclusion
const encoder = new Encoder({
  include: {
    letter: true,
    number: true,
    symbol: false,
    punctuation: false
  },
  exclude: {
    char: ["@", "#", "$"]
  },
  minlength: 2,
  maxlength: 50,
  normalize: true,
  dedupe: true
});

// Right-to-left language configuration
const rtlEncoder = new Encoder({
  rtl: true,
  normalize: true,
  split: /[\s\u200B]+/, // Include zero-width space
  prepare: (str) => str.trim(),
  finalize: (terms) => terms.filter(term => term.length > 1)
});

Custom Processing Functions

/**
 * Custom text preparation function
 * @param str - Input text string
 * @returns Prepared text string
 */
type PrepareFunction = (str: string) => string;

/**
 * Custom normalization function
 * @param str - Input text string
 * @returns Normalized text string
 */
type NormalizeFunction = (str: string) => string;

/**
 * Custom term finalization function
 * @param terms - Array of processed terms
 * @returns Final array of terms
 */
type FinalizeFunction = (terms: string[]) => string[];

/**
 * Custom term filter function
 * @param term - Term to evaluate
 * @returns True to include term, false to exclude
 */
type FilterFunction = (term: string) => boolean;

Usage Examples:

// Custom processing pipeline
const advancedEncoder = new Encoder({
  prepare: (str) => {
    // Custom text cleaning
    return str.toLowerCase()
              .replace(/[^\w\s]/g, ' ')
              .replace(/\s+/g, ' ')
              .trim();
  },
  normalize: (str) => {
    // Custom normalization
    return str.replace(/[áàâä]/g, 'a')
              .replace(/[éèêë]/g, 'e')
              .replace(/[íìîï]/g, 'i')
              .replace(/[óòôö]/g, 'o')
              .replace(/[úùûü]/g, 'u');
  },
  finalize: (terms) => {
    // Custom term processing
    return terms.filter(term => term.length >= 2)
                .map(term => term.toLowerCase())
                .filter((term, index, arr) => arr.indexOf(term) === index);
  },
  filter: (term) => {
    // Custom stop word filtering
    const stopWords = new Set(['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to']);
    return !stopWords.has(term);
  }
});

Types

type Encoders = 
  | "Exact"
  | "Default" 
  | "Normalize"
  | "LatinBalance"
  | "LatinAdvanced"
  | "LatinExtra"
  | "LatinSoundex"
  | "CJK"
  | ((content: string) => string[]);