Comprehensive text processing system with language-specific encoders, character set support, and custom tokenization strategies. FlexSearch's encoding system provides optimal search performance across different languages and character sets.
Configurable text encoder that processes content into search-optimized terms with support for various languages and character sets.
/**
* Text processing and encoding utilities for search optimization
* @param options - Encoder configuration options
*/
class Encoder {
constructor(options?: EncoderOptions);
}Transform text content into search-optimized terms with configurable processing rules.
/**
* Process and encode text content into search terms
* @param content - Text content to encode
* @returns Array of processed search terms
*/
encode(content: string): string[];
/**
* Update encoder configuration with new options
* @param options - New encoder configuration
* @returns Encoder instance for chaining
*/
assign(options: EncoderOptions): this;Usage Examples:
import { Encoder } from "flexsearch";
// Create encoder with default options
const encoder = new Encoder();
// Encode text content
const terms = encoder.encode("The quick brown fox jumps!");
console.log(terms); // ["the", "quick", "brown", "fox", "jumps"]
// Create encoder with custom options
const customEncoder = new Encoder({
tokenize: "strict",
normalize: true,
minlength: 2,
exclude: { punctuation: true }
});
const customTerms = customEncoder.encode("Hello, world!");
console.log(customTerms); // ["hello", "world"]Add character-level transformations for normalizing different character representations.
/**
* Add character mapping for normalization
* @param char_match - Character or pattern to match
* @param char_replace - Replacement character
* @returns Encoder instance for chaining
*/
addMapper(char_match: string, char_replace: string): this;Usage Examples:
// Map accented characters to base characters
encoder.addMapper("á", "a");
encoder.addMapper("é", "e");
encoder.addMapper("ñ", "n");
// Process text with character mapping
const mapped = encoder.encode("café piñata");
console.log(mapped); // ["cafe", "pinata"]Add regex-based pattern matching for complex text transformations.
/**
* Add pattern matching for text transformation
* @param match - String or regex pattern to match
* @param replace - Replacement string
* @returns Encoder instance for chaining
*/
addMatcher(match: string, replace: string): this;Usage Examples:
// Remove URL patterns
encoder.addMatcher("https?://[^\\s]+", "");
// Normalize phone numbers
encoder.addMatcher("\\(\\d{3}\\)\\s?\\d{3}-\\d{4}", "phone");
const processed = encoder.encode("Visit https://example.com or call (555) 123-4567");
console.log(processed); // ["visit", "or", "call", "phone"]Add stemming transformations to reduce words to their root forms.
/**
* Add stemming rules for root form reduction
* @param match - Word ending or pattern to match
* @param replace - Replacement ending or root
* @returns Encoder instance for chaining
*/
addStemmer(match: string, replace: string): this;Usage Examples:
// Add common English stemming rules
encoder.addStemmer("ing", "");
encoder.addStemmer("ed", "");
encoder.addStemmer("s", "");
const stemmed = encoder.encode("running jumped cats");
console.log(stemmed); // ["run", "jump", "cat"]Add stop word filtering and custom term exclusion rules.
/**
* Add term to filter (stop word)
* @param term - Term to exclude from indexing
* @returns Encoder instance for chaining
*/
addFilter(term: string): this;Usage Examples:
// Add common stop words
encoder.addFilter("the");
encoder.addFilter("and");
encoder.addFilter("or");
encoder.addFilter("but");
const filtered = encoder.encode("the cat and the dog");
console.log(filtered); // ["cat", "dog"]Add complex text replacement rules using strings or regular expressions.
/**
* Add text replacement rules
* @param match - String or RegExp pattern to match
* @param replace - Replacement string
* @returns Encoder instance for chaining
*/
addReplacer(match: string | RegExp, replace: string): this;Usage Examples:
// Replace abbreviations
encoder.addReplacer("Dr.", "Doctor");
encoder.addReplacer("St.", "Street");
// Remove HTML tags
encoder.addReplacer(/<[^>]*>/g, "");
const replaced = encoder.encode("Dr. Smith lives on Main St. <b>downtown</b>");
console.log(replaced); // ["doctor", "smith", "lives", "on", "main", "street", "downtown"]Pre-configured character set optimizations for different languages and use cases.
/**
* Predefined character set configurations
*/
const Charset: {
/** Exact character matching without normalization */
Exact: EncoderOptions;
/** Default balanced configuration */
Default: EncoderOptions;
/** Unicode normalization enabled */
Normalize: EncoderOptions;
/** Balanced Latin character support */
LatinBalance: EncoderOptions;
/** Advanced Latin with diacritics */
LatinAdvanced: EncoderOptions;
/** Extended Latin character support */
LatinExtra: EncoderOptions;
/** Latin Soundex phonetic matching */
LatinSoundex: EncoderOptions;
/** Chinese, Japanese, Korean support */
CJK: EncoderOptions;
/** @deprecated Simple Latin configuration */
LatinSimple: EncoderOptions;
/** @deprecated Exact Latin matching */
LatinExact: EncoderOptions;
/** @deprecated Default Latin configuration */
LatinDefault: EncoderOptions;
};Usage Examples:
import { Index, Charset } from "flexsearch";
// Use predefined charset for Latin text
const latinIndex = new Index({
encoder: Charset.LatinAdvanced
});
// Use CJK charset for Asian languages
const cjkIndex = new Index({
encoder: Charset.CJK
});
// Use Soundex for phonetic matching
const phoneticIndex = new Index({
encoder: Charset.LatinSoundex
});Language-specific encoder configurations optimized for different languages.
// Import language-specific configurations
import EnglishPack from "flexsearch/lang/en";
import GermanPack from "flexsearch/lang/de";
import FrenchPack from "flexsearch/lang/fr";Usage Examples:
// Use English language pack
const englishIndex = new Index({
encoder: EnglishPack
});
// Use German language pack
const germanIndex = new Index({
encoder: GermanPack
});
// Combine language pack with custom options
const customEnglish = new Index({
encoder: {
...EnglishPack,
minlength: 3,
cache: true
}
});interface EncoderOptions {
/** Right-to-left language support */
rtl?: boolean;
/** Remove duplicate terms */
dedupe?: boolean;
/** Character types to include in processing */
include?: EncoderSplitOptions;
/** Character types to exclude from processing */
exclude?: EncoderSplitOptions;
/** Text splitting configuration */
split?: string | RegExp | "" | false;
/** Include numeric characters */
numeric?: boolean;
/** Text normalization function or boolean */
normalize?: boolean | ((str: string) => string);
/** Text preparation function */
prepare?: (str: string) => string;
/** Term finalization function */
finalize?: (terms: string[]) => string[];
/** Term filtering configuration */
filter?: Set<string> | ((term: string) => boolean);
/** Pattern matching map */
matcher?: Map<string, string>;
/** Character mapping configuration */
mapper?: Map<string, string>;
/** Stemming rules map */
stemmer?: Map<string, string>;
/** Text replacement rules */
replacer?: [string | RegExp, string];
/** Minimum term length */
minlength?: number;
/** Maximum term length */
maxlength?: number;
/** Enable encoder result caching */
cache?: boolean | number;
}interface EncoderSplitOptions {
/** Include letter characters */
letter?: boolean;
/** Include number characters */
number?: boolean;
/** Include symbol characters */
symbol?: boolean;
/** Include punctuation characters */
punctuation?: boolean;
/** Include control characters */
control?: boolean;
/** Custom character or character array */
char?: string | string[];
}Usage Examples:
// Custom encoder with specific character inclusion
const encoder = new Encoder({
include: {
letter: true,
number: true,
symbol: false,
punctuation: false
},
exclude: {
char: ["@", "#", "$"]
},
minlength: 2,
maxlength: 50,
normalize: true,
dedupe: true
});
// Right-to-left language configuration
const rtlEncoder = new Encoder({
rtl: true,
normalize: true,
split: /[\s\u200B]+/, // Include zero-width space
prepare: (str) => str.trim(),
finalize: (terms) => terms.filter(term => term.length > 1)
});/**
* Custom text preparation function
* @param str - Input text string
* @returns Prepared text string
*/
type PrepareFunction = (str: string) => string;
/**
* Custom normalization function
* @param str - Input text string
* @returns Normalized text string
*/
type NormalizeFunction = (str: string) => string;
/**
* Custom term finalization function
* @param terms - Array of processed terms
* @returns Final array of terms
*/
type FinalizeFunction = (terms: string[]) => string[];
/**
* Custom term filter function
* @param term - Term to evaluate
* @returns True to include term, false to exclude
*/
type FilterFunction = (term: string) => boolean;Usage Examples:
// Custom processing pipeline
const advancedEncoder = new Encoder({
prepare: (str) => {
// Custom text cleaning
return str.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
},
normalize: (str) => {
// Custom normalization
return str.replace(/[áàâä]/g, 'a')
.replace(/[éèêë]/g, 'e')
.replace(/[íìîï]/g, 'i')
.replace(/[óòôö]/g, 'o')
.replace(/[úùûü]/g, 'u');
},
finalize: (terms) => {
// Custom term processing
return terms.filter(term => term.length >= 2)
.map(term => term.toLowerCase())
.filter((term, index, arr) => arr.indexOf(term) === index);
},
filter: (term) => {
// Custom stop word filtering
const stopWords = new Set(['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to']);
return !stopWords.has(term);
}
});type Encoders =
| "Exact"
| "Default"
| "Normalize"
| "LatinBalance"
| "LatinAdvanced"
| "LatinExtra"
| "LatinSoundex"
| "CJK"
| ((content: string) => string[]);