Unicode-aware character processing utilities for handling grapheme clusters, code points, and text categorization.
Functions for working with Unicode code points and grapheme clusters.
/**
* Returns a next grapheme cluster break after (not equal to) pos, if forward is true,
* or before otherwise. Returns pos itself if no further cluster break is available.
* @param str The string to search in
* @param pos The starting position
* @param forward Whether to search forward (default: true)
* @param includeExtending Whether to include extending characters (default: true)
*/
function findClusterBreak(str: string, pos: number, forward?: boolean, includeExtending?: boolean): number;
/**
* Find the code point at the given position in a string
* @param str The string to examine
* @param pos The position in the string
*/
function codePointAt(str: string, pos: number): number;
/**
* Given a Unicode codepoint, return the JavaScript string that represents it
* @param code The Unicode code point
*/
function fromCodePoint(code: number): string;
/**
* The amount of positions a character takes up in a JavaScript string
* @param code The Unicode code point
* @returns 1 for BMP characters, 2 for supplementary characters
*/
function codePointSize(code: number): 1 | 2;Usage Examples:
import { findClusterBreak, codePointAt, fromCodePoint, codePointSize } from "@codemirror/state";
// Working with grapheme clusters
const text = "Hello π world π!";
// Find cluster boundaries
console.log(findClusterBreak(text, 6, true)); // 8 (after the waving hand emoji)
console.log(findClusterBreak(text, 8, false)); // 6 (before the waving hand emoji)
// Handle emoji properly
const emojiPos = text.indexOf("π");
const nextPos = findClusterBreak(text, emojiPos, true);
console.log(text.slice(emojiPos, nextPos)); // "π" (complete emoji)
// Working with code points
const surrogatePair = "π½ππππ"; // Mathematical script letters
console.log(codePointAt(surrogatePair, 0)); // 119997 (π½)
console.log(codePointSize(119997)); // 2 (takes 2 UTF-16 code units)
// Create strings from code points
const heart = fromCodePoint(0x2764); // β€
console.log(heart); // "β€"
console.log(codePointSize(0x2764)); // 1 (BMP character)Functions for calculating visual column positions accounting for tabs and grapheme clusters.
/**
* Count the column position at the given offset into the string,
* taking extending characters and tab size into account
* @param string The string to measure
* @param tabSize The size of tab characters
* @param to The offset to measure to (default: string length)
*/
function countColumn(string: string, tabSize: number, to?: number): number;
/**
* Find the offset that corresponds to the given column position in a string,
* taking extending characters and tab size into account
* @param string The string to search in
* @param col The target column position
* @param tabSize The size of tab characters
* @param strict Whether to return -1 if string is too short (default: false)
*/
function findColumn(string: string, col: number, tabSize: number, strict?: boolean): number;Usage Examples:
import { countColumn, findColumn } from "@codemirror/state";
// Text with tabs and Unicode characters
const line = "Hello\tworld\tππ!";
const tabSize = 4;
// Count columns (visual position)
console.log(countColumn(line, tabSize)); // Total visual width
console.log(countColumn(line, tabSize, 5)); // Width up to tab character
// Column positions accounting for tabs
console.log(countColumn("a\tb", 4)); // 5 (a + 3 spaces to tab stop + b)
console.log(countColumn("ab\tc", 4)); // 6 (ab + 2 spaces to tab stop + c)
console.log(countColumn("abc\td", 4)); // 7 (abc + 1 space to tab stop + d)
// Find character offset for visual column
console.log(findColumn("Hello\tworld", 8, 4)); // Position of 'o' in "world"
console.log(findColumn("a\tb\tc", 6, 4)); // Position of 'c'
// Strict mode
console.log(findColumn("short", 10, 4, false)); // 5 (string length)
console.log(findColumn("short", 10, 4, true)); // -1 (string too short)
// Working with complex Unicode
const complexLine = "CafΓ©\tπ¨\tnaΓ―ve";
console.log(countColumn(complexLine, 4)); // Properly handles accented chars and emojiSystem for categorizing characters into word characters, whitespace, and other characters.
/**
* The categories produced by a character categorizer
*/
enum CharCategory {
/** Word characters */
Word,
/** Whitespace */
Space,
/** Anything else */
Other
}
/**
* Create a character categorizer function
* @param wordChars Additional characters to consider as word characters
* @returns Function that categorizes individual characters
*/
function makeCategorizer(wordChars: string): (char: string) => CharCategory;Usage Examples:
import { CharCategory, makeCategorizer } from "@codemirror/state";
// Create a basic categorizer
const categorize = makeCategorizer("");
// Categorize different characters
console.log(categorize("a")); // CharCategory.Word
console.log(categorize("A")); // CharCategory.Word
console.log(categorize("5")); // CharCategory.Word
console.log(categorize("_")); // CharCategory.Word
console.log(categorize(" ")); // CharCategory.Space
console.log(categorize("\t")); // CharCategory.Space
console.log(categorize("\n")); // CharCategory.Space
console.log(categorize("!")); // CharCategory.Other
console.log(categorize("@")); // CharCategory.Other
// Create categorizer with custom word characters
const customCategorize = makeCategorizer("-.$");
console.log(customCategorize("-")); // CharCategory.Word (custom)
console.log(customCategorize("$")); // CharCategory.Word (custom)
console.log(customCategorize(".")); // CharCategory.Word (custom)
console.log(customCategorize("!")); // CharCategory.Other
// Use with Unicode characters
console.log(categorize("cafΓ©")); // Each char: Word, Word, Word, Word
console.log(categorize("δ½ ε₯½")); // Each char: Word, Word (Unicode letters)
console.log(categorize("123")); // Each char: Word, Word, Word (digits)
// Practical usage: find word boundaries
function findWordBoundaries(text: string, categorizer: (char: string) => CharCategory): number[] {
const boundaries = [0];
let lastCategory = categorizer(text[0] || "");
for (let i = 1; i < text.length; i++) {
const category = categorizer(text[i]);
if (category !== lastCategory) {
boundaries.push(i);
lastCategory = category;
}
}
boundaries.push(text.length);
return boundaries;
}
const text = "hello world! 123";
const boundaries = findWordBoundaries(text, categorize);
console.log(boundaries); // [0, 5, 6, 11, 12, 13, 16] - word/space/other boundariesExamples of combining character utilities for complex text processing tasks.
// Function to safely slice text at grapheme boundaries
function safeSlice(text: string, start: number, end?: number): string {
const actualStart = findClusterBreak(text, start, false);
const actualEnd = end !== undefined ? findClusterBreak(text, end, true) : text.length;
return text.slice(actualStart, actualEnd);
}
// Function to count visual characters (grapheme clusters)
function visualLength(text: string): number {
let count = 0;
let pos = 0;
while (pos < text.length) {
pos = findClusterBreak(text, pos, true);
count++;
}
return count;
}
// Function to find word at position
function wordAt(text: string, pos: number, categorizer: (char: string) => CharCategory): {start: number, end: number, word: string} | null {
if (pos >= text.length) return null;
const charAtPos = text[pos];
if (categorizer(charAtPos) !== CharCategory.Word) return null;
// Find start of word
let start = pos;
while (start > 0) {
const prevPos = findClusterBreak(text, start, false);
if (prevPos === start) break;
const prevChar = text.slice(prevPos, start);
if (categorizer(prevChar) !== CharCategory.Word) break;
start = prevPos;
}
// Find end of word
let end = pos;
while (end < text.length) {
const nextPos = findClusterBreak(text, end, true);
if (nextPos === end) break;
const nextChar = text.slice(end, nextPos);
if (categorizer(nextChar) !== CharCategory.Word) break;
end = nextPos;
}
return {
start,
end,
word: text.slice(start, end)
};
}Usage Examples:
// Safe text slicing with emoji
const emojiText = "Hello π¨βπ» world!";
console.log(safeSlice(emojiText, 6, 11)); // "π¨βπ»" (complete emoji sequence)
// Count visual characters
console.log(visualLength("Hello π")); // 7 (not 8, emoji counts as 1)
console.log(visualLength("cafΓ©")); // 4 (proper character count)
// Find words
const sentence = "The quick-brown fox jumps!";
const categorizer = makeCategorizer("-");
const word = wordAt(sentence, 10, categorizer);
console.log(word); // {start: 4, end: 15, word: "quick-brown"}
// Column-aware text wrapping
function wrapText(text: string, maxColumns: number, tabSize: number): string[] {
const lines: string[] = [];
let currentLine = "";
let currentColumn = 0;
for (let i = 0; i < text.length;) {
const nextBreak = findClusterBreak(text, i, true);
const char = text.slice(i, nextBreak);
const charWidth = char === '\t' ?
tabSize - (currentColumn % tabSize) :
1;
if (currentColumn + charWidth > maxColumns && currentLine) {
lines.push(currentLine);
currentLine = "";
currentColumn = 0;
}
currentLine += char;
currentColumn += charWidth;
i = nextBreak;
}
if (currentLine) lines.push(currentLine);
return lines;
}/**
* Character category enumeration
*/
enum CharCategory {
Word = 0,
Space = 1,
Other = 2
}
/**
* Character categorizer function type
*/
type CharCategorizer = (char: string) => CharCategory;
/**
* Options for character processing functions
*/
interface CharacterProcessingOptions {
includeExtending?: boolean;
tabSize?: number;
wordChars?: string;
}