or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

changes.mdcharacter-utils.mdeditor-state.mdextensions.mdindex.mdrange-sets.mdselection.mdtext.mdtransactions.md
tile.json

character-utils.mddocs/

Character Utilities

Unicode-aware character processing utilities for handling grapheme clusters, code points, and text categorization.

Capabilities

Unicode Character Processing

Functions for working with Unicode code points and grapheme clusters.

/**
 * Returns a next grapheme cluster break after (not equal to) pos, if forward is true,
 * or before otherwise. Returns pos itself if no further cluster break is available.
 * @param str The string to search in
 * @param pos The starting position
 * @param forward Whether to search forward (default: true)
 * @param includeExtending Whether to include extending characters (default: true)
 */
function findClusterBreak(str: string, pos: number, forward?: boolean, includeExtending?: boolean): number;

/**
 * Find the code point at the given position in a string
 * @param str The string to examine
 * @param pos The position in the string
 */
function codePointAt(str: string, pos: number): number;

/**
 * Given a Unicode codepoint, return the JavaScript string that represents it
 * @param code The Unicode code point
 */
function fromCodePoint(code: number): string;

/**
 * The amount of positions a character takes up in a JavaScript string
 * @param code The Unicode code point
 * @returns 1 for BMP characters, 2 for supplementary characters
 */
function codePointSize(code: number): 1 | 2;

Usage Examples:

import { findClusterBreak, codePointAt, fromCodePoint, codePointSize } from "@codemirror/state";

// Working with grapheme clusters
const text = "Hello πŸ‘‹ world 🌍!";

// Find cluster boundaries
console.log(findClusterBreak(text, 6, true));  // 8 (after the waving hand emoji)
console.log(findClusterBreak(text, 8, false)); // 6 (before the waving hand emoji)

// Handle emoji properly
const emojiPos = text.indexOf("πŸ‘‹");
const nextPos = findClusterBreak(text, emojiPos, true);
console.log(text.slice(emojiPos, nextPos)); // "πŸ‘‹" (complete emoji)

// Working with code points
const surrogatePair = "π’½π‘’π“π“π‘œ"; // Mathematical script letters
console.log(codePointAt(surrogatePair, 0)); // 119997 (𝒽)
console.log(codePointSize(119997)); // 2 (takes 2 UTF-16 code units)

// Create strings from code points
const heart = fromCodePoint(0x2764); // ❀
console.log(heart); // "❀"
console.log(codePointSize(0x2764)); // 1 (BMP character)

Column Calculation

Functions for calculating visual column positions accounting for tabs and grapheme clusters.

/**
 * Count the column position at the given offset into the string,
 * taking extending characters and tab size into account
 * @param string The string to measure
 * @param tabSize The size of tab characters
 * @param to The offset to measure to (default: string length)
 */
function countColumn(string: string, tabSize: number, to?: number): number;

/**
 * Find the offset that corresponds to the given column position in a string,
 * taking extending characters and tab size into account
 * @param string The string to search in
 * @param col The target column position
 * @param tabSize The size of tab characters
 * @param strict Whether to return -1 if string is too short (default: false)
 */
function findColumn(string: string, col: number, tabSize: number, strict?: boolean): number;

Usage Examples:

import { countColumn, findColumn } from "@codemirror/state";

// Text with tabs and Unicode characters
const line = "Hello\tworld\tπŸ‘‹πŸŒ!";
const tabSize = 4;

// Count columns (visual position)
console.log(countColumn(line, tabSize)); // Total visual width
console.log(countColumn(line, tabSize, 5)); // Width up to tab character

// Column positions accounting for tabs
console.log(countColumn("a\tb", 4));     // 5 (a + 3 spaces to tab stop + b)
console.log(countColumn("ab\tc", 4));    // 6 (ab + 2 spaces to tab stop + c)
console.log(countColumn("abc\td", 4));   // 7 (abc + 1 space to tab stop + d)

// Find character offset for visual column
console.log(findColumn("Hello\tworld", 8, 4)); // Position of 'o' in "world"
console.log(findColumn("a\tb\tc", 6, 4));      // Position of 'c'

// Strict mode
console.log(findColumn("short", 10, 4, false)); // 5 (string length)
console.log(findColumn("short", 10, 4, true));  // -1 (string too short)

// Working with complex Unicode
const complexLine = "Café\t🎨\tnaïve";
console.log(countColumn(complexLine, 4)); // Properly handles accented chars and emoji

Character Categorization

System for categorizing characters into word characters, whitespace, and other characters.

/**
 * The categories produced by a character categorizer
 */
enum CharCategory {
  /** Word characters */
  Word,
  /** Whitespace */
  Space,
  /** Anything else */
  Other
}

/**
 * Create a character categorizer function
 * @param wordChars Additional characters to consider as word characters
 * @returns Function that categorizes individual characters
 */
function makeCategorizer(wordChars: string): (char: string) => CharCategory;

Usage Examples:

import { CharCategory, makeCategorizer } from "@codemirror/state";

// Create a basic categorizer
const categorize = makeCategorizer("");

// Categorize different characters
console.log(categorize("a"));     // CharCategory.Word
console.log(categorize("A"));     // CharCategory.Word  
console.log(categorize("5"));     // CharCategory.Word
console.log(categorize("_"));     // CharCategory.Word
console.log(categorize(" "));     // CharCategory.Space
console.log(categorize("\t"));    // CharCategory.Space
console.log(categorize("\n"));    // CharCategory.Space
console.log(categorize("!"));     // CharCategory.Other
console.log(categorize("@"));     // CharCategory.Other

// Create categorizer with custom word characters
const customCategorize = makeCategorizer("-.$");

console.log(customCategorize("-")); // CharCategory.Word (custom)
console.log(customCategorize("$")); // CharCategory.Word (custom)
console.log(customCategorize(".")); // CharCategory.Word (custom)
console.log(customCategorize("!")); // CharCategory.Other

// Use with Unicode characters
console.log(categorize("cafΓ©"));  // Each char: Word, Word, Word, Word
console.log(categorize("δ½ ε₯½"));   // Each char: Word, Word (Unicode letters)
console.log(categorize("123"));   // Each char: Word, Word, Word (digits)

// Practical usage: find word boundaries
function findWordBoundaries(text: string, categorizer: (char: string) => CharCategory): number[] {
  const boundaries = [0];
  let lastCategory = categorizer(text[0] || "");
  
  for (let i = 1; i < text.length; i++) {
    const category = categorizer(text[i]);
    if (category !== lastCategory) {
      boundaries.push(i);
      lastCategory = category;
    }
  }
  
  boundaries.push(text.length);
  return boundaries;
}

const text = "hello world! 123";
const boundaries = findWordBoundaries(text, categorize);
console.log(boundaries); // [0, 5, 6, 11, 12, 13, 16] - word/space/other boundaries

Advanced Character Processing

Examples of combining character utilities for complex text processing tasks.

// Function to safely slice text at grapheme boundaries
function safeSlice(text: string, start: number, end?: number): string {
  const actualStart = findClusterBreak(text, start, false);
  const actualEnd = end !== undefined ? findClusterBreak(text, end, true) : text.length;
  return text.slice(actualStart, actualEnd);
}

// Function to count visual characters (grapheme clusters)
function visualLength(text: string): number {
  let count = 0;
  let pos = 0;
  while (pos < text.length) {
    pos = findClusterBreak(text, pos, true);
    count++;
  }
  return count;
}

// Function to find word at position
function wordAt(text: string, pos: number, categorizer: (char: string) => CharCategory): {start: number, end: number, word: string} | null {
  if (pos >= text.length) return null;
  
  const charAtPos = text[pos];
  if (categorizer(charAtPos) !== CharCategory.Word) return null;
  
  // Find start of word
  let start = pos;
  while (start > 0) {
    const prevPos = findClusterBreak(text, start, false);
    if (prevPos === start) break;
    const prevChar = text.slice(prevPos, start);
    if (categorizer(prevChar) !== CharCategory.Word) break;
    start = prevPos;
  }
  
  // Find end of word
  let end = pos;
  while (end < text.length) {
    const nextPos = findClusterBreak(text, end, true);
    if (nextPos === end) break;
    const nextChar = text.slice(end, nextPos);
    if (categorizer(nextChar) !== CharCategory.Word) break;
    end = nextPos;
  }
  
  return {
    start,
    end,
    word: text.slice(start, end)
  };
}

Usage Examples:

// Safe text slicing with emoji
const emojiText = "Hello πŸ‘¨β€πŸ’» world!";
console.log(safeSlice(emojiText, 6, 11)); // "πŸ‘¨β€πŸ’»" (complete emoji sequence)

// Count visual characters
console.log(visualLength("Hello πŸ‘‹")); // 7 (not 8, emoji counts as 1)
console.log(visualLength("cafΓ©"));     // 4 (proper character count)

// Find words
const sentence = "The quick-brown fox jumps!";
const categorizer = makeCategorizer("-");
const word = wordAt(sentence, 10, categorizer);
console.log(word); // {start: 4, end: 15, word: "quick-brown"}

// Column-aware text wrapping
function wrapText(text: string, maxColumns: number, tabSize: number): string[] {
  const lines: string[] = [];
  let currentLine = "";
  let currentColumn = 0;
  
  for (let i = 0; i < text.length;) {
    const nextBreak = findClusterBreak(text, i, true);
    const char = text.slice(i, nextBreak);
    const charWidth = char === '\t' ? 
      tabSize - (currentColumn % tabSize) : 
      1;
    
    if (currentColumn + charWidth > maxColumns && currentLine) {
      lines.push(currentLine);
      currentLine = "";
      currentColumn = 0;
    }
    
    currentLine += char;
    currentColumn += charWidth;
    i = nextBreak;
  }
  
  if (currentLine) lines.push(currentLine);
  return lines;
}

Types

/**
 * Character category enumeration
 */
enum CharCategory {
  Word = 0,
  Space = 1, 
  Other = 2
}

/**
 * Character categorizer function type
 */
type CharCategorizer = (char: string) => CharCategory;

/**
 * Options for character processing functions
 */
interface CharacterProcessingOptions {
  includeExtending?: boolean;
  tabSize?: number;
  wordChars?: string;
}