tessl/npm-codemirror--state

Editor state data structures for the CodeMirror code editor

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Securityby

Pending

The risk profile of this skill

Overview

Eval results

Files

Character Utilities

Name: tessl/npm-codemirror--state
Author: tessl

Unicode-aware character processing utilities for handling grapheme clusters, code points, and text categorization.

Capabilities

Unicode Character Processing

Functions for working with Unicode code points and grapheme clusters.

/**
 * Returns a next grapheme cluster break after (not equal to) pos, if forward is true,
 * or before otherwise. Returns pos itself if no further cluster break is available.
 * @param str The string to search in
 * @param pos The starting position
 * @param forward Whether to search forward (default: true)
 * @param includeExtending Whether to include extending characters (default: true)
 */
function findClusterBreak(str: string, pos: number, forward?: boolean, includeExtending?: boolean): number;

/**
 * Find the code point at the given position in a string
 * @param str The string to examine
 * @param pos The position in the string
 */
function codePointAt(str: string, pos: number): number;

/**
 * Given a Unicode codepoint, return the JavaScript string that represents it
 * @param code The Unicode code point
 */
function fromCodePoint(code: number): string;

/**
 * The amount of positions a character takes up in a JavaScript string
 * @param code The Unicode code point
 * @returns 1 for BMP characters, 2 for supplementary characters
 */
function codePointSize(code: number): 1 | 2;

Usage Examples:

import { findClusterBreak, codePointAt, fromCodePoint, codePointSize } from "@codemirror/state";

// Working with grapheme clusters
const text = "Hello 👋 world 🌍!";

// Find cluster boundaries
console.log(findClusterBreak(text, 6, true));  // 8 (after the waving hand emoji)
console.log(findClusterBreak(text, 8, false)); // 6 (before the waving hand emoji)

// Handle emoji properly
const emojiPos = text.indexOf("👋");
const nextPos = findClusterBreak(text, emojiPos, true);
console.log(text.slice(emojiPos, nextPos)); // "👋" (complete emoji)

// Working with code points
const surrogatePair = "𝒽𝑒𝓁𝓁𝑜"; // Mathematical script letters
console.log(codePointAt(surrogatePair, 0)); // 119997 (𝒽)
console.log(codePointSize(119997)); // 2 (takes 2 UTF-16 code units)

// Create strings from code points
const heart = fromCodePoint(0x2764); // ❤
console.log(heart); // "❤"
console.log(codePointSize(0x2764)); // 1 (BMP character)

Column Calculation

Functions for calculating visual column positions accounting for tabs and grapheme clusters.

/**
 * Count the column position at the given offset into the string,
 * taking extending characters and tab size into account
 * @param string The string to measure
 * @param tabSize The size of tab characters
 * @param to The offset to measure to (default: string length)
 */
function countColumn(string: string, tabSize: number, to?: number): number;

/**
 * Find the offset that corresponds to the given column position in a string,
 * taking extending characters and tab size into account
 * @param string The string to search in
 * @param col The target column position
 * @param tabSize The size of tab characters
 * @param strict Whether to return -1 if string is too short (default: false)
 */
function findColumn(string: string, col: number, tabSize: number, strict?: boolean): number;

Usage Examples:

import { countColumn, findColumn } from "@codemirror/state";

// Text with tabs and Unicode characters
const line = "Hello\tworld\t👋🌍!";
const tabSize = 4;

// Count columns (visual position)
console.log(countColumn(line, tabSize)); // Total visual width
console.log(countColumn(line, tabSize, 5)); // Width up to tab character

// Column positions accounting for tabs
console.log(countColumn("a\tb", 4));     // 5 (a + 3 spaces to tab stop + b)
console.log(countColumn("ab\tc", 4));    // 6 (ab + 2 spaces to tab stop + c)
console.log(countColumn("abc\td", 4));   // 7 (abc + 1 space to tab stop + d)

// Find character offset for visual column
console.log(findColumn("Hello\tworld", 8, 4)); // Position of 'o' in "world"
console.log(findColumn("a\tb\tc", 6, 4));      // Position of 'c'

// Strict mode
console.log(findColumn("short", 10, 4, false)); // 5 (string length)
console.log(findColumn("short", 10, 4, true));  // -1 (string too short)

// Working with complex Unicode
const complexLine = "Café\t🎨\tnaïve";
console.log(countColumn(complexLine, 4)); // Properly handles accented chars and emoji

Character Categorization

System for categorizing characters into word characters, whitespace, and other characters.

/**
 * The categories produced by a character categorizer
 */
enum CharCategory {
  /** Word characters */
  Word,
  /** Whitespace */
  Space,
  /** Anything else */
  Other
}

/**
 * Create a character categorizer function
 * @param wordChars Additional characters to consider as word characters
 * @returns Function that categorizes individual characters
 */
function makeCategorizer(wordChars: string): (char: string) => CharCategory;

Usage Examples:

import { CharCategory, makeCategorizer } from "@codemirror/state";

// Create a basic categorizer
const categorize = makeCategorizer("");

// Categorize different characters
console.log(categorize("a"));     // CharCategory.Word
console.log(categorize("A"));     // CharCategory.Word  
console.log(categorize("5"));     // CharCategory.Word
console.log(categorize("_"));     // CharCategory.Word
console.log(categorize(" "));     // CharCategory.Space
console.log(categorize("\t"));    // CharCategory.Space
console.log(categorize("\n"));    // CharCategory.Space
console.log(categorize("!"));     // CharCategory.Other
console.log(categorize("@"));     // CharCategory.Other

// Create categorizer with custom word characters
const customCategorize = makeCategorizer("-.$");

console.log(customCategorize("-")); // CharCategory.Word (custom)
console.log(customCategorize("$")); // CharCategory.Word (custom)
console.log(customCategorize(".")); // CharCategory.Word (custom)
console.log(customCategorize("!")); // CharCategory.Other

// Use with Unicode characters
console.log(categorize("café"));  // Each char: Word, Word, Word, Word
console.log(categorize("你好"));   // Each char: Word, Word (Unicode letters)
console.log(categorize("123"));   // Each char: Word, Word, Word (digits)

// Practical usage: find word boundaries
function findWordBoundaries(text: string, categorizer: (char: string) => CharCategory): number[] {
  const boundaries = [0];
  let lastCategory = categorizer(text[0] || "");
  
  for (let i = 1; i < text.length; i++) {
    const category = categorizer(text[i]);
    if (category !== lastCategory) {
      boundaries.push(i);
      lastCategory = category;
    }
  }
  
  boundaries.push(text.length);
  return boundaries;
}

const text = "hello world! 123";
const boundaries = findWordBoundaries(text, categorize);
console.log(boundaries); // [0, 5, 6, 11, 12, 13, 16] - word/space/other boundaries

Advanced Character Processing

Examples of combining character utilities for complex text processing tasks.

// Function to safely slice text at grapheme boundaries
function safeSlice(text: string, start: number, end?: number): string {
  const actualStart = findClusterBreak(text, start, false);
  const actualEnd = end !== undefined ? findClusterBreak(text, end, true) : text.length;
  return text.slice(actualStart, actualEnd);
}

// Function to count visual characters (grapheme clusters)
function visualLength(text: string): number {
  let count = 0;
  let pos = 0;
  while (pos < text.length) {
    pos = findClusterBreak(text, pos, true);
    count++;
  }
  return count;
}

// Function to find word at position
function wordAt(text: string, pos: number, categorizer: (char: string) => CharCategory): {start: number, end: number, word: string} | null {
  if (pos >= text.length) return null;
  
  const charAtPos = text[pos];
  if (categorizer(charAtPos) !== CharCategory.Word) return null;
  
  // Find start of word
  let start = pos;
  while (start > 0) {
    const prevPos = findClusterBreak(text, start, false);
    if (prevPos === start) break;
    const prevChar = text.slice(prevPos, start);
    if (categorizer(prevChar) !== CharCategory.Word) break;
    start = prevPos;
  }
  
  // Find end of word
  let end = pos;
  while (end < text.length) {
    const nextPos = findClusterBreak(text, end, true);
    if (nextPos === end) break;
    const nextChar = text.slice(end, nextPos);
    if (categorizer(nextChar) !== CharCategory.Word) break;
    end = nextPos;
  }
  
  return {
    start,
    end,
    word: text.slice(start, end)
  };
}

Usage Examples:

// Safe text slicing with emoji
const emojiText = "Hello 👨‍💻 world!";
console.log(safeSlice(emojiText, 6, 11)); // "👨‍💻" (complete emoji sequence)

// Count visual characters
console.log(visualLength("Hello 👋")); // 7 (not 8, emoji counts as 1)
console.log(visualLength("café"));     // 4 (proper character count)

// Find words
const sentence = "The quick-brown fox jumps!";
const categorizer = makeCategorizer("-");
const word = wordAt(sentence, 10, categorizer);
console.log(word); // {start: 4, end: 15, word: "quick-brown"}

// Column-aware text wrapping
function wrapText(text: string, maxColumns: number, tabSize: number): string[] {
  const lines: string[] = [];
  let currentLine = "";
  let currentColumn = 0;
  
  for (let i = 0; i < text.length;) {
    const nextBreak = findClusterBreak(text, i, true);
    const char = text.slice(i, nextBreak);
    const charWidth = char === '\t' ? 
      tabSize - (currentColumn % tabSize) : 
      1;
    
    if (currentColumn + charWidth > maxColumns && currentLine) {
      lines.push(currentLine);
      currentLine = "";
      currentColumn = 0;
    }
    
    currentLine += char;
    currentColumn += charWidth;
    i = nextBreak;
  }
  
  if (currentLine) lines.push(currentLine);
  return lines;
}

Types

/**
 * Character category enumeration
 */
enum CharCategory {
  Word = 0,
  Space = 1, 
  Other = 2
}

/**
 * Character categorizer function type
 */
type CharCategorizer = (char: string) => CharCategory;

/**
 * Options for character processing functions
 */
interface CharacterProcessingOptions {
  includeExtending?: boolean;
  tabSize?: number;
  wordChars?: string;
}