or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

async-programming.mdcharacter-operations.mdcollections.mdconfiguration.mdcore-infrastructure.mddata-encoding.mddate-time.mdexternal-integration.mdindex.mdnumeric-types.mdreactive-programming.mdstring-operations.mdtype-system.md
tile.json

character-operations.mddocs/

Character Operations

Unicode character classification and manipulation with comprehensive Unicode 9.0.0 support for accurate character processing across all language scripts and symbol sets.

Unicode Support Overview

The Fable library includes comprehensive Unicode 9.0.0 support through the Unicode.9.0.0.ts module, which provides character classification data and functions for accurate text processing across all Unicode categories.

Unicode Categories

The Unicode standard defines multiple categories for character classification, enabling proper text processing for international applications.

Major Category Groups

// Unicode General Categories (simplified examples)
enum UnicodeCategory {
    // Letters
    UppercaseLetter,      // Lu - e.g., 'A', 'B', 'C'
    LowercaseLetter,      // Ll - e.g., 'a', 'b', 'c'
    TitlecaseLetter,      // Lt - e.g., 'Dž', 'Lj', 'Nj'
    ModifierLetter,       // Lm - e.g., 'ʰ', 'ʲ', 'ʷ'
    OtherLetter,          // Lo - e.g., 'א', 'أ', '中'
    
    // Numbers
    DecimalDigitNumber,   // Nd - e.g., '0', '1', '٠', '२'
    LetterNumber,         // Nl - e.g., 'Ⅰ', 'Ⅱ', 'ⅰ', 'ⅱ'
    OtherNumber,          // No - e.g., '½', '¾', '²', '³'
    
    // Punctuation
    ConnectorPunctuation, // Pc - e.g., '_', '‿'
    DashPunctuation,      // Pd - e.g., '-', '–', '—'
    OpenPunctuation,      // Ps - e.g., '(', '[', '{'
    ClosePunctuation,     // Pe - e.g., ')', ']', '}'
    InitialQuotePunctuation, // Pi - e.g., '"', '''
    FinalQuotePunctuation,   // Pf - e.g., '"', '''
    OtherPunctuation,     // Po - e.g., '!', '@', '#'
    
    // Symbols
    MathSymbol,           // Sm - e.g., '+', '=', '∀', '∃'
    CurrencySymbol,       // Sc - e.g., '$', '€', '¥', '£'
    ModifierSymbol,       // Sk - e.g., '^', '`', '¨'
    OtherSymbol,          // So - e.g., '©', '®', '☺', '♠'
    
    // Separators
    SpaceSeparator,       // Zs - e.g., ' ', ' ', ' '
    LineSeparator,        // Zl - e.g., '\u2028'
    ParagraphSeparator,   // Zp - e.g., '\u2029'
    
    // Control characters
    Control,              // Cc - e.g., '\0', '\t', '\n'
    Format,               // Cf - e.g., '\u200E', '\u200F'
    Surrogate,            // Cs - UTF-16 surrogate pairs
    PrivateUse,           // Co - Private use characters
    Unassigned            // Cn - Unassigned code points
}

Character Classification Functions

The Char module leverages Unicode data to provide accurate character classification.

Basic Character Type Detection

// From Char.ts - Unicode-aware character classification
import { 
    getUnicodeCategory, isLetter, isDigit, 
    isLetterOrDigit, isWhiteSpace, isPunctuation 
} from "fable-library/Char.js";

// Test various Unicode characters
const testChars = [
    'A',        // Latin uppercase letter
    'α',        // Greek lowercase letter  
    'א',        // Hebrew letter
    '中',       // Chinese ideograph
    '0',        // ASCII digit
    '٠',        // Arabic-Indic digit
    '²',        // Superscript digit
    'Ⅴ',        // Roman numeral
    ' ',        // Space
    '\t',       // Tab
    '\u00A0',   // Non-breaking space
    '!',        // Exclamation mark
    '©',        // Copyright symbol
    '€',        // Euro symbol
    '♠',        // Spade symbol
    '😀'        // Emoji (surrogate pair)
];

testChars.forEach(char => {
    const category = getUnicodeCategory(char);
    
    console.log(`Character: '${char}' (U+${char.codePointAt(0)?.toString(16).toUpperCase()})`);
    console.log(`  Unicode Category: ${category}`);
    console.log(`  Is Letter: ${isLetter(char)}`);
    console.log(`  Is Digit: ${isDigit(char)}`);
    console.log(`  Is Letter or Digit: ${isLetterOrDigit(char)}`);
    console.log(`  Is Whitespace: ${isWhiteSpace(char)}`);
    console.log(`  Is Punctuation: ${isPunctuation(char)}`);
    console.log();
});

Advanced Character Properties

import { 
    isUpper, isLower, isNumber, isSymbol, 
    isSeparator, isControl, toUpper, toLower 
} from "fable-library/Char.js";

// Test case conversion and properties
const caseTestChars = [
    'A', 'a',           // Basic Latin
    'Α', 'α',           // Greek
    'А', 'а',           // Cyrillic  
    'İ', 'ı',           // Turkish (special case)
    'SS', 'ß',          // German sharp s
    'Ω', 'ω'            // Greek omega
];

caseTestChars.forEach(char => {
    console.log(`Character: '${char}'`);
    console.log(`  Is Upper: ${isUpper(char)}`);
    console.log(`  Is Lower: ${isLower(char)}`);
    console.log(`  To Upper: '${toUpper(char)}'`);
    console.log(`  To Lower: '${toLower(char)}'`);
    console.log();
});

// Test numeric and symbolic characters
const numericChars = ['5', '٥', 'Ⅴ', 'ⅴ', '½', '²'];
const symbolChars = ['+', '€', '©', '♠', '∀', '∃'];

console.log("Numeric characters:");
numericChars.forEach(char => {
    console.log(`'${char}': Number=${isNumber(char)}, Digit=${isDigit(char)}`);
});

console.log("\nSymbolic characters:");
symbolChars.forEach(char => {
    console.log(`'${char}': Symbol=${isSymbol(char)}`);
});

Surrogate Pair Handling

Modern Unicode requires proper handling of surrogate pairs for characters beyond the Basic Multilingual Plane.

Surrogate Pair Detection and Processing

import { 
    isHighSurrogate, isLowSurrogate, isSurrogate,
    getUnicodeCategory, isLetter 
} from "fable-library/Char.js";

// Test emoji and high Unicode characters
const emojiAndHighChars = [
    '😀',      // Grinning face (U+1F600)
    '🌍',      // Earth globe Europe-Africa (U+1F30D)
    '💻',      // Personal computer (U+1F4BB)
    '🎉',      // Party popper (U+1F389)
    '𝐀',       // Mathematical bold capital A (U+1D400)
    '𝒜',       // Mathematical script capital A (U+1D49C)
    '𠀀'       // CJK ideograph (U+20000)
];

function analyzeUnicodeString(str: string): void {
    console.log(`\nAnalyzing: "${str}"`);
    
    // Iterate through string considering surrogate pairs
    let i = 0;
    while (i < str.length) {
        const char = str[i];
        const code = str.codePointAt(i) || 0;
        
        if (isHighSurrogate(char) && i + 1 < str.length && isLowSurrogate(str[i + 1])) {
            // Surrogate pair
            const fullChar = str.substring(i, i + 2);
            console.log(`  Surrogate pair: '${fullChar}' (U+${code.toString(16).toUpperCase()})`);
            console.log(`    High: '${char}' (${char.charCodeAt(0).toString(16).toUpperCase()})`);
            console.log(`    Low: '${str[i + 1]}' (${str[i + 1].charCodeAt(0).toString(16).toUpperCase()})`);
            console.log(`    Category: ${getUnicodeCategory(fullChar)}`);
            i += 2; // Skip both surrogate units
        } else {
            // Regular character
            console.log(`  Character: '${char}' (U+${code.toString(16).toUpperCase()})`);
            console.log(`    Is Surrogate: ${isSurrogate(char)}`);
            console.log(`    Category: ${getUnicodeCategory(char)}`);
            i += 1;
        }
    }
}

// Analyze each test string
emojiAndHighChars.forEach(analyzeUnicodeString);

Safe Unicode Iteration

import { isHighSurrogate, isLowSurrogate } from "fable-library/Char.js";

// Safe Unicode iteration that handles surrogate pairs correctly
function* iterateUnicodeCharacters(text: string): Generator<string, void, unknown> {
    for (let i = 0; i < text.length; i++) {
        const char = text[i];
        
        if (isHighSurrogate(char) && i + 1 < text.length && isLowSurrogate(text[i + 1])) {
            // Yield the complete surrogate pair
            yield text.substring(i, i + 2);
            i++; // Skip the low surrogate in next iteration
        } else {
            // Yield single character
            yield char;
        }
    }
}

// Count Unicode characters correctly
function getUnicodeLength(text: string): number {
    let count = 0;
    for (const char of iterateUnicodeCharacters(text)) {
        count++;
    }
    return count;
}

// Test with mixed content
const testStrings = [
    "Hello",                    // ASCII only
    "café",                     // Latin with accents
    "🌍 Hello 世界 🎉",          // Mixed with emoji and CJK
    "𝐀𝐁𝐂 𝒜ℬ𝒞 𝔄𝔅ℭ",            // Mathematical symbols
    "👨‍👩‍👧‍👦"                    // Family emoji (complex sequence)
];

testStrings.forEach(str => {
    console.log(`\nString: "${str}"`);
    console.log(`JavaScript length: ${str.length}`);
    console.log(`Unicode length: ${getUnicodeLength(str)}`);
    
    console.log("Characters:");
    let index = 0;
    for (const char of iterateUnicodeCharacters(str)) {
        const codePoint = char.codePointAt(0) || 0;
        console.log(`  [${index}] '${char}' (U+${codePoint.toString(16).toUpperCase()})`);
        index++;
    }
});

Language-Specific Processing

Script Detection and Processing

import { getUnicodeCategory, isLetter, isDigit } from "fable-library/Char.js";

// Detect script/writing system of text
enum Script {
    Latin,
    Greek,
    Cyrillic,
    Arabic,
    Hebrew,
    CJK,          // Chinese, Japanese, Korean
    Devanagari,   // Hindi, Sanskrit
    Thai,
    Mixed,
    Unknown
}

function detectScript(text: string): Script {
    const scripts = new Set<Script>();
    
    for (const char of text) {
        if (!isLetter(char)) continue;
        
        const code = char.codePointAt(0) || 0;
        
        if (code >= 0x0041 && code <= 0x007A || code >= 0x00C0 && code <= 0x024F) {
            scripts.add(Script.Latin);
        } else if (code >= 0x0370 && code <= 0x03FF) {
            scripts.add(Script.Greek);
        } else if (code >= 0x0400 && code <= 0x04FF) {
            scripts.add(Script.Cyrillic);
        } else if (code >= 0x0600 && code <= 0x06FF) {
            scripts.add(Script.Arabic);
        } else if (code >= 0x0590 && code <= 0x05FF) {
            scripts.add(Script.Hebrew);
        } else if (code >= 0x4E00 && code <= 0x9FFF || 
                   code >= 0x3040 && code <= 0x309F ||
                   code >= 0x30A0 && code <= 0x30FF) {
            scripts.add(Script.CJK);
        } else if (code >= 0x0900 && code <= 0x097F) {
            scripts.add(Script.Devanagari);
        } else if (code >= 0x0E00 && code <= 0x0E7F) {
            scripts.add(Script.Thai);
        } else {
            scripts.add(Script.Unknown);
        }
    }
    
    if (scripts.size === 0) return Script.Unknown;
    if (scripts.size === 1) return scripts.values().next().value;
    return Script.Mixed;
}

// Test script detection
const scriptTests = [
    "Hello World",                           // Latin
    "Γεια σας κόσμε",                       // Greek
    "Привет мир",                           // Cyrillic
    "مرحبا بالعالم",                         // Arabic
    "שלום עולם",                            // Hebrew
    "你好世界",                              // Chinese
    "こんにちは世界",                          // Japanese
    "नमस्ते दुनिया",                         // Devanagari (Hindi)
    "สวัสดีชาวโลก",                          // Thai
    "Hello 世界 🌍",                         // Mixed
    "Café naïve résumé"                     // Latin with accents
];

scriptTests.forEach(text => {
    const script = detectScript(text);
    console.log(`"${text}" -> ${Script[script]}`);
});

Right-to-Left (RTL) Text Processing

import { getUnicodeCategory } from "fable-library/Char.js";

// Detect RTL characters and text direction
function isRTLCharacter(char: string): boolean {
    const code = char.codePointAt(0) || 0;
    
    // Arabic, Hebrew, and other RTL scripts
    return (code >= 0x0590 && code <= 0x05FF) ||  // Hebrew
           (code >= 0x0600 && code <= 0x06FF) ||  // Arabic
           (code >= 0x0750 && code <= 0x077F) ||  // Arabic Supplement
           (code >= 0x08A0 && code <= 0x08FF) ||  // Arabic Extended-A
           (code >= 0xFB50 && code <= 0xFDFF) ||  // Arabic Presentation Forms-A
           (code >= 0xFE70 && code <= 0xFEFF);    // Arabic Presentation Forms-B
}

function detectTextDirection(text: string): 'ltr' | 'rtl' | 'mixed' {
    let ltrCount = 0;
    let rtlCount = 0;
    
    for (const char of text) {
        if (isRTLCharacter(char)) {
            rtlCount++;
        } else {
            const code = char.codePointAt(0) || 0;
            // Count Latin letters as LTR
            if ((code >= 0x0041 && code <= 0x005A) || (code >= 0x0061 && code <= 0x007A)) {
                ltrCount++;
            }
        }
    }
    
    if (rtlCount > 0 && ltrCount > 0) return 'mixed';
    if (rtlCount > ltrCount) return 'rtl';
    return 'ltr';
}

// Test text direction detection
const directionTests = [
    "Hello World",                    // LTR
    "مرحبا بالعالم",                  // RTL (Arabic)
    "שלום עולם",                     // RTL (Hebrew)  
    "Hello مرحبا World",              // Mixed
    "English text",                   // LTR
    "العربية والإنجليزية English"      // Mixed Arabic-English
];

directionTests.forEach(text => {
    const direction = detectTextDirection(text);
    console.log(`"${text}" -> ${direction.toUpperCase()}`);
});

Practical Character Processing Applications

Text Normalization and Cleaning

import { 
    isLetter, isDigit, isWhiteSpace, isPunctuation,
    toUpper, toLower, getUnicodeCategory 
} from "fable-library/Char.js";

class TextProcessor {
    // Normalize whitespace - replace all whitespace with regular spaces
    static normalizeWhitespace(text: string): string {
        return Array.from(text)
            .map(char => isWhiteSpace(char) ? ' ' : char)
            .join('')
            .replace(/\s+/g, ' ')  // Collapse multiple spaces
            .trim();
    }
    
    // Extract only letters and digits
    static extractAlphanumeric(text: string): string {
        return Array.from(text)
            .filter(char => isLetter(char) || isDigit(char))
            .join('');
    }
    
    // Remove accents and diacritics (basic normalization)
    static removeAccents(text: string): string {
        return text.normalize('NFD')
            .replace(/[\u0300-\u036f]/g, ''); // Remove combining diacritical marks
    }
    
    // Convert to title case (first letter uppercase, rest lowercase)
    static toTitleCase(text: string): string {
        let result = '';
        let capitalizeNext = true;
        
        for (const char of text) {
            if (isLetter(char)) {
                if (capitalizeNext) {
                    result += toUpper(char);
                    capitalizeNext = false;
                } else {
                    result += toLower(char);
                }
            } else {
                result += char;
                if (isWhiteSpace(char) || isPunctuation(char)) {
                    capitalizeNext = true;
                }
            }
        }
        
        return result;
    }
    
    // Clean text for searching (normalize case, remove accents, etc.)
    static normalizeForSearch(text: string): string {
        return this.removeAccents(text)
            .toLowerCase()
            .replace(/[^\p{L}\p{N}\s]/gu, ' ')  // Keep only letters, numbers, spaces
            .replace(/\s+/g, ' ')
            .trim();
    }
    
    // Analyze character composition
    static analyzeText(text: string): {
        letters: number;
        digits: number;
        punctuation: number;
        whitespace: number;
        symbols: number;
        other: number;
        scripts: Set<string>;
    } {
        const analysis = {
            letters: 0,
            digits: 0,
            punctuation: 0,
            whitespace: 0,
            symbols: 0,
            other: 0,
            scripts: new Set<string>()
        };
        
        for (const char of text) {
            if (isLetter(char)) {
                analysis.letters++;
                // Detect script (simplified)
                const code = char.codePointAt(0) || 0;
                if (code <= 0x007F) analysis.scripts.add('ASCII');
                else if (code <= 0x024F) analysis.scripts.add('Latin Extended');
                else if (code >= 0x0370 && code <= 0x03FF) analysis.scripts.add('Greek');
                else if (code >= 0x0400 && code <= 0x04FF) analysis.scripts.add('Cyrillic');
                else if (code >= 0x4E00 && code <= 0x9FFF) analysis.scripts.add('CJK');
                else analysis.scripts.add('Other');
            } else if (isDigit(char)) {
                analysis.digits++;
            } else if (isPunctuation(char)) {
                analysis.punctuation++;
            } else if (isWhiteSpace(char)) {
                analysis.whitespace++;
            } else {
                const category = getUnicodeCategory(char);
                if (category >= 25 && category <= 28) { // Symbol categories
                    analysis.symbols++;
                } else {
                    analysis.other++;
                }
            }
        }
        
        return analysis;
    }
}

// Test text processing
const testTexts = [
    "Hello, World! 🌍",
    "café naïve résumé",
    "Привет, мир!",
    "こんにちは世界!",
    "Mixed: Hello 世界 مرحبا 🎉",
    "Numbers: 123 ٧٨٩ ১২৩",
    "   Extra    whitespace   text   "
];

testTexts.forEach(text => {
    console.log(`\nOriginal: "${text}"`);
    console.log(`Normalized WS: "${TextProcessor.normalizeWhitespace(text)}"`);
    console.log(`Alphanumeric: "${TextProcessor.extractAlphanumeric(text)}"`);
    console.log(`No accents: "${TextProcessor.removeAccents(text)}"`);
    console.log(`Title case: "${TextProcessor.toTitleCase(text)}"`);
    console.log(`Search form: "${TextProcessor.normalizeForSearch(text)}"`);
    
    const analysis = TextProcessor.analyzeText(text);
    console.log(`Analysis:`, analysis);
});

Input Validation and Sanitization

import { isLetter, isDigit, isPunctuation, isWhiteSpace } from "fable-library/Char.js";

class InputValidator {
    // Validate username (letters, digits, underscore, hyphen)
    static isValidUsername(username: string): boolean {
        if (username.length < 3 || username.length > 20) return false;
        
        for (const char of username) {
            if (!isLetter(char) && !isDigit(char) && char !== '_' && char !== '-') {
                return false;
            }
        }
        
        // Must start with letter
        return isLetter(username[0]);
    }
    
    // Validate email local part (simplified)
    static isValidEmailLocalPart(localPart: string): boolean {
        if (localPart.length === 0 || localPart.length > 64) return false;
        
        for (const char of localPart) {
            if (!isLetter(char) && !isDigit(char) && 
                !['_', '-', '.', '+'].includes(char)) {
                return false;
            }
        }
        
        return true;
    }
    
    // Validate password strength
    static validatePassword(password: string): {
        isValid: boolean;
        errors: string[];
        strength: 'weak' | 'medium' | 'strong';
    } {
        const errors: string[] = [];
        let score = 0;
        
        if (password.length < 8) {
            errors.push('Password must be at least 8 characters long');
        } else if (password.length >= 12) {
            score += 2;
        } else {
            score += 1;
        }
        
        let hasUpper = false, hasLower = false, hasDigit = false, hasSymbol = false;
        
        for (const char of password) {
            if (isLetter(char)) {
                if (char === char.toUpperCase()) hasUpper = true;
                else hasLower = true;
            } else if (isDigit(char)) {
                hasDigit = true;
            } else if (isPunctuation(char) || !isWhiteSpace(char)) {
                hasSymbol = true;
            }
        }
        
        if (!hasUpper) errors.push('Password must contain uppercase letters');
        else score += 1;
        
        if (!hasLower) errors.push('Password must contain lowercase letters');
        else score += 1;
        
        if (!hasDigit) errors.push('Password must contain digits');
        else score += 1;
        
        if (!hasSymbol) errors.push('Password must contain symbols');
        else score += 1;
        
        let strength: 'weak' | 'medium' | 'strong';
        if (score < 3) strength = 'weak';
        else if (score < 5) strength = 'medium';
        else strength = 'strong';
        
        return {
            isValid: errors.length === 0,
            errors,
            strength
        };
    }
    
    // Sanitize display name (allow letters, digits, spaces, basic punctuation)
    static sanitizeDisplayName(name: string): string {
        return Array.from(name)
            .filter(char => 
                isLetter(char) || 
                isDigit(char) || 
                isWhiteSpace(char) ||
                ['\'', '-', '.'].includes(char)
            )
            .join('')
            .replace(/\s+/g, ' ')
            .trim()
            .substring(0, 50); // Limit length
    }
}

// Test validation functions
const usernameTests = ['john_doe', 'user123', 'a', '123user', 'valid-user', 'invalid@user'];
const passwordTests = ['weak', 'StrongPass123!', 'onlylowercase', 'ONLYUPPERCASE', 'NoSymbols123'];
const nameTests = ['John Doe', 'José María', 'O\'Connor', 'User@#$%Name', '   Multiple   Spaces   '];

console.log('Username validation:');
usernameTests.forEach(username => {
    console.log(`"${username}": ${InputValidator.isValidUsername(username)}`);
});

console.log('\nPassword validation:');
passwordTests.forEach(password => {
    const result = InputValidator.validatePassword(password);
    console.log(`"${password}": ${result.strength} (${result.isValid ? 'valid' : 'invalid'})`);
    if (result.errors.length > 0) {
        result.errors.forEach(error => console.log(`  - ${error}`));
    }
});

console.log('\nName sanitization:');
nameTests.forEach(name => {
    console.log(`"${name}" -> "${InputValidator.sanitizeDisplayName(name)}"`);
});