Unicode character classification and manipulation with comprehensive Unicode 9.0.0 support for accurate character processing across all language scripts and symbol sets.
The Fable library includes comprehensive Unicode 9.0.0 support through the Unicode.9.0.0.ts module, which provides character classification data and functions for accurate text processing across all Unicode categories.
The Unicode standard defines multiple categories for character classification, enabling proper text processing for international applications.
// Unicode General Categories (simplified examples)
enum UnicodeCategory {
// Letters
UppercaseLetter, // Lu - e.g., 'A', 'B', 'C'
LowercaseLetter, // Ll - e.g., 'a', 'b', 'c'
TitlecaseLetter, // Lt - e.g., 'Dž', 'Lj', 'Nj'
ModifierLetter, // Lm - e.g., 'ʰ', 'ʲ', 'ʷ'
OtherLetter, // Lo - e.g., 'א', 'أ', '中'
// Numbers
DecimalDigitNumber, // Nd - e.g., '0', '1', '٠', '२'
LetterNumber, // Nl - e.g., 'Ⅰ', 'Ⅱ', 'ⅰ', 'ⅱ'
OtherNumber, // No - e.g., '½', '¾', '²', '³'
// Punctuation
ConnectorPunctuation, // Pc - e.g., '_', '‿'
DashPunctuation, // Pd - e.g., '-', '–', '—'
OpenPunctuation, // Ps - e.g., '(', '[', '{'
ClosePunctuation, // Pe - e.g., ')', ']', '}'
InitialQuotePunctuation, // Pi - e.g., '"', '''
FinalQuotePunctuation, // Pf - e.g., '"', '''
OtherPunctuation, // Po - e.g., '!', '@', '#'
// Symbols
MathSymbol, // Sm - e.g., '+', '=', '∀', '∃'
CurrencySymbol, // Sc - e.g., '$', '€', '¥', '£'
ModifierSymbol, // Sk - e.g., '^', '`', '¨'
OtherSymbol, // So - e.g., '©', '®', '☺', '♠'
// Separators
SpaceSeparator, // Zs - e.g., ' ', ' ', ' '
LineSeparator, // Zl - e.g., '\u2028'
ParagraphSeparator, // Zp - e.g., '\u2029'
// Control characters
Control, // Cc - e.g., '\0', '\t', '\n'
Format, // Cf - e.g., '\u200E', '\u200F'
Surrogate, // Cs - UTF-16 surrogate pairs
PrivateUse, // Co - Private use characters
Unassigned // Cn - Unassigned code points
}The Char module leverages Unicode data to provide accurate character classification.
// From Char.ts - Unicode-aware character classification
import {
getUnicodeCategory, isLetter, isDigit,
isLetterOrDigit, isWhiteSpace, isPunctuation
} from "fable-library/Char.js";
// Test various Unicode characters
const testChars = [
'A', // Latin uppercase letter
'α', // Greek lowercase letter
'א', // Hebrew letter
'中', // Chinese ideograph
'0', // ASCII digit
'٠', // Arabic-Indic digit
'²', // Superscript digit
'Ⅴ', // Roman numeral
' ', // Space
'\t', // Tab
'\u00A0', // Non-breaking space
'!', // Exclamation mark
'©', // Copyright symbol
'€', // Euro symbol
'♠', // Spade symbol
'😀' // Emoji (surrogate pair)
];
testChars.forEach(char => {
const category = getUnicodeCategory(char);
console.log(`Character: '${char}' (U+${char.codePointAt(0)?.toString(16).toUpperCase()})`);
console.log(` Unicode Category: ${category}`);
console.log(` Is Letter: ${isLetter(char)}`);
console.log(` Is Digit: ${isDigit(char)}`);
console.log(` Is Letter or Digit: ${isLetterOrDigit(char)}`);
console.log(` Is Whitespace: ${isWhiteSpace(char)}`);
console.log(` Is Punctuation: ${isPunctuation(char)}`);
console.log();
});import {
isUpper, isLower, isNumber, isSymbol,
isSeparator, isControl, toUpper, toLower
} from "fable-library/Char.js";
// Test case conversion and properties
const caseTestChars = [
'A', 'a', // Basic Latin
'Α', 'α', // Greek
'А', 'а', // Cyrillic
'İ', 'ı', // Turkish (special case)
'SS', 'ß', // German sharp s
'Ω', 'ω' // Greek omega
];
caseTestChars.forEach(char => {
console.log(`Character: '${char}'`);
console.log(` Is Upper: ${isUpper(char)}`);
console.log(` Is Lower: ${isLower(char)}`);
console.log(` To Upper: '${toUpper(char)}'`);
console.log(` To Lower: '${toLower(char)}'`);
console.log();
});
// Test numeric and symbolic characters
const numericChars = ['5', '٥', 'Ⅴ', 'ⅴ', '½', '²'];
const symbolChars = ['+', '€', '©', '♠', '∀', '∃'];
console.log("Numeric characters:");
numericChars.forEach(char => {
console.log(`'${char}': Number=${isNumber(char)}, Digit=${isDigit(char)}`);
});
console.log("\nSymbolic characters:");
symbolChars.forEach(char => {
console.log(`'${char}': Symbol=${isSymbol(char)}`);
});Modern Unicode requires proper handling of surrogate pairs for characters beyond the Basic Multilingual Plane.
import {
isHighSurrogate, isLowSurrogate, isSurrogate,
getUnicodeCategory, isLetter
} from "fable-library/Char.js";
// Test emoji and high Unicode characters
const emojiAndHighChars = [
'😀', // Grinning face (U+1F600)
'🌍', // Earth globe Europe-Africa (U+1F30D)
'💻', // Personal computer (U+1F4BB)
'🎉', // Party popper (U+1F389)
'𝐀', // Mathematical bold capital A (U+1D400)
'𝒜', // Mathematical script capital A (U+1D49C)
'𠀀' // CJK ideograph (U+20000)
];
function analyzeUnicodeString(str: string): void {
console.log(`\nAnalyzing: "${str}"`);
// Iterate through string considering surrogate pairs
let i = 0;
while (i < str.length) {
const char = str[i];
const code = str.codePointAt(i) || 0;
if (isHighSurrogate(char) && i + 1 < str.length && isLowSurrogate(str[i + 1])) {
// Surrogate pair
const fullChar = str.substring(i, i + 2);
console.log(` Surrogate pair: '${fullChar}' (U+${code.toString(16).toUpperCase()})`);
console.log(` High: '${char}' (${char.charCodeAt(0).toString(16).toUpperCase()})`);
console.log(` Low: '${str[i + 1]}' (${str[i + 1].charCodeAt(0).toString(16).toUpperCase()})`);
console.log(` Category: ${getUnicodeCategory(fullChar)}`);
i += 2; // Skip both surrogate units
} else {
// Regular character
console.log(` Character: '${char}' (U+${code.toString(16).toUpperCase()})`);
console.log(` Is Surrogate: ${isSurrogate(char)}`);
console.log(` Category: ${getUnicodeCategory(char)}`);
i += 1;
}
}
}
// Analyze each test string
emojiAndHighChars.forEach(analyzeUnicodeString);import { isHighSurrogate, isLowSurrogate } from "fable-library/Char.js";
// Safe Unicode iteration that handles surrogate pairs correctly
function* iterateUnicodeCharacters(text: string): Generator<string, void, unknown> {
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (isHighSurrogate(char) && i + 1 < text.length && isLowSurrogate(text[i + 1])) {
// Yield the complete surrogate pair
yield text.substring(i, i + 2);
i++; // Skip the low surrogate in next iteration
} else {
// Yield single character
yield char;
}
}
}
// Count Unicode characters correctly
function getUnicodeLength(text: string): number {
let count = 0;
for (const char of iterateUnicodeCharacters(text)) {
count++;
}
return count;
}
// Test with mixed content
const testStrings = [
"Hello", // ASCII only
"café", // Latin with accents
"🌍 Hello 世界 🎉", // Mixed with emoji and CJK
"𝐀𝐁𝐂 𝒜ℬ𝒞 𝔄𝔅ℭ", // Mathematical symbols
"👨👩👧👦" // Family emoji (complex sequence)
];
testStrings.forEach(str => {
console.log(`\nString: "${str}"`);
console.log(`JavaScript length: ${str.length}`);
console.log(`Unicode length: ${getUnicodeLength(str)}`);
console.log("Characters:");
let index = 0;
for (const char of iterateUnicodeCharacters(str)) {
const codePoint = char.codePointAt(0) || 0;
console.log(` [${index}] '${char}' (U+${codePoint.toString(16).toUpperCase()})`);
index++;
}
});import { getUnicodeCategory, isLetter, isDigit } from "fable-library/Char.js";
// Detect script/writing system of text
enum Script {
Latin,
Greek,
Cyrillic,
Arabic,
Hebrew,
CJK, // Chinese, Japanese, Korean
Devanagari, // Hindi, Sanskrit
Thai,
Mixed,
Unknown
}
function detectScript(text: string): Script {
const scripts = new Set<Script>();
for (const char of text) {
if (!isLetter(char)) continue;
const code = char.codePointAt(0) || 0;
if (code >= 0x0041 && code <= 0x007A || code >= 0x00C0 && code <= 0x024F) {
scripts.add(Script.Latin);
} else if (code >= 0x0370 && code <= 0x03FF) {
scripts.add(Script.Greek);
} else if (code >= 0x0400 && code <= 0x04FF) {
scripts.add(Script.Cyrillic);
} else if (code >= 0x0600 && code <= 0x06FF) {
scripts.add(Script.Arabic);
} else if (code >= 0x0590 && code <= 0x05FF) {
scripts.add(Script.Hebrew);
} else if (code >= 0x4E00 && code <= 0x9FFF ||
code >= 0x3040 && code <= 0x309F ||
code >= 0x30A0 && code <= 0x30FF) {
scripts.add(Script.CJK);
} else if (code >= 0x0900 && code <= 0x097F) {
scripts.add(Script.Devanagari);
} else if (code >= 0x0E00 && code <= 0x0E7F) {
scripts.add(Script.Thai);
} else {
scripts.add(Script.Unknown);
}
}
if (scripts.size === 0) return Script.Unknown;
if (scripts.size === 1) return scripts.values().next().value;
return Script.Mixed;
}
// Test script detection
const scriptTests = [
"Hello World", // Latin
"Γεια σας κόσμε", // Greek
"Привет мир", // Cyrillic
"مرحبا بالعالم", // Arabic
"שלום עולם", // Hebrew
"你好世界", // Chinese
"こんにちは世界", // Japanese
"नमस्ते दुनिया", // Devanagari (Hindi)
"สวัสดีชาวโลก", // Thai
"Hello 世界 🌍", // Mixed
"Café naïve résumé" // Latin with accents
];
scriptTests.forEach(text => {
const script = detectScript(text);
console.log(`"${text}" -> ${Script[script]}`);
});import { getUnicodeCategory } from "fable-library/Char.js";
// Detect RTL characters and text direction
function isRTLCharacter(char: string): boolean {
const code = char.codePointAt(0) || 0;
// Arabic, Hebrew, and other RTL scripts
return (code >= 0x0590 && code <= 0x05FF) || // Hebrew
(code >= 0x0600 && code <= 0x06FF) || // Arabic
(code >= 0x0750 && code <= 0x077F) || // Arabic Supplement
(code >= 0x08A0 && code <= 0x08FF) || // Arabic Extended-A
(code >= 0xFB50 && code <= 0xFDFF) || // Arabic Presentation Forms-A
(code >= 0xFE70 && code <= 0xFEFF); // Arabic Presentation Forms-B
}
function detectTextDirection(text: string): 'ltr' | 'rtl' | 'mixed' {
let ltrCount = 0;
let rtlCount = 0;
for (const char of text) {
if (isRTLCharacter(char)) {
rtlCount++;
} else {
const code = char.codePointAt(0) || 0;
// Count Latin letters as LTR
if ((code >= 0x0041 && code <= 0x005A) || (code >= 0x0061 && code <= 0x007A)) {
ltrCount++;
}
}
}
if (rtlCount > 0 && ltrCount > 0) return 'mixed';
if (rtlCount > ltrCount) return 'rtl';
return 'ltr';
}
// Test text direction detection
const directionTests = [
"Hello World", // LTR
"مرحبا بالعالم", // RTL (Arabic)
"שלום עולם", // RTL (Hebrew)
"Hello مرحبا World", // Mixed
"English text", // LTR
"العربية والإنجليزية English" // Mixed Arabic-English
];
directionTests.forEach(text => {
const direction = detectTextDirection(text);
console.log(`"${text}" -> ${direction.toUpperCase()}`);
});import {
isLetter, isDigit, isWhiteSpace, isPunctuation,
toUpper, toLower, getUnicodeCategory
} from "fable-library/Char.js";
class TextProcessor {
// Normalize whitespace - replace all whitespace with regular spaces
static normalizeWhitespace(text: string): string {
return Array.from(text)
.map(char => isWhiteSpace(char) ? ' ' : char)
.join('')
.replace(/\s+/g, ' ') // Collapse multiple spaces
.trim();
}
// Extract only letters and digits
static extractAlphanumeric(text: string): string {
return Array.from(text)
.filter(char => isLetter(char) || isDigit(char))
.join('');
}
// Remove accents and diacritics (basic normalization)
static removeAccents(text: string): string {
return text.normalize('NFD')
.replace(/[\u0300-\u036f]/g, ''); // Remove combining diacritical marks
}
// Convert to title case (first letter uppercase, rest lowercase)
static toTitleCase(text: string): string {
let result = '';
let capitalizeNext = true;
for (const char of text) {
if (isLetter(char)) {
if (capitalizeNext) {
result += toUpper(char);
capitalizeNext = false;
} else {
result += toLower(char);
}
} else {
result += char;
if (isWhiteSpace(char) || isPunctuation(char)) {
capitalizeNext = true;
}
}
}
return result;
}
// Clean text for searching (normalize case, remove accents, etc.)
static normalizeForSearch(text: string): string {
return this.removeAccents(text)
.toLowerCase()
.replace(/[^\p{L}\p{N}\s]/gu, ' ') // Keep only letters, numbers, spaces
.replace(/\s+/g, ' ')
.trim();
}
// Analyze character composition
static analyzeText(text: string): {
letters: number;
digits: number;
punctuation: number;
whitespace: number;
symbols: number;
other: number;
scripts: Set<string>;
} {
const analysis = {
letters: 0,
digits: 0,
punctuation: 0,
whitespace: 0,
symbols: 0,
other: 0,
scripts: new Set<string>()
};
for (const char of text) {
if (isLetter(char)) {
analysis.letters++;
// Detect script (simplified)
const code = char.codePointAt(0) || 0;
if (code <= 0x007F) analysis.scripts.add('ASCII');
else if (code <= 0x024F) analysis.scripts.add('Latin Extended');
else if (code >= 0x0370 && code <= 0x03FF) analysis.scripts.add('Greek');
else if (code >= 0x0400 && code <= 0x04FF) analysis.scripts.add('Cyrillic');
else if (code >= 0x4E00 && code <= 0x9FFF) analysis.scripts.add('CJK');
else analysis.scripts.add('Other');
} else if (isDigit(char)) {
analysis.digits++;
} else if (isPunctuation(char)) {
analysis.punctuation++;
} else if (isWhiteSpace(char)) {
analysis.whitespace++;
} else {
const category = getUnicodeCategory(char);
if (category >= 25 && category <= 28) { // Symbol categories
analysis.symbols++;
} else {
analysis.other++;
}
}
}
return analysis;
}
}
// Test text processing
const testTexts = [
"Hello, World! 🌍",
"café naïve résumé",
"Привет, мир!",
"こんにちは世界!",
"Mixed: Hello 世界 مرحبا 🎉",
"Numbers: 123 ٧٨٩ ১২৩",
" Extra whitespace text "
];
testTexts.forEach(text => {
console.log(`\nOriginal: "${text}"`);
console.log(`Normalized WS: "${TextProcessor.normalizeWhitespace(text)}"`);
console.log(`Alphanumeric: "${TextProcessor.extractAlphanumeric(text)}"`);
console.log(`No accents: "${TextProcessor.removeAccents(text)}"`);
console.log(`Title case: "${TextProcessor.toTitleCase(text)}"`);
console.log(`Search form: "${TextProcessor.normalizeForSearch(text)}"`);
const analysis = TextProcessor.analyzeText(text);
console.log(`Analysis:`, analysis);
});import { isLetter, isDigit, isPunctuation, isWhiteSpace } from "fable-library/Char.js";
class InputValidator {
// Validate username (letters, digits, underscore, hyphen)
static isValidUsername(username: string): boolean {
if (username.length < 3 || username.length > 20) return false;
for (const char of username) {
if (!isLetter(char) && !isDigit(char) && char !== '_' && char !== '-') {
return false;
}
}
// Must start with letter
return isLetter(username[0]);
}
// Validate email local part (simplified)
static isValidEmailLocalPart(localPart: string): boolean {
if (localPart.length === 0 || localPart.length > 64) return false;
for (const char of localPart) {
if (!isLetter(char) && !isDigit(char) &&
!['_', '-', '.', '+'].includes(char)) {
return false;
}
}
return true;
}
// Validate password strength
static validatePassword(password: string): {
isValid: boolean;
errors: string[];
strength: 'weak' | 'medium' | 'strong';
} {
const errors: string[] = [];
let score = 0;
if (password.length < 8) {
errors.push('Password must be at least 8 characters long');
} else if (password.length >= 12) {
score += 2;
} else {
score += 1;
}
let hasUpper = false, hasLower = false, hasDigit = false, hasSymbol = false;
for (const char of password) {
if (isLetter(char)) {
if (char === char.toUpperCase()) hasUpper = true;
else hasLower = true;
} else if (isDigit(char)) {
hasDigit = true;
} else if (isPunctuation(char) || !isWhiteSpace(char)) {
hasSymbol = true;
}
}
if (!hasUpper) errors.push('Password must contain uppercase letters');
else score += 1;
if (!hasLower) errors.push('Password must contain lowercase letters');
else score += 1;
if (!hasDigit) errors.push('Password must contain digits');
else score += 1;
if (!hasSymbol) errors.push('Password must contain symbols');
else score += 1;
let strength: 'weak' | 'medium' | 'strong';
if (score < 3) strength = 'weak';
else if (score < 5) strength = 'medium';
else strength = 'strong';
return {
isValid: errors.length === 0,
errors,
strength
};
}
// Sanitize display name (allow letters, digits, spaces, basic punctuation)
static sanitizeDisplayName(name: string): string {
return Array.from(name)
.filter(char =>
isLetter(char) ||
isDigit(char) ||
isWhiteSpace(char) ||
['\'', '-', '.'].includes(char)
)
.join('')
.replace(/\s+/g, ' ')
.trim()
.substring(0, 50); // Limit length
}
}
// Test validation functions
const usernameTests = ['john_doe', 'user123', 'a', '123user', 'valid-user', 'invalid@user'];
const passwordTests = ['weak', 'StrongPass123!', 'onlylowercase', 'ONLYUPPERCASE', 'NoSymbols123'];
const nameTests = ['John Doe', 'José María', 'O\'Connor', 'User@#$%Name', ' Multiple Spaces '];
console.log('Username validation:');
usernameTests.forEach(username => {
console.log(`"${username}": ${InputValidator.isValidUsername(username)}`);
});
console.log('\nPassword validation:');
passwordTests.forEach(password => {
const result = InputValidator.validatePassword(password);
console.log(`"${password}": ${result.strength} (${result.isValid ? 'valid' : 'invalid'})`);
if (result.errors.length > 0) {
result.errors.forEach(error => console.log(` - ${error}`));
}
});
console.log('\nName sanitization:');
nameTests.forEach(name => {
console.log(`"${name}" -> "${InputValidator.sanitizeDisplayName(name)}"`);
});