Extended regular expressions with augmented syntax, named capture groups, Unicode support, and cross-browser compatibility
Comprehensive Unicode property, category, and script matching with astral plane support for international text processing.
XRegExp supports Unicode property matching via \\p{} and \\P{} tokens.
// Unicode token patterns:
// \\p{PropertyName} - Match Unicode property
// \\P{PropertyName} - Match NOT Unicode property (negated)
// \\p{^PropertyName} - Match NOT Unicode property (caret negation)
// \\pL - Single letter shorthand for \\p{Letter}
// \\p{Type=Value} - Match specific property type and valueUsage Examples:
// Basic Unicode property matching
const letters = XRegExp('\\\\p{Letter}+', 'A');
letters.test('Hello世界'); // true - matches Unicode letters
// Negated properties
const nonDigits = XRegExp('\\\\P{Number}+', 'A');
nonDigits.test('abc'); // true - matches non-numeric characters
// Single letter shortcuts
const identifiers = XRegExp('\\\\pL[\\\\pL\\\\pN]*', 'A');
identifiers.test('變數名123'); // true - letter followed by letters/numbers
// Category matching
const punctuation = XRegExp('\\\\p{Punctuation}', 'A');
punctuation.test('!'); // true
punctuation.test('。'); // true - Unicode punctuationAdd custom Unicode character data for specialized matching.
/**
* Adds to the list of Unicode tokens that XRegExp regexes can match
* @param data - Array of objects with named character ranges
* @param typePrefix - Optional type prefix for all provided Unicode tokens
*/
function addUnicodeData(data: UnicodeCharacterRange[], typePrefix?: string): void;
interface UnicodeCharacterRange {
/** The name of the character range */
name: string;
/** An alternate name for the character range */
alias?: string;
/** Needed when token matches orphan high surrogates and uses surrogate pairs */
isBmpLast?: boolean;
/** Can be used to avoid duplicating data by referencing inverse of another token */
inverseOf?: string;
/** Character data for Basic Multilingual Plane (U+0000-U+FFFF) */
bmp?: string;
/** Character data for astral code points (U+10000-U+10FFFF) */
astral?: string;
}Usage Examples:
// Add custom Unicode token
XRegExp.addUnicodeData([{
name: 'XDigit',
alias: 'Hexadecimal',
bmp: '0-9A-Fa-f'
}]);
// Use the custom token
XRegExp('\\\\p{XDigit}:\\\\p{Hexadecimal}+').test('0:3D'); // true
// Add token with type prefix
XRegExp.addUnicodeData([{
name: 'Emoji',
bmp: '\\u{1F600}-\\u{1F64F}',
astral: '\\u{1F300}-\\u{1F5FF}|\\u{1F680}-\\u{1F6FF}'
}], 'Custom');
// Use with type prefix
XRegExp('\\\\p{Custom=Emoji}').test('😀'); // true (with flag A)XRegExp includes comprehensive Unicode general categories:
// All letters
XRegExp('\\\\p{Letter}', 'A').test('A'); // true
XRegExp('\\\\p{Letter}', 'A').test('文'); // true
XRegExp('\\\\p{L}', 'A').test('π'); // true (shorthand)
// Specific letter subcategories
XRegExp('\\\\p{Uppercase_Letter}', 'A').test('A'); // true
XRegExp('\\\\p{Lu}', 'A').test('A'); // true (shorthand)
XRegExp('\\\\p{Lowercase_Letter}', 'A').test('a'); // true
XRegExp('\\\\p{Ll}', 'A').test('a'); // true (shorthand)
XRegExp('\\\\p{Titlecase_Letter}', 'A').test('Dž'); // true
XRegExp('\\\\p{Lt}', 'A').test('Dž'); // true (shorthand)// All numbers
XRegExp('\\\\p{Number}', 'A').test('5'); // true
XRegExp('\\\\p{Number}', 'A').test('Ⅴ'); // true (Roman numeral)
XRegExp('\\\\p{N}', 'A').test('½'); // true (shorthand)
// Specific number subcategories
XRegExp('\\\\p{Decimal_Number}', 'A').test('9'); // true
XRegExp('\\\\p{Nd}', 'A').test('9'); // true (shorthand)
XRegExp('\\\\p{Letter_Number}', 'A').test('Ⅴ'); // true
XRegExp('\\\\p{Nl}', 'A').test('Ⅴ'); // true (shorthand)
XRegExp('\\\\p{Other_Number}', 'A').test('½'); // true
XRegExp('\\\\p{No}', 'A').test('½'); // true (shorthand)// All marks (combining characters)
XRegExp('\\\\p{Mark}', 'A').test('́'); // true (combining acute)
XRegExp('\\\\p{M}', 'A').test('̃'); // true (shorthand)
// Specific mark subcategories
XRegExp('\\\\p{Nonspacing_Mark}', 'A').test('́'); // true
XRegExp('\\\\p{Mn}', 'A').test('́'); // true (shorthand)// All punctuation
XRegExp('\\\\p{Punctuation}', 'A').test('!'); // true
XRegExp('\\\\p{Punctuation}', 'A').test('。'); // true (CJK period)
XRegExp('\\\\p{P}', 'A').test('?'); // true (shorthand)
// Specific punctuation subcategories
XRegExp('\\\\p{Open_Punctuation}', 'A').test('('); // true
XRegExp('\\\\p{Ps}', 'A').test('['); // true (shorthand)
XRegExp('\\\\p{Close_Punctuation}', 'A').test(')'); // true
XRegExp('\\\\p{Pe}', 'A').test(']'); // true (shorthand)XRegExp supports Unicode script matching:
// Latin script
XRegExp('\\\\p{Latin}', 'A').test('Hello'); // true
XRegExp('\\\\p{Script=Latin}', 'A').test('A'); // true (explicit syntax)
// Chinese/Japanese/Korean scripts
XRegExp('\\\\p{Han}', 'A').test('漢字'); // true (Chinese characters)
XRegExp('\\\\p{Hiragana}', 'A').test('ひらがな'); // true
XRegExp('\\\\p{Katakana}', 'A').test('カタカナ'); // true
XRegExp('\\\\p{Hangul}', 'A').test('한글'); // true (Korean)
// Arabic and Hebrew
XRegExp('\\\\p{Arabic}', 'A').test('العربية'); // true
XRegExp('\\\\p{Hebrew}', 'A').test('עברית'); // true
// Cyrillic
XRegExp('\\\\p{Cyrillic}', 'A').test('Кирилица'); // true
// Greek
XRegExp('\\\\p{Greek}', 'A').test('Ελληνικά'); // trueXRegExp includes Unicode properties for specialized matching:
// Alphabetic property (broader than Letter category)
XRegExp('\\\\p{Alphabetic}', 'A').test('A'); // true
XRegExp('\\\\p{Alpha}', 'A').test('文'); // true
// Whitespace property
XRegExp('\\\\p{White_Space}', 'A').test(' '); // true
XRegExp('\\\\p{Space}', 'A').test('\\t'); // true
// Uppercase and Lowercase properties
XRegExp('\\\\p{Uppercase}', 'A').test('A'); // true
XRegExp('\\\\p{Lowercase}', 'A').test('a'); // true
// Math property
XRegExp('\\\\p{Math}', 'A').test('+'); // true
XRegExp('\\\\p{Math}', 'A').test('∑'); // true (summation)
// Currency symbol property
XRegExp('\\\\p{Currency_Symbol}', 'A').test('$'); // true
XRegExp('\\\\p{Sc}', 'A').test('€'); // true (shorthand)Flag A enables 21-bit Unicode support for characters beyond the Basic Multilingual Plane:
// Flag A enables astral mode for Unicode tokens
// Required for code points above U+FFFF (outside BMP)
// Automatically added when XRegExp.install('astral') is calledUsage Examples:
// Without flag A - only BMP characters (U+0000-U+FFFF)
const bmpOnly = XRegExp('\\\\p{Letter}');
bmpOnly.test('A'); // true
bmpOnly.test('文'); // true
bmpOnly.test('𝒜'); // false (mathematical script capital A, U+1D49C)
// With flag A - full Unicode range (U+0000-U+10FFFF)
const fullUnicode = XRegExp('\\\\p{Letter}', 'A');
fullUnicode.test('A'); // true
fullUnicode.test('文'); // true
fullUnicode.test('𝒜'); // true (now works with astral support)
// Astral emoji support
const emoji = XRegExp('\\\\p{Emoji}', 'A');
emoji.test('😀'); // true (U+1F600)
emoji.test('🚀'); // true (U+1F680)Enable astral mode for all new regexes:
// Enable astral mode globally
XRegExp.install('astral');
// Now flag A is automatically added to all XRegExp regexes
const auto = XRegExp('\\\\p{Letter}'); // Automatically gets flag A
auto.test('𝒜'); // true
// Disable astral mode
XRegExp.uninstall('astral');XRegExp supports extended Unicode escape syntax:
// \\u{N...} - Unicode code point escape with curly braces
// N... is any one or more digit hexadecimal number from 0-10FFFF
// Can include leading zeros
// Requires flag u for code points > U+FFFFUsage Examples:
// Basic Multilingual Plane characters
XRegExp('\\\\u{41}').test('A'); // true (U+0041)
XRegExp('\\\\u{3042}').test('あ'); // true (U+3042, Hiragana A)
// Astral characters (requires flag u)
XRegExp('\\\\u{1F600}', 'u').test('😀'); // true (U+1F600, grinning face)
XRegExp('\\\\u{1D49C}', 'u').test('𝒜'); // true (U+1D49C, math script A)
// Leading zeros allowed
XRegExp('\\\\u{0041}').test('A'); // true (same as \\u{41})
XRegExp('\\\\u{00003042}').test('あ'); // true (same as \\u{3042})// Match programming identifiers with Unicode support
const identifier = XRegExp('^\\\\p{ID_Start}\\\\p{ID_Continue}*$', 'A');
identifier.test('변수명'); // true (Korean)
identifier.test('переменная'); // true (Russian)
identifier.test('変数名'); // true (Japanese)
identifier.test('متغير'); // true (Arabic)// Extract words from mixed-script text
const words = XRegExp('\\\\p{Letter}+', 'gA');
const text = 'Hello 世界 مرحبا мир';
const matches = XRegExp.match(text, words, 'all');
// Result: ['Hello', '世界', 'مرحبا', 'мир']// Match all Unicode whitespace characters
const whitespace = XRegExp('\\\\p{White_Space}+', 'gA');
const text = 'word1\\u2003word2\\u2009word3'; // em space and thin space
XRegExp.split(text, whitespace);
// Result: ['word1', 'word2', 'word3']// Match base letters with any combining marks
const withDiacritics = XRegExp('\\\\p{Letter}\\\\p{Mark}*', 'gA');
const text = 'café naïve résumé';
XRegExp.match(text, withDiacritics, 'all');
// Result: ['café', 'naïve', 'résumé'] (preserves combining characters)Install with Tessl CLI
npx tessl i tessl/npm-xregexp