Tokenizes a string that represents a regular expression.
—
Core tokenization functionality that parses regular expression strings into structured token representations. Handles all regex features including groups, character classes, quantifiers, lookarounds, and special characters.
Parses a regular expression string and returns a structured token tree representing the regex's components.
/**
* Tokenizes a regular expression string into structured tokens
* @param regexpStr - String representation of a regular expression (without delimiters)
* @returns Root token containing the parsed structure
* @throws SyntaxError for invalid regular expressions
*/
function tokenizer(regexpStr: string): Root;Usage Examples:
import { tokenizer, types } from "ret";
// Simple character sequence
const simple = tokenizer("abc");
// Result: { type: types.ROOT, stack: [
// { type: types.CHAR, value: 97 }, // 'a'
// { type: types.CHAR, value: 98 }, // 'b'
// { type: types.CHAR, value: 99 } // 'c'
// ]}
// Alternation
const alternation = tokenizer("foo|bar");
// Result: { type: types.ROOT, options: [
// [{ type: types.CHAR, value: 102 }, ...], // 'foo'
// [{ type: types.CHAR, value: 98 }, ...] // 'bar'
// ]}
// Groups with quantifiers
const groups = tokenizer("(ab)+");
// Result: { type: types.ROOT, stack: [
// { type: types.REPETITION, min: 1, max: Infinity, value: {
// type: types.GROUP, remember: true, stack: [
// { type: types.CHAR, value: 97 }, // 'a'
// { type: types.CHAR, value: 98 } // 'b'
// ]
// }}
// ]}The tokenizer throws SyntaxError for invalid regular expressions. All possible errors include:
import { tokenizer } from "ret";
// Invalid group - '?' followed by invalid character
try {
tokenizer("(?_abc)");
} catch (error) {
// SyntaxError: Invalid regular expression: /(?_abc)/: Invalid group, character '_' after '?' at column X
}
// Nothing to repeat - repetition token used inappropriately
try {
tokenizer("foo|?bar");
} catch (error) {
// SyntaxError: Invalid regular expression: /foo|?bar/: Nothing to repeat at column X
}
try {
tokenizer("{1,3}foo");
} catch (error) {
// SyntaxError: Invalid regular expression: /{1,3}foo/: Nothing to repeat at column X
}
try {
tokenizer("foo(+bar)");
} catch (error) {
// SyntaxError: Invalid regular expression: /foo(+bar)/: Nothing to repeat at column X
}
// Unmatched closing parenthesis
try {
tokenizer("hello)world");
} catch (error) {
// SyntaxError: Invalid regular expression: /hello)world/: Unmatched ) at column X
}
// Unterminated group
try {
tokenizer("(1(23)4");
} catch (error) {
// SyntaxError: Invalid regular expression: /(1(23)4/: Unterminated group
}
// Unterminated character class
try {
tokenizer("[abc");
} catch (error) {
// SyntaxError: Invalid regular expression: /[abc/: Unterminated character class
}
// Backslash at end of pattern
try {
tokenizer("test\\");
} catch (error) {
// SyntaxError: Invalid regular expression: /test\\/: \ at end of pattern
}
// Invalid capture group name
try {
tokenizer("(?<123>abc)");
} catch (error) {
// SyntaxError: Invalid regular expression: /(?<123>abc)/: Invalid capture group name, character '1' after '<' at column X
}
// Unclosed capture group name
try {
tokenizer("(?<name abc)");
} catch (error) {
// SyntaxError: Invalid regular expression: /(?<name abc)/: Unclosed capture group name, expected '>', found ' ' at column X
}\n, \t, \r, \f, \v, \0\uXXXX, \xXX\cX\d, \D, \w, \W, \s, \S[abc], [^abc], [a-z]. (any character except newline)^ (start), $ (end)\b, \B(pattern)(?:pattern)(?<name>pattern)(?=pattern) (positive), (?!pattern) (negative)* (0+), + (1+), ? (0-1){n}, {n,}, {n,m}| for alternative patterns\1, \2, etc.The top-level container for the entire regex:
interface Root {
type: types.ROOT;
stack?: Token[]; // Sequential tokens (no alternation)
options?: Token[][]; // Alternative branches (with alternation)
flags?: string[]; // Optional regex flags
}Represents parenthesized groups with various modifiers:
interface Group {
type: types.GROUP;
stack?: Token[]; // Sequential tokens in group
options?: Token[][]; // Alternative branches in group
remember: boolean; // Whether group captures (true for capturing groups)
followedBy?: boolean; // Positive lookahead (?=)
notFollowedBy?: boolean; // Negative lookahead (?!)
lookBehind?: boolean; // Lookbehind assertions
name?: string; // Named capture group name
}Represent individual characters and character classes:
interface Char {
type: types.CHAR;
value: number; // Character code
}
interface Set {
type: types.SET;
set: SetTokens; // Array of characters/ranges in the set
not: boolean; // Whether set is negated ([^...])
}
interface Range {
type: types.RANGE;
from: number; // Start character code
to: number; // End character code
}Represent repetition patterns:
interface Repetition {
type: types.REPETITION;
min: number; // Minimum repetitions
max: number; // Maximum repetitions (Infinity for unbounded)
value: Token; // Token being repeated
}Represent anchors and backreferences:
interface Position {
type: types.POSITION;
value: '$' | '^' | 'b' | 'B'; // Anchor/boundary type
}
interface Reference {
type: types.REFERENCE;
value: number; // Reference number
}Install with Tessl CLI
npx tessl i tessl/npm-ret