Lexical analysis functionality that converts JavaScript source code into arrays of tokens. This is useful for syntax highlighting, code formatting, static analysis tools, and any application that needs to understand JavaScript at the lexical level without full parsing.
Converts JavaScript source code into an array of tokens representing all lexical elements.
/**
* Tokenize JavaScript source code into an array of tokens
* @param code - JavaScript source code to tokenize
* @param options - Optional tokenization configuration
* @param delegate - Optional function called for each token
* @returns Array of tokens representing the lexical structure
*/
function tokenize(code: string, options?: TokenizeOptions, delegate?: TokenVisitor): Token[];
interface TokenizeOptions {
/** Include character index ranges in tokens */
range?: boolean;
/** Include line and column location information in tokens */
loc?: boolean;
/** Include comment tokens in the result */
comment?: boolean;
/** Enable error tolerance mode */
tolerant?: boolean;
}
type TokenVisitor = (token: Token) => Token | void;Usage Examples:
import { tokenize } from "esprima";
// Basic tokenization
const tokens = tokenize("const x = 42;");
console.log(tokens);
// [
// { type: 'Keyword', value: 'const' },
// { type: 'Identifier', value: 'x' },
// { type: 'Punctuator', value: '=' },
// { type: 'Numeric', value: '42' },
// { type: 'Punctuator', value: ';' }
// ]
// Tokenization with location information
const tokensWithLoc = tokenize("function test() {}", {
range: true,
loc: true
});
// Including comments
const tokensWithComments = tokenize("/* comment */ var x = 1;", {
comment: true
});
// With tolerant mode for error recovery
const tolerantTokens = tokenize("var x = ; var y = 2;", {
tolerant: true
});Each token represents a lexical element with its type, value, and optional location information.
interface Token {
/** Type of the token (Keyword, Identifier, Punctuator, etc.) */
type: string;
/** String value of the token as it appears in source */
value: string;
/** Character index range in source code */
range?: [number, number];
/** Line and column location information */
loc?: SourceLocation;
/** For regular expression tokens, contains pattern and flags */
regex?: {
pattern: string;
flags: string;
};
}
interface SourceLocation {
start: Position;
end: Position;
}
interface Position {
line: number;
column: number;
}Esprima recognizes the following token types:
// Primary token types
type TokenType =
| 'Boolean' // true, false
| 'EOF' // End of file
| 'Identifier' // Variable names, function names
| 'Keyword' // Reserved words (var, function, if, etc.)
| 'Null' // null literal
| 'Numeric' // Number literals (123, 3.14, 0x1F)
| 'Punctuator' // Operators and punctuation (+, -, {, }, etc.)
| 'String' // String literals ("hello", 'world')
| 'RegularExpression' // Regular expression literals (/pattern/flags)
| 'Template' // Template literal parts (`template ${expr}`)
| 'LineComment' // Single-line comments (// comment)
| 'BlockComment'; // Multi-line comments (/* comment */)Token Type Examples:
import { tokenize } from "esprima";
// Different token types
const tokens = tokenize(`
var name = "John"; // Keyword, Identifier, Punctuator, String
const age = 25; // Keyword, Identifier, Punctuator, Numeric
let active = true; // Keyword, Identifier, Punctuator, Boolean
const pattern = /\\d+/g; // Keyword, Identifier, Punctuator, RegularExpression
/* Block comment */ // BlockComment
// Line comment // LineComment
`);
tokens.forEach(token => {
console.log(`${token.type}: "${token.value}"`);
});Optional function called for each token during tokenization, allowing token transformation or filtering.
/**
* Visitor function called for each token during tokenization
* @param token - Current token being processed
* @returns Modified token, original token, or undefined to filter out
*/
type TokenVisitor = (token: Token) => Token | void;Usage Examples:
import { tokenize } from "esprima";
// Collect specific token types
const identifiers: string[] = [];
tokenize("function test(param) { return param; }", {}, (token) => {
if (token.type === 'Identifier') {
identifiers.push(token.value);
}
return token;
});
console.log(identifiers); // ['test', 'param', 'param']
// Transform tokens (e.g., for obfuscation)
const transformedTokens = tokenize("var secret = 123;", {}, (token) => {
if (token.type === 'Identifier' && token.value === 'secret') {
return { ...token, value: 'hidden' };
}
return token;
});
// Filter out comments
const noCommentTokens = tokenize("/* TODO */ var x = 1;", {}, (token) => {
if (token.type === 'BlockComment' || token.type === 'LineComment') {
return; // Filter out by returning undefined
}
return token;
});Regular expression literals include both the pattern and flags.
import { tokenize } from "esprima";
const tokens = tokenize("const regex = /abc/gi;");
const regexToken = tokens.find(t => t.type === 'RegularExpression');
console.log(regexToken.regex);
// { pattern: 'abc', flags: 'gi' }Template literals are tokenized into multiple parts.
import { tokenize } from "esprima";
const tokens = tokenize("`Hello ${name}!`");
// Results in Template tokens for template parts and expression boundariesWhen enabled, each token includes precise source location information.
import { tokenize } from "esprima";
const tokens = tokenize("function\n test() {}", {
range: true,
loc: true
});
tokens.forEach(token => {
console.log(`${token.value} at line ${token.loc.start.line}, column ${token.loc.start.column}`);
console.log(` Range: [${token.range[0]}, ${token.range[1]}]`);
});In standard mode, tokenization stops at the first lexical error.
import { tokenize } from "esprima";
try {
const tokens = tokenize("var x = 'unterminated string");
} catch (error) {
console.log(error.message); // "Unterminated string constant"
}In tolerant mode, tokenization continues after errors and includes error information.
import { tokenize } from "esprima";
const tokens = tokenize("var x = 'bad string\n var y = 2;", {
tolerant: true
});
// Check for errors
if (tokens.errors && tokens.errors.length > 0) {
console.log("Tokenization errors:", tokens.errors);
}
// Still get valid tokens for the rest of the code
console.log("Valid tokens:", tokens.length);