A wrapper around the stdlib `tokenize` which roundtrips.
npx @tessl/cli install tessl/pypi-tokenize-rt@6.2.0A wrapper around the stdlib tokenize which roundtrips. The tokenize-rt package provides perfect roundtrip tokenization by introducing additional token types (ESCAPED_NL and UNIMPORTANT_WS) that preserve exact source code formatting, enabling precise refactoring tools that maintain whitespace, comments, and formatting while modifying Python source code.
pip install tokenize-rtimport tokenize_rtCommon for working with tokens:
from tokenize_rt import src_to_tokens, tokens_to_src, TokenFor additional utilities:
from tokenize_rt import (
ESCAPED_NL, UNIMPORTANT_WS, NON_CODING_TOKENS, NAMED_UNICODE_RE,
Offset, reversed_enumerate, parse_string_literal,
rfind_string_parts, curly_escape, _re_partition
)from tokenize_rt import src_to_tokens, tokens_to_src, Token
# Convert source code to tokens
source = '''
def hello():
print("Hello, world!")
'''
# Tokenize with perfect roundtrip capability
tokens = src_to_tokens(source)
# Each token has name, src, line, and utf8_byte_offset
for token in tokens:
if token.name not in {'UNIMPORTANT_WS', 'ESCAPED_NL'}:
print(f'{token.name}: {token.src!r}')
# Convert back to source (perfect roundtrip)
reconstructed = tokens_to_src(tokens)
assert source == reconstructed
# Working with specific tokens
name_tokens = [t for t in tokens if t.name == 'NAME']
print(f"Found {len(name_tokens)} NAME tokens")
# Using token matching
for token in tokens:
if token.matches(name='NAME', src='hello'):
print(f"Found 'hello' at line {token.line}, offset {token.utf8_byte_offset}")Convert between Python source code and token representations with perfect roundtrip capability, preserving all formatting including whitespace and escaped newlines.
def src_to_tokens(src: str) -> list[Token]:
"""
Convert Python source code string to list of tokens.
Args:
src (str): Python source code to tokenize
Returns:
list[Token]: List of Token objects representing the source
"""
def tokens_to_src(tokens: Iterable[Token]) -> str:
"""
Convert an iterable of tokens back to source code string.
Args:
tokens (Iterable[Token]): Tokens to convert back to source
Returns:
str: Reconstructed source code
"""Data structures for representing tokens and their positions within source code.
class Offset(NamedTuple):
"""
Represents a token offset with line and byte position information.
"""
line: int | None = None
utf8_byte_offset: int | None = None
class Token(NamedTuple):
"""
Represents a tokenized element with position information.
"""
name: str # Token type name (from token.tok_name or custom types)
src: str # Source text of the token
line: int | None = None # Line number where token appears
utf8_byte_offset: int | None = None # UTF-8 byte offset within the line
@property
def offset(self) -> Offset:
"""Returns an Offset object for this token."""
def matches(self, *, name: str, src: str) -> bool:
"""
Check if token matches given name and source.
Args:
name (str): Token name to match
src (str): Token source to match
Returns:
bool: True if both name and src match
"""Helper functions for working with token sequences, particularly useful for code refactoring and analysis tools.
def reversed_enumerate(tokens: Sequence[Token]) -> Generator[tuple[int, Token]]:
"""
Yield (index, token) pairs in reverse order.
Args:
tokens (Sequence[Token]): Token sequence to enumerate in reverse
Yields:
tuple[int, Token]: (index, token) pairs in reverse order
"""
def rfind_string_parts(tokens: Sequence[Token], i: int) -> tuple[int, ...]:
"""
Find the indices of string parts in a (joined) string literal.
Args:
tokens (Sequence[Token]): Token sequence to search
i (int): Starting index (should be at end of string literal)
Returns:
tuple[int, ...]: Indices of string parts, or empty tuple if not a string literal
"""Functions for parsing and processing Python string literals, including prefix extraction and escaping utilities.
def parse_string_literal(src: str) -> tuple[str, str]:
"""
Parse a string literal's source into (prefix, string) components.
Args:
src (str): String literal source code
Returns:
tuple[str, str]: (prefix, string) pair
Example:
>>> parse_string_literal('f"foo"')
('f', '"foo"')
"""
def curly_escape(s: str) -> str:
"""
Escape curly braces in strings while preserving named unicode escapes.
Args:
s (str): String to escape
Returns:
str: String with curly braces escaped except in unicode names
"""Pre-defined constants for token classification and filtering.
# Type imports (for reference in signatures)
from re import Pattern
ESCAPED_NL: str
"""Constant for escaped newline token type."""
UNIMPORTANT_WS: str
"""Constant for unimportant whitespace token type."""
NON_CODING_TOKENS: frozenset[str]
"""
Set of token names that don't affect control flow or code:
{'COMMENT', ESCAPED_NL, 'NL', UNIMPORTANT_WS}
"""
NAMED_UNICODE_RE: Pattern[str]
"""Regular expression pattern for matching named unicode escapes."""Internal helper functions that are exposed and may be useful for advanced use cases.
def _re_partition(regex: Pattern[str], s: str) -> tuple[str, str, str]:
"""
Partition a string based on regex match (internal helper function).
Args:
regex (Pattern[str]): Compiled regular expression pattern
s (str): String to partition
Returns:
tuple[str, str, str]: (before_match, match, after_match) or (s, '', '') if no match
"""Command-line tool for tokenizing Python files and inspecting token sequences.
def main(argv: Sequence[str] | None = None) -> int:
"""
Command-line interface that tokenizes a file and prints tokens with positions.
Args:
argv (Sequence[str] | None): Command line arguments, or None for sys.argv
Returns:
int: Exit code (0 for success)
"""from tokenize_rt import src_to_tokens, NON_CODING_TOKENS
source = '''
# This is a comment
def func(): # Another comment
pass
'''
tokens = src_to_tokens(source)
# Filter out non-coding tokens
code_tokens = [t for t in tokens if t.name not in NON_CODING_TOKENS]
print("Code-only tokens:", [t.src for t in code_tokens])
# Find all comments
comments = [t for t in tokens if t.name == 'COMMENT']
print("Comments found:", [t.src for t in comments])from tokenize_rt import src_to_tokens, parse_string_literal, rfind_string_parts
# Parse string prefixes
prefix, string_part = parse_string_literal('f"Hello {name}!"')
print(f"Prefix: {prefix!r}, String: {string_part!r}")
# Find string parts in concatenated strings
source = '"first" "second" "third"'
tokens = src_to_tokens(source)
# Find the string literal at the end
string_indices = rfind_string_parts(tokens, len(tokens) - 1)
print("String part indices:", string_indices)from tokenize_rt import src_to_tokens, tokens_to_src, Token
source = 'old_name = 42'
tokens = src_to_tokens(source)
# Replace 'old_name' with 'new_name'
modified_tokens = []
for token in tokens:
if token.matches(name='NAME', src='old_name'):
# Create new token with same position but different source
modified_tokens.append(Token(
name=token.name,
src='new_name',
line=token.line,
utf8_byte_offset=token.utf8_byte_offset
))
else:
modified_tokens.append(token)
result = tokens_to_src(modified_tokens)
print(result) # 'new_name = 42'