Annotate AST trees with source code positions
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Utilities for converting between different position representations (line/column vs character offsets) and working with source code positions. These utilities handle the complexities of Unicode text and provide compatibility across different position systems.
Utility class for converting between character offsets and (line, column) positions in source text.
class LineNumbers:
def __init__(self, text):
"""
Initialize with source text for position calculations.
Parameters:
- text (str): Source text to analyze
"""
def line_to_offset(self, line, column) -> int:
"""
Convert line and column position to character offset.
Parameters:
- line (int): Line number (1-based)
- column (int): Column position (0-based)
Returns:
int: Character offset in source text
"""
def offset_to_line(self, offset) -> Tuple[int, int]:
"""
Convert character offset to line and column position.
Parameters:
- offset (int): Character offset in source text
Returns:
Tuple[int, int]: (line, column) where line is 1-based, column is 0-based
"""
def from_utf8_col(self, line, utf8_column) -> int:
"""
Convert UTF8 byte column to Unicode character column.
Parameters:
- line (int): Line number (1-based)
- utf8_column (int): Column position in UTF8 bytes
Returns:
int: Column position in Unicode characters
"""import asttokens
source = "hello = 'world'\nprint(hello)"
line_numbers = asttokens.LineNumbers(source)
# Convert position to offset
offset = line_numbers.line_to_offset(1, 8) # Line 1, column 8
print(source[offset]) # '=' (character at that position)
# Convert offset to position
line, col = line_numbers.offset_to_line(16) # Character 16
print(f"Line {line}, Column {col}") # Line 2, Column 0
# Handle UTF8 encoding differences
source_utf8 = "café = 'délicious'"
line_numbers_utf8 = asttokens.LineNumbers(source_utf8)
unicode_col = line_numbers_utf8.from_utf8_col(1, 5) # UTF8 byte 5
print(unicode_col) # Unicode character positionFunction to determine if nodes or Python versions support faster tokenless operations.
def supports_tokenless(node=None) -> bool:
"""
Check if node or Python version supports tokenless operation.
Parameters:
- node (ast.AST, optional): Specific AST node to check
Returns:
bool: True if tokenless operation is supported
"""import asttokens
import ast
source = "x = [1, 2, 3]"
tree = ast.parse(source)
# Check general tokenless support
if asttokens.supports_tokenless():
print("Python version supports tokenless operations")
# Check specific node support
assign_node = tree.body[0]
if asttokens.supports_tokenless(assign_node):
print("This node supports tokenless operations")
# Use ASTText for better performance
astext = asttokens.ASTText(source, tree=tree)
text = astext.get_text(assign_node)
else:
print("Node requires full tokenization")
# Use ASTTokens
atok = asttokens.ASTTokens(source, tree=tree)
text = atok.get_text(assign_node)Helper functions for working with tokens are available through the asttokens.util module. These provide token matching, type checking, and generation capabilities.
# Available through asttokens.util module
from asttokens.util import (
token_repr, match_token, expect_token, is_non_coding_token,
generate_tokens, patched_generate_tokens
)
def token_repr(tok_type, string) -> str:
"""
Create human-readable representation of token.
Parameters:
- tok_type (int): Token type from token module
- string (str): Token string content
Returns:
str: Human-friendly token representation
"""
def match_token(token, tok_type, tok_str=None) -> bool:
"""
Check if token matches specified type and optionally string.
Parameters:
- token (Token): Token to check
- tok_type (int): Expected token type
- tok_str (str, optional): Expected token string
Returns:
bool: True if token matches criteria
"""
def expect_token(token, tok_type, tok_str=None):
"""
Validate that token matches expected type/string, raise if not.
Parameters:
- token (Token): Token to validate
- tok_type (int): Expected token type
- tok_str (str, optional): Expected token string
Raises:
ValueError: If token doesn't match expectations
"""
def is_non_coding_token(token_type) -> bool:
"""
Check if token type represents non-coding content.
Parameters:
- token_type (int): Token type to check
Returns:
bool: True for comments, newlines, encoding declarations
"""
def generate_tokens(text) -> Iterator[Token]:
"""
Generate enhanced Token objects from source text.
Parameters:
- text (str): Source code to tokenize
Yields:
Token: Enhanced token with position information
"""
def patched_generate_tokens(original_tokens) -> Iterator[Token]:
"""
Fixed tokenizer that handles non-ASCII identifiers correctly.
Parameters:
- original_tokens (Iterator): Original token stream
Yields:
Token: Corrected tokens with proper handling
"""import asttokens
import asttokens.util
import token
source = "name = 'value' # comment"
atok = asttokens.ASTTokens(source, parse=True)
# Get first token
first_token = atok.tokens[0]
# Check token matching
if asttokens.util.match_token(first_token, token.NAME, 'name'):
print("Found 'name' token")
# Create readable representation
repr_str = asttokens.util.token_repr(first_token.type, first_token.string)
print(repr_str) # "NAME:'name'"
# Check for non-coding tokens
for tok in atok.tokens:
if asttokens.util.is_non_coding_token(tok.type):
print(f"Non-coding token: {tok.string}")
# Generate tokens manually
tokens = list(asttokens.util.generate_tokens("x = 1"))
print([f"{t.type}:{t.string}" for t in tokens])The Token class provides rich position information for each token.
class Token:
"""
Enhanced token representation with comprehensive position information.
Attributes:
- type (int): Token type from token module
- string (str): Token text content
- start (Tuple[int, int]): Starting (row, column) position
- end (Tuple[int, int]): Ending (row, column) position
- line (str): Complete line text containing this token
- index (int): Token index in token list
- startpos (int): Starting character offset
- endpos (int): Ending character offset
"""
def __str__(self) -> str:
"""
Human-readable token representation.
Returns:
str: String representation of token
"""import asttokens
source = "def func():\n pass"
atok = asttokens.ASTTokens(source, parse=True)
# Examine token details
def_token = atok.tokens[0]
print(f"Type: {def_token.type}") # Token type number
print(f"String: {def_token.string}") # 'def'
print(f"Start: {def_token.start}") # (1, 0) - line 1, column 0
print(f"End: {def_token.end}") # (1, 3) - line 1, column 3
print(f"Line: {def_token.line}") # 'def func():'
print(f"Index: {def_token.index}") # 0 - first token
print(f"Start pos: {def_token.startpos}") # 0 - character offset 0
print(f"End pos: {def_token.endpos}") # 3 - character offset 3
print(f"Repr: {def_token}") # Human-readable representation
# Use position information
text_slice = source[def_token.startpos:def_token.endpos]
print(text_slice) # 'def' - exact token textInstall with Tessl CLI
npx tessl i tessl/pypi-asttokens