A Python parser that supports error recovery and round-trip parsing for different Python versions
—
Low-level tokenization functions and classes for converting Python source code into tokens. The tokenizer handles encoding detection, f-strings, Python version differences, and maintains precise position information.
Core functions for tokenizing Python source code with version-specific support.
def tokenize(code: str, *, version_info: PythonVersionInfo, start_pos: tuple[int, int] = (1, 0)):
"""
Tokenize Python source code string.
Args:
code (str): Python source code to tokenize
version_info (PythonVersionInfo): Python version for tokenization rules
start_pos (tuple[int, int]): Starting position (line, column)
Yields:
PythonToken: Token objects with type, value, position, and prefix
"""
def tokenize_lines(lines, *, version_info: PythonVersionInfo, indents=None, start_pos=(1, 0), is_first_token=True):
"""
Tokenize sequence of code lines.
Args:
lines (Iterable[str]): Lines of Python code
version_info (PythonVersionInfo): Python version for tokenization
indents (list[int], optional): Current indentation stack
start_pos (tuple[int, int]): Starting position (line, column)
is_first_token (bool): Whether this is the first token in the module
Yields:
PythonToken: Token objects
"""Enumeration of all Python token types with version-specific handling.
class PythonTokenTypes:
"""
Enumeration of Python token types.
Token categories:
- Literals: STRING, NUMBER, FSTRING_START, FSTRING_STRING, FSTRING_END
- Identifiers: NAME, ERRORTOKEN
- Structure: NEWLINE, INDENT, DEDENT, ERROR_DEDENT, ENDMARKER
- Operators: OP (covers all operators and punctuation)
"""
# Core token types
STRING: TokenType
NUMBER: TokenType
NAME: TokenType
OP: TokenType
NEWLINE: TokenType
INDENT: TokenType
DEDENT: TokenType
ENDMARKER: TokenType
ERRORTOKEN: TokenType
# F-string tokens (Python 3.6+)
FSTRING_START: TokenType
FSTRING_STRING: TokenType
FSTRING_END: TokenType
# Error handling
ERROR_DEDENT: TokenTypeimport parso
from parso.python.tokenize import tokenize
from parso.utils import PythonVersionInfo
# Tokenize simple code
code = 'x = 42 + y'
version = PythonVersionInfo(3, 9)
tokens = list(tokenize(code, version))
for token in tokens:
print(f"{token.type.name}: '{token.string}' at {token.start_pos}")
# Tokenize with f-strings
f_string_code = 'name = "Alice"\ngreeting = f"Hello, {name}!"'
tokens = list(tokenize(f_string_code, version))
for token in tokens:
if 'FSTRING' in token.type.name:
print(f"F-string token: {token.type.name} = '{token.string}'")
# Tokenize with different Python versions
py38_code = 'items := [1, 2, 3]' # Walrus operator
py38_tokens = list(tokenize(py38_code, PythonVersionInfo(3, 8)))
print("Python 3.8 tokens:", [(t.type.name, t.string) for t in py38_tokens])
# Handle tokenization errors
invalid_code = 'x = $invalid' # Invalid character
tokens = list(tokenize(invalid_code, version))
for token in tokens:
if token.type.name == 'ERRORTOKEN':
print(f"Error token: '{token.string}' at {token.start_pos}")from parso.utils import python_bytes_to_unicode
# Handle different encodings
latin1_bytes = b'# -*- coding: latin-1 -*-\ntext = "caf\xe9"'
unicode_text = python_bytes_to_unicode(latin1_bytes)
tokens = list(tokenize(unicode_text, PythonVersionInfo(3, 9)))
# UTF-8 with BOM
utf8_bom = b'\xef\xbb\xbfprint("hello")'
unicode_text = python_bytes_to_unicode(utf8_bom)# Multi-line tokenization with position tracking
multiline_code = '''def function():
"""Docstring here."""
x = 1 + \\
2 + 3
return x'''
tokens = list(tokenize(multiline_code, PythonVersionInfo(3, 9)))
for token in tokens:
if token.string.strip(): # Skip whitespace-only tokens
print(f"'{token.string}' at line {token.start_pos[0]}, col {token.start_pos[1]}")def analyze_token_stream(code, version_info):
"""Analyze token stream characteristics."""
tokens = list(tokenize(code, version_info))
stats = {
'total_tokens': len(tokens),
'names': 0,
'operators': 0,
'literals': 0,
'keywords': 0,
'indentation_changes': 0
}
keywords = {'def', 'class', 'if', 'else', 'for', 'while', 'import', 'from', 'return'}
for token in tokens:
if token.type.name == 'NAME':
if token.string in keywords:
stats['keywords'] += 1
else:
stats['names'] += 1
elif token.type.name == 'OP':
stats['operators'] += 1
elif token.type.name in ('STRING', 'NUMBER'):
stats['literals'] += 1
elif token.type.name in ('INDENT', 'DEDENT'):
stats['indentation_changes'] += 1
return stats
# Usage
code = '''
def example():
x = 42
if x > 0:
return "positive"
return "zero or negative"
'''
stats = analyze_token_stream(code, PythonVersionInfo(3, 9))
print("Token analysis:", stats)Install with Tessl CLI
npx tessl i tessl/pypi-parso