Tessl Tile for pypi/parso@0.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-parsing.md error-handling.md grammar-system.md index.md python-elements.md tokenization.md utilities.md

tokenization.mddocs/

0
# Tokenization
1

2
Low-level tokenization functions and classes for converting Python source code into tokens. The tokenizer handles encoding detection, f-strings, Python version differences, and maintains precise position information.
3

4
## Capabilities
5

6
### Main Tokenization Functions
7

8
Core functions for tokenizing Python source code with version-specific support.
9

10
```python { .api }
11
def tokenize(code: str, *, version_info: PythonVersionInfo, start_pos: tuple[int, int] = (1, 0)):
12
    """
13
    Tokenize Python source code string.
14
    
15
    Args:
16
        code (str): Python source code to tokenize
17
        version_info (PythonVersionInfo): Python version for tokenization rules
18
        start_pos (tuple[int, int]): Starting position (line, column)
19
        
20
    Yields:
21
        PythonToken: Token objects with type, value, position, and prefix
22
    """
23

24
def tokenize_lines(lines, *, version_info: PythonVersionInfo, indents=None, start_pos=(1, 0), is_first_token=True):
25
    """
26
    Tokenize sequence of code lines.
27
    
28
    Args:
29
        lines (Iterable[str]): Lines of Python code
30
        version_info (PythonVersionInfo): Python version for tokenization
31
        indents (list[int], optional): Current indentation stack
32
        start_pos (tuple[int, int]): Starting position (line, column)
33
        is_first_token (bool): Whether this is the first token in the module
34
        
35
    Yields:
36
        PythonToken: Token objects
37
    """
38
```
39

40
### Token Type System
41

42
Enumeration of all Python token types with version-specific handling.
43

44
```python { .api }
45
class PythonTokenTypes:
46
    """
47
    Enumeration of Python token types.
48
    
49
    Token categories:
50
    - Literals: STRING, NUMBER, FSTRING_START, FSTRING_STRING, FSTRING_END
51
    - Identifiers: NAME, ERRORTOKEN
52
    - Structure: NEWLINE, INDENT, DEDENT, ERROR_DEDENT, ENDMARKER
53
    - Operators: OP (covers all operators and punctuation)
54
    """
55
    
56
    # Core token types
57
    STRING: TokenType
58
    NUMBER: TokenType
59
    NAME: TokenType
60
    OP: TokenType
61
    NEWLINE: TokenType
62
    INDENT: TokenType
63
    DEDENT: TokenType
64
    ENDMARKER: TokenType
65
    ERRORTOKEN: TokenType
66
    
67
    # F-string tokens (Python 3.6+)
68
    FSTRING_START: TokenType
69
    FSTRING_STRING: TokenType
70
    FSTRING_END: TokenType
71
    
72
    # Error handling
73
    ERROR_DEDENT: TokenType
74
```
75

76
### Usage Examples
77

78
```python
79
import parso
80
from parso.python.tokenize import tokenize
81
from parso.utils import PythonVersionInfo
82

83
# Tokenize simple code
84
code = 'x = 42 + y'
85
version = PythonVersionInfo(3, 9)
86

87
tokens = list(tokenize(code, version))
88
for token in tokens:
89
    print(f"{token.type.name}: '{token.string}' at {token.start_pos}")
90

91
# Tokenize with f-strings
92
f_string_code = 'name = "Alice"\ngreeting = f"Hello, {name}!"'
93
tokens = list(tokenize(f_string_code, version))
94

95
for token in tokens:
96
    if 'FSTRING' in token.type.name:
97
        print(f"F-string token: {token.type.name} = '{token.string}'")
98

99
# Tokenize with different Python versions
100
py38_code = 'items := [1, 2, 3]'  # Walrus operator
101
py38_tokens = list(tokenize(py38_code, PythonVersionInfo(3, 8)))
102
print("Python 3.8 tokens:", [(t.type.name, t.string) for t in py38_tokens])
103

104
# Handle tokenization errors
105
invalid_code = 'x = $invalid'  # Invalid character
106
tokens = list(tokenize(invalid_code, version))
107
for token in tokens:
108
    if token.type.name == 'ERRORTOKEN':
109
        print(f"Error token: '{token.string}' at {token.start_pos}")
110
```
111

112
## Advanced Tokenization
113

114
### Encoding Detection
115

116
```python
117
from parso.utils import python_bytes_to_unicode
118

119
# Handle different encodings
120
latin1_bytes = b'# -*- coding: latin-1 -*-\ntext = "caf\xe9"'
121
unicode_text = python_bytes_to_unicode(latin1_bytes)
122
tokens = list(tokenize(unicode_text, PythonVersionInfo(3, 9)))
123

124
# UTF-8 with BOM
125
utf8_bom = b'\xef\xbb\xbfprint("hello")'
126
unicode_text = python_bytes_to_unicode(utf8_bom)
127
```
128

129
### Position Tracking
130

131
```python
132
# Multi-line tokenization with position tracking
133
multiline_code = '''def function():
134
    """Docstring here."""
135
    x = 1 + \\
136
        2 + 3
137
    return x'''
138

139
tokens = list(tokenize(multiline_code, PythonVersionInfo(3, 9)))
140
for token in tokens:
141
    if token.string.strip():  # Skip whitespace-only tokens
142
        print(f"'{token.string}' at line {token.start_pos[0]}, col {token.start_pos[1]}")
143
```
144

145
### Token Stream Analysis
146

147
```python
148
def analyze_token_stream(code, version_info):
149
    """Analyze token stream characteristics."""
150
    tokens = list(tokenize(code, version_info))
151
    
152
    stats = {
153
        'total_tokens': len(tokens),
154
        'names': 0,
155
        'operators': 0,
156
        'literals': 0,
157
        'keywords': 0,
158
        'indentation_changes': 0
159
    }
160
    
161
    keywords = {'def', 'class', 'if', 'else', 'for', 'while', 'import', 'from', 'return'}
162
    
163
    for token in tokens:
164
        if token.type.name == 'NAME':
165
            if token.string in keywords:
166
                stats['keywords'] += 1
167
            else:
168
                stats['names'] += 1
169
        elif token.type.name == 'OP':
170
            stats['operators'] += 1
171
        elif token.type.name in ('STRING', 'NUMBER'):
172
            stats['literals'] += 1
173
        elif token.type.name in ('INDENT', 'DEDENT'):
174
            stats['indentation_changes'] += 1
175
    
176
    return stats
177

178
# Usage
179
code = '''
180
def example():
181
    x = 42
182
    if x > 0:
183
        return "positive"
184
    return "zero or negative"
185
'''
186

187
stats = analyze_token_stream(code, PythonVersionInfo(3, 9))
188
print("Token analysis:", stats)
189
```

Version

Tile

Files

tokenization.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

tokenization.mddocs/