or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-parsing.mderror-handling.mdgrammar-system.mdindex.mdpython-elements.mdtokenization.mdtree-navigation.mdutilities.md

tokenization.mddocs/

0

# Tokenization

1

2

Low-level tokenization functions and classes for converting Python source code into tokens. The tokenizer handles encoding detection, f-strings, Python version differences, and maintains precise position information.

3

4

## Capabilities

5

6

### Main Tokenization Functions

7

8

Core functions for tokenizing Python source code with version-specific support.

9

10

```python { .api }

11

def tokenize(code: str, *, version_info: PythonVersionInfo, start_pos: tuple[int, int] = (1, 0)):

12

"""

13

Tokenize Python source code string.

14

15

Args:

16

code (str): Python source code to tokenize

17

version_info (PythonVersionInfo): Python version for tokenization rules

18

start_pos (tuple[int, int]): Starting position (line, column)

19

20

Yields:

21

PythonToken: Token objects with type, value, position, and prefix

22

"""

23

24

def tokenize_lines(lines, *, version_info: PythonVersionInfo, indents=None, start_pos=(1, 0), is_first_token=True):

25

"""

26

Tokenize sequence of code lines.

27

28

Args:

29

lines (Iterable[str]): Lines of Python code

30

version_info (PythonVersionInfo): Python version for tokenization

31

indents (list[int], optional): Current indentation stack

32

start_pos (tuple[int, int]): Starting position (line, column)

33

is_first_token (bool): Whether this is the first token in the module

34

35

Yields:

36

PythonToken: Token objects

37

"""

38

```

39

40

### Token Type System

41

42

Enumeration of all Python token types with version-specific handling.

43

44

```python { .api }

45

class PythonTokenTypes:

46

"""

47

Enumeration of Python token types.

48

49

Token categories:

50

- Literals: STRING, NUMBER, FSTRING_START, FSTRING_STRING, FSTRING_END

51

- Identifiers: NAME, ERRORTOKEN

52

- Structure: NEWLINE, INDENT, DEDENT, ERROR_DEDENT, ENDMARKER

53

- Operators: OP (covers all operators and punctuation)

54

"""

55

56

# Core token types

57

STRING: TokenType

58

NUMBER: TokenType

59

NAME: TokenType

60

OP: TokenType

61

NEWLINE: TokenType

62

INDENT: TokenType

63

DEDENT: TokenType

64

ENDMARKER: TokenType

65

ERRORTOKEN: TokenType

66

67

# F-string tokens (Python 3.6+)

68

FSTRING_START: TokenType

69

FSTRING_STRING: TokenType

70

FSTRING_END: TokenType

71

72

# Error handling

73

ERROR_DEDENT: TokenType

74

```

75

76

### Usage Examples

77

78

```python

79

import parso

80

from parso.python.tokenize import tokenize

81

from parso.utils import PythonVersionInfo

82

83

# Tokenize simple code

84

code = 'x = 42 + y'

85

version = PythonVersionInfo(3, 9)

86

87

tokens = list(tokenize(code, version))

88

for token in tokens:

89

print(f"{token.type.name}: '{token.string}' at {token.start_pos}")

90

91

# Tokenize with f-strings

92

f_string_code = 'name = "Alice"\ngreeting = f"Hello, {name}!"'

93

tokens = list(tokenize(f_string_code, version))

94

95

for token in tokens:

96

if 'FSTRING' in token.type.name:

97

print(f"F-string token: {token.type.name} = '{token.string}'")

98

99

# Tokenize with different Python versions

100

py38_code = 'items := [1, 2, 3]' # Walrus operator

101

py38_tokens = list(tokenize(py38_code, PythonVersionInfo(3, 8)))

102

print("Python 3.8 tokens:", [(t.type.name, t.string) for t in py38_tokens])

103

104

# Handle tokenization errors

105

invalid_code = 'x = $invalid' # Invalid character

106

tokens = list(tokenize(invalid_code, version))

107

for token in tokens:

108

if token.type.name == 'ERRORTOKEN':

109

print(f"Error token: '{token.string}' at {token.start_pos}")

110

```

111

112

## Advanced Tokenization

113

114

### Encoding Detection

115

116

```python

117

from parso.utils import python_bytes_to_unicode

118

119

# Handle different encodings

120

latin1_bytes = b'# -*- coding: latin-1 -*-\ntext = "caf\xe9"'

121

unicode_text = python_bytes_to_unicode(latin1_bytes)

122

tokens = list(tokenize(unicode_text, PythonVersionInfo(3, 9)))

123

124

# UTF-8 with BOM

125

utf8_bom = b'\xef\xbb\xbfprint("hello")'

126

unicode_text = python_bytes_to_unicode(utf8_bom)

127

```

128

129

### Position Tracking

130

131

```python

132

# Multi-line tokenization with position tracking

133

multiline_code = '''def function():

134

"""Docstring here."""

135

x = 1 + \\

136

2 + 3

137

return x'''

138

139

tokens = list(tokenize(multiline_code, PythonVersionInfo(3, 9)))

140

for token in tokens:

141

if token.string.strip(): # Skip whitespace-only tokens

142

print(f"'{token.string}' at line {token.start_pos[0]}, col {token.start_pos[1]}")

143

```

144

145

### Token Stream Analysis

146

147

```python

148

def analyze_token_stream(code, version_info):

149

"""Analyze token stream characteristics."""

150

tokens = list(tokenize(code, version_info))

151

152

stats = {

153

'total_tokens': len(tokens),

154

'names': 0,

155

'operators': 0,

156

'literals': 0,

157

'keywords': 0,

158

'indentation_changes': 0

159

}

160

161

keywords = {'def', 'class', 'if', 'else', 'for', 'while', 'import', 'from', 'return'}

162

163

for token in tokens:

164

if token.type.name == 'NAME':

165

if token.string in keywords:

166

stats['keywords'] += 1

167

else:

168

stats['names'] += 1

169

elif token.type.name == 'OP':

170

stats['operators'] += 1

171

elif token.type.name in ('STRING', 'NUMBER'):

172

stats['literals'] += 1

173

elif token.type.name in ('INDENT', 'DEDENT'):

174

stats['indentation_changes'] += 1

175

176

return stats

177

178

# Usage

179

code = '''

180

def example():

181

x = 42

182

if x > 0:

183

return "positive"

184

return "zero or negative"

185

'''

186

187

stats = analyze_token_stream(code, PythonVersionInfo(3, 9))

188

print("Token analysis:", stats)

189

```