tessl/pypi-parso

A Python parser that supports error recovery and round-trip parsing for different Python versions

—

Pending

Overview

Eval results

Files

Tokenization

Name: tessl/pypi-parso
Author: tessl

Low-level tokenization functions and classes for converting Python source code into tokens. The tokenizer handles encoding detection, f-strings, Python version differences, and maintains precise position information.

Capabilities

Main Tokenization Functions

Core functions for tokenizing Python source code with version-specific support.

def tokenize(code: str, *, version_info: PythonVersionInfo, start_pos: tuple[int, int] = (1, 0)):
    """
    Tokenize Python source code string.
    
    Args:
        code (str): Python source code to tokenize
        version_info (PythonVersionInfo): Python version for tokenization rules
        start_pos (tuple[int, int]): Starting position (line, column)
        
    Yields:
        PythonToken: Token objects with type, value, position, and prefix
    """

def tokenize_lines(lines, *, version_info: PythonVersionInfo, indents=None, start_pos=(1, 0), is_first_token=True):
    """
    Tokenize sequence of code lines.
    
    Args:
        lines (Iterable[str]): Lines of Python code
        version_info (PythonVersionInfo): Python version for tokenization
        indents (list[int], optional): Current indentation stack
        start_pos (tuple[int, int]): Starting position (line, column)
        is_first_token (bool): Whether this is the first token in the module
        
    Yields:
        PythonToken: Token objects
    """

Token Type System

Enumeration of all Python token types with version-specific handling.

class PythonTokenTypes:
    """
    Enumeration of Python token types.
    
    Token categories:
    - Literals: STRING, NUMBER, FSTRING_START, FSTRING_STRING, FSTRING_END
    - Identifiers: NAME, ERRORTOKEN
    - Structure: NEWLINE, INDENT, DEDENT, ERROR_DEDENT, ENDMARKER
    - Operators: OP (covers all operators and punctuation)
    """
    
    # Core token types
    STRING: TokenType
    NUMBER: TokenType
    NAME: TokenType
    OP: TokenType
    NEWLINE: TokenType
    INDENT: TokenType
    DEDENT: TokenType
    ENDMARKER: TokenType
    ERRORTOKEN: TokenType
    
    # F-string tokens (Python 3.6+)
    FSTRING_START: TokenType
    FSTRING_STRING: TokenType
    FSTRING_END: TokenType
    
    # Error handling
    ERROR_DEDENT: TokenType

Usage Examples

import parso
from parso.python.tokenize import tokenize
from parso.utils import PythonVersionInfo

# Tokenize simple code
code = 'x = 42 + y'
version = PythonVersionInfo(3, 9)

tokens = list(tokenize(code, version))
for token in tokens:
    print(f"{token.type.name}: '{token.string}' at {token.start_pos}")

# Tokenize with f-strings
f_string_code = 'name = "Alice"\ngreeting = f"Hello, {name}!"'
tokens = list(tokenize(f_string_code, version))

for token in tokens:
    if 'FSTRING' in token.type.name:
        print(f"F-string token: {token.type.name} = '{token.string}'")

# Tokenize with different Python versions
py38_code = 'items := [1, 2, 3]'  # Walrus operator
py38_tokens = list(tokenize(py38_code, PythonVersionInfo(3, 8)))
print("Python 3.8 tokens:", [(t.type.name, t.string) for t in py38_tokens])

# Handle tokenization errors
invalid_code = 'x = $invalid'  # Invalid character
tokens = list(tokenize(invalid_code, version))
for token in tokens:
    if token.type.name == 'ERRORTOKEN':
        print(f"Error token: '{token.string}' at {token.start_pos}")

Advanced Tokenization

Encoding Detection

from parso.utils import python_bytes_to_unicode

# Handle different encodings
latin1_bytes = b'# -*- coding: latin-1 -*-\ntext = "caf\xe9"'
unicode_text = python_bytes_to_unicode(latin1_bytes)
tokens = list(tokenize(unicode_text, PythonVersionInfo(3, 9)))

# UTF-8 with BOM
utf8_bom = b'\xef\xbb\xbfprint("hello")'
unicode_text = python_bytes_to_unicode(utf8_bom)

Position Tracking

# Multi-line tokenization with position tracking
multiline_code = '''def function():
    """Docstring here."""
    x = 1 + \\
        2 + 3
    return x'''

tokens = list(tokenize(multiline_code, PythonVersionInfo(3, 9)))
for token in tokens:
    if token.string.strip():  # Skip whitespace-only tokens
        print(f"'{token.string}' at line {token.start_pos[0]}, col {token.start_pos[1]}")

Token Stream Analysis

def analyze_token_stream(code, version_info):
    """Analyze token stream characteristics."""
    tokens = list(tokenize(code, version_info))
    
    stats = {
        'total_tokens': len(tokens),
        'names': 0,
        'operators': 0,
        'literals': 0,
        'keywords': 0,
        'indentation_changes': 0
    }
    
    keywords = {'def', 'class', 'if', 'else', 'for', 'while', 'import', 'from', 'return'}
    
    for token in tokens:
        if token.type.name == 'NAME':
            if token.string in keywords:
                stats['keywords'] += 1
            else:
                stats['names'] += 1
        elif token.type.name == 'OP':
            stats['operators'] += 1
        elif token.type.name in ('STRING', 'NUMBER'):
            stats['literals'] += 1
        elif token.type.name in ('INDENT', 'DEDENT'):
            stats['indentation_changes'] += 1
    
    return stats

# Usage
code = '''
def example():
    x = 42
    if x > 0:
        return "positive"
    return "zero or negative"
'''

stats = analyze_token_stream(code, PythonVersionInfo(3, 9))
print("Token analysis:", stats)

Install with Tessl CLI