CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pyupgrade

A tool and pre-commit hook to automatically upgrade Python syntax for newer versions of the language.

Pending
Overview
Eval results
Files

string-processing.mddocs/

String Processing

Specialized utilities for processing and transforming string literals and format strings. These functions handle the complex parsing and manipulation of Python string formats.

Capabilities

Format String Parsing

Parse and manipulate format strings with support for named Unicode escapes.

def parse_format(s: str) -> list[DotFormatPart]:
    """
    Parse format string into component parts.
    
    Args:
        s: Format string to parse (e.g., "Hello {name}!")
        
    Returns:
        List of format parts, each containing:
        - Literal text
        - Field name (None for literal parts)  
        - Format specification (None if not specified)
        - Conversion specification (None if not specified)
        
    Notes:
        - Handles named Unicode escape sequences (\N{...})
        - Compatible with string.Formatter.parse()
        - Preserves all format string information for reconstruction
    """

def unparse_parsed_string(parsed: list[DotFormatPart]) -> str:
    """
    Convert parsed format parts back to format string.
    
    Args:
        parsed: List of format parts from parse_format()
        
    Returns:
        Reconstructed format string
        
    Notes:
        - Escapes curly braces in literal parts
        - Rebuilds field specifications with proper syntax
        - Inverse operation of parse_format()
    """

String Encoding Utilities

Utilities for working with string encodings and codecs.

def is_codec(encoding: str, name: str) -> bool:
    """
    Check if encoding matches codec name.
    
    Args:
        encoding: Encoding string to check (e.g., "utf-8", "ascii")
        name: Codec name to match against
        
    Returns:
        True if encoding resolves to the specified codec name
        
    Notes:
        - Handles encoding aliases (e.g., "utf8" → "utf-8")
        - Returns False for unknown encodings
        - Used to determine safe string-to-binary conversions
    """

Type Definitions

Format String Components

DotFormatPart = tuple[str, Optional[str], Optional[str], Optional[str]]
"""
Format string component tuple.

Elements:
    0: Literal text portion
    1: Field name (None for literal-only parts)
    2: Format specification (None if not specified)  
    3: Conversion specification (None if not specified)

Examples:
    ("Hello ", None, None, None)  # Literal text
    ("", "name", None, None)       # Simple field {name}
    ("", "0", ">10", None)         # Formatted field {0:>10}
    ("", "value", None, "r")       # Conversion field {value!r}
"""

Usage Examples

Format String Analysis

from pyupgrade._string_helpers import parse_format, unparse_parsed_string

# Parse a format string
format_str = "Hello {name}! You have {count:d} messages."
parts = parse_format(format_str)

# parts contains:
# [
#     ("Hello ", None, None, None),
#     ("", "name", None, None), 
#     ("! You have ", None, None, None),
#     ("", "count", "d", None),
#     (" messages.", None, None, None)
# ]

# Modify and reconstruct
# Remove format specifications to simplify
simplified_parts = [
    (text, field, None, conv) if field else (text, field, spec, conv)
    for text, field, spec, conv in parts
]

simplified_str = unparse_parsed_string(simplified_parts)
# Result: "Hello {name}! You have {count} messages."

Encoding Detection for String Conversion

from pyupgrade._string_helpers import is_codec

# Check if encoding is safe for ASCII conversion
def can_convert_to_ascii(encoding_str: str) -> bool:
    """Check if encoding is ASCII-compatible."""
    return (is_codec(encoding_str, 'ascii') or 
            is_codec(encoding_str, 'utf-8'))

# Usage in string.encode() conversion
encoding = "utf-8"
if can_convert_to_ascii(encoding):
    # Safe to convert "text".encode("utf-8") → b"text"
    pass

# Handle encoding aliases
assert is_codec("utf8", "utf-8")      # True - alias
assert is_codec("ascii", "ascii")     # True - exact
assert is_codec("latin1", "iso8859-1") # True - standard name

Format String Simplification

def simplify_format_string(format_str: str) -> str:
    """Remove positional format keys from format string."""
    
    parts = parse_format(format_str)
    simplified = []
    
    for text, field, spec, conv in parts:
        if field and field.isdigit():
            # Remove positional field numbers
            simplified.append((text, "", spec, conv))
        else:
            simplified.append((text, field, spec, conv))
    
    return unparse_parsed_string(simplified)

# Example usage
original = "Item {0}: {1} (price: ${2:.2f})"
simplified = simplify_format_string(original)  
# Result: "Item {}: {} (price: ${:.2f})"

Unicode Escape Handling

# parse_format handles named Unicode escapes correctly
unicode_format = "Greek letter: \\N{GREEK SMALL LETTER ALPHA} = {value}"
parts = parse_format(unicode_format)

# The literal part preserves the Unicode escape:
# [("Greek letter: \\N{GREEK SMALL LETTER ALPHA} = ", None, None, None),
#  ("", "value", None, None)]

reconstructed = unparse_parsed_string(parts)
assert reconstructed == unicode_format

Integration with Token Processing

from pyupgrade._string_helpers import parse_format, unparse_parsed_string
from tokenize_rt import Token

def transform_format_token(token: Token) -> Token:
    """Transform format string token to remove positional keys."""
    
    try:
        parts = parse_format(token.src)
    except ValueError:
        # Malformed format string, skip transformation
        return token
    
    # Check if all format keys are positional and sequential
    field_nums = []
    for _, field, _, _ in parts:
        if field and field.isdigit():
            field_nums.append(int(field))
    
    if field_nums == list(range(len(field_nums))):
        # Sequential positional keys, safe to remove
        simplified_parts = [
            (text, "" if field and field.isdigit() else field, spec, conv)
            for text, field, spec, conv in parts
        ]
        new_src = unparse_parsed_string(simplified_parts)
        return token._replace(src=new_src)
    
    return token

Advanced String Processing

Format String Validation

def validate_format_string(format_str: str) -> bool:
    """Check if format string is valid."""
    try:
        parse_format(format_str)
        return True
    except ValueError:
        return False

def count_format_fields(format_str: str) -> int:
    """Count number of format fields in string."""
    try:
        parts = parse_format(format_str)
        return sum(1 for _, field, _, _ in parts if field is not None)
    except ValueError:
        return 0

Encoding Safety Checks

def is_safe_binary_conversion(text: str, encoding: str) -> bool:
    """Check if string can be safely converted to binary literal."""
    
    # Check encoding compatibility
    if not (is_codec(encoding, 'ascii') or 
            is_codec(encoding, 'utf-8') or 
            is_codec(encoding, 'iso8859-1')):
        return False
    
    # Check for non-ASCII characters with restrictive encodings
    if not text.isascii() and is_codec(encoding, 'ascii'):
        return False
    
    # Check for Unicode escapes that can't be represented
    if '\\u' in text or '\\U' in text or '\\N' in text:
        if is_codec(encoding, 'ascii'):
            return False
    
    return True

Install with Tessl CLI

npx tessl i tessl/pypi-pyupgrade

docs

ast-utilities.md

cli.md

core-engine.md

index.md

plugin-system.md

string-processing.md

token-manipulation.md

tile.json