Tessl Tile for pypi/pyupgrade@3.20.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

ast-utilities.md cli.md core-engine.md index.md plugin-system.md string-processing.md token-manipulation.md

string-processing.mddocs/

0
# String Processing
1

2
Specialized utilities for processing and transforming string literals and format strings. These functions handle the complex parsing and manipulation of Python string formats.
3

4
## Capabilities
5

6
### Format String Parsing
7

8
Parse and manipulate format strings with support for named Unicode escapes.
9

10
```python { .api }
11
def parse_format(s: str) -> list[DotFormatPart]:
12
    """
13
    Parse format string into component parts.
14
    
15
    Args:
16
        s: Format string to parse (e.g., "Hello {name}!")
17
        
18
    Returns:
19
        List of format parts, each containing:
20
        - Literal text
21
        - Field name (None for literal parts)  
22
        - Format specification (None if not specified)
23
        - Conversion specification (None if not specified)
24
        
25
    Notes:
26
        - Handles named Unicode escape sequences (\N{...})
27
        - Compatible with string.Formatter.parse()
28
        - Preserves all format string information for reconstruction
29
    """
30

31
def unparse_parsed_string(parsed: list[DotFormatPart]) -> str:
32
    """
33
    Convert parsed format parts back to format string.
34
    
35
    Args:
36
        parsed: List of format parts from parse_format()
37
        
38
    Returns:
39
        Reconstructed format string
40
        
41
    Notes:
42
        - Escapes curly braces in literal parts
43
        - Rebuilds field specifications with proper syntax
44
        - Inverse operation of parse_format()
45
    """
46
```
47

48
### String Encoding Utilities
49

50
Utilities for working with string encodings and codecs.
51

52
```python { .api }
53
def is_codec(encoding: str, name: str) -> bool:
54
    """
55
    Check if encoding matches codec name.
56
    
57
    Args:
58
        encoding: Encoding string to check (e.g., "utf-8", "ascii")
59
        name: Codec name to match against
60
        
61
    Returns:
62
        True if encoding resolves to the specified codec name
63
        
64
    Notes:
65
        - Handles encoding aliases (e.g., "utf8" → "utf-8")
66
        - Returns False for unknown encodings
67
        - Used to determine safe string-to-binary conversions
68
    """
69
```
70

71
## Type Definitions
72

73
### Format String Components
74

75
```python { .api }
76
DotFormatPart = tuple[str, Optional[str], Optional[str], Optional[str]]
77
"""
78
Format string component tuple.
79

80
Elements:
81
    0: Literal text portion
82
    1: Field name (None for literal-only parts)
83
    2: Format specification (None if not specified)  
84
    3: Conversion specification (None if not specified)
85

86
Examples:
87
    ("Hello ", None, None, None)  # Literal text
88
    ("", "name", None, None)       # Simple field {name}
89
    ("", "0", ">10", None)         # Formatted field {0:>10}
90
    ("", "value", None, "r")       # Conversion field {value!r}
91
"""
92
```
93

94
## Usage Examples
95

96
### Format String Analysis
97

98
```python
99
from pyupgrade._string_helpers import parse_format, unparse_parsed_string
100

101
# Parse a format string
102
format_str = "Hello {name}! You have {count:d} messages."
103
parts = parse_format(format_str)
104

105
# parts contains:
106
# [
107
#     ("Hello ", None, None, None),
108
#     ("", "name", None, None), 
109
#     ("! You have ", None, None, None),
110
#     ("", "count", "d", None),
111
#     (" messages.", None, None, None)
112
# ]
113

114
# Modify and reconstruct
115
# Remove format specifications to simplify
116
simplified_parts = [
117
    (text, field, None, conv) if field else (text, field, spec, conv)
118
    for text, field, spec, conv in parts
119
]
120

121
simplified_str = unparse_parsed_string(simplified_parts)
122
# Result: "Hello {name}! You have {count} messages."
123
```
124

125
### Encoding Detection for String Conversion
126

127
```python
128
from pyupgrade._string_helpers import is_codec
129

130
# Check if encoding is safe for ASCII conversion
131
def can_convert_to_ascii(encoding_str: str) -> bool:
132
    """Check if encoding is ASCII-compatible."""
133
    return (is_codec(encoding_str, 'ascii') or 
134
            is_codec(encoding_str, 'utf-8'))
135

136
# Usage in string.encode() conversion
137
encoding = "utf-8"
138
if can_convert_to_ascii(encoding):
139
    # Safe to convert "text".encode("utf-8") → b"text"
140
    pass
141

142
# Handle encoding aliases
143
assert is_codec("utf8", "utf-8")      # True - alias
144
assert is_codec("ascii", "ascii")     # True - exact
145
assert is_codec("latin1", "iso8859-1") # True - standard name
146
```
147

148
### Format String Simplification
149

150
```python
151
def simplify_format_string(format_str: str) -> str:
152
    """Remove positional format keys from format string."""
153
    
154
    parts = parse_format(format_str)
155
    simplified = []
156
    
157
    for text, field, spec, conv in parts:
158
        if field and field.isdigit():
159
            # Remove positional field numbers
160
            simplified.append((text, "", spec, conv))
161
        else:
162
            simplified.append((text, field, spec, conv))
163
    
164
    return unparse_parsed_string(simplified)
165

166
# Example usage
167
original = "Item {0}: {1} (price: ${2:.2f})"
168
simplified = simplify_format_string(original)  
169
# Result: "Item {}: {} (price: ${:.2f})"
170
```
171

172
### Unicode Escape Handling
173

174
```python
175
# parse_format handles named Unicode escapes correctly
176
unicode_format = "Greek letter: \\N{GREEK SMALL LETTER ALPHA} = {value}"
177
parts = parse_format(unicode_format)
178

179
# The literal part preserves the Unicode escape:
180
# [("Greek letter: \\N{GREEK SMALL LETTER ALPHA} = ", None, None, None),
181
#  ("", "value", None, None)]
182

183
reconstructed = unparse_parsed_string(parts)
184
assert reconstructed == unicode_format
185
```
186

187
### Integration with Token Processing
188

189
```python
190
from pyupgrade._string_helpers import parse_format, unparse_parsed_string
191
from tokenize_rt import Token
192

193
def transform_format_token(token: Token) -> Token:
194
    """Transform format string token to remove positional keys."""
195
    
196
    try:
197
        parts = parse_format(token.src)
198
    except ValueError:
199
        # Malformed format string, skip transformation
200
        return token
201
    
202
    # Check if all format keys are positional and sequential
203
    field_nums = []
204
    for _, field, _, _ in parts:
205
        if field and field.isdigit():
206
            field_nums.append(int(field))
207
    
208
    if field_nums == list(range(len(field_nums))):
209
        # Sequential positional keys, safe to remove
210
        simplified_parts = [
211
            (text, "" if field and field.isdigit() else field, spec, conv)
212
            for text, field, spec, conv in parts
213
        ]
214
        new_src = unparse_parsed_string(simplified_parts)
215
        return token._replace(src=new_src)
216
    
217
    return token
218
```
219

220
## Advanced String Processing
221

222
### Format String Validation
223

224
```python
225
def validate_format_string(format_str: str) -> bool:
226
    """Check if format string is valid."""
227
    try:
228
        parse_format(format_str)
229
        return True
230
    except ValueError:
231
        return False
232

233
def count_format_fields(format_str: str) -> int:
234
    """Count number of format fields in string."""
235
    try:
236
        parts = parse_format(format_str)
237
        return sum(1 for _, field, _, _ in parts if field is not None)
238
    except ValueError:
239
        return 0
240
```
241

242
### Encoding Safety Checks
243

244
```python
245
def is_safe_binary_conversion(text: str, encoding: str) -> bool:
246
    """Check if string can be safely converted to binary literal."""
247
    
248
    # Check encoding compatibility
249
    if not (is_codec(encoding, 'ascii') or 
250
            is_codec(encoding, 'utf-8') or 
251
            is_codec(encoding, 'iso8859-1')):
252
        return False
253
    
254
    # Check for non-ASCII characters with restrictive encodings
255
    if not text.isascii() and is_codec(encoding, 'ascii'):
256
        return False
257
    
258
    # Check for Unicode escapes that can't be represented
259
    if '\\u' in text or '\\U' in text or '\\N' in text:
260
        if is_codec(encoding, 'ascii'):
261
            return False
262
    
263
    return True
264
```

Version

Tile

Files

string-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

string-processing.mddocs/