0
# String Processing
1
2
Specialized utilities for processing and transforming string literals and format strings. These functions handle the complex parsing and manipulation of Python string formats.
3
4
## Capabilities
5
6
### Format String Parsing
7
8
Parse and manipulate format strings with support for named Unicode escapes.
9
10
```python { .api }
11
def parse_format(s: str) -> list[DotFormatPart]:
12
"""
13
Parse format string into component parts.
14
15
Args:
16
s: Format string to parse (e.g., "Hello {name}!")
17
18
Returns:
19
List of format parts, each containing:
20
- Literal text
21
- Field name (None for literal parts)
22
- Format specification (None if not specified)
23
- Conversion specification (None if not specified)
24
25
Notes:
26
- Handles named Unicode escape sequences (\N{...})
27
- Compatible with string.Formatter.parse()
28
- Preserves all format string information for reconstruction
29
"""
30
31
def unparse_parsed_string(parsed: list[DotFormatPart]) -> str:
32
"""
33
Convert parsed format parts back to format string.
34
35
Args:
36
parsed: List of format parts from parse_format()
37
38
Returns:
39
Reconstructed format string
40
41
Notes:
42
- Escapes curly braces in literal parts
43
- Rebuilds field specifications with proper syntax
44
- Inverse operation of parse_format()
45
"""
46
```
47
48
### String Encoding Utilities
49
50
Utilities for working with string encodings and codecs.
51
52
```python { .api }
53
def is_codec(encoding: str, name: str) -> bool:
54
"""
55
Check if encoding matches codec name.
56
57
Args:
58
encoding: Encoding string to check (e.g., "utf-8", "ascii")
59
name: Codec name to match against
60
61
Returns:
62
True if encoding resolves to the specified codec name
63
64
Notes:
65
- Handles encoding aliases (e.g., "utf8" → "utf-8")
66
- Returns False for unknown encodings
67
- Used to determine safe string-to-binary conversions
68
"""
69
```
70
71
## Type Definitions
72
73
### Format String Components
74
75
```python { .api }
76
DotFormatPart = tuple[str, Optional[str], Optional[str], Optional[str]]
77
"""
78
Format string component tuple.
79
80
Elements:
81
0: Literal text portion
82
1: Field name (None for literal-only parts)
83
2: Format specification (None if not specified)
84
3: Conversion specification (None if not specified)
85
86
Examples:
87
("Hello ", None, None, None) # Literal text
88
("", "name", None, None) # Simple field {name}
89
("", "0", ">10", None) # Formatted field {0:>10}
90
("", "value", None, "r") # Conversion field {value!r}
91
"""
92
```
93
94
## Usage Examples
95
96
### Format String Analysis
97
98
```python
99
from pyupgrade._string_helpers import parse_format, unparse_parsed_string
100
101
# Parse a format string
102
format_str = "Hello {name}! You have {count:d} messages."
103
parts = parse_format(format_str)
104
105
# parts contains:
106
# [
107
# ("Hello ", None, None, None),
108
# ("", "name", None, None),
109
# ("! You have ", None, None, None),
110
# ("", "count", "d", None),
111
# (" messages.", None, None, None)
112
# ]
113
114
# Modify and reconstruct
115
# Remove format specifications to simplify
116
simplified_parts = [
117
(text, field, None, conv) if field else (text, field, spec, conv)
118
for text, field, spec, conv in parts
119
]
120
121
simplified_str = unparse_parsed_string(simplified_parts)
122
# Result: "Hello {name}! You have {count} messages."
123
```
124
125
### Encoding Detection for String Conversion
126
127
```python
128
from pyupgrade._string_helpers import is_codec
129
130
# Check if encoding is safe for ASCII conversion
131
def can_convert_to_ascii(encoding_str: str) -> bool:
132
"""Check if encoding is ASCII-compatible."""
133
return (is_codec(encoding_str, 'ascii') or
134
is_codec(encoding_str, 'utf-8'))
135
136
# Usage in string.encode() conversion
137
encoding = "utf-8"
138
if can_convert_to_ascii(encoding):
139
# Safe to convert "text".encode("utf-8") → b"text"
140
pass
141
142
# Handle encoding aliases
143
assert is_codec("utf8", "utf-8") # True - alias
144
assert is_codec("ascii", "ascii") # True - exact
145
assert is_codec("latin1", "iso8859-1") # True - standard name
146
```
147
148
### Format String Simplification
149
150
```python
151
def simplify_format_string(format_str: str) -> str:
152
"""Remove positional format keys from format string."""
153
154
parts = parse_format(format_str)
155
simplified = []
156
157
for text, field, spec, conv in parts:
158
if field and field.isdigit():
159
# Remove positional field numbers
160
simplified.append((text, "", spec, conv))
161
else:
162
simplified.append((text, field, spec, conv))
163
164
return unparse_parsed_string(simplified)
165
166
# Example usage
167
original = "Item {0}: {1} (price: ${2:.2f})"
168
simplified = simplify_format_string(original)
169
# Result: "Item {}: {} (price: ${:.2f})"
170
```
171
172
### Unicode Escape Handling
173
174
```python
175
# parse_format handles named Unicode escapes correctly
176
unicode_format = "Greek letter: \\N{GREEK SMALL LETTER ALPHA} = {value}"
177
parts = parse_format(unicode_format)
178
179
# The literal part preserves the Unicode escape:
180
# [("Greek letter: \\N{GREEK SMALL LETTER ALPHA} = ", None, None, None),
181
# ("", "value", None, None)]
182
183
reconstructed = unparse_parsed_string(parts)
184
assert reconstructed == unicode_format
185
```
186
187
### Integration with Token Processing
188
189
```python
190
from pyupgrade._string_helpers import parse_format, unparse_parsed_string
191
from tokenize_rt import Token
192
193
def transform_format_token(token: Token) -> Token:
194
"""Transform format string token to remove positional keys."""
195
196
try:
197
parts = parse_format(token.src)
198
except ValueError:
199
# Malformed format string, skip transformation
200
return token
201
202
# Check if all format keys are positional and sequential
203
field_nums = []
204
for _, field, _, _ in parts:
205
if field and field.isdigit():
206
field_nums.append(int(field))
207
208
if field_nums == list(range(len(field_nums))):
209
# Sequential positional keys, safe to remove
210
simplified_parts = [
211
(text, "" if field and field.isdigit() else field, spec, conv)
212
for text, field, spec, conv in parts
213
]
214
new_src = unparse_parsed_string(simplified_parts)
215
return token._replace(src=new_src)
216
217
return token
218
```
219
220
## Advanced String Processing
221
222
### Format String Validation
223
224
```python
225
def validate_format_string(format_str: str) -> bool:
226
"""Check if format string is valid."""
227
try:
228
parse_format(format_str)
229
return True
230
except ValueError:
231
return False
232
233
def count_format_fields(format_str: str) -> int:
234
"""Count number of format fields in string."""
235
try:
236
parts = parse_format(format_str)
237
return sum(1 for _, field, _, _ in parts if field is not None)
238
except ValueError:
239
return 0
240
```
241
242
### Encoding Safety Checks
243
244
```python
245
def is_safe_binary_conversion(text: str, encoding: str) -> bool:
246
"""Check if string can be safely converted to binary literal."""
247
248
# Check encoding compatibility
249
if not (is_codec(encoding, 'ascii') or
250
is_codec(encoding, 'utf-8') or
251
is_codec(encoding, 'iso8859-1')):
252
return False
253
254
# Check for non-ASCII characters with restrictive encodings
255
if not text.isascii() and is_codec(encoding, 'ascii'):
256
return False
257
258
# Check for Unicode escapes that can't be represented
259
if '\\u' in text or '\\U' in text or '\\N' in text:
260
if is_codec(encoding, 'ascii'):
261
return False
262
263
return True
264
```