or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

ast-utilities.mdcli.mdcore-engine.mdindex.mdplugin-system.mdstring-processing.mdtoken-manipulation.md

string-processing.mddocs/

0

# String Processing

1

2

Specialized utilities for processing and transforming string literals and format strings. These functions handle the complex parsing and manipulation of Python string formats.

3

4

## Capabilities

5

6

### Format String Parsing

7

8

Parse and manipulate format strings with support for named Unicode escapes.

9

10

```python { .api }

11

def parse_format(s: str) -> list[DotFormatPart]:

12

"""

13

Parse format string into component parts.

14

15

Args:

16

s: Format string to parse (e.g., "Hello {name}!")

17

18

Returns:

19

List of format parts, each containing:

20

- Literal text

21

- Field name (None for literal parts)

22

- Format specification (None if not specified)

23

- Conversion specification (None if not specified)

24

25

Notes:

26

- Handles named Unicode escape sequences (\N{...})

27

- Compatible with string.Formatter.parse()

28

- Preserves all format string information for reconstruction

29

"""

30

31

def unparse_parsed_string(parsed: list[DotFormatPart]) -> str:

32

"""

33

Convert parsed format parts back to format string.

34

35

Args:

36

parsed: List of format parts from parse_format()

37

38

Returns:

39

Reconstructed format string

40

41

Notes:

42

- Escapes curly braces in literal parts

43

- Rebuilds field specifications with proper syntax

44

- Inverse operation of parse_format()

45

"""

46

```

47

48

### String Encoding Utilities

49

50

Utilities for working with string encodings and codecs.

51

52

```python { .api }

53

def is_codec(encoding: str, name: str) -> bool:

54

"""

55

Check if encoding matches codec name.

56

57

Args:

58

encoding: Encoding string to check (e.g., "utf-8", "ascii")

59

name: Codec name to match against

60

61

Returns:

62

True if encoding resolves to the specified codec name

63

64

Notes:

65

- Handles encoding aliases (e.g., "utf8" → "utf-8")

66

- Returns False for unknown encodings

67

- Used to determine safe string-to-binary conversions

68

"""

69

```

70

71

## Type Definitions

72

73

### Format String Components

74

75

```python { .api }

76

DotFormatPart = tuple[str, Optional[str], Optional[str], Optional[str]]

77

"""

78

Format string component tuple.

79

80

Elements:

81

0: Literal text portion

82

1: Field name (None for literal-only parts)

83

2: Format specification (None if not specified)

84

3: Conversion specification (None if not specified)

85

86

Examples:

87

("Hello ", None, None, None) # Literal text

88

("", "name", None, None) # Simple field {name}

89

("", "0", ">10", None) # Formatted field {0:>10}

90

("", "value", None, "r") # Conversion field {value!r}

91

"""

92

```

93

94

## Usage Examples

95

96

### Format String Analysis

97

98

```python

99

from pyupgrade._string_helpers import parse_format, unparse_parsed_string

100

101

# Parse a format string

102

format_str = "Hello {name}! You have {count:d} messages."

103

parts = parse_format(format_str)

104

105

# parts contains:

106

# [

107

# ("Hello ", None, None, None),

108

# ("", "name", None, None),

109

# ("! You have ", None, None, None),

110

# ("", "count", "d", None),

111

# (" messages.", None, None, None)

112

# ]

113

114

# Modify and reconstruct

115

# Remove format specifications to simplify

116

simplified_parts = [

117

(text, field, None, conv) if field else (text, field, spec, conv)

118

for text, field, spec, conv in parts

119

]

120

121

simplified_str = unparse_parsed_string(simplified_parts)

122

# Result: "Hello {name}! You have {count} messages."

123

```

124

125

### Encoding Detection for String Conversion

126

127

```python

128

from pyupgrade._string_helpers import is_codec

129

130

# Check if encoding is safe for ASCII conversion

131

def can_convert_to_ascii(encoding_str: str) -> bool:

132

"""Check if encoding is ASCII-compatible."""

133

return (is_codec(encoding_str, 'ascii') or

134

is_codec(encoding_str, 'utf-8'))

135

136

# Usage in string.encode() conversion

137

encoding = "utf-8"

138

if can_convert_to_ascii(encoding):

139

# Safe to convert "text".encode("utf-8") → b"text"

140

pass

141

142

# Handle encoding aliases

143

assert is_codec("utf8", "utf-8") # True - alias

144

assert is_codec("ascii", "ascii") # True - exact

145

assert is_codec("latin1", "iso8859-1") # True - standard name

146

```

147

148

### Format String Simplification

149

150

```python

151

def simplify_format_string(format_str: str) -> str:

152

"""Remove positional format keys from format string."""

153

154

parts = parse_format(format_str)

155

simplified = []

156

157

for text, field, spec, conv in parts:

158

if field and field.isdigit():

159

# Remove positional field numbers

160

simplified.append((text, "", spec, conv))

161

else:

162

simplified.append((text, field, spec, conv))

163

164

return unparse_parsed_string(simplified)

165

166

# Example usage

167

original = "Item {0}: {1} (price: ${2:.2f})"

168

simplified = simplify_format_string(original)

169

# Result: "Item {}: {} (price: ${:.2f})"

170

```

171

172

### Unicode Escape Handling

173

174

```python

175

# parse_format handles named Unicode escapes correctly

176

unicode_format = "Greek letter: \\N{GREEK SMALL LETTER ALPHA} = {value}"

177

parts = parse_format(unicode_format)

178

179

# The literal part preserves the Unicode escape:

180

# [("Greek letter: \\N{GREEK SMALL LETTER ALPHA} = ", None, None, None),

181

# ("", "value", None, None)]

182

183

reconstructed = unparse_parsed_string(parts)

184

assert reconstructed == unicode_format

185

```

186

187

### Integration with Token Processing

188

189

```python

190

from pyupgrade._string_helpers import parse_format, unparse_parsed_string

191

from tokenize_rt import Token

192

193

def transform_format_token(token: Token) -> Token:

194

"""Transform format string token to remove positional keys."""

195

196

try:

197

parts = parse_format(token.src)

198

except ValueError:

199

# Malformed format string, skip transformation

200

return token

201

202

# Check if all format keys are positional and sequential

203

field_nums = []

204

for _, field, _, _ in parts:

205

if field and field.isdigit():

206

field_nums.append(int(field))

207

208

if field_nums == list(range(len(field_nums))):

209

# Sequential positional keys, safe to remove

210

simplified_parts = [

211

(text, "" if field and field.isdigit() else field, spec, conv)

212

for text, field, spec, conv in parts

213

]

214

new_src = unparse_parsed_string(simplified_parts)

215

return token._replace(src=new_src)

216

217

return token

218

```

219

220

## Advanced String Processing

221

222

### Format String Validation

223

224

```python

225

def validate_format_string(format_str: str) -> bool:

226

"""Check if format string is valid."""

227

try:

228

parse_format(format_str)

229

return True

230

except ValueError:

231

return False

232

233

def count_format_fields(format_str: str) -> int:

234

"""Count number of format fields in string."""

235

try:

236

parts = parse_format(format_str)

237

return sum(1 for _, field, _, _ in parts if field is not None)

238

except ValueError:

239

return 0

240

```

241

242

### Encoding Safety Checks

243

244

```python

245

def is_safe_binary_conversion(text: str, encoding: str) -> bool:

246

"""Check if string can be safely converted to binary literal."""

247

248

# Check encoding compatibility

249

if not (is_codec(encoding, 'ascii') or

250

is_codec(encoding, 'utf-8') or

251

is_codec(encoding, 'iso8859-1')):

252

return False

253

254

# Check for non-ASCII characters with restrictive encodings

255

if not text.isascii() and is_codec(encoding, 'ascii'):

256

return False

257

258

# Check for Unicode escapes that can't be represented

259

if '\\u' in text or '\\U' in text or '\\N' in text:

260

if is_codec(encoding, 'ascii'):

261

return False

262

263

return True

264

```