or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli.mdconfiguration.mdfile-processing.mdformatting.mdindex.mdindividual-fixes.mdtext-fixing.mdutilities.md

utilities.mddocs/

0

# Utilities and Debugging

1

2

Debugging and utility functions for understanding Unicode text and applying transformation plans manually.

3

4

## Capabilities

5

6

### Unicode Text Analysis

7

8

Debugging utility for analyzing Unicode text character by character.

9

10

```python { .api }

11

def explain_unicode(text: str) -> None:

12

"""

13

Debug utility showing detailed Unicode information for each character.

14

15

Prints character-by-character breakdown showing Unicode codepoint,

16

glyph, category, and name for debugging mysterious Unicode text.

17

Output goes to stdout and is intended for interactive debugging.

18

19

Args:

20

text: Unicode string to analyze

21

22

Returns:

23

None (prints to stdout)

24

25

Examples:

26

>>> explain_unicode('café')

27

U+0063 c [Ll] LATIN SMALL LETTER C

28

U+0061 a [Ll] LATIN SMALL LETTER A

29

U+0066 f [Ll] LATIN SMALL LETTER F

30

U+00E9 é [Ll] LATIN SMALL LETTER E WITH ACUTE

31

32

>>> explain_unicode('😀🎉')

33

U+1F600 😀 [So] GRINNING FACE

34

U+1F389 🎉 [So] PARTY POPPER

35

"""

36

```

37

38

### Transformation Plan Application

39

40

Function for manually applying transformation plans generated by ftfy's explanation system.

41

42

```python { .api }

43

# Dictionary mapping fixer names to functions for use with apply_plan

44

FIXERS: dict[str, Callable] = {

45

"unescape_html": fixes.unescape_html,

46

"remove_terminal_escapes": fixes.remove_terminal_escapes,

47

"restore_byte_a0": fixes.restore_byte_a0,

48

"replace_lossy_sequences": fixes.replace_lossy_sequences,

49

"decode_inconsistent_utf8": fixes.decode_inconsistent_utf8,

50

"fix_c1_controls": fixes.fix_c1_controls,

51

"fix_latin_ligatures": fixes.fix_latin_ligatures,

52

"fix_character_width": fixes.fix_character_width,

53

"uncurl_quotes": fixes.uncurl_quotes,

54

"fix_line_breaks": fixes.fix_line_breaks,

55

"fix_surrogates": fixes.fix_surrogates,

56

"remove_control_chars": fixes.remove_control_chars,

57

}

58

59

def apply_plan(text: str, plan: list[tuple[str, str]]) -> str:

60

"""

61

Apply sequence of text transformations from explanation plan.

62

63

Takes transformation plan (list of operation/parameter tuples) and

64

applies each step in sequence. Useful for replaying ftfy fixes or

65

applying custom transformation sequences.

66

67

Args:

68

text: Initial text or bytes to transform

69

plan: List of (operation, parameter) tuples

70

71

Returns:

72

Final transformed text

73

74

Operations:

75

"encode": Convert string to bytes using parameter as encoding

76

"decode": Convert bytes to string using parameter as encoding

77

"transcode": Apply bytes→bytes function named in parameter

78

"apply": Apply string→string function named in parameter

79

80

Examples:

81

>>> plan = [('encode', 'latin-1'), ('decode', 'utf-8')]

82

>>> apply_plan('só', plan)

83

'só'

84

85

>>> plan = [('apply', 'uncurl_quotes'), ('apply', 'fix_line_breaks')]

86

>>> apply_plan('"curly quotes"\\r\\n', plan)

87

'"curly quotes"\\n'

88

"""

89

```

90

91

### Heuristic Text Analysis

92

93

Functions for detecting whether text contains mojibake or other problems.

94

95

```python { .api }

96

def badness(text: str) -> int:

97

"""

98

Count the number of unlikely character sequences in text.

99

100

Returns numerical badness score by counting mojibake patterns.

101

Higher scores indicate more likely encoding problems. Score > 0

102

indicates text likely contains mojibake.

103

104

Args:

105

text: Unicode string to analyze

106

107

Returns:

108

Number of unlikely character sequences found

109

110

Examples:

111

>>> from ftfy.badness import badness

112

>>> badness("normal text")

113

0

114

>>> badness("âœ" broken") # Multiple mojibake patterns

115

2

116

"""

117

118

def is_bad(text: str) -> bool:

119

"""

120

Heuristic detection of likely mojibake in text.

121

122

Uses statistical analysis of Unicode character patterns to detect

123

text that likely contains encoding problems. Designed to minimize

124

false positives while catching common mojibake patterns.

125

126

Args:

127

text: Unicode string to analyze

128

129

Returns:

130

True if text likely contains mojibake, False otherwise

131

132

Examples:

133

>>> from ftfy.badness import is_bad

134

>>> is_bad("normal text")

135

False

136

>>> is_bad("âœ" broken") # Mojibake pattern

137

True

138

"""

139

```

140

141

## Usage Examples

142

143

### Unicode Text Debugging

144

145

```python

146

from ftfy import explain_unicode

147

148

# Debug mysterious characters

149

mysterious_text = "Weird chars: \u00a0\u200b\u2019"

150

print("Analyzing mysterious text:")

151

explain_unicode(mysterious_text)

152

153

# Debug emoji and special characters

154

emoji_text = "🎉🔥💯"

155

print("\nAnalyzing emoji:")

156

explain_unicode(emoji_text)

157

158

# Debug potential mojibake

159

mojibake = "café" # This might be mojibake

160

print("\nAnalyzing potential mojibake:")

161

explain_unicode(mojibake)

162

```

163

164

### Transformation Plan Replay

165

166

```python

167

from ftfy import fix_and_explain, apply_plan

168

169

# Get explanation for a fix

170

broken_text = "só"

171

result = fix_and_explain(broken_text)

172

print(f"Original: {broken_text}")

173

print(f"Fixed: {result.text}")

174

print(f"Plan: {result.explanation}")

175

176

# Convert ExplanationStep objects to tuples for apply_plan

177

plan_tuples = [(step.action, step.parameter) for step in result.explanation]

178

179

# Replay the same transformation on similar text

180

similar_text = "José" # Same type of mojibake

181

replayed = apply_plan(similar_text, plan_tuples)

182

print(f"Replayed fix: {similar_text} → {replayed}")

183

```

184

185

### Custom Transformation Sequences

186

187

```python

188

from ftfy import apply_plan

189

from ftfy.fixes import FIXERS

190

191

# Check available transformations

192

print("Available fixers:", list(FIXERS.keys()))

193

194

# Build custom transformation plan

195

custom_plan = [

196

('apply', 'remove_terminal_escapes'),

197

('apply', 'unescape_html'),

198

('apply', 'uncurl_quotes'),

199

('apply', 'fix_character_width')

200

]

201

202

# Apply custom sequence

203

messy_text = '\x1b[31m<"curly">\x1b[0m WIDE'

204

cleaned = apply_plan(messy_text, custom_plan)

205

print(f"Custom clean: {messy_text} → {cleaned}")

206

```

207

208

### Encoding Transformation Plans

209

210

```python

211

from ftfy import apply_plan

212

213

# Manually specify encoding transformations

214

encoding_plan = [

215

('encode', 'latin-1'), # String → bytes as latin-1

216

('decode', 'utf-8') # Bytes → string as utf-8

217

]

218

219

mojibake_texts = ['café', 'naïve', 'résumé']

220

for text in mojibake_texts:

221

try:

222

fixed = apply_plan(text, encoding_plan)

223

print(f"{text} → {fixed}")

224

except UnicodeError as e:

225

print(f"{text} → Error: {e}")

226

```

227

228

### Mojibake Detection

229

230

```python

231

from ftfy.badness import is_bad, badness

232

from ftfy import fix_text

233

234

test_strings = [

235

"Normal English text",

236

"Regular café",

237

"âœ" mojibake pattern",

238

"Broken text™ with weird chars",

239

"Standard Unicode: 你好世界",

240

"Currency symbols: €£¥",

241

"só definite mojibake"

242

]

243

244

print("Mojibake detection results:")

245

for text in test_strings:

246

bad = is_bad(text)

247

score = badness(text)

248

if bad:

249

fixed = fix_text(text)

250

print(f"😱 BAD (score {score}): '{text}' → '{fixed}'")

251

else:

252

print(f"✅ OK (score {score}): '{text}'")

253

```

254

255

### Debugging Text Processing Pipeline

256

257

```python

258

from ftfy import fix_and_explain, apply_plan, explain_unicode

259

from ftfy.badness import is_bad, badness

260

261

def debug_text_processing(text):

262

"""Comprehensive text debugging pipeline."""

263

264

print(f"=== Debugging: '{text}' ===")

265

266

# Check if text looks problematic

267

bad_score = badness(text)

268

print(f"Looks bad: {is_bad(text)} (badness score: {bad_score})")

269

270

# Show character details

271

print("\nCharacter analysis:")

272

explain_unicode(text)

273

274

# Try fixing and get explanation

275

result = fix_and_explain(text)

276

print(f"\nFixed: '{result.text}'")

277

278

if result.explanation:

279

print(f"Transformations applied: {len(result.explanation)}")

280

for i, step in enumerate(result.explanation, 1):

281

print(f" {i}. {step.action}: {step.parameter}")

282

283

# Test plan replay

284

plan_tuples = [(s.action, s.parameter) for s in result.explanation]

285

replayed = apply_plan(text, plan_tuples)

286

print(f"Plan replay result: '{replayed}'")

287

print(f"Replay matches: {replayed == result.text}")

288

else:

289

print("No transformations needed")

290

291

print()

292

293

# Debug various problematic texts

294

debug_texts = [

295

"âœ" Check mark mojibake",

296

"Normal text",

297

"só encoding issue",

298

'\x1b[31mTerminal\x1b[0m escapes'

299

]

300

301

for text in debug_texts:

302

debug_text_processing(text)

303

```

304

305

### Plan Composition and Analysis

306

307

```python

308

from ftfy import apply_plan, fix_and_explain

309

310

def analyze_transformation_effects(text, individual_plans):

311

"""Test individual transformations vs combined effect."""

312

313

print(f"Original: '{text}'")

314

315

# Apply individual transformations

316

print("\nIndividual transformations:")

317

current = text

318

for plan_name, plan in individual_plans.items():

319

try:

320

result = apply_plan(current, plan)

321

if result != current:

322

print(f" {plan_name}: '{current}' → '{result}'")

323

current = result

324

else:

325

print(f" {plan_name}: no change")

326

except Exception as e:

327

print(f" {plan_name}: ERROR {e}")

328

329

print(f"Sequential result: '{current}'")

330

331

# Compare with ftfy's automatic processing

332

auto_result = fix_and_explain(text)

333

print(f"ftfy result: '{auto_result.text}'")

334

print(f"Results match: {current == auto_result.text}")

335

336

# Test transformation composition

337

plans = {

338

'html_unescape': [('apply', 'unescape_html')],

339

'terminal_clean': [('apply', 'remove_terminal_escapes')],

340

'quote_fix': [('apply', 'uncurl_quotes')],

341

'encoding_fix': [('encode', 'latin-1'), ('decode', 'utf-8')]

342

}

343

344

complex_text = '\x1b[32m<"Problematic">\x1b[0m text with sóme issues'

345

analyze_transformation_effects(complex_text, plans)

346

```