or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli.mdconfiguration.mdfile-processing.mdformatting.mdindex.mdindividual-fixes.mdtext-fixing.mdutilities.md

file-processing.mddocs/

0

# File and Byte Processing

1

2

Functions for processing files and handling bytes of unknown encoding, including streaming file processing and encoding detection utilities.

3

4

## Capabilities

5

6

### File Stream Processing

7

8

Process text files with automatic encoding detection and line-by-line text fixing.

9

10

```python { .api }

11

def fix_file(

12

input_file: TextIO | BinaryIO,

13

encoding: str | None = None,

14

config: TextFixerConfig | None = None,

15

**kwargs: Any

16

) -> Iterator[str]:

17

"""

18

Fix text found in a file with streaming processing.

19

20

Processes file line by line, applying text fixes to each line.

21

Handles both text and binary file objects, with encoding detection

22

for binary files when encoding is not specified.

23

24

Args:

25

input_file: File object opened in text or binary mode

26

encoding: Encoding name for binary files, None for detection

27

config: Configuration object, or None for defaults

28

**kwargs: Individual config options

29

30

Yields:

31

Fixed lines of text as strings

32

33

Examples:

34

>>> with open('messy.txt', 'r') as f:

35

... for line in fix_file(f):

36

... print(line, end='')

37

38

>>> with open('unknown.txt', 'rb') as f:

39

... for line in fix_file(f, encoding='utf-8'):

40

... print(line, end='')

41

"""

42

```

43

44

### Byte Encoding Detection

45

46

Attempt to decode bytes of unknown encoding using heuristic detection.

47

48

```python { .api }

49

def guess_bytes(bstring: bytes) -> tuple[str, str]:

50

"""

51

Guess reasonable strategy for decoding bytes in unknown encoding.

52

53

WARNING: This is not the recommended way to use ftfy. ftfy is not

54

designed as an encoding detector. Use only when encoding is truly

55

unknowable and you need a best-effort decode.

56

57

Tries encodings in order: UTF-16 with BOM, UTF-8, utf-8-variants,

58

MacRoman (if CR line breaks), sloppy-windows-1252.

59

60

Args:

61

bstring: Bytes to decode

62

63

Returns:

64

Tuple of (decoded_string, detected_encoding)

65

66

Raises:

67

UnicodeError: If input is already a string

68

69

Examples:

70

>>> text, encoding = guess_bytes(b'caf\\xc3\\xa9')

71

>>> print(f"Text: {text}, Encoding: {encoding}")

72

Text: café, Encoding: utf-8

73

74

>>> text, encoding = guess_bytes(b'\\xff\\xfecafe') # UTF-16 BOM

75

>>> print(f"Encoding: {encoding}")

76

Encoding: utf-16

77

"""

78

```

79

80

## Usage Examples

81

82

### Processing Text Files

83

84

```python

85

from ftfy import fix_file, TextFixerConfig

86

import sys

87

88

# Process file with default settings

89

with open('input.txt', 'r', encoding='utf-8') as infile:

90

with open('output.txt', 'w', encoding='utf-8') as outfile:

91

for line in fix_file(infile):

92

outfile.write(line)

93

94

# Process with custom configuration

95

config = TextFixerConfig(uncurl_quotes=False, fix_encoding=True)

96

with open('input.txt', 'r') as infile:

97

for line in fix_file(infile, config=config):

98

print(line, end='')

99

```

100

101

### Processing Binary Files

102

103

```python

104

from ftfy import fix_file

105

106

# Process binary file with known encoding

107

with open('data.txt', 'rb') as binfile:

108

for line in fix_file(binfile, encoding='latin-1'):

109

print(line, end='')

110

111

# Process binary file with encoding detection (risky)

112

with open('unknown.txt', 'rb') as binfile:

113

for line in fix_file(binfile, encoding=None): # Will use guess_bytes

114

print(line, end='')

115

```

116

117

### Standard Input/Output Processing

118

119

```python

120

import sys

121

from ftfy import fix_file

122

123

# Process stdin to stdout

124

for line in fix_file(sys.stdin):

125

sys.stdout.write(line)

126

127

# Process stdin as binary with encoding detection

128

for line in fix_file(sys.stdin.buffer, encoding=None):

129

sys.stdout.write(line)

130

```

131

132

### Batch File Processing

133

134

```python

135

import os

136

from ftfy import fix_file, TextFixerConfig

137

138

def process_directory(input_dir, output_dir, config=None):

139

"""Process all text files in a directory."""

140

if config is None:

141

config = TextFixerConfig()

142

143

for filename in os.listdir(input_dir):

144

if filename.endswith('.txt'):

145

input_path = os.path.join(input_dir, filename)

146

output_path = os.path.join(output_dir, filename)

147

148

with open(input_path, 'rb') as infile:

149

with open(output_path, 'w', encoding='utf-8') as outfile:

150

for line in fix_file(infile, config=config):

151

outfile.write(line)

152

153

# Process with conservative settings

154

conservative = TextFixerConfig(

155

fix_encoding=True,

156

unescape_html=False,

157

restore_byte_a0=False

158

)

159

process_directory('input/', 'output/', conservative)

160

```

161

162

### Encoding Detection Examples

163

164

```python

165

from ftfy import guess_bytes

166

167

# Detect UTF-8

168

utf8_bytes = "café".encode('utf-8')

169

text, encoding = guess_bytes(utf8_bytes)

170

print(f"Detected: {encoding}, Text: {text}") # utf-8, café

171

172

# Detect UTF-16 with BOM

173

utf16_bytes = "hello".encode('utf-16')

174

text, encoding = guess_bytes(utf16_bytes)

175

print(f"Detected: {encoding}") # utf-16

176

177

# Detect MacRoman (by CR line breaks)

178

macroman_bytes = "line1\rline2".encode('macroman')

179

text, encoding = guess_bytes(macroman_bytes)

180

print(f"Detected: {encoding}") # macroman

181

182

# Default to sloppy-windows-1252

183

mystery_bytes = bytes([0x80, 0x81, 0x82]) # C1 controls

184

text, encoding = guess_bytes(mystery_bytes)

185

print(f"Detected: {encoding}") # sloppy-windows-1252

186

```

187

188

### Error Handling

189

190

```python

191

from ftfy import fix_file, guess_bytes

192

193

# Handle encoding detection errors

194

def safe_guess_bytes(data):

195

"""Safely guess byte encoding with fallback."""

196

try:

197

return guess_bytes(data)

198

except UnicodeDecodeError:

199

# Fallback to sloppy decoding

200

return data.decode('sloppy-windows-1252', errors='replace'), 'sloppy-windows-1252'

201

202

# Handle file processing errors

203

def safe_fix_file(filepath, output_path):

204

"""Process file with error handling."""

205

try:

206

with open(filepath, 'rb') as infile:

207

# Try UTF-8 first

208

infile_text = open(filepath, 'r', encoding='utf-8')

209

with open(output_path, 'w', encoding='utf-8') as outfile:

210

for line in fix_file(infile_text):

211

outfile.write(line)

212

213

except UnicodeDecodeError:

214

# Fall back to binary mode with detection

215

with open(filepath, 'rb') as infile:

216

with open(output_path, 'w', encoding='utf-8') as outfile:

217

for line in fix_file(infile, encoding=None):

218

outfile.write(line)

219

```

220

221

### Large File Processing

222

223

```python

224

from ftfy import fix_file, TextFixerConfig

225

226

# Process large files with memory efficiency

227

def process_large_file(input_path, output_path, chunk_size=1024*1024):

228

"""Process large file in chunks to manage memory."""

229

config = TextFixerConfig(max_decode_length=chunk_size)

230

231

with open(input_path, 'rb') as infile:

232

with open(output_path, 'w', encoding='utf-8') as outfile:

233

for line in fix_file(infile, config=config):

234

outfile.write(line)

235

236

# Disable explanations for performance on large files

237

fast_config = TextFixerConfig(explain=False, max_decode_length=500000)

238

239

with open('huge_file.txt', 'rb') as infile:

240

for line in fix_file(infile, config=fast_config):

241

# Process line...

242

pass

243

```

244

245

### Custom File Processing Pipeline

246

247

```python

248

from ftfy import fix_file, TextFixerConfig

249

import gzip

250

import json

251

252

def process_jsonl_gz(input_path, output_path):

253

"""Process gzipped JSONL file with text fixing."""

254

config = TextFixerConfig(unescape_html=False) # Preserve JSON

255

256

with gzip.open(input_path, 'rt', encoding='utf-8') as infile:

257

with gzip.open(output_path, 'wt', encoding='utf-8') as outfile:

258

for line in fix_file(infile, config=config):

259

try:

260

# Parse and re-serialize to ensure valid JSON

261

data = json.loads(line.strip())

262

json.dump(data, outfile, ensure_ascii=False)

263

outfile.write('\n')

264

except json.JSONDecodeError:

265

# Write problematic line as-is

266

outfile.write(line)

267

268

# Process log files with terminal escapes

269

def clean_log_file(input_path, output_path):

270

"""Clean log file by removing terminal escapes."""

271

config = TextFixerConfig(

272

remove_terminal_escapes=True,

273

fix_encoding=True,

274

unescape_html=False, # Logs may contain < >

275

uncurl_quotes=False # Preserve original quotes in logs

276

)

277

278

with open(input_path, 'rb') as infile:

279

with open(output_path, 'w', encoding='utf-8') as outfile:

280

for line in fix_file(infile, config=config):

281

outfile.write(line)

282

```