0
# File and Byte Processing
1
2
Functions for processing files and handling bytes of unknown encoding, including streaming file processing and encoding detection utilities.
3
4
## Capabilities
5
6
### File Stream Processing
7
8
Process text files with automatic encoding detection and line-by-line text fixing.
9
10
```python { .api }
11
def fix_file(
12
input_file: TextIO | BinaryIO,
13
encoding: str | None = None,
14
config: TextFixerConfig | None = None,
15
**kwargs: Any
16
) -> Iterator[str]:
17
"""
18
Fix text found in a file with streaming processing.
19
20
Processes file line by line, applying text fixes to each line.
21
Handles both text and binary file objects, with encoding detection
22
for binary files when encoding is not specified.
23
24
Args:
25
input_file: File object opened in text or binary mode
26
encoding: Encoding name for binary files, None for detection
27
config: Configuration object, or None for defaults
28
**kwargs: Individual config options
29
30
Yields:
31
Fixed lines of text as strings
32
33
Examples:
34
>>> with open('messy.txt', 'r') as f:
35
... for line in fix_file(f):
36
... print(line, end='')
37
38
>>> with open('unknown.txt', 'rb') as f:
39
... for line in fix_file(f, encoding='utf-8'):
40
... print(line, end='')
41
"""
42
```
43
44
### Byte Encoding Detection
45
46
Attempt to decode bytes of unknown encoding using heuristic detection.
47
48
```python { .api }
49
def guess_bytes(bstring: bytes) -> tuple[str, str]:
50
"""
51
Guess reasonable strategy for decoding bytes in unknown encoding.
52
53
WARNING: This is not the recommended way to use ftfy. ftfy is not
54
designed as an encoding detector. Use only when encoding is truly
55
unknowable and you need a best-effort decode.
56
57
Tries encodings in order: UTF-16 with BOM, UTF-8, utf-8-variants,
58
MacRoman (if CR line breaks), sloppy-windows-1252.
59
60
Args:
61
bstring: Bytes to decode
62
63
Returns:
64
Tuple of (decoded_string, detected_encoding)
65
66
Raises:
67
UnicodeError: If input is already a string
68
69
Examples:
70
>>> text, encoding = guess_bytes(b'caf\\xc3\\xa9')
71
>>> print(f"Text: {text}, Encoding: {encoding}")
72
Text: café, Encoding: utf-8
73
74
>>> text, encoding = guess_bytes(b'\\xff\\xfecafe') # UTF-16 BOM
75
>>> print(f"Encoding: {encoding}")
76
Encoding: utf-16
77
"""
78
```
79
80
## Usage Examples
81
82
### Processing Text Files
83
84
```python
85
from ftfy import fix_file, TextFixerConfig
86
import sys
87
88
# Process file with default settings
89
with open('input.txt', 'r', encoding='utf-8') as infile:
90
with open('output.txt', 'w', encoding='utf-8') as outfile:
91
for line in fix_file(infile):
92
outfile.write(line)
93
94
# Process with custom configuration
95
config = TextFixerConfig(uncurl_quotes=False, fix_encoding=True)
96
with open('input.txt', 'r') as infile:
97
for line in fix_file(infile, config=config):
98
print(line, end='')
99
```
100
101
### Processing Binary Files
102
103
```python
104
from ftfy import fix_file
105
106
# Process binary file with known encoding
107
with open('data.txt', 'rb') as binfile:
108
for line in fix_file(binfile, encoding='latin-1'):
109
print(line, end='')
110
111
# Process binary file with encoding detection (risky)
112
with open('unknown.txt', 'rb') as binfile:
113
for line in fix_file(binfile, encoding=None): # Will use guess_bytes
114
print(line, end='')
115
```
116
117
### Standard Input/Output Processing
118
119
```python
120
import sys
121
from ftfy import fix_file
122
123
# Process stdin to stdout
124
for line in fix_file(sys.stdin):
125
sys.stdout.write(line)
126
127
# Process stdin as binary with encoding detection
128
for line in fix_file(sys.stdin.buffer, encoding=None):
129
sys.stdout.write(line)
130
```
131
132
### Batch File Processing
133
134
```python
135
import os
136
from ftfy import fix_file, TextFixerConfig
137
138
def process_directory(input_dir, output_dir, config=None):
139
"""Process all text files in a directory."""
140
if config is None:
141
config = TextFixerConfig()
142
143
for filename in os.listdir(input_dir):
144
if filename.endswith('.txt'):
145
input_path = os.path.join(input_dir, filename)
146
output_path = os.path.join(output_dir, filename)
147
148
with open(input_path, 'rb') as infile:
149
with open(output_path, 'w', encoding='utf-8') as outfile:
150
for line in fix_file(infile, config=config):
151
outfile.write(line)
152
153
# Process with conservative settings
154
conservative = TextFixerConfig(
155
fix_encoding=True,
156
unescape_html=False,
157
restore_byte_a0=False
158
)
159
process_directory('input/', 'output/', conservative)
160
```
161
162
### Encoding Detection Examples
163
164
```python
165
from ftfy import guess_bytes
166
167
# Detect UTF-8
168
utf8_bytes = "café".encode('utf-8')
169
text, encoding = guess_bytes(utf8_bytes)
170
print(f"Detected: {encoding}, Text: {text}") # utf-8, café
171
172
# Detect UTF-16 with BOM
173
utf16_bytes = "hello".encode('utf-16')
174
text, encoding = guess_bytes(utf16_bytes)
175
print(f"Detected: {encoding}") # utf-16
176
177
# Detect MacRoman (by CR line breaks)
178
macroman_bytes = "line1\rline2".encode('macroman')
179
text, encoding = guess_bytes(macroman_bytes)
180
print(f"Detected: {encoding}") # macroman
181
182
# Default to sloppy-windows-1252
183
mystery_bytes = bytes([0x80, 0x81, 0x82]) # C1 controls
184
text, encoding = guess_bytes(mystery_bytes)
185
print(f"Detected: {encoding}") # sloppy-windows-1252
186
```
187
188
### Error Handling
189
190
```python
191
from ftfy import fix_file, guess_bytes
192
193
# Handle encoding detection errors
194
def safe_guess_bytes(data):
195
"""Safely guess byte encoding with fallback."""
196
try:
197
return guess_bytes(data)
198
except UnicodeDecodeError:
199
# Fallback to sloppy decoding
200
return data.decode('sloppy-windows-1252', errors='replace'), 'sloppy-windows-1252'
201
202
# Handle file processing errors
203
def safe_fix_file(filepath, output_path):
204
"""Process file with error handling."""
205
try:
206
with open(filepath, 'rb') as infile:
207
# Try UTF-8 first
208
infile_text = open(filepath, 'r', encoding='utf-8')
209
with open(output_path, 'w', encoding='utf-8') as outfile:
210
for line in fix_file(infile_text):
211
outfile.write(line)
212
213
except UnicodeDecodeError:
214
# Fall back to binary mode with detection
215
with open(filepath, 'rb') as infile:
216
with open(output_path, 'w', encoding='utf-8') as outfile:
217
for line in fix_file(infile, encoding=None):
218
outfile.write(line)
219
```
220
221
### Large File Processing
222
223
```python
224
from ftfy import fix_file, TextFixerConfig
225
226
# Process large files with memory efficiency
227
def process_large_file(input_path, output_path, chunk_size=1024*1024):
228
"""Process large file in chunks to manage memory."""
229
config = TextFixerConfig(max_decode_length=chunk_size)
230
231
with open(input_path, 'rb') as infile:
232
with open(output_path, 'w', encoding='utf-8') as outfile:
233
for line in fix_file(infile, config=config):
234
outfile.write(line)
235
236
# Disable explanations for performance on large files
237
fast_config = TextFixerConfig(explain=False, max_decode_length=500000)
238
239
with open('huge_file.txt', 'rb') as infile:
240
for line in fix_file(infile, config=fast_config):
241
# Process line...
242
pass
243
```
244
245
### Custom File Processing Pipeline
246
247
```python
248
from ftfy import fix_file, TextFixerConfig
249
import gzip
250
import json
251
252
def process_jsonl_gz(input_path, output_path):
253
"""Process gzipped JSONL file with text fixing."""
254
config = TextFixerConfig(unescape_html=False) # Preserve JSON
255
256
with gzip.open(input_path, 'rt', encoding='utf-8') as infile:
257
with gzip.open(output_path, 'wt', encoding='utf-8') as outfile:
258
for line in fix_file(infile, config=config):
259
try:
260
# Parse and re-serialize to ensure valid JSON
261
data = json.loads(line.strip())
262
json.dump(data, outfile, ensure_ascii=False)
263
outfile.write('\n')
264
except json.JSONDecodeError:
265
# Write problematic line as-is
266
outfile.write(line)
267
268
# Process log files with terminal escapes
269
def clean_log_file(input_path, output_path):
270
"""Clean log file by removing terminal escapes."""
271
config = TextFixerConfig(
272
remove_terminal_escapes=True,
273
fix_encoding=True,
274
unescape_html=False, # Logs may contain < >
275
uncurl_quotes=False # Preserve original quotes in logs
276
)
277
278
with open(input_path, 'rb') as infile:
279
with open(output_path, 'w', encoding='utf-8') as outfile:
280
for line in fix_file(infile, config=config):
281
outfile.write(line)
282
```