0
# Command Line Interface
1
2
Command-line tool for batch text processing with configurable options for encoding, normalization, and entity handling.
3
4
## Capabilities
5
6
### Command Line Entry Point
7
8
Main function providing command-line access to ftfy text processing.
9
10
```python { .api }
11
def main() -> None:
12
"""
13
Run ftfy as command-line utility.
14
15
Processes files or standard input with configurable text fixing options.
16
Handles encoding detection, normalization settings, and HTML entity processing.
17
18
Command line usage:
19
ftfy [filename] [options]
20
21
Options:
22
-o, --output: Output file (default: stdout)
23
-g, --guess: Guess input encoding (risky)
24
-e, --encoding: Specify input encoding (default: utf-8)
25
-n, --normalization: Unicode normalization (default: NFC)
26
--preserve-entities: Don't decode HTML entities
27
28
Examples:
29
ftfy input.txt -o output.txt
30
ftfy -g mystery.txt
31
cat file.txt | ftfy > cleaned.txt
32
"""
33
```
34
35
## Command Line Usage
36
37
### Basic File Processing
38
39
```bash
40
# Fix a single file, output to stdout
41
ftfy broken_text.txt
42
43
# Fix file and save to new file
44
ftfy input.txt -o fixed_output.txt
45
46
# Process standard input
47
cat messy_file.txt | ftfy > clean_file.txt
48
echo "âœ" mojibake" | ftfy
49
```
50
51
### Encoding Options
52
53
```bash
54
# Specify input encoding explicitly
55
ftfy --encoding latin-1 oldfile.txt
56
57
# Let ftfy guess the encoding (not recommended)
58
ftfy --guess mystery_encoding.txt
59
60
# Process file with unknown encoding
61
ftfy -g -o output.txt unknown_file.txt
62
```
63
64
### Normalization and Entity Options
65
66
```bash
67
# Disable Unicode normalization
68
ftfy --normalization none input.txt
69
70
# Use NFD normalization instead of default NFC
71
ftfy --normalization NFD input.txt
72
73
# Preserve HTML entities (don't decode them)
74
ftfy --preserve-entities html_file.txt
75
76
# Combine options
77
ftfy -e latin-1 -n NFD --preserve-entities input.txt -o output.txt
78
```
79
80
### Batch Processing Examples
81
82
```bash
83
# Process all .txt files in directory
84
for file in *.txt; do
85
ftfy "$file" -o "fixed_$file"
86
done
87
88
# Process files preserving directory structure
89
find . -name "*.txt" -exec sh -c 'ftfy "$1" -o "${1%.txt}_fixed.txt"' _ {} \;
90
91
# Process with encoding detection for mixed files
92
find . -name "*.txt" -exec ftfy -g -o {}.fixed {} \;
93
```
94
95
## Python API Access
96
97
You can also access CLI functionality programmatically:
98
99
```python
100
from ftfy.cli import main
101
import sys
102
103
# Simulate command line arguments
104
sys.argv = ['ftfy', 'input.txt', '-o', 'output.txt', '--encoding', 'latin-1']
105
main()
106
```
107
108
## Usage Examples from Python
109
110
### Replicating CLI Behavior
111
112
```python
113
from ftfy import fix_file, TextFixerConfig
114
import sys
115
116
def cli_equivalent(input_file, output_file=None, encoding='utf-8',
117
normalization='NFC', preserve_entities=False, guess=False):
118
"""Replicate CLI behavior in Python."""
119
120
if guess:
121
encoding = None
122
123
unescape_html = False if preserve_entities else "auto"
124
normalization = None if normalization.lower() == 'none' else normalization
125
126
config = TextFixerConfig(
127
unescape_html=unescape_html,
128
normalization=normalization
129
)
130
131
# Open input file
132
if input_file == '-':
133
infile = sys.stdin.buffer
134
else:
135
infile = open(input_file, 'rb')
136
137
# Open output file
138
if output_file is None or output_file == '-':
139
outfile = sys.stdout
140
else:
141
outfile = open(output_file, 'w', encoding='utf-8')
142
143
try:
144
for line in fix_file(infile, encoding=encoding, config=config):
145
outfile.write(line)
146
finally:
147
if input_file != '-':
148
infile.close()
149
if output_file not in (None, '-'):
150
outfile.close()
151
152
# Usage examples
153
cli_equivalent('messy.txt', 'clean.txt')
154
cli_equivalent('latin1.txt', encoding='latin-1', preserve_entities=True)
155
cli_equivalent('unknown.txt', guess=True)
156
```
157
158
### Error Handling
159
160
The CLI handles various error conditions:
161
162
```python
163
import sys
164
from ftfy.cli import main
165
166
# Test error conditions
167
test_cases = [
168
# Same input and output file
169
['ftfy', 'test.txt', '-o', 'test.txt'],
170
171
# Invalid encoding
172
['ftfy', 'test.txt', '-e', 'invalid-encoding'],
173
174
# Non-existent input file
175
['ftfy', 'nonexistent.txt']
176
]
177
178
for args in test_cases:
179
print(f"Testing: {' '.join(args)}")
180
sys.argv = args
181
try:
182
main()
183
print("Success")
184
except SystemExit as e:
185
print(f"Exit code: {e.code}")
186
except Exception as e:
187
print(f"Error: {e}")
188
print()
189
```
190
191
### Integration with Shell Scripts
192
193
```bash
194
#!/bin/bash
195
# Script to clean up text files from various sources
196
197
FTFY_OPTIONS="--encoding utf-8 --normalization NFC"
198
199
# Function to process file with error handling
200
process_file() {
201
local input="$1"
202
local output="$2"
203
204
if ftfy $FTFY_OPTIONS "$input" -o "$output" 2>/dev/null; then
205
echo "✓ Processed: $input → $output"
206
else
207
echo "✗ Failed to process: $input"
208
# Try with encoding detection as fallback
209
if ftfy --guess "$input" -o "$output" 2>/dev/null; then
210
echo "✓ Processed with encoding detection: $input → $output"
211
else
212
echo "✗ Complete failure: $input"
213
return 1
214
fi
215
fi
216
}
217
218
# Process all text files
219
find . -name "*.txt" | while read file; do
220
process_file "$file" "${file%.txt}_clean.txt"
221
done
222
```
223
224
### Pipeline Integration
225
226
```bash
227
# Integration with common text processing pipelines
228
229
# Clean web scraping results
230
curl -s "https://example.com" | html2text | ftfy > clean_content.txt
231
232
# Process CSV files with text cleaning
233
csvcut -c description messy_data.csv | ftfy > clean_descriptions.txt
234
235
# Clean up log files
236
tail -f application.log | ftfy --preserve-entities > clean.log
237
238
# Database export cleaning
239
pg_dump --data-only mytable | ftfy -g > clean_export.sql
240
241
# Clean and normalize for analysis
242
cat survey_responses.txt | ftfy --normalization NFKC > normalized.txt
243
```
244
245
### Advanced CLI Usage
246
247
```bash
248
# Process files with specific configurations for different use cases
249
250
# Web content: preserve HTML entities, normalize for display
251
ftfy --preserve-entities --normalization NFC web_content.txt
252
253
# Database text: aggressive cleaning, compatibility normalization
254
ftfy --normalization NFKC --encoding utf-8 database_dump.txt
255
256
# Log processing: preserve structure, clean terminal escapes
257
ftfy --preserve-entities log_file.txt | grep -v "^\s*$" > clean_log.txt
258
259
# Scientific text: preserve Unicode, minimal normalization
260
ftfy --normalization NFD scientific_paper.txt
261
262
# Legacy system integration: guess encoding, normalize for compatibility
263
ftfy --guess --normalization NFKC legacy_export.txt
264
```