Tessl Tile for pypi/smart-open@7.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

big-data.md cloud-storage.md compression.md core-operations.md index.md network-access.md utilities.md

compression.mddocs/

0
# Compression and Encoding
1

2
Automatic and explicit compression handling for multiple formats with streaming support. Smart-open provides transparent compression/decompression that works seamlessly across all supported storage systems.
3

4
## Capabilities
5

6
### Compression Management
7

8
Register custom compression handlers and manage compression behavior.
9

10
```python { .api }
11
def register_compressor(ext, callback):
12
    """Register compression handler for file extension.
13
    
14
    Parameters:
15
        ext: str - File extension with leading dot (e.g., '.gz', '.custom')
16
        callback: callable - Function accepting (file_obj, mode) returning wrapped file object
17
        
18
    Notes:
19
        Callback should return file-like object that handles compression/decompression
20
        Mode parameter indicates 'rb'/'wb' for binary or text mode intent
21
    """
22

23
def get_supported_compression_types():
24
    """Get list of supported compression type identifiers.
25
    
26
    Returns:
27
        list[str] - Compression types like ['gzip', 'bz2', 'zstandard', 'disable', 'infer_from_extension']
28
    """
29

30
def get_supported_extensions():
31
    """Get list of supported compressed file extensions.
32
    
33
    Returns:
34
        list[str] - File extensions like ['.gz', '.bz2', '.zst', '.xz', '.lzma']
35
    """
36

37
def compression_wrapper(file_obj, mode, compression='infer_from_extension', filename=None):
38
    """Wrap file object with appropriate compression handler.
39
    
40
    Parameters:
41
        file_obj: file-like object - Base file object to wrap
42
        mode: str - File mode for compression behavior
43
        compression: str - Compression type or 'infer_from_extension'
44
        filename: str - Filename for extension-based inference
45
        
46
    Returns:
47
        file-like object - Wrapped or original file object
48
    """
49
```
50

51
### Compression Constants
52

53
```python { .api }
54
# Compression behavior constants
55
NO_COMPRESSION = 'disable'
56
INFER_FROM_EXTENSION = 'infer_from_extension'
57
```
58

59
## Supported Compression Formats
60

61
Smart-open supports multiple compression formats out of the box:
62

63
- **gzip** (`.gz`) - Most common, good compression ratio and speed
64
- **bzip2** (`.bz2`) - Higher compression ratio, slower
65
- **zstandard** (`.zst`) - Modern format, excellent compression and speed
66
- **xz/lzma** (`.xz`, `.lzma`) - High compression ratio
67
- **lz4** (`.lz4`) - Very fast compression/decompression
68

69
## Usage Examples
70

71
### Automatic Compression Detection
72

73
```python
74
from smart_open import open
75

76
# Compression automatically detected from file extension
77
with open('s3://bucket/data.txt.gz') as f:
78
    uncompressed_text = f.read()  # Automatically decompressed
79

80
with open('gs://bucket/logs.txt.bz2') as f:
81
    for line in f:  # Line-by-line decompression
82
        process_log_line(line)
83

84
# Writing compressed files (automatic compression)
85
with open('s3://bucket/output.txt.gz', 'w') as f:
86
    f.write('This will be compressed with gzip')
87

88
with open('azure://container/data.json.zst', 'w') as f:
89
    json.dump(large_data, f)  # Compressed with zstandard
90
```
91

92
### Explicit Compression Control
93

94
```python
95
# Explicitly specify compression type
96
with open('s3://bucket/data.txt', compression='gzip') as f:
97
    content = f.read()
98

99
# Disable compression for files with compression extensions
100
with open('s3://bucket/already-compressed.gz', compression='disable') as f:
101
    raw_compressed_data = f.read()  # Read as-is, no decompression
102

103
# Force compression on write
104
with open('gs://bucket/output.txt', 'w', compression='bz2') as f:
105
    f.write('This will be compressed with bzip2')
106
```
107

108
### Binary vs Text Mode with Compression
109

110
```python
111
# Binary mode with compression
112
with open('s3://bucket/binary-data.dat.gz', 'rb') as f:
113
    decompressed_bytes = f.read()
114

115
with open('s3://bucket/output.bin.zst', 'wb') as f:
116
    f.write(binary_data)  # Compressed binary write
117

118
# Text mode with compression and encoding
119
with open('gs://bucket/unicode-text.txt.gz', encoding='utf-8') as f:
120
    unicode_text = f.read()
121

122
with open('azure://container/output.csv.bz2', 'w', encoding='utf-8') as f:
123
    writer = csv.writer(f)
124
    writer.writerows(data)
125
```
126

127
### Custom Compression Handlers
128

129
```python
130
from smart_open import register_compressor
131
import lz4.frame
132

133
# Register LZ4 compression handler
134
def lz4_handler(file_obj, mode):
135
    if 'r' in mode:
136
        return lz4.frame.open(file_obj, mode='rb')
137
    else:
138
        return lz4.frame.open(file_obj, mode='wb')
139

140
register_compressor('.lz4', lz4_handler)
141

142
# Now .lz4 files work automatically
143
with open('s3://bucket/data.txt.lz4') as f:
144
    content = f.read()
145

146
# Custom compression with parameters
147
def custom_gzip_handler(file_obj, mode):
148
    import gzip
149
    if 'r' in mode:
150
        return gzip.open(file_obj, mode='rt', encoding='utf-8')
151
    else:
152
        return gzip.open(file_obj, mode='wt', encoding='utf-8', compresslevel=9)
153

154
register_compressor('.custom.gz', custom_gzip_handler)
155
```
156

157
## Performance Considerations
158

159
### Compression Format Selection
160

161
```python
162
# For maximum compression (slower)
163
with open('s3://bucket/archive.txt.bz2', 'w') as f:
164
    f.write(large_text_data)
165

166
# For fastest compression/decompression
167
with open('s3://bucket/fast-access.txt.lz4', 'w') as f:
168
    f.write(frequently_accessed_data)
169

170
# Good balance of speed and compression
171
with open('s3://bucket/balanced.txt.zst', 'w') as f:
172
    f.write(general_purpose_data)
173

174
# Traditional web standard
175
with open('s3://bucket/web-compatible.txt.gz', 'w') as f:
176
    f.write(web_data)
177
```
178

179
### Streaming Compression
180

181
```python
182
# Stream large files with compression
183
with open('s3://bucket/huge-file.txt.gz') as f:
184
    for line in f:  # Memory-efficient line-by-line decompression
185
        process_line(line)
186

187
# Chunked reading with compression
188
with open('gs://bucket/large-binary.dat.zst', 'rb') as f:
189
    while True:
190
        chunk = f.read(64 * 1024)  # 64KB chunks, decompressed
191
        if not chunk:
192
            break
193
        process_chunk(chunk)
194

195
# Streaming write with compression
196
with open('azure://container/stream-output.txt.gz', 'w') as f:
197
    for record in generate_large_dataset():
198
        f.write(f"{record}\n")  # Compressed on-the-fly
199
```
200

201
### Compression Level Control
202

203
```python
204
# Custom compression levels via transport_params
205
import gzip
206

207
def high_compression_gzip(file_obj, mode):
208
    if 'r' in mode:
209
        return gzip.open(file_obj, mode='rt')
210
    else:
211
        return gzip.open(file_obj, mode='wt', compresslevel=9)
212

213
register_compressor('.high.gz', high_compression_gzip)
214

215
# Or use existing libraries with custom parameters
216
import bz2
217

218
def fast_bzip2(file_obj, mode):
219
    if 'r' in mode:
220
        return bz2.open(file_obj, mode='rt')
221
    else:
222
        return bz2.open(file_obj, mode='wt', compresslevel=1)
223

224
register_compressor('.fast.bz2', fast_bzip2)
225
```
226

227
## Integration Examples
228

229
### Data Pipeline Integration
230

231
```python
232
# ETL pipeline with compression
233
def process_compressed_data():
234
    # Extract: Read compressed source data
235
    with open('s3://raw-data/input.csv.gz') as f:
236
        reader = csv.DictReader(f)
237
        data = list(reader)
238
    
239
    # Transform: Process data
240
    processed_data = transform_data(data)
241
    
242
    # Load: Write compressed output
243
    with open('s3://processed-data/output.json.zst', 'w') as f:
244
        json.dump(processed_data, f)
245

246
# Batch processing with different compression formats
247
input_files = [
248
    's3://data/file1.txt.gz',
249
    's3://data/file2.txt.bz2', 
250
    's3://data/file3.txt.zst'
251
]
252

253
for input_file in input_files:
254
    with open(input_file) as f:  # Automatic decompression
255
        content = f.read()
256
        result = process_content(content)
257
        
258
        # Output with consistent compression
259
        output_file = input_file.replace('s3://data/', 's3://results/').replace('.txt.', '.result.')
260
        with open(output_file, 'w') as out_f:
261
            out_f.write(result)
262
```
263

264
### Backup and Archival
265

266
```python
267
# Compress backups with maximum compression
268
import json
269
from datetime import datetime
270

271
backup_data = collect_backup_data()
272
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
273

274
# High compression for long-term storage
275
with open(f's3://backups/backup_{timestamp}.json.bz2', 'w') as f:
276
    json.dump(backup_data, f, separators=(',', ':'))
277

278
# Fast compression for recent backups
279
with open(f's3://recent-backups/backup_{timestamp}.json.lz4', 'w') as f:
280
    json.dump(backup_data, f, indent=2)
281
```
282

283
### Log Processing
284

285
```python
286
# Process compressed log files
287
import re
288
from collections import defaultdict
289

290
log_pattern = re.compile(r'(\d{4}-\d{2}-\d{2}) (\w+): (.+)')
291
stats = defaultdict(int)
292

293
# Read compressed logs from multiple sources
294
log_files = [
295
    's3://logs/app.log.gz',
296
    'gs://logs/app.log.bz2',
297
    'azure://logs/app.log.zst'
298
]
299

300
for log_file in log_files:
301
    with open(log_file) as f:
302
        for line in f:
303
            match = log_pattern.match(line.strip())
304
            if match:
305
                date, level, message = match.groups()
306
                stats[level] += 1
307

308
# Write compressed summary
309
with open('s3://reports/log-summary.json.gz', 'w') as f:
310
    json.dump(dict(stats), f)
311
```
312

313
## Error Handling
314

315
### Compression-Specific Errors
316

317
```python
318
import gzip
319
import bz2
320
import lzma
321
from smart_open import open
322

323
try:
324
    with open('s3://bucket/corrupted-file.txt.gz') as f:
325
        content = f.read()
326
except gzip.BadGzipFile:
327
    print("Corrupted gzip file")
328
except bz2.BadBz2File:
329
    print("Corrupted bzip2 file") 
330
except lzma.LZMAError:
331
    print("Corrupted LZMA/XZ file")
332
except Exception as e:
333
    print(f"Other compression error: {e}")
334

335
# Fallback to uncompressed reading
336
try:
337
    with open('s3://bucket/maybe-compressed.txt') as f:
338
        content = f.read()
339
except Exception:
340
    # Try without decompression
341
    with open('s3://bucket/maybe-compressed.txt', compression='disable') as f:
342
        raw_content = f.read()
343
```
344

345
### Validation and Integrity
346

347
```python
348
import hashlib
349

350
def verify_compressed_file(uri, expected_hash):
351
    """Verify integrity of compressed file content."""
352
    hasher = hashlib.sha256()
353
    
354
    try:
355
        with open(uri, 'rb') as f:
356
            for chunk in iter(lambda: f.read(8192), b''):
357
                hasher.update(chunk)
358
        
359
        actual_hash = hasher.hexdigest()
360
        return actual_hash == expected_hash
361
    except Exception as e:
362
        print(f"Verification failed: {e}")
363
        return False
364

365
# Usage
366
if verify_compressed_file('s3://bucket/data.txt.gz', expected_hash):
367
    print("File integrity verified")
368
else:
369
    print("File integrity check failed")
370
```
371

372
## Best Practices
373

374
### Format Selection Guidelines
375

376
1. **Use gzip (.gz)** for web compatibility and general use
377
2. **Use zstandard (.zst)** for best balance of speed and compression
378
3. **Use bzip2 (.bz2)** for maximum compression when storage space is critical  
379
4. **Use lz4 (.lz4)** for maximum speed when compression ratio is less important
380
5. **Use xz (.xz)** for archival data requiring maximum compression
381

382
### Performance Optimization
383

384
```python
385
# Pre-compile compression handlers for repeated use
386
import gzip
387
import io
388

389
class OptimizedGzipHandler:
390
    def __init__(self, compresslevel=6):
391
        self.compresslevel = compresslevel
392
    
393
    def __call__(self, file_obj, mode):
394
        if 'r' in mode:
395
            return gzip.open(file_obj, mode='rt')
396
        else:
397
            return gzip.open(file_obj, mode='wt', 
398
                           compresslevel=self.compresslevel)
399

400
# Register optimized handler
401
register_compressor('.opt.gz', OptimizedGzipHandler(compresslevel=9))
402

403
# Batch processing with consistent compression settings
404
files_to_process = ['file1.txt', 'file2.txt', 'file3.txt']
405
for filename in files_to_process:
406
    with open(f's3://bucket/{filename}.opt.gz', 'w') as f:
407
        f.write(process_file(filename))
408
```

Version

Tile

Files

compression.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

compression.mddocs/