or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

big-data.mdcloud-storage.mdcompression.mdcore-operations.mdindex.mdnetwork-access.mdutilities.md

compression.mddocs/

0

# Compression and Encoding

1

2

Automatic and explicit compression handling for multiple formats with streaming support. Smart-open provides transparent compression/decompression that works seamlessly across all supported storage systems.

3

4

## Capabilities

5

6

### Compression Management

7

8

Register custom compression handlers and manage compression behavior.

9

10

```python { .api }

11

def register_compressor(ext, callback):

12

"""Register compression handler for file extension.

13

14

Parameters:

15

ext: str - File extension with leading dot (e.g., '.gz', '.custom')

16

callback: callable - Function accepting (file_obj, mode) returning wrapped file object

17

18

Notes:

19

Callback should return file-like object that handles compression/decompression

20

Mode parameter indicates 'rb'/'wb' for binary or text mode intent

21

"""

22

23

def get_supported_compression_types():

24

"""Get list of supported compression type identifiers.

25

26

Returns:

27

list[str] - Compression types like ['gzip', 'bz2', 'zstandard', 'disable', 'infer_from_extension']

28

"""

29

30

def get_supported_extensions():

31

"""Get list of supported compressed file extensions.

32

33

Returns:

34

list[str] - File extensions like ['.gz', '.bz2', '.zst', '.xz', '.lzma']

35

"""

36

37

def compression_wrapper(file_obj, mode, compression='infer_from_extension', filename=None):

38

"""Wrap file object with appropriate compression handler.

39

40

Parameters:

41

file_obj: file-like object - Base file object to wrap

42

mode: str - File mode for compression behavior

43

compression: str - Compression type or 'infer_from_extension'

44

filename: str - Filename for extension-based inference

45

46

Returns:

47

file-like object - Wrapped or original file object

48

"""

49

```

50

51

### Compression Constants

52

53

```python { .api }

54

# Compression behavior constants

55

NO_COMPRESSION = 'disable'

56

INFER_FROM_EXTENSION = 'infer_from_extension'

57

```

58

59

## Supported Compression Formats

60

61

Smart-open supports multiple compression formats out of the box:

62

63

- **gzip** (`.gz`) - Most common, good compression ratio and speed

64

- **bzip2** (`.bz2`) - Higher compression ratio, slower

65

- **zstandard** (`.zst`) - Modern format, excellent compression and speed

66

- **xz/lzma** (`.xz`, `.lzma`) - High compression ratio

67

- **lz4** (`.lz4`) - Very fast compression/decompression

68

69

## Usage Examples

70

71

### Automatic Compression Detection

72

73

```python

74

from smart_open import open

75

76

# Compression automatically detected from file extension

77

with open('s3://bucket/data.txt.gz') as f:

78

uncompressed_text = f.read() # Automatically decompressed

79

80

with open('gs://bucket/logs.txt.bz2') as f:

81

for line in f: # Line-by-line decompression

82

process_log_line(line)

83

84

# Writing compressed files (automatic compression)

85

with open('s3://bucket/output.txt.gz', 'w') as f:

86

f.write('This will be compressed with gzip')

87

88

with open('azure://container/data.json.zst', 'w') as f:

89

json.dump(large_data, f) # Compressed with zstandard

90

```

91

92

### Explicit Compression Control

93

94

```python

95

# Explicitly specify compression type

96

with open('s3://bucket/data.txt', compression='gzip') as f:

97

content = f.read()

98

99

# Disable compression for files with compression extensions

100

with open('s3://bucket/already-compressed.gz', compression='disable') as f:

101

raw_compressed_data = f.read() # Read as-is, no decompression

102

103

# Force compression on write

104

with open('gs://bucket/output.txt', 'w', compression='bz2') as f:

105

f.write('This will be compressed with bzip2')

106

```

107

108

### Binary vs Text Mode with Compression

109

110

```python

111

# Binary mode with compression

112

with open('s3://bucket/binary-data.dat.gz', 'rb') as f:

113

decompressed_bytes = f.read()

114

115

with open('s3://bucket/output.bin.zst', 'wb') as f:

116

f.write(binary_data) # Compressed binary write

117

118

# Text mode with compression and encoding

119

with open('gs://bucket/unicode-text.txt.gz', encoding='utf-8') as f:

120

unicode_text = f.read()

121

122

with open('azure://container/output.csv.bz2', 'w', encoding='utf-8') as f:

123

writer = csv.writer(f)

124

writer.writerows(data)

125

```

126

127

### Custom Compression Handlers

128

129

```python

130

from smart_open import register_compressor

131

import lz4.frame

132

133

# Register LZ4 compression handler

134

def lz4_handler(file_obj, mode):

135

if 'r' in mode:

136

return lz4.frame.open(file_obj, mode='rb')

137

else:

138

return lz4.frame.open(file_obj, mode='wb')

139

140

register_compressor('.lz4', lz4_handler)

141

142

# Now .lz4 files work automatically

143

with open('s3://bucket/data.txt.lz4') as f:

144

content = f.read()

145

146

# Custom compression with parameters

147

def custom_gzip_handler(file_obj, mode):

148

import gzip

149

if 'r' in mode:

150

return gzip.open(file_obj, mode='rt', encoding='utf-8')

151

else:

152

return gzip.open(file_obj, mode='wt', encoding='utf-8', compresslevel=9)

153

154

register_compressor('.custom.gz', custom_gzip_handler)

155

```

156

157

## Performance Considerations

158

159

### Compression Format Selection

160

161

```python

162

# For maximum compression (slower)

163

with open('s3://bucket/archive.txt.bz2', 'w') as f:

164

f.write(large_text_data)

165

166

# For fastest compression/decompression

167

with open('s3://bucket/fast-access.txt.lz4', 'w') as f:

168

f.write(frequently_accessed_data)

169

170

# Good balance of speed and compression

171

with open('s3://bucket/balanced.txt.zst', 'w') as f:

172

f.write(general_purpose_data)

173

174

# Traditional web standard

175

with open('s3://bucket/web-compatible.txt.gz', 'w') as f:

176

f.write(web_data)

177

```

178

179

### Streaming Compression

180

181

```python

182

# Stream large files with compression

183

with open('s3://bucket/huge-file.txt.gz') as f:

184

for line in f: # Memory-efficient line-by-line decompression

185

process_line(line)

186

187

# Chunked reading with compression

188

with open('gs://bucket/large-binary.dat.zst', 'rb') as f:

189

while True:

190

chunk = f.read(64 * 1024) # 64KB chunks, decompressed

191

if not chunk:

192

break

193

process_chunk(chunk)

194

195

# Streaming write with compression

196

with open('azure://container/stream-output.txt.gz', 'w') as f:

197

for record in generate_large_dataset():

198

f.write(f"{record}\n") # Compressed on-the-fly

199

```

200

201

### Compression Level Control

202

203

```python

204

# Custom compression levels via transport_params

205

import gzip

206

207

def high_compression_gzip(file_obj, mode):

208

if 'r' in mode:

209

return gzip.open(file_obj, mode='rt')

210

else:

211

return gzip.open(file_obj, mode='wt', compresslevel=9)

212

213

register_compressor('.high.gz', high_compression_gzip)

214

215

# Or use existing libraries with custom parameters

216

import bz2

217

218

def fast_bzip2(file_obj, mode):

219

if 'r' in mode:

220

return bz2.open(file_obj, mode='rt')

221

else:

222

return bz2.open(file_obj, mode='wt', compresslevel=1)

223

224

register_compressor('.fast.bz2', fast_bzip2)

225

```

226

227

## Integration Examples

228

229

### Data Pipeline Integration

230

231

```python

232

# ETL pipeline with compression

233

def process_compressed_data():

234

# Extract: Read compressed source data

235

with open('s3://raw-data/input.csv.gz') as f:

236

reader = csv.DictReader(f)

237

data = list(reader)

238

239

# Transform: Process data

240

processed_data = transform_data(data)

241

242

# Load: Write compressed output

243

with open('s3://processed-data/output.json.zst', 'w') as f:

244

json.dump(processed_data, f)

245

246

# Batch processing with different compression formats

247

input_files = [

248

's3://data/file1.txt.gz',

249

's3://data/file2.txt.bz2',

250

's3://data/file3.txt.zst'

251

]

252

253

for input_file in input_files:

254

with open(input_file) as f: # Automatic decompression

255

content = f.read()

256

result = process_content(content)

257

258

# Output with consistent compression

259

output_file = input_file.replace('s3://data/', 's3://results/').replace('.txt.', '.result.')

260

with open(output_file, 'w') as out_f:

261

out_f.write(result)

262

```

263

264

### Backup and Archival

265

266

```python

267

# Compress backups with maximum compression

268

import json

269

from datetime import datetime

270

271

backup_data = collect_backup_data()

272

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

273

274

# High compression for long-term storage

275

with open(f's3://backups/backup_{timestamp}.json.bz2', 'w') as f:

276

json.dump(backup_data, f, separators=(',', ':'))

277

278

# Fast compression for recent backups

279

with open(f's3://recent-backups/backup_{timestamp}.json.lz4', 'w') as f:

280

json.dump(backup_data, f, indent=2)

281

```

282

283

### Log Processing

284

285

```python

286

# Process compressed log files

287

import re

288

from collections import defaultdict

289

290

log_pattern = re.compile(r'(\d{4}-\d{2}-\d{2}) (\w+): (.+)')

291

stats = defaultdict(int)

292

293

# Read compressed logs from multiple sources

294

log_files = [

295

's3://logs/app.log.gz',

296

'gs://logs/app.log.bz2',

297

'azure://logs/app.log.zst'

298

]

299

300

for log_file in log_files:

301

with open(log_file) as f:

302

for line in f:

303

match = log_pattern.match(line.strip())

304

if match:

305

date, level, message = match.groups()

306

stats[level] += 1

307

308

# Write compressed summary

309

with open('s3://reports/log-summary.json.gz', 'w') as f:

310

json.dump(dict(stats), f)

311

```

312

313

## Error Handling

314

315

### Compression-Specific Errors

316

317

```python

318

import gzip

319

import bz2

320

import lzma

321

from smart_open import open

322

323

try:

324

with open('s3://bucket/corrupted-file.txt.gz') as f:

325

content = f.read()

326

except gzip.BadGzipFile:

327

print("Corrupted gzip file")

328

except bz2.BadBz2File:

329

print("Corrupted bzip2 file")

330

except lzma.LZMAError:

331

print("Corrupted LZMA/XZ file")

332

except Exception as e:

333

print(f"Other compression error: {e}")

334

335

# Fallback to uncompressed reading

336

try:

337

with open('s3://bucket/maybe-compressed.txt') as f:

338

content = f.read()

339

except Exception:

340

# Try without decompression

341

with open('s3://bucket/maybe-compressed.txt', compression='disable') as f:

342

raw_content = f.read()

343

```

344

345

### Validation and Integrity

346

347

```python

348

import hashlib

349

350

def verify_compressed_file(uri, expected_hash):

351

"""Verify integrity of compressed file content."""

352

hasher = hashlib.sha256()

353

354

try:

355

with open(uri, 'rb') as f:

356

for chunk in iter(lambda: f.read(8192), b''):

357

hasher.update(chunk)

358

359

actual_hash = hasher.hexdigest()

360

return actual_hash == expected_hash

361

except Exception as e:

362

print(f"Verification failed: {e}")

363

return False

364

365

# Usage

366

if verify_compressed_file('s3://bucket/data.txt.gz', expected_hash):

367

print("File integrity verified")

368

else:

369

print("File integrity check failed")

370

```

371

372

## Best Practices

373

374

### Format Selection Guidelines

375

376

1. **Use gzip (.gz)** for web compatibility and general use

377

2. **Use zstandard (.zst)** for best balance of speed and compression

378

3. **Use bzip2 (.bz2)** for maximum compression when storage space is critical

379

4. **Use lz4 (.lz4)** for maximum speed when compression ratio is less important

380

5. **Use xz (.xz)** for archival data requiring maximum compression

381

382

### Performance Optimization

383

384

```python

385

# Pre-compile compression handlers for repeated use

386

import gzip

387

import io

388

389

class OptimizedGzipHandler:

390

def __init__(self, compresslevel=6):

391

self.compresslevel = compresslevel

392

393

def __call__(self, file_obj, mode):

394

if 'r' in mode:

395

return gzip.open(file_obj, mode='rt')

396

else:

397

return gzip.open(file_obj, mode='wt',

398

compresslevel=self.compresslevel)

399

400

# Register optimized handler

401

register_compressor('.opt.gz', OptimizedGzipHandler(compresslevel=9))

402

403

# Batch processing with consistent compression settings

404

files_to_process = ['file1.txt', 'file2.txt', 'file3.txt']

405

for filename in files_to_process:

406

with open(f's3://bucket/{filename}.opt.gz', 'w') as f:

407

f.write(process_file(filename))

408

```