0
# BGZF Compressed Files
1
2
Support for reading and writing block gzip (BGZF) compressed files, the standard compression format used in genomics for SAM/BAM files and tabix-indexed files.
3
4
## Capabilities
5
6
### BGZFile
7
8
Interface for reading and writing BGZF compressed files with seeking and block-level access.
9
10
```python { .api }
11
class BGZFile:
12
def __init__(self, filepath, mode, compresslevel=6, threads=1):
13
"""
14
Open BGZF compressed file.
15
16
Parameters:
17
- filepath: str, path to BGZF file
18
- mode: str, file mode ('r', 'w', 'rb', 'wb', 'rt', 'wt')
19
- compresslevel: int, compression level (0-9, default 6)
20
- threads: int, number of compression threads
21
22
Returns:
23
BGZFile object
24
"""
25
26
def read(self, size=-1):
27
"""
28
Read data from file.
29
30
Parameters:
31
- size: int, number of bytes to read (-1 for all)
32
33
Returns:
34
bytes or str, data read from file
35
"""
36
37
def readline(self, size=-1):
38
"""
39
Read single line from file.
40
41
Parameters:
42
- size: int, maximum bytes to read
43
44
Returns:
45
bytes or str, line data
46
"""
47
48
def readlines(self, hint=-1):
49
"""
50
Read all lines from file.
51
52
Parameters:
53
- hint: int, approximate number of bytes to read
54
55
Returns:
56
list, lines from file
57
"""
58
59
def write(self, data):
60
"""
61
Write data to file.
62
63
Parameters:
64
- data: bytes or str, data to write
65
66
Returns:
67
int, number of bytes written
68
"""
69
70
def writelines(self, lines):
71
"""
72
Write multiple lines to file.
73
74
Parameters:
75
- lines: iterable, lines to write
76
"""
77
78
def seek(self, offset, whence=0):
79
"""
80
Seek to position in file.
81
82
Parameters:
83
- offset: int, byte offset
84
- whence: int, seek reference (0=start, 1=current, 2=end)
85
86
Returns:
87
int, new file position
88
"""
89
90
def tell(self):
91
"""
92
Get current file position.
93
94
Returns:
95
int, current byte position
96
"""
97
98
def flush(self):
99
"""Flush write buffers."""
100
101
def close(self):
102
"""Close the file."""
103
104
def truncate(self, size=None):
105
"""
106
Truncate file to specified size.
107
108
Parameters:
109
- size: int, size in bytes (current position if None)
110
"""
111
112
# Properties
113
@property
114
def mode(self) -> str:
115
"""File mode."""
116
117
@property
118
def name(self) -> str:
119
"""File name."""
120
121
@property
122
def closed(self) -> bool:
123
"""True if file is closed."""
124
125
@property
126
def readable(self) -> bool:
127
"""True if file is readable."""
128
129
@property
130
def writable(self) -> bool:
131
"""True if file is writable."""
132
133
@property
134
def seekable(self) -> bool:
135
"""True if file supports seeking."""
136
137
# Context manager support
138
def __enter__(self):
139
"""Context manager entry."""
140
141
def __exit__(self, exc_type, exc_val, exc_tb):
142
"""Context manager exit."""
143
144
# Iterator support
145
def __iter__(self):
146
"""Iterate over lines."""
147
148
def __next__(self):
149
"""Get next line."""
150
```
151
152
## Usage Examples
153
154
### Basic File Operations
155
156
```python
157
import pysam
158
159
# Reading BGZF files
160
with pysam.BGZFile("data.txt.gz", "rt") as infile:
161
# Read entire file
162
content = infile.read()
163
print(f"File content: {content}")
164
165
# Reading line by line
166
with pysam.BGZFile("data.txt.gz", "rt") as infile:
167
for line_num, line in enumerate(infile, 1):
168
print(f"Line {line_num}: {line.strip()}")
169
170
# Reading specific amount of data
171
with pysam.BGZFile("data.txt.gz", "rb") as infile:
172
chunk = infile.read(1024) # Read first 1KB
173
print(f"First chunk: {len(chunk)} bytes")
174
```
175
176
### Writing BGZF Files
177
178
```python
179
import pysam
180
181
# Writing text data
182
with pysam.BGZFile("output.txt.gz", "wt", compresslevel=9) as outfile:
183
outfile.write("Header line\n")
184
for i in range(1000):
185
outfile.write(f"Data line {i}\n")
186
187
# Writing binary data
188
with pysam.BGZFile("output.bin.gz", "wb") as outfile:
189
data = b"Binary data chunk"
190
outfile.write(data)
191
192
# Writing with multiple threads for better compression speed
193
with pysam.BGZFile("large_output.txt.gz", "wt", threads=4) as outfile:
194
for i in range(1000000):
195
outfile.write(f"Large dataset line {i}\n")
196
```
197
198
### File Seeking and Random Access
199
200
```python
201
import pysam
202
203
# Seeking in BGZF files (supports random access)
204
with pysam.BGZFile("indexed_data.txt.gz", "rt") as infile:
205
# Read from beginning
206
first_line = infile.readline()
207
print(f"First line: {first_line.strip()}")
208
209
# Remember position
210
pos = infile.tell()
211
print(f"Current position: {pos}")
212
213
# Read more data
214
second_line = infile.readline()
215
216
# Seek back to previous position
217
infile.seek(pos)
218
219
# Read same line again
220
second_line_again = infile.readline()
221
assert second_line == second_line_again
222
223
# Seek to end and get file size
224
infile.seek(0, 2) # Seek to end
225
file_size = infile.tell()
226
print(f"File size: {file_size} bytes")
227
```
228
229
### Processing Large Files
230
231
```python
232
import pysam
233
234
def process_large_bgzf_file(filename, chunk_size=8192):
235
"""Process large BGZF file in chunks to manage memory usage."""
236
with pysam.BGZFile(filename, "rt") as infile:
237
processed_lines = 0
238
239
while True:
240
chunk = infile.read(chunk_size)
241
if not chunk:
242
break
243
244
# Process chunk line by line
245
lines = chunk.split('\n')
246
247
# Handle partial line at end of chunk
248
if not chunk.endswith('\n') and lines:
249
# Save last partial line for next iteration
250
partial_line = lines[-1]
251
lines = lines[:-1]
252
253
# Seek back to start of partial line
254
infile.seek(infile.tell() - len(partial_line.encode()))
255
256
# Process complete lines
257
for line in lines:
258
if line.strip(): # Skip empty lines
259
# Process line here
260
processed_lines += 1
261
262
if processed_lines % 10000 == 0:
263
print(f"Processed {processed_lines} lines")
264
265
return processed_lines
266
267
# Usage
268
total_lines = process_large_bgzf_file("large_dataset.txt.gz")
269
print(f"Total processed lines: {total_lines}")
270
```
271
272
### File Compression and Conversion
273
274
```python
275
import pysam
276
277
def compress_to_bgzf(input_file, output_file, compression_level=6):
278
"""Compress regular file to BGZF format."""
279
with open(input_file, 'rb') as infile:
280
with pysam.BGZFile(output_file, 'wb', compresslevel=compression_level) as outfile:
281
# Copy data in chunks
282
chunk_size = 64 * 1024 # 64KB chunks
283
while True:
284
chunk = infile.read(chunk_size)
285
if not chunk:
286
break
287
outfile.write(chunk)
288
289
def decompress_bgzf(input_file, output_file):
290
"""Decompress BGZF file to regular file."""
291
with pysam.BGZFile(input_file, 'rb') as infile:
292
with open(output_file, 'wb') as outfile:
293
# Copy data in chunks
294
chunk_size = 64 * 1024 # 64KB chunks
295
while True:
296
chunk = infile.read(chunk_size)
297
if not chunk:
298
break
299
outfile.write(chunk)
300
301
# Usage
302
compress_to_bgzf("large_file.txt", "large_file.txt.gz", compression_level=9)
303
decompress_bgzf("compressed_file.txt.gz", "decompressed_file.txt")
304
```
305
306
### Advanced File Operations
307
308
```python
309
import pysam
310
import os
311
312
def split_bgzf_file(input_file, output_prefix, lines_per_file=1000000):
313
"""Split large BGZF file into smaller files."""
314
file_count = 0
315
current_lines = 0
316
current_file = None
317
318
try:
319
with pysam.BGZFile(input_file, "rt") as infile:
320
for line in infile:
321
# Open new output file if needed
322
if current_lines == 0:
323
if current_file:
324
current_file.close()
325
326
file_count += 1
327
output_filename = f"{output_prefix}_{file_count:03d}.txt.gz"
328
current_file = pysam.BGZFile(output_filename, "wt")
329
330
# Write line to current file
331
current_file.write(line)
332
current_lines += 1
333
334
# Check if we need to start new file
335
if current_lines >= lines_per_file:
336
current_lines = 0
337
338
finally:
339
if current_file:
340
current_file.close()
341
342
return file_count
343
344
def merge_bgzf_files(input_files, output_file):
345
"""Merge multiple BGZF files into one."""
346
with pysam.BGZFile(output_file, "wt") as outfile:
347
for input_file in input_files:
348
with pysam.BGZFile(input_file, "rt") as infile:
349
# Copy all lines from input to output
350
for line in infile:
351
outfile.write(line)
352
353
# Usage
354
num_files = split_bgzf_file("huge_dataset.txt.gz", "split_part", 500000)
355
print(f"Split into {num_files} files")
356
357
# Merge them back
358
input_files = [f"split_part_{i:03d}.txt.gz" for i in range(1, num_files + 1)]
359
merge_bgzf_files(input_files, "merged_dataset.txt.gz")
360
```
361
362
### Integration with Other Pysam Components
363
364
```python
365
import pysam
366
367
# Create custom tabix-compatible file
368
def create_bed_file_with_bgzf(features, output_file):
369
"""Create sorted, BGZF-compressed BED file suitable for tabix indexing."""
370
371
# Sort features by chromosome and position
372
sorted_features = sorted(features, key=lambda x: (x['chrom'], x['start']))
373
374
# Write to BGZF file
375
with pysam.BGZFile(output_file, "wt") as outfile:
376
for feature in sorted_features:
377
line = f"{feature['chrom']}\t{feature['start']}\t{feature['end']}\t{feature['name']}\n"
378
outfile.write(line)
379
380
# Create tabix index
381
pysam.tabix_index(output_file, preset="bed")
382
383
# Example usage
384
features = [
385
{'chrom': 'chr1', 'start': 1000, 'end': 2000, 'name': 'feature1'},
386
{'chrom': 'chr1', 'start': 1500, 'end': 2500, 'name': 'feature2'},
387
{'chrom': 'chr2', 'start': 500, 'end': 1500, 'name': 'feature3'},
388
]
389
390
create_bed_file_with_bgzf(features, "features.bed.gz")
391
392
# Now can use with TabixFile
393
with pysam.TabixFile("features.bed.gz", parser=pysam.asBed()) as tabixfile:
394
for record in tabixfile.fetch("chr1", 1200, 1800):
395
print(f"Overlapping feature: {record.name}")
396
```
397
398
## Performance Considerations
399
400
- BGZF files support random access, unlike regular gzip files
401
- Seeking is efficient due to block-based compression structure
402
- Multi-threaded compression (`threads` parameter) can significantly speed up writing
403
- Block size is optimized for genomic data access patterns
404
- Compatible with standard gzip tools for decompression
405
- Essential format for indexed genomic files (BAM, tabix)