Tessl Tile for pypi/pysam@0.23.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

alignment-files.md bgzf-files.md command-tools.md index.md sequence-files.md tabix-files.md utilities.md variant-files.md

bgzf-files.mddocs/

0
# BGZF Compressed Files
1

2
Support for reading and writing block gzip (BGZF) compressed files, the standard compression format used in genomics for SAM/BAM files and tabix-indexed files.
3

4
## Capabilities
5

6
### BGZFile
7

8
Interface for reading and writing BGZF compressed files with seeking and block-level access.
9

10
```python { .api }
11
class BGZFile:
12
    def __init__(self, filepath, mode, compresslevel=6, threads=1):
13
        """
14
        Open BGZF compressed file.
15
        
16
        Parameters:
17
        - filepath: str, path to BGZF file
18
        - mode: str, file mode ('r', 'w', 'rb', 'wb', 'rt', 'wt')
19
        - compresslevel: int, compression level (0-9, default 6)
20
        - threads: int, number of compression threads
21
        
22
        Returns:
23
        BGZFile object
24
        """
25
    
26
    def read(self, size=-1):
27
        """
28
        Read data from file.
29
        
30
        Parameters:
31
        - size: int, number of bytes to read (-1 for all)
32
        
33
        Returns:
34
        bytes or str, data read from file
35
        """
36
    
37
    def readline(self, size=-1):
38
        """
39
        Read single line from file.
40
        
41
        Parameters:
42
        - size: int, maximum bytes to read
43
        
44
        Returns:
45
        bytes or str, line data
46
        """
47
    
48
    def readlines(self, hint=-1):
49
        """
50
        Read all lines from file.
51
        
52
        Parameters:
53
        - hint: int, approximate number of bytes to read
54
        
55
        Returns:
56
        list, lines from file
57
        """
58
    
59
    def write(self, data):
60
        """
61
        Write data to file.
62
        
63
        Parameters:
64
        - data: bytes or str, data to write
65
        
66
        Returns:
67
        int, number of bytes written
68
        """
69
    
70
    def writelines(self, lines):
71
        """
72
        Write multiple lines to file.
73
        
74
        Parameters:
75
        - lines: iterable, lines to write
76
        """
77
    
78
    def seek(self, offset, whence=0):
79
        """
80
        Seek to position in file.
81
        
82
        Parameters:
83
        - offset: int, byte offset
84
        - whence: int, seek reference (0=start, 1=current, 2=end)
85
        
86
        Returns:
87
        int, new file position
88
        """
89
    
90
    def tell(self):
91
        """
92
        Get current file position.
93
        
94
        Returns:
95
        int, current byte position
96
        """
97
    
98
    def flush(self):
99
        """Flush write buffers."""
100
    
101
    def close(self):
102
        """Close the file."""
103
    
104
    def truncate(self, size=None):
105
        """
106
        Truncate file to specified size.
107
        
108
        Parameters:
109
        - size: int, size in bytes (current position if None)
110
        """
111
    
112
    # Properties
113
    @property
114
    def mode(self) -> str:
115
        """File mode."""
116
    
117
    @property
118
    def name(self) -> str:
119
        """File name."""
120
    
121
    @property
122
    def closed(self) -> bool:
123
        """True if file is closed."""
124
    
125
    @property
126
    def readable(self) -> bool:
127
        """True if file is readable."""
128
    
129
    @property
130
    def writable(self) -> bool:
131
        """True if file is writable."""
132
    
133
    @property
134
    def seekable(self) -> bool:
135
        """True if file supports seeking."""
136
    
137
    # Context manager support
138
    def __enter__(self):
139
        """Context manager entry."""
140
    
141
    def __exit__(self, exc_type, exc_val, exc_tb):
142
        """Context manager exit."""
143
    
144
    # Iterator support
145
    def __iter__(self):
146
        """Iterate over lines."""
147
    
148
    def __next__(self):
149
        """Get next line."""
150
```
151

152
## Usage Examples
153

154
### Basic File Operations
155

156
```python
157
import pysam
158

159
# Reading BGZF files
160
with pysam.BGZFile("data.txt.gz", "rt") as infile:
161
    # Read entire file
162
    content = infile.read()
163
    print(f"File content: {content}")
164

165
# Reading line by line
166
with pysam.BGZFile("data.txt.gz", "rt") as infile:
167
    for line_num, line in enumerate(infile, 1):
168
        print(f"Line {line_num}: {line.strip()}")
169

170
# Reading specific amount of data
171
with pysam.BGZFile("data.txt.gz", "rb") as infile:
172
    chunk = infile.read(1024)  # Read first 1KB
173
    print(f"First chunk: {len(chunk)} bytes")
174
```
175

176
### Writing BGZF Files
177

178
```python
179
import pysam
180

181
# Writing text data
182
with pysam.BGZFile("output.txt.gz", "wt", compresslevel=9) as outfile:
183
    outfile.write("Header line\n")
184
    for i in range(1000):
185
        outfile.write(f"Data line {i}\n")
186

187
# Writing binary data
188
with pysam.BGZFile("output.bin.gz", "wb") as outfile:
189
    data = b"Binary data chunk"
190
    outfile.write(data)
191

192
# Writing with multiple threads for better compression speed
193
with pysam.BGZFile("large_output.txt.gz", "wt", threads=4) as outfile:
194
    for i in range(1000000):
195
        outfile.write(f"Large dataset line {i}\n")
196
```
197

198
### File Seeking and Random Access
199

200
```python
201
import pysam
202

203
# Seeking in BGZF files (supports random access)
204
with pysam.BGZFile("indexed_data.txt.gz", "rt") as infile:
205
    # Read from beginning
206
    first_line = infile.readline()
207
    print(f"First line: {first_line.strip()}")
208
    
209
    # Remember position
210
    pos = infile.tell()
211
    print(f"Current position: {pos}")
212
    
213
    # Read more data
214
    second_line = infile.readline()
215
    
216
    # Seek back to previous position
217
    infile.seek(pos)
218
    
219
    # Read same line again
220
    second_line_again = infile.readline()
221
    assert second_line == second_line_again
222
    
223
    # Seek to end and get file size
224
    infile.seek(0, 2)  # Seek to end
225
    file_size = infile.tell()
226
    print(f"File size: {file_size} bytes")
227
```
228

229
### Processing Large Files
230

231
```python
232
import pysam
233

234
def process_large_bgzf_file(filename, chunk_size=8192):
235
    """Process large BGZF file in chunks to manage memory usage."""
236
    with pysam.BGZFile(filename, "rt") as infile:
237
        processed_lines = 0
238
        
239
        while True:
240
            chunk = infile.read(chunk_size)
241
            if not chunk:
242
                break
243
            
244
            # Process chunk line by line
245
            lines = chunk.split('\n')
246
            
247
            # Handle partial line at end of chunk
248
            if not chunk.endswith('\n') and lines:
249
                # Save last partial line for next iteration
250
                partial_line = lines[-1]
251
                lines = lines[:-1]
252
                
253
                # Seek back to start of partial line
254
                infile.seek(infile.tell() - len(partial_line.encode()))
255
            
256
            # Process complete lines
257
            for line in lines:
258
                if line.strip():  # Skip empty lines
259
                    # Process line here
260
                    processed_lines += 1
261
                    
262
                    if processed_lines % 10000 == 0:
263
                        print(f"Processed {processed_lines} lines")
264
        
265
        return processed_lines
266

267
# Usage
268
total_lines = process_large_bgzf_file("large_dataset.txt.gz")
269
print(f"Total processed lines: {total_lines}")
270
```
271

272
### File Compression and Conversion
273

274
```python
275
import pysam
276

277
def compress_to_bgzf(input_file, output_file, compression_level=6):
278
    """Compress regular file to BGZF format."""
279
    with open(input_file, 'rb') as infile:
280
        with pysam.BGZFile(output_file, 'wb', compresslevel=compression_level) as outfile:
281
            # Copy data in chunks
282
            chunk_size = 64 * 1024  # 64KB chunks
283
            while True:
284
                chunk = infile.read(chunk_size)
285
                if not chunk:
286
                    break
287
                outfile.write(chunk)
288

289
def decompress_bgzf(input_file, output_file):
290
    """Decompress BGZF file to regular file."""
291
    with pysam.BGZFile(input_file, 'rb') as infile:
292
        with open(output_file, 'wb') as outfile:
293
            # Copy data in chunks
294
            chunk_size = 64 * 1024  # 64KB chunks
295
            while True:
296
                chunk = infile.read(chunk_size)
297
                if not chunk:
298
                    break
299
                outfile.write(chunk)
300

301
# Usage
302
compress_to_bgzf("large_file.txt", "large_file.txt.gz", compression_level=9)
303
decompress_bgzf("compressed_file.txt.gz", "decompressed_file.txt")
304
```
305

306
### Advanced File Operations
307

308
```python
309
import pysam
310
import os
311

312
def split_bgzf_file(input_file, output_prefix, lines_per_file=1000000):
313
    """Split large BGZF file into smaller files."""
314
    file_count = 0
315
    current_lines = 0
316
    current_file = None
317
    
318
    try:
319
        with pysam.BGZFile(input_file, "rt") as infile:
320
            for line in infile:
321
                # Open new output file if needed
322
                if current_lines == 0:
323
                    if current_file:
324
                        current_file.close()
325
                    
326
                    file_count += 1
327
                    output_filename = f"{output_prefix}_{file_count:03d}.txt.gz"
328
                    current_file = pysam.BGZFile(output_filename, "wt")
329
                
330
                # Write line to current file
331
                current_file.write(line)
332
                current_lines += 1
333
                
334
                # Check if we need to start new file
335
                if current_lines >= lines_per_file:
336
                    current_lines = 0
337
    
338
    finally:
339
        if current_file:
340
            current_file.close()
341
    
342
    return file_count
343

344
def merge_bgzf_files(input_files, output_file):
345
    """Merge multiple BGZF files into one."""
346
    with pysam.BGZFile(output_file, "wt") as outfile:
347
        for input_file in input_files:
348
            with pysam.BGZFile(input_file, "rt") as infile:
349
                # Copy all lines from input to output
350
                for line in infile:
351
                    outfile.write(line)
352

353
# Usage
354
num_files = split_bgzf_file("huge_dataset.txt.gz", "split_part", 500000)
355
print(f"Split into {num_files} files")
356

357
# Merge them back
358
input_files = [f"split_part_{i:03d}.txt.gz" for i in range(1, num_files + 1)]
359
merge_bgzf_files(input_files, "merged_dataset.txt.gz")
360
```
361

362
### Integration with Other Pysam Components
363

364
```python
365
import pysam
366

367
# Create custom tabix-compatible file
368
def create_bed_file_with_bgzf(features, output_file):
369
    """Create sorted, BGZF-compressed BED file suitable for tabix indexing."""
370
    
371
    # Sort features by chromosome and position
372
    sorted_features = sorted(features, key=lambda x: (x['chrom'], x['start']))
373
    
374
    # Write to BGZF file
375
    with pysam.BGZFile(output_file, "wt") as outfile:
376
        for feature in sorted_features:
377
            line = f"{feature['chrom']}\t{feature['start']}\t{feature['end']}\t{feature['name']}\n"
378
            outfile.write(line)
379
    
380
    # Create tabix index
381
    pysam.tabix_index(output_file, preset="bed")
382

383
# Example usage
384
features = [
385
    {'chrom': 'chr1', 'start': 1000, 'end': 2000, 'name': 'feature1'},
386
    {'chrom': 'chr1', 'start': 1500, 'end': 2500, 'name': 'feature2'},
387
    {'chrom': 'chr2', 'start': 500, 'end': 1500, 'name': 'feature3'},
388
]
389

390
create_bed_file_with_bgzf(features, "features.bed.gz")
391

392
# Now can use with TabixFile
393
with pysam.TabixFile("features.bed.gz", parser=pysam.asBed()) as tabixfile:
394
    for record in tabixfile.fetch("chr1", 1200, 1800):
395
        print(f"Overlapping feature: {record.name}")
396
```
397

398
## Performance Considerations
399

400
- BGZF files support random access, unlike regular gzip files
401
- Seeking is efficient due to block-based compression structure
402
- Multi-threaded compression (`threads` parameter) can significantly speed up writing
403
- Block size is optimized for genomic data access patterns
404
- Compatible with standard gzip tools for decompression
405
- Essential format for indexed genomic files (BAM, tabix)

Version

Tile

Files

bgzf-files.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

bgzf-files.mddocs/