0
# Tabix-Indexed Files
1
2
Support for compressed, indexed genomic files using tabix indexing. Enables efficient random access to large compressed files in various formats including BED, GFF, GTF, and VCF.
3
4
## Capabilities
5
6
### TabixFile
7
8
Main interface for accessing tabix-indexed compressed files with format-specific parsers.
9
10
```python { .api }
11
class TabixFile:
12
def __init__(self, filename, parser=None, encoding="ascii", duplicate_filehandle=True):
13
"""
14
Open tabix-indexed file.
15
16
Parameters:
17
- filename: str, path to tabix-indexed file (.gz)
18
- parser: class, parser for file format (asTuple, asBed, asGTF, etc.)
19
- encoding: str, text encoding
20
- duplicate_filehandle: bool, allow multiple handles
21
22
Returns:
23
TabixFile object
24
"""
25
26
def fetch(self, reference=None, start=None, end=None, region=None, parser=None, multiple_iterators=False):
27
"""
28
Fetch records from region.
29
30
Parameters:
31
- reference: str, chromosome/contig name
32
- start: int, start position (0-based)
33
- end: int, end position (0-based)
34
- region: str, region string (chr:start-end)
35
- parser: class, override default parser
36
- multiple_iterators: bool, allow concurrent iterators
37
38
Returns:
39
Iterator of parsed records
40
"""
41
42
def close(self):
43
"""Close the file."""
44
45
# Properties
46
@property
47
def filename(self) -> str:
48
"""File name."""
49
50
@property
51
def header(self) -> list:
52
"""Header lines."""
53
54
@property
55
def contigs(self) -> list:
56
"""Available contigs/chromosomes."""
57
58
def __enter__(self):
59
"""Context manager entry."""
60
61
def __exit__(self, exc_type, exc_val, exc_tb):
62
"""Context manager exit."""
63
```
64
65
### Parser Classes
66
67
Format-specific parsers for different genomic file types.
68
69
```python { .api }
70
class asTuple:
71
"""Parse records as tuples of strings."""
72
def __call__(self, line):
73
"""
74
Parse line to tuple.
75
76
Returns:
77
tuple, fields as strings
78
"""
79
80
class asBed:
81
"""Parse BED format records."""
82
def __call__(self, line):
83
"""
84
Parse BED line.
85
86
Returns:
87
BedProxy object
88
"""
89
90
class asGTF:
91
"""Parse GTF format records."""
92
def __call__(self, line):
93
"""
94
Parse GTF line.
95
96
Returns:
97
GTFProxy object
98
"""
99
100
class asGFF3:
101
"""Parse GFF3 format records."""
102
def __call__(self, line):
103
"""
104
Parse GFF3 line.
105
106
Returns:
107
GFF3Proxy object
108
"""
109
110
class asVCF:
111
"""Parse VCF format records."""
112
def __call__(self, line):
113
"""
114
Parse VCF line.
115
116
Returns:
117
VCFProxy object
118
"""
119
```
120
121
### Proxy Classes
122
123
Efficient access to parsed record fields without copying data.
124
125
```python { .api }
126
class TupleProxy:
127
"""Fast read-only access to parsed records as tuples."""
128
129
def __getitem__(self, index):
130
"""
131
Get field by index.
132
133
Parameters:
134
- index: int, field index
135
136
Returns:
137
str, field value
138
"""
139
140
def __len__(self):
141
"""Number of fields."""
142
143
def __str__(self):
144
"""String representation."""
145
146
class BedProxy(TupleProxy):
147
"""BED format record with named field access."""
148
149
@property
150
def contig(self) -> str:
151
"""Chromosome name."""
152
153
@property
154
def start(self) -> int:
155
"""Start position (0-based)."""
156
157
@property
158
def end(self) -> int:
159
"""End position (0-based)."""
160
161
@property
162
def name(self) -> str:
163
"""Feature name."""
164
165
@property
166
def score(self) -> float:
167
"""Score value."""
168
169
@property
170
def strand(self) -> str:
171
"""Strand ('+', '-', or '.')."""
172
173
@property
174
def thickstart(self) -> int:
175
"""Thick start position."""
176
177
@property
178
def thickend(self) -> int:
179
"""Thick end position."""
180
181
@property
182
def itemrgb(self) -> str:
183
"""RGB color value."""
184
185
@property
186
def blockcount(self) -> int:
187
"""Number of blocks."""
188
189
@property
190
def blocksizes(self) -> list:
191
"""Block sizes."""
192
193
@property
194
def blockstarts(self) -> list:
195
"""Block start positions."""
196
197
class GTFProxy(TupleProxy):
198
"""GTF format record with named field access."""
199
200
@property
201
def contig(self) -> str:
202
"""Chromosome name."""
203
204
@property
205
def source(self) -> str:
206
"""Annotation source."""
207
208
@property
209
def feature(self) -> str:
210
"""Feature type."""
211
212
@property
213
def start(self) -> int:
214
"""Start position (1-based)."""
215
216
@property
217
def end(self) -> int:
218
"""End position (1-based)."""
219
220
@property
221
def score(self) -> str:
222
"""Score value."""
223
224
@property
225
def strand(self) -> str:
226
"""Strand ('+', '-', or '.')."""
227
228
@property
229
def frame(self) -> str:
230
"""Reading frame."""
231
232
@property
233
def attributes(self) -> str:
234
"""Attributes string."""
235
236
def asDict(self):
237
"""
238
Parse attributes as dictionary.
239
240
Returns:
241
dict, attribute key-value pairs
242
"""
243
244
class GFF3Proxy(TupleProxy):
245
"""GFF3 format record with named field access."""
246
247
@property
248
def contig(self) -> str:
249
"""Chromosome name."""
250
251
@property
252
def source(self) -> str:
253
"""Annotation source."""
254
255
@property
256
def feature(self) -> str:
257
"""Feature type."""
258
259
@property
260
def start(self) -> int:
261
"""Start position (1-based)."""
262
263
@property
264
def end(self) -> int:
265
"""End position (1-based)."""
266
267
@property
268
def score(self) -> str:
269
"""Score value."""
270
271
@property
272
def strand(self) -> str:
273
"""Strand ('+', '-', or '.')."""
274
275
@property
276
def frame(self) -> str:
277
"""Reading frame."""
278
279
@property
280
def attributes(self) -> str:
281
"""Attributes string."""
282
283
def asDict(self):
284
"""
285
Parse attributes as dictionary.
286
287
Returns:
288
dict, attribute key-value pairs
289
"""
290
291
class VCFProxy(TupleProxy):
292
"""VCF format record with named field access."""
293
294
@property
295
def contig(self) -> str:
296
"""Chromosome name."""
297
298
@property
299
def pos(self) -> int:
300
"""Position (1-based)."""
301
302
@property
303
def id(self) -> str:
304
"""Variant identifier."""
305
306
@property
307
def ref(self) -> str:
308
"""Reference allele."""
309
310
@property
311
def alt(self) -> str:
312
"""Alternate alleles."""
313
314
@property
315
def qual(self) -> str:
316
"""Quality score."""
317
318
@property
319
def filter(self) -> str:
320
"""Filter status."""
321
322
@property
323
def info(self) -> str:
324
"""Info fields."""
325
326
@property
327
def format(self) -> str:
328
"""Format specification."""
329
```
330
331
### Utility Functions
332
333
Functions for creating and manipulating tabix-indexed files.
334
335
```python { .api }
336
def tabix_index(filename, preset=None, seq_col=None, start_col=None, end_col=None, line_skip=0, zerobased=False, **kwargs):
337
"""
338
Create tabix index for a file.
339
340
Parameters:
341
- filename: str, file to index
342
- preset: str, format preset ('gff', 'bed', 'sam', 'vcf')
343
- seq_col: int, sequence/chromosome column (1-based)
344
- start_col: int, start position column (1-based)
345
- end_col: int, end position column (1-based)
346
- line_skip: int, number of header lines to skip
347
- zerobased: bool, coordinates are 0-based
348
349
Returns:
350
str, index filename
351
"""
352
353
def tabix_compress(filename_in, filename_out=None, force=False, **kwargs):
354
"""
355
Compress file with bgzip.
356
357
Parameters:
358
- filename_in: str, input file
359
- filename_out: str, output file (default: filename_in + .gz)
360
- force: bool, overwrite existing file
361
362
Returns:
363
str, compressed filename
364
"""
365
366
def tabix_iterator(infile, parser=None):
367
"""
368
Create iterator for unindexed files.
369
370
Parameters:
371
- infile: file object or str, input file
372
- parser: class, record parser
373
374
Returns:
375
Iterator of parsed records
376
"""
377
378
def tabix_generic_iterator(infile, parser=None):
379
"""
380
Generic file iterator.
381
382
Parameters:
383
- infile: file object, input file
384
- parser: class, record parser
385
386
Returns:
387
Iterator of parsed records
388
"""
389
390
def tabix_file_iterator(infile, parser=None):
391
"""
392
Compressed file iterator.
393
394
Parameters:
395
- infile: str, compressed file path
396
- parser: class, record parser
397
398
Returns:
399
Iterator of parsed records
400
"""
401
```
402
403
### Iterator Classes
404
405
Specialized iterators for different file access patterns.
406
407
```python { .api }
408
class GZIterator:
409
"""Iterator for gzip-compressed files."""
410
411
def __init__(self, filename, parser=None):
412
"""
413
Create gzip iterator.
414
415
Parameters:
416
- filename: str, gzip file path
417
- parser: class, record parser
418
"""
419
420
def __iter__(self):
421
"""Iterate over records."""
422
423
def __next__(self):
424
"""Get next record."""
425
426
class GZIteratorHead:
427
"""Iterator for headers in gzip files."""
428
429
def __init__(self, filename, parser=None):
430
"""
431
Create header iterator.
432
433
Parameters:
434
- filename: str, gzip file path
435
- parser: class, record parser
436
"""
437
438
def __iter__(self):
439
"""Iterate over header lines."""
440
```
441
442
### Legacy Compatibility
443
444
```python { .api }
445
class Tabixfile:
446
"""Legacy alias for TabixFile."""
447
def __init__(self, *args, **kwargs):
448
"""Same as TabixFile.__init__."""
449
```
450
451
## Usage Examples
452
453
### Basic File Access
454
455
```python
456
import pysam
457
458
# Open tabix-indexed BED file
459
with pysam.TabixFile("annotations.bed.gz", parser=pysam.asBed()) as tabixfile:
460
# Fetch features in region
461
for record in tabixfile.fetch("chr1", 1000, 2000):
462
print(f"{record.contig}:{record.start}-{record.end} {record.name}")
463
print(f" Score: {record.score}, Strand: {record.strand}")
464
465
# Open GTF file with GTF parser
466
with pysam.TabixFile("genes.gtf.gz", parser=pysam.asGTF()) as tabixfile:
467
for record in tabixfile.fetch("chr1", 1000000, 2000000):
468
if record.feature == "gene":
469
attrs = record.asDict()
470
gene_name = attrs.get("gene_name", ["Unknown"])[0]
471
print(f"Gene: {gene_name} at {record.start}-{record.end}")
472
```
473
474
### Creating Tabix Indexes
475
476
```python
477
import pysam
478
479
# Index a BED file
480
pysam.tabix_compress("features.bed", "features.bed.gz")
481
pysam.tabix_index("features.bed.gz", preset="bed")
482
483
# Index a custom format file
484
pysam.tabix_compress("custom.txt", "custom.txt.gz")
485
pysam.tabix_index("custom.txt.gz",
486
seq_col=1, # chromosome in column 1
487
start_col=2, # start in column 2
488
end_col=3, # end in column 3
489
line_skip=1) # skip header line
490
491
# Index VCF file
492
pysam.tabix_compress("variants.vcf", "variants.vcf.gz")
493
pysam.tabix_index("variants.vcf.gz", preset="vcf")
494
```
495
496
### Format-Specific Processing
497
498
```python
499
import pysam
500
501
# Process GTF annotations
502
def extract_genes(gtf_file, output_file, gene_type="protein_coding"):
503
with pysam.TabixFile(gtf_file, parser=pysam.asGTF()) as gtffile:
504
with open(output_file, 'w') as outfile:
505
outfile.write("gene_id\tgene_name\tchrom\tstart\tend\tstrand\n")
506
507
for record in gtffile.fetch():
508
if record.feature == "gene":
509
attrs = record.asDict()
510
511
# Check gene type
512
if gene_type in attrs.get("gene_type", []):
513
gene_id = attrs.get("gene_id", [""])[0]
514
gene_name = attrs.get("gene_name", [""])[0]
515
516
outfile.write(f"{gene_id}\t{gene_name}\t{record.contig}\t"
517
f"{record.start}\t{record.end}\t{record.strand}\n")
518
519
# Process BED intervals
520
def merge_overlapping_intervals(bed_file, output_file):
521
"""Merge overlapping BED intervals."""
522
intervals = []
523
524
with pysam.TabixFile(bed_file, parser=pysam.asBed()) as bedfile:
525
for record in bedfile.fetch():
526
intervals.append((record.contig, record.start, record.end, record.name))
527
528
# Sort by chromosome and position
529
intervals.sort(key=lambda x: (x[0], x[1]))
530
531
# Merge overlapping intervals
532
merged = []
533
for chrom, start, end, name in intervals:
534
if merged and merged[-1][0] == chrom and merged[-1][2] >= start:
535
# Overlapping - extend previous interval
536
merged[-1] = (chrom, merged[-1][1], max(merged[-1][2], end),
537
merged[-1][3] + "," + name)
538
else:
539
# Non-overlapping - add new interval
540
merged.append((chrom, start, end, name))
541
542
# Write merged intervals
543
with open(output_file, 'w') as outfile:
544
for chrom, start, end, name in merged:
545
outfile.write(f"{chrom}\t{start}\t{end}\t{name}\n")
546
547
# Usage
548
extract_genes("gencode.gtf.gz", "protein_coding_genes.txt")
549
merge_overlapping_intervals("peaks.bed.gz", "merged_peaks.bed")
550
```
551
552
### Multi-format Analysis
553
554
```python
555
import pysam
556
from collections import defaultdict
557
558
def intersect_features(bed_file, gtf_file, feature_type="exon"):
559
"""Find BED intervals that overlap with GTF features."""
560
561
# Load BED intervals by chromosome
562
bed_intervals = defaultdict(list)
563
with pysam.TabixFile(bed_file, parser=pysam.asBed()) as bedfile:
564
for record in bedfile.fetch():
565
bed_intervals[record.contig].append((record.start, record.end, record.name))
566
567
# Check overlaps with GTF features
568
overlaps = []
569
with pysam.TabixFile(gtf_file, parser=pysam.asGTF()) as gtffile:
570
for chrom in bed_intervals:
571
for record in gtffile.fetch(chrom):
572
if record.feature == feature_type:
573
gtf_start, gtf_end = record.start - 1, record.end # Convert to 0-based
574
575
# Check each BED interval for overlap
576
for bed_start, bed_end, bed_name in bed_intervals[chrom]:
577
if bed_start < gtf_end and bed_end > gtf_start:
578
# Calculate overlap
579
overlap_start = max(bed_start, gtf_start)
580
overlap_end = min(bed_end, gtf_end)
581
overlap_len = overlap_end - overlap_start
582
583
attrs = record.asDict()
584
gene_name = attrs.get("gene_name", ["Unknown"])[0]
585
586
overlaps.append({
587
'bed_name': bed_name,
588
'gene_name': gene_name,
589
'chrom': chrom,
590
'overlap_start': overlap_start,
591
'overlap_end': overlap_end,
592
'overlap_length': overlap_len
593
})
594
595
return overlaps
596
597
# Find peaks overlapping exons
598
overlaps = intersect_features("chip_peaks.bed.gz", "gencode.gtf.gz", "exon")
599
for overlap in overlaps[:10]: # Show first 10
600
print(f"Peak {overlap['bed_name']} overlaps {overlap['gene_name']} "
601
f"by {overlap['overlap_length']} bp")
602
```