Tessl Tile for pypi/pysam@0.23.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

alignment-files.md bgzf-files.md command-tools.md index.md sequence-files.md tabix-files.md utilities.md variant-files.md

tabix-files.mddocs/

0
# Tabix-Indexed Files
1

2
Support for compressed, indexed genomic files using tabix indexing. Enables efficient random access to large compressed files in various formats including BED, GFF, GTF, and VCF.
3

4
## Capabilities
5

6
### TabixFile
7

8
Main interface for accessing tabix-indexed compressed files with format-specific parsers.
9

10
```python { .api }
11
class TabixFile:
12
    def __init__(self, filename, parser=None, encoding="ascii", duplicate_filehandle=True):
13
        """
14
        Open tabix-indexed file.
15
        
16
        Parameters:
17
        - filename: str, path to tabix-indexed file (.gz)
18
        - parser: class, parser for file format (asTuple, asBed, asGTF, etc.)
19
        - encoding: str, text encoding
20
        - duplicate_filehandle: bool, allow multiple handles
21
        
22
        Returns:
23
        TabixFile object
24
        """
25
    
26
    def fetch(self, reference=None, start=None, end=None, region=None, parser=None, multiple_iterators=False):
27
        """
28
        Fetch records from region.
29
        
30
        Parameters:
31
        - reference: str, chromosome/contig name
32
        - start: int, start position (0-based)
33
        - end: int, end position (0-based)
34
        - region: str, region string (chr:start-end)
35
        - parser: class, override default parser
36
        - multiple_iterators: bool, allow concurrent iterators
37
        
38
        Returns:
39
        Iterator of parsed records
40
        """
41
    
42
    def close(self):
43
        """Close the file."""
44
    
45
    # Properties
46
    @property
47
    def filename(self) -> str:
48
        """File name."""
49
    
50
    @property
51
    def header(self) -> list:
52
        """Header lines."""
53
    
54
    @property
55
    def contigs(self) -> list:
56
        """Available contigs/chromosomes."""
57
    
58
    def __enter__(self):
59
        """Context manager entry."""
60
    
61
    def __exit__(self, exc_type, exc_val, exc_tb):
62
        """Context manager exit."""
63
```
64

65
### Parser Classes
66

67
Format-specific parsers for different genomic file types.
68

69
```python { .api }
70
class asTuple:
71
    """Parse records as tuples of strings."""
72
    def __call__(self, line):
73
        """
74
        Parse line to tuple.
75
        
76
        Returns:
77
        tuple, fields as strings
78
        """
79

80
class asBed:
81
    """Parse BED format records."""
82
    def __call__(self, line):
83
        """
84
        Parse BED line.
85
        
86
        Returns:
87
        BedProxy object
88
        """
89

90
class asGTF:
91
    """Parse GTF format records."""
92
    def __call__(self, line):
93
        """
94
        Parse GTF line.
95
        
96
        Returns:
97
        GTFProxy object
98
        """
99

100
class asGFF3:
101
    """Parse GFF3 format records."""
102
    def __call__(self, line):
103
        """
104
        Parse GFF3 line.
105
        
106
        Returns:
107
        GFF3Proxy object
108
        """
109

110
class asVCF:
111
    """Parse VCF format records."""
112
    def __call__(self, line):
113
        """
114
        Parse VCF line.
115
        
116
        Returns:
117
        VCFProxy object
118
        """
119
```
120

121
### Proxy Classes
122

123
Efficient access to parsed record fields without copying data.
124

125
```python { .api }
126
class TupleProxy:
127
    """Fast read-only access to parsed records as tuples."""
128
    
129
    def __getitem__(self, index):
130
        """
131
        Get field by index.
132
        
133
        Parameters:
134
        - index: int, field index
135
        
136
        Returns:
137
        str, field value
138
        """
139
    
140
    def __len__(self):
141
        """Number of fields."""
142
    
143
    def __str__(self):
144
        """String representation."""
145

146
class BedProxy(TupleProxy):
147
    """BED format record with named field access."""
148
    
149
    @property
150
    def contig(self) -> str:
151
        """Chromosome name."""
152
    
153
    @property
154
    def start(self) -> int:
155
        """Start position (0-based)."""
156
    
157
    @property
158
    def end(self) -> int:
159
        """End position (0-based)."""
160
    
161
    @property
162
    def name(self) -> str:
163
        """Feature name."""
164
    
165
    @property
166
    def score(self) -> float:
167
        """Score value."""
168
    
169
    @property
170
    def strand(self) -> str:
171
        """Strand ('+', '-', or '.')."""
172
    
173
    @property
174
    def thickstart(self) -> int:
175
        """Thick start position."""
176
    
177
    @property
178
    def thickend(self) -> int:
179
        """Thick end position."""
180
    
181
    @property
182
    def itemrgb(self) -> str:
183
        """RGB color value."""
184
    
185
    @property
186
    def blockcount(self) -> int:
187
        """Number of blocks."""
188
    
189
    @property
190
    def blocksizes(self) -> list:
191
        """Block sizes."""
192
    
193
    @property
194
    def blockstarts(self) -> list:
195
        """Block start positions."""
196

197
class GTFProxy(TupleProxy):
198
    """GTF format record with named field access."""
199
    
200
    @property
201
    def contig(self) -> str:
202
        """Chromosome name."""
203
    
204
    @property
205
    def source(self) -> str:
206
        """Annotation source."""
207
    
208
    @property
209
    def feature(self) -> str:
210
        """Feature type."""
211
    
212
    @property
213
    def start(self) -> int:
214
        """Start position (1-based)."""
215
    
216
    @property
217
    def end(self) -> int:
218
        """End position (1-based)."""
219
    
220
    @property
221
    def score(self) -> str:
222
        """Score value."""
223
    
224
    @property
225
    def strand(self) -> str:
226
        """Strand ('+', '-', or '.')."""
227
    
228
    @property
229
    def frame(self) -> str:
230
        """Reading frame."""
231
    
232
    @property
233
    def attributes(self) -> str:
234
        """Attributes string."""
235
    
236
    def asDict(self):
237
        """
238
        Parse attributes as dictionary.
239
        
240
        Returns:
241
        dict, attribute key-value pairs
242
        """
243

244
class GFF3Proxy(TupleProxy):
245
    """GFF3 format record with named field access."""
246
    
247
    @property
248
    def contig(self) -> str:
249
        """Chromosome name."""
250
    
251
    @property
252
    def source(self) -> str:
253
        """Annotation source."""
254
    
255
    @property
256
    def feature(self) -> str:
257
        """Feature type."""
258
    
259
    @property
260
    def start(self) -> int:
261
        """Start position (1-based)."""
262
    
263
    @property
264
    def end(self) -> int:
265
        """End position (1-based)."""
266
    
267
    @property
268
    def score(self) -> str:
269
        """Score value."""
270
    
271
    @property
272
    def strand(self) -> str:
273
        """Strand ('+', '-', or '.')."""
274
    
275
    @property
276
    def frame(self) -> str:
277
        """Reading frame."""
278
    
279
    @property
280
    def attributes(self) -> str:
281
        """Attributes string."""
282
    
283
    def asDict(self):
284
        """
285
        Parse attributes as dictionary.
286
        
287
        Returns:
288
        dict, attribute key-value pairs
289
        """
290

291
class VCFProxy(TupleProxy):
292
    """VCF format record with named field access."""
293
    
294
    @property
295
    def contig(self) -> str:
296
        """Chromosome name."""
297
    
298
    @property
299
    def pos(self) -> int:
300
        """Position (1-based)."""
301
    
302
    @property
303
    def id(self) -> str:
304
        """Variant identifier."""
305
    
306
    @property
307
    def ref(self) -> str:
308
        """Reference allele."""
309
    
310
    @property
311
    def alt(self) -> str:
312
        """Alternate alleles."""
313
    
314
    @property
315
    def qual(self) -> str:
316
        """Quality score."""
317
    
318
    @property
319
    def filter(self) -> str:
320
        """Filter status."""
321
    
322
    @property
323
    def info(self) -> str:
324
        """Info fields."""
325
    
326
    @property
327
    def format(self) -> str:
328
        """Format specification."""
329
```
330

331
### Utility Functions
332

333
Functions for creating and manipulating tabix-indexed files.
334

335
```python { .api }
336
def tabix_index(filename, preset=None, seq_col=None, start_col=None, end_col=None, line_skip=0, zerobased=False, **kwargs):
337
    """
338
    Create tabix index for a file.
339
    
340
    Parameters:
341
    - filename: str, file to index
342
    - preset: str, format preset ('gff', 'bed', 'sam', 'vcf')
343
    - seq_col: int, sequence/chromosome column (1-based)
344
    - start_col: int, start position column (1-based)
345
    - end_col: int, end position column (1-based)
346
    - line_skip: int, number of header lines to skip
347
    - zerobased: bool, coordinates are 0-based
348
    
349
    Returns:
350
    str, index filename
351
    """
352

353
def tabix_compress(filename_in, filename_out=None, force=False, **kwargs):
354
    """
355
    Compress file with bgzip.
356
    
357
    Parameters:
358
    - filename_in: str, input file
359
    - filename_out: str, output file (default: filename_in + .gz)
360
    - force: bool, overwrite existing file
361
    
362
    Returns:
363
    str, compressed filename
364
    """
365

366
def tabix_iterator(infile, parser=None):
367
    """
368
    Create iterator for unindexed files.
369
    
370
    Parameters:
371
    - infile: file object or str, input file
372
    - parser: class, record parser
373
    
374
    Returns:
375
    Iterator of parsed records
376
    """
377

378
def tabix_generic_iterator(infile, parser=None):
379
    """
380
    Generic file iterator.
381
    
382
    Parameters:
383
    - infile: file object, input file
384
    - parser: class, record parser
385
    
386
    Returns:
387
    Iterator of parsed records
388
    """
389

390
def tabix_file_iterator(infile, parser=None):
391
    """
392
    Compressed file iterator.
393
    
394
    Parameters:
395
    - infile: str, compressed file path
396
    - parser: class, record parser
397
    
398
    Returns:
399
    Iterator of parsed records
400
    """
401
```
402

403
### Iterator Classes
404

405
Specialized iterators for different file access patterns.
406

407
```python { .api }
408
class GZIterator:
409
    """Iterator for gzip-compressed files."""
410
    
411
    def __init__(self, filename, parser=None):
412
        """
413
        Create gzip iterator.
414
        
415
        Parameters:
416
        - filename: str, gzip file path
417
        - parser: class, record parser
418
        """
419
    
420
    def __iter__(self):
421
        """Iterate over records."""
422
    
423
    def __next__(self):
424
        """Get next record."""
425

426
class GZIteratorHead:
427
    """Iterator for headers in gzip files."""
428
    
429
    def __init__(self, filename, parser=None):
430
        """
431
        Create header iterator.
432
        
433
        Parameters:
434
        - filename: str, gzip file path
435
        - parser: class, record parser
436
        """
437
    
438
    def __iter__(self):
439
        """Iterate over header lines."""
440
```
441

442
### Legacy Compatibility
443

444
```python { .api }
445
class Tabixfile:
446
    """Legacy alias for TabixFile."""
447
    def __init__(self, *args, **kwargs):
448
        """Same as TabixFile.__init__."""
449
```
450

451
## Usage Examples
452

453
### Basic File Access
454

455
```python
456
import pysam
457

458
# Open tabix-indexed BED file
459
with pysam.TabixFile("annotations.bed.gz", parser=pysam.asBed()) as tabixfile:
460
    # Fetch features in region
461
    for record in tabixfile.fetch("chr1", 1000, 2000):
462
        print(f"{record.contig}:{record.start}-{record.end} {record.name}")
463
        print(f"  Score: {record.score}, Strand: {record.strand}")
464

465
# Open GTF file with GTF parser
466
with pysam.TabixFile("genes.gtf.gz", parser=pysam.asGTF()) as tabixfile:
467
    for record in tabixfile.fetch("chr1", 1000000, 2000000):
468
        if record.feature == "gene":
469
            attrs = record.asDict()
470
            gene_name = attrs.get("gene_name", ["Unknown"])[0]
471
            print(f"Gene: {gene_name} at {record.start}-{record.end}")
472
```
473

474
### Creating Tabix Indexes
475

476
```python
477
import pysam
478

479
# Index a BED file
480
pysam.tabix_compress("features.bed", "features.bed.gz")
481
pysam.tabix_index("features.bed.gz", preset="bed")
482

483
# Index a custom format file
484
pysam.tabix_compress("custom.txt", "custom.txt.gz")
485
pysam.tabix_index("custom.txt.gz", 
486
                  seq_col=1,     # chromosome in column 1
487
                  start_col=2,   # start in column 2  
488
                  end_col=3,     # end in column 3
489
                  line_skip=1)   # skip header line
490

491
# Index VCF file
492
pysam.tabix_compress("variants.vcf", "variants.vcf.gz")
493
pysam.tabix_index("variants.vcf.gz", preset="vcf")
494
```
495

496
### Format-Specific Processing
497

498
```python
499
import pysam
500

501
# Process GTF annotations
502
def extract_genes(gtf_file, output_file, gene_type="protein_coding"):
503
    with pysam.TabixFile(gtf_file, parser=pysam.asGTF()) as gtffile:
504
        with open(output_file, 'w') as outfile:
505
            outfile.write("gene_id\tgene_name\tchrom\tstart\tend\tstrand\n")
506
            
507
            for record in gtffile.fetch():
508
                if record.feature == "gene":
509
                    attrs = record.asDict()
510
                    
511
                    # Check gene type
512
                    if gene_type in attrs.get("gene_type", []):
513
                        gene_id = attrs.get("gene_id", [""])[0]
514
                        gene_name = attrs.get("gene_name", [""])[0]
515
                        
516
                        outfile.write(f"{gene_id}\t{gene_name}\t{record.contig}\t"
517
                                    f"{record.start}\t{record.end}\t{record.strand}\n")
518

519
# Process BED intervals
520
def merge_overlapping_intervals(bed_file, output_file):
521
    """Merge overlapping BED intervals."""
522
    intervals = []
523
    
524
    with pysam.TabixFile(bed_file, parser=pysam.asBed()) as bedfile:
525
        for record in bedfile.fetch():
526
            intervals.append((record.contig, record.start, record.end, record.name))
527
    
528
    # Sort by chromosome and position
529
    intervals.sort(key=lambda x: (x[0], x[1]))
530
    
531
    # Merge overlapping intervals
532
    merged = []
533
    for chrom, start, end, name in intervals:
534
        if merged and merged[-1][0] == chrom and merged[-1][2] >= start:
535
            # Overlapping - extend previous interval
536
            merged[-1] = (chrom, merged[-1][1], max(merged[-1][2], end), 
537
                         merged[-1][3] + "," + name)
538
        else:
539
            # Non-overlapping - add new interval
540
            merged.append((chrom, start, end, name))
541
    
542
    # Write merged intervals
543
    with open(output_file, 'w') as outfile:
544
        for chrom, start, end, name in merged:
545
            outfile.write(f"{chrom}\t{start}\t{end}\t{name}\n")
546

547
# Usage
548
extract_genes("gencode.gtf.gz", "protein_coding_genes.txt")
549
merge_overlapping_intervals("peaks.bed.gz", "merged_peaks.bed")
550
```
551

552
### Multi-format Analysis
553

554
```python
555
import pysam
556
from collections import defaultdict
557

558
def intersect_features(bed_file, gtf_file, feature_type="exon"):
559
    """Find BED intervals that overlap with GTF features."""
560
    
561
    # Load BED intervals by chromosome
562
    bed_intervals = defaultdict(list)
563
    with pysam.TabixFile(bed_file, parser=pysam.asBed()) as bedfile:
564
        for record in bedfile.fetch():
565
            bed_intervals[record.contig].append((record.start, record.end, record.name))
566
    
567
    # Check overlaps with GTF features
568
    overlaps = []
569
    with pysam.TabixFile(gtf_file, parser=pysam.asGTF()) as gtffile:
570
        for chrom in bed_intervals:
571
            for record in gtffile.fetch(chrom):
572
                if record.feature == feature_type:
573
                    gtf_start, gtf_end = record.start - 1, record.end  # Convert to 0-based
574
                    
575
                    # Check each BED interval for overlap
576
                    for bed_start, bed_end, bed_name in bed_intervals[chrom]:
577
                        if bed_start < gtf_end and bed_end > gtf_start:
578
                            # Calculate overlap
579
                            overlap_start = max(bed_start, gtf_start)
580
                            overlap_end = min(bed_end, gtf_end)
581
                            overlap_len = overlap_end - overlap_start
582
                            
583
                            attrs = record.asDict()
584
                            gene_name = attrs.get("gene_name", ["Unknown"])[0]
585
                            
586
                            overlaps.append({
587
                                'bed_name': bed_name,
588
                                'gene_name': gene_name,
589
                                'chrom': chrom,
590
                                'overlap_start': overlap_start,
591
                                'overlap_end': overlap_end,
592
                                'overlap_length': overlap_len
593
                            })
594
    
595
    return overlaps
596

597
# Find peaks overlapping exons
598
overlaps = intersect_features("chip_peaks.bed.gz", "gencode.gtf.gz", "exon")
599
for overlap in overlaps[:10]:  # Show first 10
600
    print(f"Peak {overlap['bed_name']} overlaps {overlap['gene_name']} "
601
          f"by {overlap['overlap_length']} bp")
602
```

Version

Tile

Files

tabix-files.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

tabix-files.mddocs/