or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

alignment-files.mdbgzf-files.mdcommand-tools.mdindex.mdsequence-files.mdtabix-files.mdutilities.mdvariant-files.md

tabix-files.mddocs/

0

# Tabix-Indexed Files

1

2

Support for compressed, indexed genomic files using tabix indexing. Enables efficient random access to large compressed files in various formats including BED, GFF, GTF, and VCF.

3

4

## Capabilities

5

6

### TabixFile

7

8

Main interface for accessing tabix-indexed compressed files with format-specific parsers.

9

10

```python { .api }

11

class TabixFile:

12

def __init__(self, filename, parser=None, encoding="ascii", duplicate_filehandle=True):

13

"""

14

Open tabix-indexed file.

15

16

Parameters:

17

- filename: str, path to tabix-indexed file (.gz)

18

- parser: class, parser for file format (asTuple, asBed, asGTF, etc.)

19

- encoding: str, text encoding

20

- duplicate_filehandle: bool, allow multiple handles

21

22

Returns:

23

TabixFile object

24

"""

25

26

def fetch(self, reference=None, start=None, end=None, region=None, parser=None, multiple_iterators=False):

27

"""

28

Fetch records from region.

29

30

Parameters:

31

- reference: str, chromosome/contig name

32

- start: int, start position (0-based)

33

- end: int, end position (0-based)

34

- region: str, region string (chr:start-end)

35

- parser: class, override default parser

36

- multiple_iterators: bool, allow concurrent iterators

37

38

Returns:

39

Iterator of parsed records

40

"""

41

42

def close(self):

43

"""Close the file."""

44

45

# Properties

46

@property

47

def filename(self) -> str:

48

"""File name."""

49

50

@property

51

def header(self) -> list:

52

"""Header lines."""

53

54

@property

55

def contigs(self) -> list:

56

"""Available contigs/chromosomes."""

57

58

def __enter__(self):

59

"""Context manager entry."""

60

61

def __exit__(self, exc_type, exc_val, exc_tb):

62

"""Context manager exit."""

63

```

64

65

### Parser Classes

66

67

Format-specific parsers for different genomic file types.

68

69

```python { .api }

70

class asTuple:

71

"""Parse records as tuples of strings."""

72

def __call__(self, line):

73

"""

74

Parse line to tuple.

75

76

Returns:

77

tuple, fields as strings

78

"""

79

80

class asBed:

81

"""Parse BED format records."""

82

def __call__(self, line):

83

"""

84

Parse BED line.

85

86

Returns:

87

BedProxy object

88

"""

89

90

class asGTF:

91

"""Parse GTF format records."""

92

def __call__(self, line):

93

"""

94

Parse GTF line.

95

96

Returns:

97

GTFProxy object

98

"""

99

100

class asGFF3:

101

"""Parse GFF3 format records."""

102

def __call__(self, line):

103

"""

104

Parse GFF3 line.

105

106

Returns:

107

GFF3Proxy object

108

"""

109

110

class asVCF:

111

"""Parse VCF format records."""

112

def __call__(self, line):

113

"""

114

Parse VCF line.

115

116

Returns:

117

VCFProxy object

118

"""

119

```

120

121

### Proxy Classes

122

123

Efficient access to parsed record fields without copying data.

124

125

```python { .api }

126

class TupleProxy:

127

"""Fast read-only access to parsed records as tuples."""

128

129

def __getitem__(self, index):

130

"""

131

Get field by index.

132

133

Parameters:

134

- index: int, field index

135

136

Returns:

137

str, field value

138

"""

139

140

def __len__(self):

141

"""Number of fields."""

142

143

def __str__(self):

144

"""String representation."""

145

146

class BedProxy(TupleProxy):

147

"""BED format record with named field access."""

148

149

@property

150

def contig(self) -> str:

151

"""Chromosome name."""

152

153

@property

154

def start(self) -> int:

155

"""Start position (0-based)."""

156

157

@property

158

def end(self) -> int:

159

"""End position (0-based)."""

160

161

@property

162

def name(self) -> str:

163

"""Feature name."""

164

165

@property

166

def score(self) -> float:

167

"""Score value."""

168

169

@property

170

def strand(self) -> str:

171

"""Strand ('+', '-', or '.')."""

172

173

@property

174

def thickstart(self) -> int:

175

"""Thick start position."""

176

177

@property

178

def thickend(self) -> int:

179

"""Thick end position."""

180

181

@property

182

def itemrgb(self) -> str:

183

"""RGB color value."""

184

185

@property

186

def blockcount(self) -> int:

187

"""Number of blocks."""

188

189

@property

190

def blocksizes(self) -> list:

191

"""Block sizes."""

192

193

@property

194

def blockstarts(self) -> list:

195

"""Block start positions."""

196

197

class GTFProxy(TupleProxy):

198

"""GTF format record with named field access."""

199

200

@property

201

def contig(self) -> str:

202

"""Chromosome name."""

203

204

@property

205

def source(self) -> str:

206

"""Annotation source."""

207

208

@property

209

def feature(self) -> str:

210

"""Feature type."""

211

212

@property

213

def start(self) -> int:

214

"""Start position (1-based)."""

215

216

@property

217

def end(self) -> int:

218

"""End position (1-based)."""

219

220

@property

221

def score(self) -> str:

222

"""Score value."""

223

224

@property

225

def strand(self) -> str:

226

"""Strand ('+', '-', or '.')."""

227

228

@property

229

def frame(self) -> str:

230

"""Reading frame."""

231

232

@property

233

def attributes(self) -> str:

234

"""Attributes string."""

235

236

def asDict(self):

237

"""

238

Parse attributes as dictionary.

239

240

Returns:

241

dict, attribute key-value pairs

242

"""

243

244

class GFF3Proxy(TupleProxy):

245

"""GFF3 format record with named field access."""

246

247

@property

248

def contig(self) -> str:

249

"""Chromosome name."""

250

251

@property

252

def source(self) -> str:

253

"""Annotation source."""

254

255

@property

256

def feature(self) -> str:

257

"""Feature type."""

258

259

@property

260

def start(self) -> int:

261

"""Start position (1-based)."""

262

263

@property

264

def end(self) -> int:

265

"""End position (1-based)."""

266

267

@property

268

def score(self) -> str:

269

"""Score value."""

270

271

@property

272

def strand(self) -> str:

273

"""Strand ('+', '-', or '.')."""

274

275

@property

276

def frame(self) -> str:

277

"""Reading frame."""

278

279

@property

280

def attributes(self) -> str:

281

"""Attributes string."""

282

283

def asDict(self):

284

"""

285

Parse attributes as dictionary.

286

287

Returns:

288

dict, attribute key-value pairs

289

"""

290

291

class VCFProxy(TupleProxy):

292

"""VCF format record with named field access."""

293

294

@property

295

def contig(self) -> str:

296

"""Chromosome name."""

297

298

@property

299

def pos(self) -> int:

300

"""Position (1-based)."""

301

302

@property

303

def id(self) -> str:

304

"""Variant identifier."""

305

306

@property

307

def ref(self) -> str:

308

"""Reference allele."""

309

310

@property

311

def alt(self) -> str:

312

"""Alternate alleles."""

313

314

@property

315

def qual(self) -> str:

316

"""Quality score."""

317

318

@property

319

def filter(self) -> str:

320

"""Filter status."""

321

322

@property

323

def info(self) -> str:

324

"""Info fields."""

325

326

@property

327

def format(self) -> str:

328

"""Format specification."""

329

```

330

331

### Utility Functions

332

333

Functions for creating and manipulating tabix-indexed files.

334

335

```python { .api }

336

def tabix_index(filename, preset=None, seq_col=None, start_col=None, end_col=None, line_skip=0, zerobased=False, **kwargs):

337

"""

338

Create tabix index for a file.

339

340

Parameters:

341

- filename: str, file to index

342

- preset: str, format preset ('gff', 'bed', 'sam', 'vcf')

343

- seq_col: int, sequence/chromosome column (1-based)

344

- start_col: int, start position column (1-based)

345

- end_col: int, end position column (1-based)

346

- line_skip: int, number of header lines to skip

347

- zerobased: bool, coordinates are 0-based

348

349

Returns:

350

str, index filename

351

"""

352

353

def tabix_compress(filename_in, filename_out=None, force=False, **kwargs):

354

"""

355

Compress file with bgzip.

356

357

Parameters:

358

- filename_in: str, input file

359

- filename_out: str, output file (default: filename_in + .gz)

360

- force: bool, overwrite existing file

361

362

Returns:

363

str, compressed filename

364

"""

365

366

def tabix_iterator(infile, parser=None):

367

"""

368

Create iterator for unindexed files.

369

370

Parameters:

371

- infile: file object or str, input file

372

- parser: class, record parser

373

374

Returns:

375

Iterator of parsed records

376

"""

377

378

def tabix_generic_iterator(infile, parser=None):

379

"""

380

Generic file iterator.

381

382

Parameters:

383

- infile: file object, input file

384

- parser: class, record parser

385

386

Returns:

387

Iterator of parsed records

388

"""

389

390

def tabix_file_iterator(infile, parser=None):

391

"""

392

Compressed file iterator.

393

394

Parameters:

395

- infile: str, compressed file path

396

- parser: class, record parser

397

398

Returns:

399

Iterator of parsed records

400

"""

401

```

402

403

### Iterator Classes

404

405

Specialized iterators for different file access patterns.

406

407

```python { .api }

408

class GZIterator:

409

"""Iterator for gzip-compressed files."""

410

411

def __init__(self, filename, parser=None):

412

"""

413

Create gzip iterator.

414

415

Parameters:

416

- filename: str, gzip file path

417

- parser: class, record parser

418

"""

419

420

def __iter__(self):

421

"""Iterate over records."""

422

423

def __next__(self):

424

"""Get next record."""

425

426

class GZIteratorHead:

427

"""Iterator for headers in gzip files."""

428

429

def __init__(self, filename, parser=None):

430

"""

431

Create header iterator.

432

433

Parameters:

434

- filename: str, gzip file path

435

- parser: class, record parser

436

"""

437

438

def __iter__(self):

439

"""Iterate over header lines."""

440

```

441

442

### Legacy Compatibility

443

444

```python { .api }

445

class Tabixfile:

446

"""Legacy alias for TabixFile."""

447

def __init__(self, *args, **kwargs):

448

"""Same as TabixFile.__init__."""

449

```

450

451

## Usage Examples

452

453

### Basic File Access

454

455

```python

456

import pysam

457

458

# Open tabix-indexed BED file

459

with pysam.TabixFile("annotations.bed.gz", parser=pysam.asBed()) as tabixfile:

460

# Fetch features in region

461

for record in tabixfile.fetch("chr1", 1000, 2000):

462

print(f"{record.contig}:{record.start}-{record.end} {record.name}")

463

print(f" Score: {record.score}, Strand: {record.strand}")

464

465

# Open GTF file with GTF parser

466

with pysam.TabixFile("genes.gtf.gz", parser=pysam.asGTF()) as tabixfile:

467

for record in tabixfile.fetch("chr1", 1000000, 2000000):

468

if record.feature == "gene":

469

attrs = record.asDict()

470

gene_name = attrs.get("gene_name", ["Unknown"])[0]

471

print(f"Gene: {gene_name} at {record.start}-{record.end}")

472

```

473

474

### Creating Tabix Indexes

475

476

```python

477

import pysam

478

479

# Index a BED file

480

pysam.tabix_compress("features.bed", "features.bed.gz")

481

pysam.tabix_index("features.bed.gz", preset="bed")

482

483

# Index a custom format file

484

pysam.tabix_compress("custom.txt", "custom.txt.gz")

485

pysam.tabix_index("custom.txt.gz",

486

seq_col=1, # chromosome in column 1

487

start_col=2, # start in column 2

488

end_col=3, # end in column 3

489

line_skip=1) # skip header line

490

491

# Index VCF file

492

pysam.tabix_compress("variants.vcf", "variants.vcf.gz")

493

pysam.tabix_index("variants.vcf.gz", preset="vcf")

494

```

495

496

### Format-Specific Processing

497

498

```python

499

import pysam

500

501

# Process GTF annotations

502

def extract_genes(gtf_file, output_file, gene_type="protein_coding"):

503

with pysam.TabixFile(gtf_file, parser=pysam.asGTF()) as gtffile:

504

with open(output_file, 'w') as outfile:

505

outfile.write("gene_id\tgene_name\tchrom\tstart\tend\tstrand\n")

506

507

for record in gtffile.fetch():

508

if record.feature == "gene":

509

attrs = record.asDict()

510

511

# Check gene type

512

if gene_type in attrs.get("gene_type", []):

513

gene_id = attrs.get("gene_id", [""])[0]

514

gene_name = attrs.get("gene_name", [""])[0]

515

516

outfile.write(f"{gene_id}\t{gene_name}\t{record.contig}\t"

517

f"{record.start}\t{record.end}\t{record.strand}\n")

518

519

# Process BED intervals

520

def merge_overlapping_intervals(bed_file, output_file):

521

"""Merge overlapping BED intervals."""

522

intervals = []

523

524

with pysam.TabixFile(bed_file, parser=pysam.asBed()) as bedfile:

525

for record in bedfile.fetch():

526

intervals.append((record.contig, record.start, record.end, record.name))

527

528

# Sort by chromosome and position

529

intervals.sort(key=lambda x: (x[0], x[1]))

530

531

# Merge overlapping intervals

532

merged = []

533

for chrom, start, end, name in intervals:

534

if merged and merged[-1][0] == chrom and merged[-1][2] >= start:

535

# Overlapping - extend previous interval

536

merged[-1] = (chrom, merged[-1][1], max(merged[-1][2], end),

537

merged[-1][3] + "," + name)

538

else:

539

# Non-overlapping - add new interval

540

merged.append((chrom, start, end, name))

541

542

# Write merged intervals

543

with open(output_file, 'w') as outfile:

544

for chrom, start, end, name in merged:

545

outfile.write(f"{chrom}\t{start}\t{end}\t{name}\n")

546

547

# Usage

548

extract_genes("gencode.gtf.gz", "protein_coding_genes.txt")

549

merge_overlapping_intervals("peaks.bed.gz", "merged_peaks.bed")

550

```

551

552

### Multi-format Analysis

553

554

```python

555

import pysam

556

from collections import defaultdict

557

558

def intersect_features(bed_file, gtf_file, feature_type="exon"):

559

"""Find BED intervals that overlap with GTF features."""

560

561

# Load BED intervals by chromosome

562

bed_intervals = defaultdict(list)

563

with pysam.TabixFile(bed_file, parser=pysam.asBed()) as bedfile:

564

for record in bedfile.fetch():

565

bed_intervals[record.contig].append((record.start, record.end, record.name))

566

567

# Check overlaps with GTF features

568

overlaps = []

569

with pysam.TabixFile(gtf_file, parser=pysam.asGTF()) as gtffile:

570

for chrom in bed_intervals:

571

for record in gtffile.fetch(chrom):

572

if record.feature == feature_type:

573

gtf_start, gtf_end = record.start - 1, record.end # Convert to 0-based

574

575

# Check each BED interval for overlap

576

for bed_start, bed_end, bed_name in bed_intervals[chrom]:

577

if bed_start < gtf_end and bed_end > gtf_start:

578

# Calculate overlap

579

overlap_start = max(bed_start, gtf_start)

580

overlap_end = min(bed_end, gtf_end)

581

overlap_len = overlap_end - overlap_start

582

583

attrs = record.asDict()

584

gene_name = attrs.get("gene_name", ["Unknown"])[0]

585

586

overlaps.append({

587

'bed_name': bed_name,

588

'gene_name': gene_name,

589

'chrom': chrom,

590

'overlap_start': overlap_start,

591

'overlap_end': overlap_end,

592

'overlap_length': overlap_len

593

})

594

595

return overlaps

596

597

# Find peaks overlapping exons

598

overlaps = intersect_features("chip_peaks.bed.gz", "gencode.gtf.gz", "exon")

599

for overlap in overlaps[:10]: # Show first 10

600

print(f"Peak {overlap['bed_name']} overlaps {overlap['gene_name']} "

601

f"by {overlap['overlap_length']} bp")

602

```