or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

alignment-files.mdbgzf-files.mdcommand-tools.mdindex.mdsequence-files.mdtabix-files.mdutilities.mdvariant-files.md

alignment-files.mddocs/

0

# SAM/BAM/CRAM Alignment Files

1

2

Comprehensive support for reading and writing sequence alignment files in SAM, BAM, and CRAM formats. Provides random access, indexing, pileup analysis, and full metadata handling.

3

4

## Capabilities

5

6

### AlignmentFile

7

8

Main interface for reading and writing alignment files with support for all SAM/BAM/CRAM formats.

9

10

```python { .api }

11

class AlignmentFile:

12

def __init__(self, filepath_or_object, mode=None, template=None, reference_names=None, reference_lengths=None, text=None, header=None, add_sq_text=False, check_header=True, check_sq=True, reference_filename=None, filename=None, index_filename=None, filepath_index=None, require_index=False, duplicate_filehandle=True, ignore_truncation=False, threads=1):

13

"""

14

Open an alignment file for reading or writing.

15

16

Parameters:

17

- filepath_or_object: str or file-like, path to file or file object

18

- mode: str, file mode ('r', 'w', 'rb', 'wb', 'rc', 'wc')

19

- template: AlignmentFile, template for header when writing

20

- reference_names: list, sequence names for header

21

- reference_lengths: list, sequence lengths for header

22

- text: str, SAM header text

23

- header: AlignmentHeader, header object

24

- check_header: bool, validate file header

25

- check_sq: bool, validate sequence dictionary

26

- reference_filename: str, reference FASTA file

27

- index_filename: str, index file path

28

- require_index: bool, require index file

29

- threads: int, number of threads for compression

30

31

Returns:

32

AlignmentFile object

33

"""

34

35

def fetch(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, multiple_iterators=False, until_eof=False):

36

"""

37

Fetch reads from a region.

38

39

Parameters:

40

- contig: str, chromosome/contig name

41

- start: int, 0-based start position

42

- stop: int, 0-based stop position

43

- region: str, region string (chr:start-stop)

44

- multiple_iterators: bool, allow concurrent iterators

45

- until_eof: bool, read until end of file

46

47

Returns:

48

Iterator of AlignedSegment objects

49

"""

50

51

def pileup(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, **kwargs):

52

"""

53

Create pileup of reads covering each position.

54

55

Parameters:

56

- contig: str, chromosome/contig name

57

- start: int, 0-based start position

58

- stop: int, 0-based stop position

59

- stepper: str, pileup algorithm ('all', 'nofilter', 'samtools')

60

- fastafile: FastaFile, reference sequences

61

- mask: int, ignore reads with these flags

62

- max_depth: int, maximum read depth

63

- truncate: bool, truncate to region boundaries

64

- min_base_quality: int, minimum base quality

65

66

Returns:

67

Iterator of PileupColumn objects

68

"""

69

70

def count(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, read_callback=None, until_eof=False):

71

"""

72

Count reads in a region.

73

74

Returns:

75

int, number of reads

76

"""

77

78

def head(self, n):

79

"""

80

Return first n reads.

81

82

Returns:

83

Iterator of AlignedSegment objects

84

"""

85

86

def mate(self, read):

87

"""

88

Find mate of a paired read.

89

90

Returns:

91

AlignedSegment or None

92

"""

93

94

def write(self, read):

95

"""

96

Write an aligned segment.

97

98

Parameters:

99

- read: AlignedSegment, read to write

100

"""

101

102

def close(self):

103

"""Close the file."""

104

105

# Properties

106

@property

107

def header(self) -> "AlignmentHeader":

108

"""File header information."""

109

110

@property

111

def references(self) -> tuple:

112

"""Reference sequence names."""

113

114

@property

115

def lengths(self) -> tuple:

116

"""Reference sequence lengths."""

117

118

@property

119

def nreferences(self) -> int:

120

"""Number of reference sequences."""

121

122

@property

123

def mapped(self) -> int:

124

"""Number of mapped reads."""

125

126

@property

127

def unmapped(self) -> int:

128

"""Number of unmapped reads."""

129

130

@property

131

def nocoordinate(self) -> int:

132

"""Number of reads without coordinates."""

133

134

@property

135

def filename(self) -> str:

136

"""Filename of the file."""

137

138

@property

139

def is_valid_tid(self) -> bool:

140

"""Check if template ID is valid."""

141

142

def get_index_statistics(self):

143

"""

144

Get index statistics for each reference.

145

146

Returns:

147

List of IndexStats namedtuples

148

"""

149

150

def count_coverage(self, contig=None, start=None, stop=None, region=None, reference=None, end=None, quality_threshold=0, read_callback=None):

151

"""

152

Count coverage per position across A, C, G, T bases.

153

154

Parameters:

155

- contig: str, chromosome/contig name

156

- start: int, 0-based start position

157

- stop: int, 0-based stop position

158

- region: str, region string (chr:start-stop)

159

- quality_threshold: int, minimum base quality

160

- read_callback: callable, filter reads

161

162

Returns:

163

Tuple of 4 arrays (A, C, G, T coverage per position)

164

"""

165

166

def has_index(self):

167

"""

168

Check if file has an index.

169

170

Returns:

171

bool, True if index exists

172

"""

173

174

def check_index(self):

175

"""

176

Check if existing index is valid.

177

178

Returns:

179

bool, True if index is valid

180

"""

181

182

def find_introns(self, read_iterator):

183

"""

184

Find introns from read iterator (optimized version).

185

186

Parameters:

187

- read_iterator: iterator, reads to analyze

188

189

Returns:

190

dict, mapping (start, end) tuples to counts

191

"""

192

193

def find_introns_slow(self, read_iterator):

194

"""

195

Find introns from read iterator (slower but more compatible).

196

197

Parameters:

198

- read_iterator: iterator, reads to analyze

199

200

Returns:

201

dict, mapping (start, end) tuples to counts

202

"""

203

204

@property

205

def reference_filename(self):

206

"""Reference FASTA filename if specified."""

207

```

208

209

### AlignmentHeader

210

211

Header information for alignment files with methods for accessing and manipulating metadata.

212

213

```python { .api }

214

class AlignmentHeader:

215

def __init__(self):

216

"""Create empty alignment header."""

217

218

@classmethod

219

def from_text(cls, text):

220

"""

221

Create header from SAM header text.

222

223

Parameters:

224

- text: str, SAM header text

225

226

Returns:

227

AlignmentHeader object

228

"""

229

230

@classmethod

231

def from_dict(cls, header_dict):

232

"""

233

Create header from dictionary.

234

235

Parameters:

236

- header_dict: dict, header data

237

238

Returns:

239

AlignmentHeader object

240

"""

241

242

@classmethod

243

def from_references(cls, reference_names, reference_lengths, text=None, add_sq_text=False):

244

"""

245

Create header from reference sequences.

246

247

Parameters:

248

- reference_names: list, sequence names

249

- reference_lengths: list, sequence lengths

250

- text: str, additional header text

251

- add_sq_text: bool, add @SQ lines

252

253

Returns:

254

AlignmentHeader object

255

"""

256

257

def copy(self):

258

"""

259

Create a copy of the header.

260

261

Returns:

262

AlignmentHeader object

263

"""

264

265

def to_dict(self):

266

"""

267

Convert header to dictionary.

268

269

Returns:

270

dict, header data

271

"""

272

273

def get_reference_name(self, tid):

274

"""

275

Get reference name by template ID.

276

277

Parameters:

278

- tid: int, template ID

279

280

Returns:

281

str, reference name or None

282

"""

283

284

def get_reference_length(self, reference):

285

"""

286

Get reference sequence length.

287

288

Parameters:

289

- reference: str, reference name

290

291

Returns:

292

int, sequence length

293

"""

294

295

def get_tid(self, reference):

296

"""

297

Get template ID for reference.

298

299

Parameters:

300

- reference: str, reference name

301

302

Returns:

303

int, template ID

304

"""

305

306

def is_valid_tid(self, tid):

307

"""

308

Check if template ID is valid.

309

310

Parameters:

311

- tid: int, template ID

312

313

Returns:

314

bool, True if valid

315

"""

316

317

@property

318

def nreferences(self):

319

"""Number of reference sequences."""

320

321

@property

322

def references(self):

323

"""Tuple of reference sequence names."""

324

325

@property

326

def lengths(self):

327

"""Tuple of reference sequence lengths."""

328

```

329

330

### IndexStats

331

332

Named tuple containing index statistics for a reference sequence.

333

334

```python { .api }

335

class IndexStats(NamedTuple):

336

contig: str # Reference sequence name

337

mapped: int # Number of mapped reads

338

unmapped: int # Number of unmapped reads

339

total: int # Total number of reads

340

```

341

342

### AlignedSegment

343

344

Individual aligned read or segment with all alignment information and optional tags.

345

346

```python { .api }

347

class AlignedSegment:

348

def __init__(self):

349

"""Create a new aligned segment."""

350

351

# Core properties

352

@property

353

def query_name(self) -> str:

354

"""Read name/identifier."""

355

356

@query_name.setter

357

def query_name(self, value: str):

358

"""Set read name."""

359

360

@property

361

def query_sequence(self) -> str:

362

"""Read sequence."""

363

364

@query_sequence.setter

365

def query_sequence(self, value: str):

366

"""Set read sequence."""

367

368

@property

369

def query_qualities(self) -> list:

370

"""Base quality scores as list of integers."""

371

372

@query_qualities.setter

373

def query_qualities(self, value: list):

374

"""Set base quality scores."""

375

376

@property

377

def flag(self) -> int:

378

"""SAM flag as integer."""

379

380

@flag.setter

381

def flag(self, value: int):

382

"""Set SAM flag."""

383

384

@property

385

def reference_id(self) -> int:

386

"""Reference sequence ID (-1 if unmapped)."""

387

388

@reference_id.setter

389

def reference_id(self, value: int):

390

"""Set reference sequence ID."""

391

392

@property

393

def reference_start(self) -> int:

394

"""0-based start position on reference."""

395

396

@reference_start.setter

397

def reference_start(self, value: int):

398

"""Set reference start position."""

399

400

@property

401

def reference_end(self) -> int:

402

"""0-based end position on reference."""

403

404

@property

405

def next_reference_id(self) -> int:

406

"""Reference ID of mate."""

407

408

@next_reference_id.setter

409

def next_reference_id(self, value: int):

410

"""Set mate reference ID."""

411

412

@property

413

def next_reference_start(self) -> int:

414

"""Start position of mate."""

415

416

@next_reference_start.setter

417

def next_reference_start(self, value: int):

418

"""Set mate start position."""

419

420

@property

421

def template_length(self) -> int:

422

"""Template length (insert size)."""

423

424

@template_length.setter

425

def template_length(self, value: int):

426

"""Set template length."""

427

428

@property

429

def mapping_quality(self) -> int:

430

"""Mapping quality score."""

431

432

@mapping_quality.setter

433

def mapping_quality(self, value: int):

434

"""Set mapping quality."""

435

436

@property

437

def cigar(self) -> tuple:

438

"""CIGAR alignment as tuple of (operation, length) pairs."""

439

440

@cigar.setter

441

def cigar(self, value: tuple):

442

"""Set CIGAR alignment."""

443

444

@property

445

def cigarstring(self) -> str:

446

"""CIGAR as string representation."""

447

448

@cigarstring.setter

449

def cigarstring(self, value: str):

450

"""Set CIGAR from string."""

451

452

# Flag properties

453

@property

454

def is_paired(self) -> bool:

455

"""True if read is paired."""

456

457

@property

458

def is_proper_pair(self) -> bool:

459

"""True if read is in proper pair."""

460

461

@property

462

def is_unmapped(self) -> bool:

463

"""True if read is unmapped."""

464

465

@property

466

def mate_is_unmapped(self) -> bool:

467

"""True if mate is unmapped."""

468

469

@property

470

def is_reverse(self) -> bool:

471

"""True if read is reverse complemented."""

472

473

@property

474

def mate_is_reverse(self) -> bool:

475

"""True if mate is reverse complemented."""

476

477

@property

478

def is_read1(self) -> bool:

479

"""True if first read in pair."""

480

481

@property

482

def is_read2(self) -> bool:

483

"""True if second read in pair."""

484

485

@property

486

def is_secondary(self) -> bool:

487

"""True if secondary alignment."""

488

489

@property

490

def is_qcfail(self) -> bool:

491

"""True if read fails quality control."""

492

493

@property

494

def is_duplicate(self) -> bool:

495

"""True if read is duplicate."""

496

497

@property

498

def is_supplementary(self) -> bool:

499

"""True if supplementary alignment."""

500

501

# Methods

502

def get_tag(self, tag, with_value_type=False):

503

"""

504

Get optional alignment tag.

505

506

Parameters:

507

- tag: str, two-character tag name

508

- with_value_type: bool, return (value, type) tuple

509

510

Returns:

511

Tag value or (value, type) tuple

512

"""

513

514

def set_tag(self, tag, value, value_type=None, replace=True):

515

"""

516

Set optional alignment tag.

517

518

Parameters:

519

- tag: str, two-character tag name

520

- value: tag value

521

- value_type: str, value type ('A', 'i', 'f', 'Z', 'H', 'B')

522

- replace: bool, replace existing tag

523

"""

524

525

def has_tag(self, tag):

526

"""

527

Check if tag exists.

528

529

Returns:

530

bool, True if tag exists

531

"""

532

533

def get_tags(self, with_value_type=False):

534

"""

535

Get all tags.

536

537

Returns:

538

List of (tag, value) or (tag, value, type) tuples

539

"""

540

541

def set_tags(self, tags):

542

"""

543

Set multiple tags.

544

545

Parameters:

546

- tags: list of (tag, value) or (tag, value, type) tuples

547

"""

548

549

def get_aligned_pairs(self, matches_only=False, with_seq=False):

550

"""

551

Get alignment between query and reference.

552

553

Parameters:

554

- matches_only: bool, only return matching positions

555

- with_seq: bool, include sequence characters

556

557

Returns:

558

List of (query_pos, ref_pos) or (query_pos, ref_pos, ref_base) tuples

559

"""

560

561

def get_reference_positions(self, full_length=False):

562

"""

563

Get reference positions corresponding to query.

564

565

Returns:

566

List of reference positions

567

"""

568

569

def to_string(self):

570

"""

571

Convert to SAM format string.

572

573

Returns:

574

str, SAM format representation

575

"""

576

577

def compare(self, other):

578

"""

579

Compare two aligned segments.

580

581

Returns:

582

int, comparison result

583

"""

584

```

585

586

### AlignmentHeader

587

588

Header information for alignment files including sequence dictionary and metadata.

589

590

```python { .api }

591

class AlignmentHeader:

592

def __init__(self, dictionary=None):

593

"""

594

Create alignment header.

595

596

Parameters:

597

- dictionary: dict, header information

598

"""

599

600

@property

601

def references(self) -> tuple:

602

"""Reference sequence names."""

603

604

@property

605

def lengths(self) -> tuple:

606

"""Reference sequence lengths."""

607

608

def to_dict(self):

609

"""

610

Convert header to dictionary.

611

612

Returns:

613

dict, header information

614

"""

615

616

def as_dict(self):

617

"""

618

Get header as dictionary.

619

620

Returns:

621

dict, header information

622

"""

623

624

def copy(self):

625

"""

626

Create copy of header.

627

628

Returns:

629

AlignmentHeader, header copy

630

"""

631

```

632

633

### PileupColumn

634

635

Column of reads covering a genomic position for variant calling and depth analysis.

636

637

```python { .api }

638

class PileupColumn:

639

@property

640

def reference_id(self) -> int:

641

"""Reference sequence ID."""

642

643

@property

644

def reference_pos(self) -> int:

645

"""0-based reference position."""

646

647

@property

648

def nsegments(self) -> int:

649

"""Number of reads covering position."""

650

651

@property

652

def pileups(self) -> list:

653

"""List of PileupRead objects."""

654

655

def get_query_sequences(self, mark_matches=False, mark_ends=False, add_indels=False):

656

"""

657

Get query sequences at position.

658

659

Parameters:

660

- mark_matches: bool, mark matches with '.'

661

- mark_ends: bool, mark read ends with '$'

662

- add_indels: bool, include indel information

663

664

Returns:

665

List of sequence strings

666

"""

667

668

def get_query_qualities(self):

669

"""

670

Get base qualities at position.

671

672

Returns:

673

List of quality scores

674

"""

675

676

def get_query_names(self):

677

"""

678

Get read names at position.

679

680

Returns:

681

List of read names

682

"""

683

```

684

685

### PileupRead

686

687

Individual read within a pileup column with position-specific information.

688

689

```python { .api }

690

class PileupRead:

691

@property

692

def alignment(self) -> "AlignedSegment":

693

"""Underlying aligned segment."""

694

695

@property

696

def query_position(self) -> int:

697

"""Position in read sequence (None for indels)."""

698

699

@property

700

def query_position_or_next(self) -> int:

701

"""Query position or next position for indels."""

702

703

@property

704

def is_del(self) -> bool:

705

"""True if position is deletion."""

706

707

@property

708

def is_head(self) -> bool:

709

"""True if start of read."""

710

711

@property

712

def is_tail(self) -> bool:

713

"""True if end of read."""

714

715

@property

716

def is_refskip(self) -> bool:

717

"""True if reference skip (N in CIGAR)."""

718

719

@property

720

def level(self) -> int:

721

"""Level in pileup display."""

722

723

@property

724

def indel(self) -> int:

725

"""Length of indel (+ insertion, - deletion)."""

726

```

727

728

### IndexStats

729

730

Named tuple containing index statistics for reference sequences.

731

732

```python { .api }

733

IndexStats = namedtuple("IndexStats", ["contig", "mapped", "unmapped", "total"])

734

```

735

736

### IndexedReads

737

738

Index alignment file by read name for fast lookup by query name.

739

740

```python { .api }

741

class IndexedReads:

742

def __init__(self, samfile, multiple_iterators=False):

743

"""

744

Create read name index.

745

746

Parameters:

747

- samfile: AlignmentFile, alignment file to index

748

- multiple_iterators: bool, allow concurrent access

749

"""

750

751

def find(self, query_name):

752

"""

753

Find reads by name.

754

755

Parameters:

756

- query_name: str, read name to find

757

758

Returns:

759

Iterator of AlignedSegment objects

760

"""

761

```

762

763

## Usage Examples

764

765

### Basic File Reading

766

767

```python

768

import pysam

769

770

# Read SAM/BAM file

771

with pysam.AlignmentFile("input.bam", "rb") as bamfile:

772

# Iterate over all reads

773

for read in bamfile:

774

print(f"{read.query_name}: {read.reference_start}-{read.reference_end}")

775

776

# Fetch reads in region

777

for read in bamfile.fetch("chr1", 1000, 2000):

778

if not read.is_unmapped:

779

print(f"Mapped read: {read.query_sequence}")

780

781

# Read with index

782

with pysam.AlignmentFile("input.bam", "rb", index_filename="input.bam.bai") as bamfile:

783

count = bamfile.count("chr1", 1000, 2000)

784

print(f"Reads in region: {count}")

785

```

786

787

### Pileup Analysis

788

789

```python

790

import pysam

791

792

with pysam.AlignmentFile("input.bam", "rb") as bamfile:

793

for pileupcolumn in bamfile.pileup("chr1", 1000, 2000):

794

print(f"Position {pileupcolumn.reference_pos}: {pileupcolumn.nsegments} reads")

795

796

# Get base calls at position

797

bases = pileupcolumn.get_query_sequences()

798

qualities = pileupcolumn.get_query_qualities()

799

800

for base, qual in zip(bases, qualities):

801

if qual >= 20: # Filter by quality

802

print(f" Base: {base}, Quality: {qual}")

803

```

804

805

### Writing Files

806

807

```python

808

import pysam

809

810

# Create new BAM file

811

header = {"HD": {"VN": "1.0", "SO": "coordinate"},

812

"SQ": [{"LN": 1575, "SN": "chr1"},

813

{"LN": 1584, "SN": "chr2"}]}

814

815

with pysam.AlignmentFile("output.bam", "wb", header=header) as outfile:

816

# Create aligned segment

817

read = pysam.AlignedSegment()

818

read.query_name = "read1"

819

read.query_sequence = "ACGTACGTACGT"

820

read.flag = 0

821

read.reference_id = 0 # chr1

822

read.reference_start = 100

823

read.mapping_quality = 60

824

read.cigar = [(0, 12)] # 12M

825

826

outfile.write(read)

827

```