Package for reading, manipulating, and writing genomic data
—
Direct access to samtools and bcftools command-line functionality from Python. All subcommands are available as Python functions with the same arguments and behavior as the command-line tools.
Complete samtools toolkit exposed as Python functions for alignment file processing.
def view(*args, **kwargs):
"""
View and convert alignment files.
Parameters:
*args: command-line arguments as strings
**kwargs: keyword arguments (catch_stdout, save_stdout, etc.)
Returns:
int, exit code (0 for success)
"""
def sort(*args, **kwargs):
"""
Sort alignment files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def index(*args, **kwargs):
"""
Index alignment files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def merge(*args, **kwargs):
"""
Merge multiple alignment files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def cat(*args, **kwargs):
"""
Concatenate alignment files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def stats(*args, **kwargs):
"""
Generate alignment statistics.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def flagstat(*args, **kwargs):
"""
Generate flag statistics.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def idxstats(*args, **kwargs):
"""
Generate index statistics.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def depth(*args, **kwargs):
"""
Calculate per-position depth.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def coverage(*args, **kwargs):
"""
Calculate coverage statistics.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def mpileup(*args, **kwargs):
"""
Generate pileup format.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def fixmate(*args, **kwargs):
"""
Fix mate pair information.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def markdup(*args, **kwargs):
"""
Mark or remove duplicates.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def rmdup(*args, **kwargs):
"""
Remove duplicates (legacy).
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def quickcheck(*args, **kwargs):
"""
Quick integrity check.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def calmd(*args, **kwargs):
"""
Calculate MD tag.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def fasta(*args, **kwargs):
"""
Convert to FASTA format.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def fastq(*args, **kwargs):
"""
Convert to FASTQ format.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def bam2fq(*args, **kwargs):
"""
Convert BAM to FASTQ.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def split(*args, **kwargs):
"""
Split alignment files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def collate(*args, **kwargs):
"""
Collate alignment files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def faidx(*args, **kwargs):
"""
Index FASTA file.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def dict(*args, **kwargs):
"""
Create sequence dictionary.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def bedcov(*args, **kwargs):
"""
Calculate BED coverage.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def phase(*args, **kwargs):
"""
Phase variants.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def consensus(*args, **kwargs):
"""
Generate consensus sequence.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def reheader(*args, **kwargs):
"""
Replace header.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def depad(*args, **kwargs):
"""
Remove padding from alignment.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def pad2unpad(*args, **kwargs):
"""
Convert padded to unpadded alignment.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def addreplacerg(*args, **kwargs):
"""
Add or replace read groups.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def ampliconstats(*args, **kwargs):
"""
Generate amplicon statistics.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def ampliconclip(*args, **kwargs):
"""
Clip amplicon primers.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def tview(*args, **kwargs):
"""
Text alignment viewer.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def flags(*args, **kwargs):
"""
Explain SAM flags.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def version(*args, **kwargs):
"""
Show samtools version.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def bamshuf(*args, **kwargs):
"""
Shuffle and group alignments by name.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def cram_size(*args, **kwargs):
"""
List the contents of a CRAM file.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def fqidx(*args, **kwargs):
"""
Index/extract FASTQ.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def fqimport(*args, **kwargs):
"""
Import FASTQ to SAM.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def reference(*args, **kwargs):
"""
Generate reference sequence from MD tags.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def reset(*args, **kwargs):
"""
Remove auxiliary fields.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def samples(*args, **kwargs):
"""
List the samples in a set of SAM/BAM/CRAM files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def targetcut(*args, **kwargs):
"""
Cut out regions with excessive mismatches.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""Complete bcftools toolkit exposed as Python functions for variant file processing.
def call(*args, **kwargs):
"""
SNP/indel calling.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def filter(*args, **kwargs):
"""
Filter VCF/BCF records.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def norm(*args, **kwargs):
"""
Normalize variants.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def annotate(*args, **kwargs):
"""
Annotate variants.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def query(*args, **kwargs):
"""
Extract/query VCF/BCF data.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def convert(*args, **kwargs):
"""
Convert between formats.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def concat(*args, **kwargs):
"""
Concatenate VCF files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def isec(*args, **kwargs):
"""
Create intersections of VCF files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def gtcheck(*args, **kwargs):
"""
Check sample identity.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def roh(*args, **kwargs):
"""
Runs of homozygosity.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def plugin(*args, **kwargs):
"""
Run bcftools plugin.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def cnv(*args, **kwargs):
"""
Copy number variation analysis.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def csq(*args, **kwargs):
"""
Consequence analysis.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def head(*args, **kwargs):
"""
View header.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def consensus(*args, **kwargs):
"""
Create consensus sequence by applying VCF variants.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def index(*args, **kwargs):
"""
Index VCF/BCF files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def merge(*args, **kwargs):
"""
Merge VCF files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def mpileup(*args, **kwargs):
"""
Multi-way pileup producing genotype likelihoods.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def reheader(*args, **kwargs):
"""
Modify header of VCF/BCF files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def sort(*args, **kwargs):
"""
Sort VCF/BCF files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def stats(*args, **kwargs):
"""
Produce VCF/BCF stats.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""
def view(*args, **kwargs):
"""
View, subset and filter VCF or BCF files.
Parameters:
*args: command-line arguments as strings
Returns:
int, exit code
"""class SamtoolsError(Exception):
"""
Exception raised when samtools/bcftools commands fail.
Attributes:
- cmd: str, command that failed
- returncode: int, exit code
- stderr: str, error output
"""
def __init__(self, cmd, returncode, stderr=None):
"""
Initialize SamtoolsError.
Parameters:
- cmd: str, command that failed
- returncode: int, exit code
- stderr: str, error output
"""import pysam
# Sort BAM file
pysam.sort("-o", "sorted.bam", "input.bam")
# Index sorted BAM
pysam.index("sorted.bam")
# Generate statistics
pysam.stats("sorted.bam")
# View specific region
pysam.view("-b", "-o", "region.bam", "sorted.bam", "chr1:1000-2000")
# Convert SAM to BAM
pysam.view("-b", "-o", "output.bam", "input.sam")import pysam
# Capture command output
result = pysam.flagstat("input.bam", catch_stdout=True)
print("Flagstat output:", result)
# Save output to file
pysam.stats("input.bam", save_stdout="stats.txt")
# Capture both stdout and stderr
try:
result = pysam.view("-h", "nonexistent.bam", catch_stdout=True)
except pysam.SamtoolsError as e:
print(f"Command failed: {e.cmd}")
print(f"Exit code: {e.returncode}")
print(f"Error: {e.stderr}")import pysam
import os
def process_alignment_workflow(input_sam, output_prefix):
"""Complete alignment processing workflow."""
# Step 1: Convert SAM to BAM
temp_bam = f"{output_prefix}_temp.bam"
pysam.view("-b", "-o", temp_bam, input_sam)
# Step 2: Sort BAM
sorted_bam = f"{output_prefix}_sorted.bam"
pysam.sort("-o", sorted_bam, temp_bam)
# Step 3: Index sorted BAM
pysam.index(sorted_bam)
# Step 4: Mark duplicates
markdup_bam = f"{output_prefix}_markdup.bam"
pysam.markdup("-o", markdup_bam, sorted_bam)
# Step 5: Index marked BAM
pysam.index(markdup_bam)
# Step 6: Generate statistics
stats_file = f"{output_prefix}_stats.txt"
pysam.stats(markdup_bam, save_stdout=stats_file)
# Step 7: Generate flagstat
flagstat_file = f"{output_prefix}_flagstat.txt"
pysam.flagstat(markdup_bam, save_stdout=flagstat_file)
# Cleanup intermediate files
os.remove(temp_bam)
return {
'final_bam': markdup_bam,
'stats_file': stats_file,
'flagstat_file': flagstat_file
}
# Usage
results = process_alignment_workflow("input.sam", "processed")
print(f"Final BAM: {results['final_bam']}")import pysam
def variant_calling_pipeline(bam_files, reference_fasta, output_vcf):
"""Complete variant calling pipeline using bcftools."""
# Step 1: Generate pileup
pileup_bcf = "pileup.bcf"
cmd_args = ["-f", reference_fasta, "-o", pileup_bcf, "-O", "b"] + bam_files
pysam.mpileup(*cmd_args)
# Step 2: Call variants
calls_vcf = "calls.vcf"
pysam.call("-mv", "-o", calls_vcf, "-O", "v", pileup_bcf)
# Step 3: Filter variants
pysam.filter("-i", "QUAL>=20 && DP>=10", "-o", output_vcf, calls_vcf)
# Step 4: Index final VCF
pysam.tabix_compress(output_vcf, f"{output_vcf}.gz")
pysam.tabix_index(f"{output_vcf}.gz", preset="vcf")
# Cleanup intermediate files
os.remove(pileup_bcf)
os.remove(calls_vcf)
return f"{output_vcf}.gz"
# Usage
bam_files = ["sample1.bam", "sample2.bam", "sample3.bam"]
final_vcf = variant_calling_pipeline(bam_files, "reference.fa", "variants.vcf")
print(f"Final VCF: {final_vcf}")import pysam
import glob
import os
from multiprocessing import Pool
def process_single_bam(bam_file):
"""Process single BAM file."""
base_name = os.path.splitext(bam_file)[0]
try:
# Sort if not already sorted
if not bam_file.endswith('_sorted.bam'):
sorted_bam = f"{base_name}_sorted.bam"
pysam.sort("-o", sorted_bam, bam_file)
bam_file = sorted_bam
# Index
pysam.index(bam_file)
# Generate statistics
stats_file = f"{base_name}_stats.txt"
pysam.stats(bam_file, save_stdout=stats_file)
return f"Processed: {bam_file}"
except pysam.SamtoolsError as e:
return f"Failed: {bam_file} - {e}"
def batch_process_bams(bam_pattern, num_processes=4):
"""Process multiple BAM files in parallel."""
bam_files = glob.glob(bam_pattern)
with Pool(processes=num_processes) as pool:
results = pool.map(process_single_bam, bam_files)
for result in results:
print(result)
# Usage
batch_process_bams("*.bam", num_processes=8)import pysam
import json
def quality_control_analysis(bam_file):
"""Comprehensive quality control analysis."""
base_name = os.path.splitext(bam_file)[0]
# Collect various statistics
qc_results = {}
# Basic statistics
stats_output = pysam.stats(bam_file, catch_stdout=True)
qc_results['basic_stats'] = stats_output
# Flag statistics
flagstat_output = pysam.flagstat(bam_file, catch_stdout=True)
qc_results['flag_stats'] = flagstat_output
# Index statistics (if indexed)
try:
idxstats_output = pysam.idxstats(bam_file, catch_stdout=True)
qc_results['index_stats'] = idxstats_output
except pysam.SamtoolsError:
qc_results['index_stats'] = "No index available"
# Quick integrity check
try:
pysam.quickcheck(bam_file)
qc_results['integrity_check'] = "PASS"
except pysam.SamtoolsError:
qc_results['integrity_check'] = "FAIL"
# Save results
qc_file = f"{base_name}_qc.json"
with open(qc_file, 'w') as f:
json.dump(qc_results, f, indent=2)
return qc_file
# Usage
qc_report = quality_control_analysis("sample.bam")
print(f"QC report saved to: {qc_report}")catch_stdout=True to capture output instead of printing to consoleSamtoolsError exception for failed commandsInstall with Tessl CLI
npx tessl i tessl/pypi-pysam