CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pysam

Package for reading, manipulating, and writing genomic data

Pending
Overview
Eval results
Files

command-tools.mddocs/

Command-Line Tools Integration

Direct access to samtools and bcftools command-line functionality from Python. All subcommands are available as Python functions with the same arguments and behavior as the command-line tools.

Capabilities

Samtools Functions

Complete samtools toolkit exposed as Python functions for alignment file processing.

def view(*args, **kwargs):
    """
    View and convert alignment files.
    
    Parameters:
    *args: command-line arguments as strings
    **kwargs: keyword arguments (catch_stdout, save_stdout, etc.)
    
    Returns:
    int, exit code (0 for success)
    """

def sort(*args, **kwargs):
    """
    Sort alignment files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def index(*args, **kwargs):
    """
    Index alignment files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def merge(*args, **kwargs):
    """
    Merge multiple alignment files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def cat(*args, **kwargs):
    """
    Concatenate alignment files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def stats(*args, **kwargs):
    """
    Generate alignment statistics.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def flagstat(*args, **kwargs):
    """
    Generate flag statistics.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def idxstats(*args, **kwargs):
    """
    Generate index statistics.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def depth(*args, **kwargs):
    """
    Calculate per-position depth.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def coverage(*args, **kwargs):
    """
    Calculate coverage statistics.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def mpileup(*args, **kwargs):
    """
    Generate pileup format.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def fixmate(*args, **kwargs):
    """
    Fix mate pair information.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def markdup(*args, **kwargs):
    """
    Mark or remove duplicates.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def rmdup(*args, **kwargs):
    """
    Remove duplicates (legacy).
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def quickcheck(*args, **kwargs):
    """
    Quick integrity check.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def calmd(*args, **kwargs):
    """
    Calculate MD tag.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def fasta(*args, **kwargs):
    """
    Convert to FASTA format.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def fastq(*args, **kwargs):
    """
    Convert to FASTQ format.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def bam2fq(*args, **kwargs):
    """
    Convert BAM to FASTQ.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def split(*args, **kwargs):
    """
    Split alignment files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def collate(*args, **kwargs):
    """
    Collate alignment files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def faidx(*args, **kwargs):
    """
    Index FASTA file.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def dict(*args, **kwargs):
    """
    Create sequence dictionary.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def bedcov(*args, **kwargs):
    """
    Calculate BED coverage.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def phase(*args, **kwargs):
    """
    Phase variants.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def consensus(*args, **kwargs):
    """
    Generate consensus sequence.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def reheader(*args, **kwargs):
    """
    Replace header.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def depad(*args, **kwargs):
    """
    Remove padding from alignment.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def pad2unpad(*args, **kwargs):
    """
    Convert padded to unpadded alignment.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def addreplacerg(*args, **kwargs):
    """
    Add or replace read groups.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def ampliconstats(*args, **kwargs):
    """
    Generate amplicon statistics.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def ampliconclip(*args, **kwargs):
    """
    Clip amplicon primers.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def tview(*args, **kwargs):
    """
    Text alignment viewer.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def flags(*args, **kwargs):
    """
    Explain SAM flags.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def version(*args, **kwargs):
    """
    Show samtools version.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def bamshuf(*args, **kwargs):
    """
    Shuffle and group alignments by name.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def cram_size(*args, **kwargs):
    """
    List the contents of a CRAM file.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def fqidx(*args, **kwargs):
    """
    Index/extract FASTQ.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def fqimport(*args, **kwargs):
    """
    Import FASTQ to SAM.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def reference(*args, **kwargs):
    """
    Generate reference sequence from MD tags.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def reset(*args, **kwargs):
    """
    Remove auxiliary fields.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def samples(*args, **kwargs):
    """
    List the samples in a set of SAM/BAM/CRAM files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def targetcut(*args, **kwargs):
    """
    Cut out regions with excessive mismatches.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

BCFtools Functions

Complete bcftools toolkit exposed as Python functions for variant file processing.

def call(*args, **kwargs):
    """
    SNP/indel calling.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def filter(*args, **kwargs):
    """
    Filter VCF/BCF records.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def norm(*args, **kwargs):
    """
    Normalize variants.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def annotate(*args, **kwargs):
    """
    Annotate variants.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def query(*args, **kwargs):
    """
    Extract/query VCF/BCF data.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def convert(*args, **kwargs):
    """
    Convert between formats.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def concat(*args, **kwargs):
    """
    Concatenate VCF files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def isec(*args, **kwargs):
    """
    Create intersections of VCF files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def gtcheck(*args, **kwargs):
    """
    Check sample identity.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def roh(*args, **kwargs):
    """
    Runs of homozygosity.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def plugin(*args, **kwargs):
    """
    Run bcftools plugin.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def cnv(*args, **kwargs):
    """
    Copy number variation analysis.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def csq(*args, **kwargs):
    """
    Consequence analysis.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def head(*args, **kwargs):
    """
    View header.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def consensus(*args, **kwargs):
    """
    Create consensus sequence by applying VCF variants.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def index(*args, **kwargs):
    """
    Index VCF/BCF files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def merge(*args, **kwargs):
    """
    Merge VCF files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def mpileup(*args, **kwargs):
    """
    Multi-way pileup producing genotype likelihoods.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def reheader(*args, **kwargs):
    """
    Modify header of VCF/BCF files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def sort(*args, **kwargs):
    """
    Sort VCF/BCF files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def stats(*args, **kwargs):
    """
    Produce VCF/BCF stats.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

def view(*args, **kwargs):
    """
    View, subset and filter VCF or BCF files.
    
    Parameters:
    *args: command-line arguments as strings
    
    Returns:
    int, exit code
    """

Error Handling

class SamtoolsError(Exception):
    """
    Exception raised when samtools/bcftools commands fail.
    
    Attributes:
    - cmd: str, command that failed
    - returncode: int, exit code
    - stderr: str, error output
    """
    
    def __init__(self, cmd, returncode, stderr=None):
        """
        Initialize SamtoolsError.
        
        Parameters:
        - cmd: str, command that failed
        - returncode: int, exit code  
        - stderr: str, error output
        """

Usage Examples

Basic Command Execution

import pysam

# Sort BAM file
pysam.sort("-o", "sorted.bam", "input.bam")

# Index sorted BAM
pysam.index("sorted.bam")

# Generate statistics
pysam.stats("sorted.bam")

# View specific region
pysam.view("-b", "-o", "region.bam", "sorted.bam", "chr1:1000-2000")

# Convert SAM to BAM
pysam.view("-b", "-o", "output.bam", "input.sam")

Capturing Output

import pysam

# Capture command output
result = pysam.flagstat("input.bam", catch_stdout=True)
print("Flagstat output:", result)

# Save output to file
pysam.stats("input.bam", save_stdout="stats.txt")

# Capture both stdout and stderr
try:
    result = pysam.view("-h", "nonexistent.bam", catch_stdout=True)
except pysam.SamtoolsError as e:
    print(f"Command failed: {e.cmd}")
    print(f"Exit code: {e.returncode}")
    print(f"Error: {e.stderr}")

Complex Workflows

import pysam
import os

def process_alignment_workflow(input_sam, output_prefix):
    """Complete alignment processing workflow."""
    
    # Step 1: Convert SAM to BAM
    temp_bam = f"{output_prefix}_temp.bam"
    pysam.view("-b", "-o", temp_bam, input_sam)
    
    # Step 2: Sort BAM
    sorted_bam = f"{output_prefix}_sorted.bam"
    pysam.sort("-o", sorted_bam, temp_bam)
    
    # Step 3: Index sorted BAM
    pysam.index(sorted_bam)
    
    # Step 4: Mark duplicates
    markdup_bam = f"{output_prefix}_markdup.bam"
    pysam.markdup("-o", markdup_bam, sorted_bam)
    
    # Step 5: Index marked BAM
    pysam.index(markdup_bam)
    
    # Step 6: Generate statistics
    stats_file = f"{output_prefix}_stats.txt"
    pysam.stats(markdup_bam, save_stdout=stats_file)
    
    # Step 7: Generate flagstat
    flagstat_file = f"{output_prefix}_flagstat.txt"
    pysam.flagstat(markdup_bam, save_stdout=flagstat_file)
    
    # Cleanup intermediate files
    os.remove(temp_bam)
    
    return {
        'final_bam': markdup_bam,
        'stats_file': stats_file,
        'flagstat_file': flagstat_file
    }

# Usage
results = process_alignment_workflow("input.sam", "processed")
print(f"Final BAM: {results['final_bam']}")

Variant Calling Pipeline

import pysam

def variant_calling_pipeline(bam_files, reference_fasta, output_vcf):
    """Complete variant calling pipeline using bcftools."""
    
    # Step 1: Generate pileup
    pileup_bcf = "pileup.bcf"
    cmd_args = ["-f", reference_fasta, "-o", pileup_bcf, "-O", "b"] + bam_files
    pysam.mpileup(*cmd_args)
    
    # Step 2: Call variants
    calls_vcf = "calls.vcf"
    pysam.call("-mv", "-o", calls_vcf, "-O", "v", pileup_bcf)
    
    # Step 3: Filter variants
    pysam.filter("-i", "QUAL>=20 && DP>=10", "-o", output_vcf, calls_vcf)
    
    # Step 4: Index final VCF
    pysam.tabix_compress(output_vcf, f"{output_vcf}.gz")
    pysam.tabix_index(f"{output_vcf}.gz", preset="vcf")
    
    # Cleanup intermediate files
    os.remove(pileup_bcf)
    os.remove(calls_vcf)
    
    return f"{output_vcf}.gz"

# Usage
bam_files = ["sample1.bam", "sample2.bam", "sample3.bam"]
final_vcf = variant_calling_pipeline(bam_files, "reference.fa", "variants.vcf")
print(f"Final VCF: {final_vcf}")

Batch Processing

import pysam
import glob
import os
from multiprocessing import Pool

def process_single_bam(bam_file):
    """Process single BAM file."""
    base_name = os.path.splitext(bam_file)[0]
    
    try:
        # Sort if not already sorted
        if not bam_file.endswith('_sorted.bam'):
            sorted_bam = f"{base_name}_sorted.bam"
            pysam.sort("-o", sorted_bam, bam_file)
            bam_file = sorted_bam
        
        # Index
        pysam.index(bam_file)
        
        # Generate statistics
        stats_file = f"{base_name}_stats.txt"
        pysam.stats(bam_file, save_stdout=stats_file)
        
        return f"Processed: {bam_file}"
        
    except pysam.SamtoolsError as e:
        return f"Failed: {bam_file} - {e}"

def batch_process_bams(bam_pattern, num_processes=4):
    """Process multiple BAM files in parallel."""
    bam_files = glob.glob(bam_pattern)
    
    with Pool(processes=num_processes) as pool:
        results = pool.map(process_single_bam, bam_files)
    
    for result in results:
        print(result)

# Usage
batch_process_bams("*.bam", num_processes=8)

Quality Control Pipeline

import pysam
import json

def quality_control_analysis(bam_file):
    """Comprehensive quality control analysis."""
    base_name = os.path.splitext(bam_file)[0]
    
    # Collect various statistics
    qc_results = {}
    
    # Basic statistics
    stats_output = pysam.stats(bam_file, catch_stdout=True)
    qc_results['basic_stats'] = stats_output
    
    # Flag statistics
    flagstat_output = pysam.flagstat(bam_file, catch_stdout=True)
    qc_results['flag_stats'] = flagstat_output
    
    # Index statistics (if indexed)
    try:
        idxstats_output = pysam.idxstats(bam_file, catch_stdout=True)
        qc_results['index_stats'] = idxstats_output
    except pysam.SamtoolsError:
        qc_results['index_stats'] = "No index available"
    
    # Quick integrity check
    try:
        pysam.quickcheck(bam_file)
        qc_results['integrity_check'] = "PASS"
    except pysam.SamtoolsError:
        qc_results['integrity_check'] = "FAIL"
    
    # Save results
    qc_file = f"{base_name}_qc.json"
    with open(qc_file, 'w') as f:
        json.dump(qc_results, f, indent=2)
    
    return qc_file

# Usage
qc_report = quality_control_analysis("sample.bam")
print(f"QC report saved to: {qc_report}")

Performance Tips

  • Commands run in separate processes, so there's some overhead
  • For simple operations, using the API classes directly may be faster
  • Use catch_stdout=True to capture output instead of printing to console
  • Commands support all the same arguments as their command-line equivalents
  • Error handling uses SamtoolsError exception for failed commands
  • Multi-threading in commands can improve performance for compression/decompression

Install with Tessl CLI

npx tessl i tessl/pypi-pysam

docs

alignment-files.md

bgzf-files.md

command-tools.md

index.md

sequence-files.md

tabix-files.md

utilities.md

variant-files.md

tile.json