Package for reading, manipulating, and writing genomic data
—
Helper functions, error handling classes, and constants used throughout the pysam library for quality score conversion, string encoding, and genomic data processing.
Functions for controlling HTSlib logging verbosity.
def get_verbosity():
"""
Get current HTSlib verbosity level.
Returns:
int, current verbosity level
"""
def set_verbosity(verbosity):
"""
Set HTSlib verbosity level.
Parameters:
- verbosity: int, verbosity level (0=silent, 1=errors, 2=warnings, 3=info)
Returns:
int, previous verbosity level
"""Functions for converting between different quality score representations.
def qualitystring_to_array(input_str, offset=33):
"""
Convert quality string to array of integers.
Parameters:
- input_str: str, quality string
- offset: int, ASCII offset (33 for Sanger, 64 for Illumina 1.3+)
Returns:
array, quality scores as integers
"""
def array_to_qualitystring(qualities, offset=33):
"""
Convert array of integers to quality string.
Parameters:
- qualities: array, quality scores as integers
- offset: int, ASCII offset (33 for Sanger, 64 for Illumina 1.3+)
Returns:
str, quality string
"""
def qualities_to_qualitystring(qualities, offset=33):
"""
Convert list of quality scores to quality string.
Parameters:
- qualities: list, quality scores
- offset: int, ASCII offset
Returns:
str, quality string
"""Functions for handling string encoding in genomic data.
def get_encoding_error_handler():
"""
Get current string encoding error handler.
Returns:
str, error handler name ('strict', 'ignore', 'replace')
"""
def set_encoding_error_handler(errors):
"""
Set string encoding error handler.
Parameters:
- errors: str, error handler ('strict', 'ignore', 'replace')
"""Exception classes for pysam operations.
class SamtoolsError(Exception):
"""
Exception raised when samtools/bcftools commands fail.
Attributes:
- cmd: str, command that failed
- returncode: int, exit code
- stderr: str, error output
"""
def __init__(self, cmd, returncode, stderr=None):
"""
Initialize SamtoolsError.
Parameters:
- cmd: str, failed command
- returncode: int, exit code
- stderr: str, error output
"""
class PysamDispatcher:
"""
Command-line dispatcher for samtools/bcftools.
Handles execution of external commands and error reporting.
"""
def __init__(self, name):
"""
Initialize dispatcher.
Parameters:
- name: str, tool name ('samtools' or 'bcftools')
"""
def __call__(self, *args, **kwargs):
"""
Execute command.
Parameters:
*args: command arguments
**kwargs: execution options
Returns:
int, exit code
"""
class unquoted_str(str):
"""
String subclass for unquoted VCF header values.
Used to indicate that a header value should not be quoted
when writing VCF files.
"""Constants for CIGAR string operations in sequence alignments.
# CIGAR operation constants
CMATCH: int # Match/mismatch (0)
CINS: int # Insertion (1)
CDEL: int # Deletion (2)
CREF_SKIP: int # Reference skip (3)
CSOFT_CLIP: int # Soft clipping (4)
CHARD_CLIP: int # Hard clipping (5)
CPAD: int # Padding (6)
CEQUAL: int # Sequence match (7)
CDIFF: int # Sequence mismatch (8)
CBACK: int # Back (9)
# CIGAR operations tuple
CIGAR_OPS: tuple # All CIGAR operation constantsConstants for SAM flag bits indicating read properties.
# SAM flag constants
FPAIRED: int # Template having multiple segments (0x1)
FPROPER_PAIR: int # Each segment properly aligned (0x2)
FUNMAP: int # Segment unmapped (0x4)
FMUNMAP: int # Next segment unmapped (0x8)
FREVERSE: int # SEQ reverse complemented (0x10)
FMREVERSE: int # SEQ of next segment reverse complemented (0x20)
FREAD1: int # First segment in template (0x40)
FREAD2: int # Last segment in template (0x80)
FSECONDARY: int # Secondary alignment (0x100)
FQCFAIL: int # Not passing quality controls (0x200)
FDUP: int # PCR or optical duplicate (0x400)
FSUPPLEMENTARY: int # Supplementary alignment (0x800)
# SAM flags tuple
SAM_FLAGS: tuple # All SAM flag constantsAccess to library version information.
__version__: str # Pysam version string
__samtools_version__: str # Underlying samtools version
__htslib_version__: str # Underlying htslib versionFunctions for getting pysam build configuration.
def get_include():
"""
Get list of include directories for pysam headers.
Returns:
list, include directory paths
"""
def get_defines():
"""
Get list of compilation defines.
Returns:
list, compilation defines
"""
def get_libraries():
"""
Get list of pysam libraries for linking.
Returns:
list, library file paths
"""import pysam
# Convert quality string to numeric scores
quality_string = "###$$%&'()*+,-./0123456789"
scores = pysam.qualitystring_to_array(quality_string)
print(f"Quality scores: {scores}")
# Convert back to string
converted_string = pysam.array_to_qualitystring(scores)
print(f"Converted string: {converted_string}")
assert quality_string == converted_string
# Handle different encoding offsets
illumina_quality = "BCDEFGHIJKLMNOP" # Illumina 1.3+ encoding
sanger_scores = pysam.qualitystring_to_array(illumina_quality, offset=64)
sanger_quality = pysam.array_to_qualitystring(sanger_scores, offset=33)
print(f"Illumina to Sanger: {illumina_quality} -> {sanger_quality}")
# Convert list of qualities
quality_list = [20, 25, 30, 35, 40]
quality_str = pysam.qualities_to_qualitystring(quality_list)
print(f"Quality list to string: {quality_list} -> {quality_str}")import pysam
def analyze_cigar(cigar_tuples):
"""Analyze CIGAR operations in an alignment."""
operations = {
pysam.CMATCH: "Match/Mismatch",
pysam.CINS: "Insertion",
pysam.CDEL: "Deletion",
pysam.CREF_SKIP: "Reference Skip",
pysam.CSOFT_CLIP: "Soft Clip",
pysam.CHARD_CLIP: "Hard Clip",
pysam.CPAD: "Padding",
pysam.CEQUAL: "Sequence Match",
pysam.CDIFF: "Sequence Mismatch",
pysam.CBACK: "Back"
}
print("CIGAR analysis:")
for op, length in cigar_tuples:
op_name = operations.get(op, f"Unknown({op})")
print(f" {op_name}: {length} bases")
# Example usage with aligned segment
with pysam.AlignmentFile("example.bam", "rb") as bamfile:
for read in bamfile.fetch():
if read.cigar:
print(f"Read: {read.query_name}")
analyze_cigar(read.cigar)
breakimport pysam
def analyze_sam_flags(flag):
"""Analyze SAM flags for a read."""
flags = {
pysam.FPAIRED: "Paired",
pysam.FPROPER_PAIR: "Proper pair",
pysam.FUNMAP: "Unmapped",
pysam.FMUNMAP: "Mate unmapped",
pysam.FREVERSE: "Reverse strand",
pysam.FMREVERSE: "Mate reverse strand",
pysam.FREAD1: "First in pair",
pysam.FREAD2: "Second in pair",
pysam.FSECONDARY: "Secondary alignment",
pysam.FQCFAIL: "QC fail",
pysam.FDUP: "Duplicate",
pysam.FSUPPLEMENTARY: "Supplementary"
}
print(f"SAM flag {flag} ({flag:b} binary):")
active_flags = []
for flag_bit, description in flags.items():
if flag & flag_bit:
active_flags.append(description)
if active_flags:
for desc in active_flags:
print(f" - {desc}")
else:
print(" - No flags set")
# Example usage
with pysam.AlignmentFile("example.bam", "rb") as bamfile:
for read in bamfile.fetch():
print(f"Read: {read.query_name}")
analyze_sam_flags(read.flag)
# Check specific flags using properties
if read.is_paired:
print(" This is a paired read")
if read.is_proper_pair:
print(" This is a proper pair")
if read.is_reverse:
print(" This read is reverse complemented")
breakimport pysam
def safe_samtools_operation(command_func, *args, **kwargs):
"""Safely execute samtools command with error handling."""
try:
result = command_func(*args, **kwargs)
print(f"Command succeeded with exit code: {result}")
return result
except pysam.SamtoolsError as e:
print(f"Samtools command failed:")
print(f" Command: {e.cmd}")
print(f" Exit code: {e.returncode}")
if e.stderr:
print(f" Error: {e.stderr}")
return None
# Example usage
result = safe_samtools_operation(pysam.sort, "-o", "output.bam", "input.bam")
if result is not None:
print("Sort operation completed successfully")import pysam
# Check current encoding error handler
current_handler = pysam.get_encoding_error_handler()
print(f"Current encoding error handler: {current_handler}")
# Set more permissive error handling for problematic files
pysam.set_encoding_error_handler('ignore')
try:
# Process file that might have encoding issues
with pysam.AlignmentFile("problematic.bam", "rb") as bamfile:
for read in bamfile.fetch():
# Process reads, ignoring encoding errors in strings
pass
finally:
# Restore original handler
pysam.set_encoding_error_handler(current_handler)import pysam
# Display version information
print(f"Pysam version: {pysam.__version__}")
print(f"Samtools version: {pysam.__samtools_version__}")
# Get build configuration
includes = pysam.get_include()
print(f"Include directories: {includes}")
defines = pysam.get_defines()
print(f"Compilation defines: {defines}")
libraries = pysam.get_libraries()
print(f"Libraries: {libraries}")
# Check available CIGAR operations
print(f"Available CIGAR operations: {pysam.CIGAR_OPS}")
# Check available SAM flags
print(f"Available SAM flags: {pysam.SAM_FLAGS}")import pysam
import numpy as np
def filter_by_average_quality(input_bam, output_bam, min_avg_quality=20):
"""Filter reads by average base quality."""
with pysam.AlignmentFile(input_bam, "rb") as infile:
with pysam.AlignmentFile(output_bam, "wb", template=infile) as outfile:
reads_processed = 0
reads_kept = 0
for read in infile:
reads_processed += 1
if read.query_qualities:
# Calculate average quality
avg_quality = np.mean(read.query_qualities)
if avg_quality >= min_avg_quality:
outfile.write(read)
reads_kept += 1
else:
# Keep reads without quality scores
outfile.write(read)
reads_kept += 1
print(f"Processed {reads_processed} reads, kept {reads_kept} reads")
print(f"Filtering rate: {(reads_processed - reads_kept) / reads_processed * 100:.1f}%")
# Usage
filter_by_average_quality("input.bam", "filtered.bam", min_avg_quality=25)Install with Tessl CLI
npx tessl i tessl/pypi-pysam