tessl/pypi-scikit-allel

A Python package for exploring and analysing genetic variation data.

—

Pending

Overview

Eval results

Files

I/O Operations

Name: tessl/pypi-scikit-allel
Author: tessl

Scikit-allel provides comprehensive file I/O capabilities for working with standard genetic data formats including VCF, GFF, and FASTA files, with support for large datasets and various output formats.

VCF File Operations

Reading VCF Files

Read VCF (Variant Call Format) files into structured arrays.

import allel

# Read entire VCF file
variants, samples, genotypes = allel.read_vcf('data.vcf')

# Read specific fields
variants, samples, genotypes = allel.read_vcf(
    'data.vcf', 
    fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT']
)

# Read with region filtering
data = allel.read_vcf('data.vcf', region='chr1:1000000-2000000')

# Read with sample selection
data = allel.read_vcf('data.vcf', samples=['sample1', 'sample2', 'sample3'])

{ .api }

Key Functions:

def read_vcf(input, fields=None, exclude_fields=None, rename_fields=None, 
             types=None, numbers=None, alt_number=DEFAULT_ALT_NUMBER, fills=None,
             region=None, tabix='tabix', samples=None, transformers=None,
             buffer_size=DEFAULT_BUFFER_SIZE, chunk_length=DEFAULT_CHUNK_LENGTH, log=None):
    """
    Read a VCF file into structured arrays.
    
    Args:
        input (str): Path to VCF file or file-like object
        fields (list, optional): Specific fields to read (e.g., ['variants/CHROM', 'calldata/GT'])
        exclude_fields (list, optional): Fields to exclude from reading
        rename_fields (dict, optional): Mapping to rename fields
        types (dict, optional): Data types for specific fields
        numbers (dict, optional): Number specifications for INFO fields
        alt_number (int): Number of alternate alleles to read (default: DEFAULT_ALT_NUMBER)
        fills (dict, optional): Fill values for missing data
        region (str, optional): Genomic region to read (e.g., 'chr1:1000-2000')
        tabix (str): Path to tabix executable for indexed files
        samples (list, optional): Specific samples to include
        transformers (dict, optional): Custom transformation functions
        buffer_size (int): Buffer size for file reading
        chunk_length (int): Number of variants to read per chunk
        log (file, optional): Log file for messages
        
    Returns:
        tuple: (variants, samples, calldata) structured arrays
    """

def iter_vcf_chunks(input, fields=None, types=None, numbers=None, alt_number=None,
                    chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE, region=None,
                    samples=None, transformers=None, log=None):
    """
    Iterate over VCF file in chunks for memory-efficient processing.
    
    Args:
        input (str): Path to VCF file
        fields (list, optional): Fields to read
        chunk_length (int): Variants per chunk
        buffer_size (int): File buffer size
        region (str, optional): Genomic region
        samples (list, optional): Sample subset
        transformers (dict, optional): Field transformers
        log (file, optional): Log file
        
    Yields:
        tuple: (variants, samples, calldata) for each chunk
    """

def read_vcf_headers(input):
    """
    Read only the header information from a VCF file.
    
    Args:
        input (str): Path to VCF file or file-like object
        
    Returns:
        dict: Header information including metadata and column names
    """

{ .api }

VCF Format Conversion

Convert VCF files to other formats for analysis or storage.

# Convert to NumPy compressed format
allel.vcf_to_npz('input.vcf', 'output.npz', compression='gzip')

# Convert to HDF5
allel.vcf_to_hdf5('input.vcf', 'output.h5', group='/variants',
                  compression='gzip', chunks=True)

# Convert to Zarr format
allel.vcf_to_zarr('input.vcf', 'output.zarr', compressor='blosc')

# Convert to pandas DataFrame
df = allel.vcf_to_dataframe('input.vcf', alt_number=1)

# Convert to CSV
allel.vcf_to_csv('input.vcf', 'output.csv', write_header=True)

{ .api }

def vcf_to_npz(input, output, fields=None, types=None, numbers=None, alt_number=None,
               chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE, region=None,
               samples=None, transformers=None, log=None, compression='gzip'):
    """
    Convert VCF file to NumPy compressed format.
    
    Args:
        input (str): Input VCF file path
        output (str): Output NPZ file path
        fields (list, optional): Fields to include
        compression (str): Compression method ('gzip', 'bz2', 'lzma')
        
    Returns:
        None
    """

def vcf_to_hdf5(input, output, group='/', fields=None, types=None, numbers=None,
                alt_number=None, chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE,
                region=None, samples=None, transformers=None, log=None,
                compression='gzip', compression_opts=1, chunks=True, fletcher32=False, shuffle=False):
    """
    Convert VCF file to HDF5 format.
    
    Args:
        input (str): Input VCF file path
        output (str): Output HDF5 file path
        group (str): HDF5 group to write to
        compression (str): Compression method
        compression_opts (int): Compression level
        chunks (bool or tuple): Chunking strategy
        fletcher32 (bool): Enable Fletcher32 checksum
        shuffle (bool): Enable shuffle filter
        
    Returns:
        None
    """

def vcf_to_zarr(input, output, group='', fields=None, types=None, numbers=None,
                alt_number=None, chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE,
                region=None, samples=None, transformers=None, log=None,
                compressor=None, chunks=True):
    """
    Convert VCF file to Zarr format.
    
    Args:
        input (str): Input VCF file path
        output (str): Output Zarr path
        group (str): Zarr group name
        compressor (object, optional): Compression algorithm (e.g., blosc.Blosc())
        chunks (bool or tuple): Chunking strategy
        
    Returns:
        None
    """

def vcf_to_dataframe(input, fields=None, alt_number=None, chunk_length=1000,
                     buffer_size=DEFAULT_BUFFER_SIZE, region=None, samples=None,
                     transformers=None, log=None):
    """
    Convert VCF file to pandas DataFrame.
    
    Args:
        input (str): Input VCF file path
        fields (list, optional): Fields to include
        alt_number (int, optional): Number of alternate alleles
        
    Returns:
        pandas.DataFrame: VCF data as DataFrame
    """

{ .api }

Writing VCF Files

Write genetic data back to VCF format.

# Write arrays to VCF file
allel.write_vcf('output.vcf', variants, calldata, samples=sample_names,
                meta={'fileformat': 'VCFv4.2'})

# Customize field descriptions
descriptions = {
    'DP': 'Total read depth',
    'GQ': 'Genotype quality'
}
allel.write_vcf('output.vcf', variants, calldata, description=descriptions)

{ .api }

def write_vcf(path, variants, calldata, samples=None, meta=None, rename=None,
              number=None, description=None, fill=None, write_header=True):
    """
    Write variant data to VCF format.
    
    Args:
        path (str): Output VCF file path
        variants (dict): Variant information arrays
        calldata (dict): Call data arrays (per-sample information)
        samples (list, optional): Sample names
        meta (dict, optional): Metadata for VCF header
        rename (dict, optional): Field name mappings
        number (dict, optional): Number specifications for INFO fields
        description (dict, optional): Field descriptions for header
        fill (dict, optional): Fill values for missing data
        write_header (bool): Whether to write VCF header
        
    Returns:
        None
    """

{ .api }

GFF File Operations

Reading GFF Files

Parse GFF3 (General Feature Format) files for genomic annotations.

# Read GFF3 file into record array
features = allel.gff3_to_recarray('annotations.gff3')

# Read specific genomic region
features = allel.gff3_to_recarray('annotations.gff3', region='chr1:1000000-2000000')

# Convert to pandas DataFrame for easier manipulation
df = allel.gff3_to_dataframe('annotations.gff3')

# Iterate over GFF records for memory efficiency
for record in allel.iter_gff3('annotations.gff3'):
    print(record['type'], record['start'], record['end'])

{ .api }

def gff3_to_recarray(path, attributes=None, region=None, score_fill=-1):
    """
    Read GFF3 file into structured record array.
    
    Args:
        path (str): Path to GFF3 file
        attributes (list, optional): Attribute names to parse from INFO field
        region (str, optional): Genomic region to read (e.g., 'chr1:1000-2000')
        score_fill (float, optional): Fill value for missing scores
        
    Returns:
        numpy.recarray: Structured array with GFF3 fields
    """

def gff3_to_dataframe(input, region=None):
    """
    Read GFF3 file into pandas DataFrame.
    
    Args:
        input (str): Path to GFF3 file
        region (str, optional): Genomic region to read
        
    Returns:
        pandas.DataFrame: GFF3 data as DataFrame
    """

def iter_gff3(path, attributes=None, region=None, score_fill=-1):
    """
    Iterate over GFF3 records from file.
    
    Args:
        path (str): Path to GFF3 file
        attributes (list, optional): Attribute names to parse
        region (str, optional): Genomic region filter
        score_fill (float, optional): Fill value for missing scores
        
    Yields:
        dict: GFF3 record as dictionary with standard fields
    """

{ .api }

FASTA File Operations

Reading FASTA Files

Read FASTA format sequences for reference genomes and sequence analysis.

# Write sequences to FASTA file
sequences = {'chr1': 'ATCGATCG', 'chr2': 'GCTAGCTA'}
allel.write_fasta('output.fasta', sequences, ['chr1', 'chr2'])

{ .api }

def write_fasta(path, sequences, names, mode='w', width=80):
    """
    Write sequences to FASTA format file.
    
    Args:
        path (str): Output FASTA file path
        sequences (dict or list): Sequences to write
        names (list): Sequence names/identifiers
        mode (str): File open mode ('w' for write, 'a' for append)
        width (int): Line width for sequence wrapping
        
    Returns:
        None
    """

{ .api }

File Format Utilities

Format Detection and Handling

Utilities for working with different file formats and sources.

# Handle compressed files automatically
data = allel.read_vcf('data.vcf.gz')  # Automatically handles gzip
data = allel.read_vcf('data.vcf.bz2')  # Automatically handles bzip2

# Read from URLs
data = allel.read_vcf('https://example.com/data.vcf.gz')

# Custom buffer sizes for large files
data = allel.read_vcf('large_file.vcf', buffer_size=1048576)  # 1MB buffer

{ .api }

Data Import Examples

Complete Workflow Examples

# Example 1: Basic VCF to analysis workflow
import allel
import numpy as np

# Read VCF file
variants, samples, genotypes = allel.read_vcf(
    'my_data.vcf',
    fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT']
)

# Create genotype array
g = allel.GenotypeArray(genotypes['GT'])

# Basic quality control
ac = g.count_alleles()
segregating = ac.is_segregating()
biallelic = ac.allelism() == 2

# Filter to high-quality, biallelic, segregating variants
mask = segregating & biallelic
g_filtered = g[mask]
variants_filtered = {k: v[mask] for k, v in variants.items()}

# Example 2: Large file processing with chunking
def process_large_vcf(filename):
    results = []
    for variants, samples, calldata in allel.iter_vcf_chunks(filename, chunk_length=10000):
        g = allel.GenotypeArray(calldata['GT'])
        ac = g.count_alleles()
        diversity = allel.mean_pairwise_difference(ac)
        results.extend(diversity)
    return np.array(results)

# Example 3: Multi-format workflow
# Read VCF and annotations
variants, samples, genotypes = allel.read_vcf('variants.vcf')
annotations = allel.read_gff3('genes.gff3')
reference = allel.read_fasta('reference.fasta')

# Convert to efficient storage format
allel.vcf_to_zarr('variants.vcf', 'variants.zarr', compressor='blosc')

{ .api }

Memory Management

Efficient Data Loading

Handle large datasets efficiently with selective loading and chunking.

# Read only essential fields to save memory
minimal_data = allel.read_vcf(
    'large_file.vcf',
    fields=['variants/CHROM', 'variants/POS', 'calldata/GT']
)

# Process in chunks for memory efficiency
def analyze_in_chunks(vcf_file, chunk_size=5000):
    all_stats = []
    for chunk in allel.iter_vcf_chunks(vcf_file, chunk_length=chunk_size):
        variants, samples, calldata = chunk
        g = allel.GenotypeArray(calldata['GT'])
        stats = g.count_alleles()
        all_stats.append(stats)
    return all_stats

# Use region-based loading for targeted analysis
region_data = allel.read_vcf('genome.vcf', region='chr22:20000000-25000000')

{ .api }

Error Handling

Common I/O Issues

Handle common file format and data issues gracefully.

try:
    # Attempt to read VCF with error handling
    data = allel.read_vcf('data.vcf')
except Exception as e:
    print(f"Error reading VCF: {e}")
    # Try with different parameters
    data = allel.read_vcf('data.vcf', alt_number=1)

# Validate data after reading
def validate_genetic_data(variants, genotypes):
    """Validate genetic data integrity."""
    if len(variants['POS']) != genotypes['GT'].shape[0]:
        raise ValueError("Mismatch between variants and genotypes")
    
    # Check for reasonable chromosome names
    valid_chroms = set([str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'])
    invalid_chroms = set(variants['CHROM']) - valid_chroms
    if invalid_chroms:
        print(f"Warning: Unusual chromosome names found: {invalid_chroms}")
    
    return True

{ .api }

Install with Tessl CLI