tessl/pypi-gfftk

Comprehensive Python toolkit for working with genome annotation files in GFF3, GTF, and TBL formats with format conversion and analysis capabilities

Overview

Eval results

Files

Sequence Operations

Name: tessl/pypi-gfftk
Author: tessl

Comprehensive FASTA file parsing and genomic sequence manipulation capabilities, including coordinate-based sequence extraction, translation using multiple genetic codes, reverse complement operations, and efficient sequence access.

Capabilities

FASTA File Handling

Object-oriented and functional interfaces for working with FASTA files and sequence data.

class FASTA:
    """FASTA file handler with efficient sequence access."""

    def __init__(self, fasta_file):
        """
        Initialize FASTA handler.

        Parameters:
        - fasta_file (str): Path to FASTA file
        """

    def get_seq(self, contig):
        """
        Get sequence for specified contig.

        Parameters:
        - contig (str): Contig/chromosome name

        Returns:
        str: DNA sequence for the contig
        """

def fastaparser(handle):
    """
    Parse FASTA file as generator yielding (header, sequence) tuples.

    Parameters:
    - handle (file-like): Open file handle to FASTA file

    Yields:
    tuple: (header, sequence) pairs
    """

def fasta2dict(fasta, full_header=False):
    """
    Convert FASTA file to dictionary.

    Parameters:
    - fasta (str): Path to FASTA file
    - full_header (bool): Use full header as key vs first word only

    Returns:
    dict: {header: sequence} mapping
    """

def fasta2headers(fasta, full_header=False):
    """
    Get FASTA headers as set.

    Parameters:
    - fasta (str): Path to FASTA file
    - full_header (bool): Use full header vs first word only

    Returns:
    set: Set of sequence headers
    """

def fasta2lengths(fasta, full_header=False):
    """
    Get sequence lengths as dictionary.

    Parameters:
    - fasta (str): Path to FASTA file
    - full_header (bool): Use full header as key vs first word only

    Returns:
    dict: {header: length} mapping
    """

Sequence Extraction

Extract specific regions from genomic sequences based on coordinates.

def getSeqRegions(seqs, header, coordinates, coords=False):
    """
    Extract sequence regions from coordinates.

    Parameters:
    - seqs (dict): Dictionary of sequences
    - header (str): Sequence header/contig name
    - coordinates (list): List of (start, end) coordinate tuples
    - coords (bool): Whether to include coordinate information

    Returns:
    str: Extracted sequence regions concatenated
    """

DNA Translation and Manipulation

Translate DNA sequences to proteins using standard genetic codes and perform sequence manipulations.

def translate(dna, strand, phase, table=1):
    """
    Translate DNA sequence to protein using genetic code.

    Parameters:
    - dna (str): DNA sequence to translate
    - strand (str): Strand orientation ("+" or "-")
    - phase (int): Reading frame phase (0, 1, or 2)
    - table (int): Genetic code table (1=standard, 11=bacterial)

    Returns:
    str: Translated protein sequence
    """

def RevComp(s):
    """
    Generate reverse complement of DNA sequence.

    Parameters:
    - s (str): Input DNA sequence

    Returns:
    str: Reverse complement sequence
    """

Text Formatting

Format sequences and text for output with proper line wrapping.

def softwrap(string, every=80):
    """
    Soft wrap text to specified width.

    Parameters:
    - string (str): Input string to wrap
    - every (int): Line width for wrapping

    Returns:
    str: Wrapped text with newlines
    """

Genetic Code Tables

Access to standard genetic code tables for translation.

codon_table = {
    "1": {
        # Standard genetic code table
        "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
        # ... (complete codon to amino acid mapping)
    },
    "11": {
        # Bacterial, archaeal and plant plastid genetic code
        "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
        # ... (complete codon to amino acid mapping)
    }
}

Usage Examples

Basic FASTA Operations

from gfftk.fasta import FASTA, fasta2dict

# Object-oriented approach
fasta = FASTA("genome.fasta")
chr1_seq = fasta.get_seq("chr1")

# Functional approach
genome = fasta2dict("genome.fasta")
chr1_seq = genome["chr1"]

# Get sequence information
from gfftk.fasta import fasta2headers, fasta2lengths

headers = fasta2headers("genome.fasta")
lengths = fasta2lengths("genome.fasta")

print(f"Number of sequences: {len(headers)}")
print(f"Sequence lengths: {lengths}")

Sequence Extraction

from gfftk.fasta import fasta2dict, getSeqRegions

# Load genome
genome = fasta2dict("genome.fasta")

# Extract specific regions
coordinates = [(1000, 2000), (3000, 4000), (5000, 6000)]
extracted = getSeqRegions(genome, "chr1", coordinates)

print(f"Extracted sequence: {extracted}")

DNA Translation

from gfftk.fasta import translate, RevComp

# Example DNA sequence
dna_sequence = "ATGAAGTTTGCCTAG"

# Translate forward strand
protein_forward = translate(dna_sequence, "+", 0, table=1)
print(f"Forward translation: {protein_forward}")

# Translate reverse strand
dna_reverse = RevComp(dna_sequence)
protein_reverse = translate(dna_reverse, "-", 0, table=1)
print(f"Reverse translation: {protein_reverse}")

# Translate with different genetic code (bacterial)
protein_bacterial = translate(dna_sequence, "+", 0, table=11)
print(f"Bacterial code translation: {protein_bacterial}")

# Translate in different reading frames
for phase in [0, 1, 2]:
    protein = translate(dna_sequence, "+", phase, table=1)
    print(f"Phase {phase}: {protein}")

Sequence Processing Pipeline

from gfftk.fasta import FASTA, translate, softwrap

# Initialize genome access
genome = FASTA("genome.fasta")

# Define gene coordinates (from GFF3 parsing)
gene_coords = {
    "gene1": {
        "contig": "chr1",
        "strand": "+",
        "cds": [(1000, 1200), (1500, 1700), (2000, 2300)]
    }
}

# Extract and translate CDS sequences
for gene_id, gene_info in gene_coords.items():
    # Get contig sequence
    contig_seq = genome.get_seq(gene_info["contig"])

    # Extract CDS regions
    cds_sequence = ""
    for start, end in gene_info["cds"]:
        cds_sequence += contig_seq[start-1:end]  # Convert to 0-based

    # Handle reverse strand
    if gene_info["strand"] == "-":
        from gfftk.fasta import RevComp
        cds_sequence = RevComp(cds_sequence)

    # Translate to protein
    protein = translate(cds_sequence, gene_info["strand"], 0, table=1)

    # Format output
    wrapped_protein = softwrap(protein, every=60)
    print(f">{gene_id}\n{wrapped_protein}")

Working with Compressed Files

from gfftk.fasta import fasta2dict

# Works with compressed FASTA files automatically
genome = fasta2dict("genome.fasta.gz")
genome2 = fasta2dict("genome.fasta.bz2")

print(f"Loaded {len(genome)} sequences from compressed file")

Types

# Sequence dictionary format
SequenceDict = dict[str, str]  # {header: sequence}

# Sequence header set
HeaderSet = set[str]

# Sequence length dictionary
LengthDict = dict[str, int]  # {header: length}

# Coordinate tuple format
CoordinateTuple = tuple[int, int]  # (start, end) in 1-based coordinates

# Coordinate list
CoordinateList = list[CoordinateTuple]

# Strand orientation
Strand = str  # "+" or "-"

# Reading frame phase
Phase = int  # 0, 1, or 2

# Genetic code table identifier
GeneticCodeTable = int  # 1 (standard) or 11 (bacterial/archaeal/plant plastid)

# DNA sequence
DNASequence = str  # String containing A, T, G, C, N characters

# Protein sequence
ProteinSequence = str  # String containing single-letter amino acid codes

# Codon table structure
CodonTable = dict[str, str]  # {codon: amino_acid}

# Complete genetic code tables
GeneticCodeTables = dict[str, CodonTable]

Install with Tessl CLI