Comprehensive Python toolkit for working with genome annotation files in GFF3, GTF, and TBL formats with format conversion and analysis capabilities
Comprehensive FASTA file parsing and genomic sequence manipulation capabilities, including coordinate-based sequence extraction, translation using multiple genetic codes, reverse complement operations, and efficient sequence access.
Object-oriented and functional interfaces for working with FASTA files and sequence data.
class FASTA:
"""FASTA file handler with efficient sequence access."""
def __init__(self, fasta_file):
"""
Initialize FASTA handler.
Parameters:
- fasta_file (str): Path to FASTA file
"""
def get_seq(self, contig):
"""
Get sequence for specified contig.
Parameters:
- contig (str): Contig/chromosome name
Returns:
str: DNA sequence for the contig
"""
def fastaparser(handle):
"""
Parse FASTA file as generator yielding (header, sequence) tuples.
Parameters:
- handle (file-like): Open file handle to FASTA file
Yields:
tuple: (header, sequence) pairs
"""
def fasta2dict(fasta, full_header=False):
"""
Convert FASTA file to dictionary.
Parameters:
- fasta (str): Path to FASTA file
- full_header (bool): Use full header as key vs first word only
Returns:
dict: {header: sequence} mapping
"""
def fasta2headers(fasta, full_header=False):
"""
Get FASTA headers as set.
Parameters:
- fasta (str): Path to FASTA file
- full_header (bool): Use full header vs first word only
Returns:
set: Set of sequence headers
"""
def fasta2lengths(fasta, full_header=False):
"""
Get sequence lengths as dictionary.
Parameters:
- fasta (str): Path to FASTA file
- full_header (bool): Use full header as key vs first word only
Returns:
dict: {header: length} mapping
"""Extract specific regions from genomic sequences based on coordinates.
def getSeqRegions(seqs, header, coordinates, coords=False):
"""
Extract sequence regions from coordinates.
Parameters:
- seqs (dict): Dictionary of sequences
- header (str): Sequence header/contig name
- coordinates (list): List of (start, end) coordinate tuples
- coords (bool): Whether to include coordinate information
Returns:
str: Extracted sequence regions concatenated
"""Translate DNA sequences to proteins using standard genetic codes and perform sequence manipulations.
def translate(dna, strand, phase, table=1):
"""
Translate DNA sequence to protein using genetic code.
Parameters:
- dna (str): DNA sequence to translate
- strand (str): Strand orientation ("+" or "-")
- phase (int): Reading frame phase (0, 1, or 2)
- table (int): Genetic code table (1=standard, 11=bacterial)
Returns:
str: Translated protein sequence
"""
def RevComp(s):
"""
Generate reverse complement of DNA sequence.
Parameters:
- s (str): Input DNA sequence
Returns:
str: Reverse complement sequence
"""Format sequences and text for output with proper line wrapping.
def softwrap(string, every=80):
"""
Soft wrap text to specified width.
Parameters:
- string (str): Input string to wrap
- every (int): Line width for wrapping
Returns:
str: Wrapped text with newlines
"""Access to standard genetic code tables for translation.
codon_table = {
"1": {
# Standard genetic code table
"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
# ... (complete codon to amino acid mapping)
},
"11": {
# Bacterial, archaeal and plant plastid genetic code
"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
# ... (complete codon to amino acid mapping)
}
}from gfftk.fasta import FASTA, fasta2dict
# Object-oriented approach
fasta = FASTA("genome.fasta")
chr1_seq = fasta.get_seq("chr1")
# Functional approach
genome = fasta2dict("genome.fasta")
chr1_seq = genome["chr1"]
# Get sequence information
from gfftk.fasta import fasta2headers, fasta2lengths
headers = fasta2headers("genome.fasta")
lengths = fasta2lengths("genome.fasta")
print(f"Number of sequences: {len(headers)}")
print(f"Sequence lengths: {lengths}")from gfftk.fasta import fasta2dict, getSeqRegions
# Load genome
genome = fasta2dict("genome.fasta")
# Extract specific regions
coordinates = [(1000, 2000), (3000, 4000), (5000, 6000)]
extracted = getSeqRegions(genome, "chr1", coordinates)
print(f"Extracted sequence: {extracted}")from gfftk.fasta import translate, RevComp
# Example DNA sequence
dna_sequence = "ATGAAGTTTGCCTAG"
# Translate forward strand
protein_forward = translate(dna_sequence, "+", 0, table=1)
print(f"Forward translation: {protein_forward}")
# Translate reverse strand
dna_reverse = RevComp(dna_sequence)
protein_reverse = translate(dna_reverse, "-", 0, table=1)
print(f"Reverse translation: {protein_reverse}")
# Translate with different genetic code (bacterial)
protein_bacterial = translate(dna_sequence, "+", 0, table=11)
print(f"Bacterial code translation: {protein_bacterial}")
# Translate in different reading frames
for phase in [0, 1, 2]:
protein = translate(dna_sequence, "+", phase, table=1)
print(f"Phase {phase}: {protein}")from gfftk.fasta import FASTA, translate, softwrap
# Initialize genome access
genome = FASTA("genome.fasta")
# Define gene coordinates (from GFF3 parsing)
gene_coords = {
"gene1": {
"contig": "chr1",
"strand": "+",
"cds": [(1000, 1200), (1500, 1700), (2000, 2300)]
}
}
# Extract and translate CDS sequences
for gene_id, gene_info in gene_coords.items():
# Get contig sequence
contig_seq = genome.get_seq(gene_info["contig"])
# Extract CDS regions
cds_sequence = ""
for start, end in gene_info["cds"]:
cds_sequence += contig_seq[start-1:end] # Convert to 0-based
# Handle reverse strand
if gene_info["strand"] == "-":
from gfftk.fasta import RevComp
cds_sequence = RevComp(cds_sequence)
# Translate to protein
protein = translate(cds_sequence, gene_info["strand"], 0, table=1)
# Format output
wrapped_protein = softwrap(protein, every=60)
print(f">{gene_id}\n{wrapped_protein}")from gfftk.fasta import fasta2dict
# Works with compressed FASTA files automatically
genome = fasta2dict("genome.fasta.gz")
genome2 = fasta2dict("genome.fasta.bz2")
print(f"Loaded {len(genome)} sequences from compressed file")# Sequence dictionary format
SequenceDict = dict[str, str] # {header: sequence}
# Sequence header set
HeaderSet = set[str]
# Sequence length dictionary
LengthDict = dict[str, int] # {header: length}
# Coordinate tuple format
CoordinateTuple = tuple[int, int] # (start, end) in 1-based coordinates
# Coordinate list
CoordinateList = list[CoordinateTuple]
# Strand orientation
Strand = str # "+" or "-"
# Reading frame phase
Phase = int # 0, 1, or 2
# Genetic code table identifier
GeneticCodeTable = int # 1 (standard) or 11 (bacterial/archaeal/plant plastid)
# DNA sequence
DNASequence = str # String containing A, T, G, C, N characters
# Protein sequence
ProteinSequence = str # String containing single-letter amino acid codes
# Codon table structure
CodonTable = dict[str, str] # {codon: amino_acid}
# Complete genetic code tables
GeneticCodeTables = dict[str, CodonTable]Install with Tessl CLI
npx tessl i tessl/pypi-gfftk