Comprehensive Python toolkit for working with genome annotation files in GFF3, GTF, and TBL formats with format conversion and analysis capabilities
npx @tessl/cli install tessl/pypi-gfftk@25.6.0A comprehensive Python toolkit for working with genome annotation files in GFF3, GTF, and TBL formats. GFFtk provides powerful format conversion capabilities, allowing users to convert between different genomic file formats including GenBank, extract protein and transcript sequences from annotations, and perform advanced filtering operations using flexible regex patterns on genomic features.
pip install gfftkgfftk (after installation)import gfftkCommon imports for format conversion and parsing:
from gfftk.gff import gff2dict, dict2gff3, gtf2dict, dict2gtf
from gfftk.convert import gff2proteins, gff2tbl, tbl2gff3
from gfftk.genbank import tbl2dict, dict2tbl
from gfftk.fasta import fasta2dict, translate, FASTAfrom gfftk.gff import gff2dict
from gfftk.convert import gff2proteins
from gfftk.fasta import fasta2dict
# Load genome sequence and annotation
genome = fasta2dict("genome.fasta")
annotation = gff2dict("annotation.gff3", "genome.fasta")
# Convert GFF3 to protein FASTA
gff2proteins("annotation.gff3", "genome.fasta", output="proteins.faa")
# Access annotation data programmatically
for gene_id, gene_data in annotation.items():
print(f"Gene: {gene_id}")
print(f"Location: {gene_data['contig']}:{gene_data['location'][0]}-{gene_data['location'][1]}")
print(f"Products: {gene_data['product']}")GFFtk is built around a central annotation dictionary format that enables seamless conversion between different genomic annotation formats:
This design allows for reliable conversion between formats while preserving all annotation information and relationships.
Convert between GFF3, GTF, TBL, GenBank, and FASTA formats with full feature preservation and validation. Supports protein and transcript sequence extraction with customizable genetic code tables.
def gff2tbl(gff, fasta, output=False, table=1, debug=False, grep=[], grepv=[]):
"""Convert GFF3 to TBL format"""
def tbl2gff3(tbl, fasta, output=False, table=1, grep=[], grepv=[]):
"""Convert TBL to GFF3 format"""
def gff2proteins(gff, fasta, output=False, table=1, strip_stop=False, debug=False, grep=[], grepv=[]):
"""Convert GFF3 to protein FASTA sequences"""Parse and validate GFF3 and GTF files with support for multiple annotation sources and formats. Handles complex gene models with alternative splicing and provides robust error checking.
def gff2dict(gff, fasta, table=1, debug=False):
"""Parse GFF3 to annotation dictionary"""
def gtf2dict(gtf, fasta, table=1, debug=False):
"""Parse GTF to annotation dictionary"""
def dict2gff3(infile, output=False, debug=False, source=False, newline=False):
"""Write annotation dictionary to GFF3 format"""Extract and manipulate genomic sequences with support for coordinate-based extraction, translation using multiple genetic codes, and reverse complement operations.
class FASTA:
def __init__(self, fasta_file): ...
def get_seq(self, contig): ...
def fasta2dict(fasta, full_header=False):
"""Convert FASTA file to dictionary"""
def translate(dna, strand, phase, table=1):
"""Translate DNA sequence to protein"""EvidenceModeler-like consensus gene prediction that combines multiple annotation sources using protein and transcript evidence, with configurable scoring weights and structural validation.
def generate_consensus(fasta, genes, proteins, transcripts, weights, output, ...):
"""Generate consensus predictions from multiple sources"""
def getAED(query, reference):
"""Calculate Annotation Edit Distance"""
def score_by_evidence(locus, weights={}, derived=[]):
"""Score models by evidence overlap"""Compare two genome annotations to identify differences, calculate similarity metrics, and generate detailed comparison reports with feature-level analysis.
def compareAnnotations(old, new, fasta, output=False):
"""Compare two GFF3 annotations"""
def pairwiseAED(query, reference):
"""Calculate pairwise AED scores"""
def gff2interlap(input, fasta):
"""Convert GFF3 to InterLap structure for overlap analysis"""Complete support for NCBI GenBank and TBL annotation formats with bidirectional conversion, validation, and NCBI submission integration.
def tbl2dict(inputfile, fasta, annotation=False, table=1, debug=False):
"""Convert NCBI TBL format to annotation dictionary"""
def dict2tbl(annots, seqs, outfile, table=1, debug=False):
"""Convert annotation dictionary to NCBI TBL format"""
def dict2gbff(annots, seqs, outfile, organism=None, circular=False):
"""Convert annotation dictionary to GenBank format"""
def table2asn(sbt, tbl, fasta, out, organism, strain, table=1):
"""Run NCBI table2asn for GenBank submission"""Direct command-line access to all GFFtk functions with comprehensive parameter support and batch processing capabilities.
def convert(args):
"""CLI interface for format conversion operations"""
def consensus(args):
"""CLI interface for consensus gene prediction"""
def compare(args):
"""CLI interface for annotation comparison"""
def stats(args):
"""CLI interface for statistics calculation"""Comprehensive file handling utilities with support for compressed formats, data validation, and annotation statistics calculation.
def zopen(filename, mode="r", buff=1024*1024, external=True):
"""Open files with automatic compression support"""
def annotation_stats(Genes):
"""Calculate comprehensive annotation statistics"""
def filter_annotations(annotations, grep=None, grepv=None):
"""Filter annotations using regex patterns"""# Central annotation dictionary format
AnnotationDict = dict[str, dict]
# Gene annotation structure
GeneAnnotation = {
"name": str, # Gene name/identifier
"type": list[str], # Feature types per transcript
"transcript": list[str], # Full transcript sequences
"cds_transcript": list[str], # CDS-only sequences
"protein": list[str], # Protein translations
"5UTR": list[list[tuple[int, int]]], # 5' UTR coordinates
"3UTR": list[list[tuple[int, int]]], # 3' UTR coordinates
"codon_start": list[int], # Translation start phase
"ids": list[str], # Transcript IDs
"CDS": list[list[tuple[int, int]]], # CDS coordinates
"mRNA": list[list[tuple[int, int]]], # mRNA coordinates
"strand": str, # Strand ("+"/"-")
"gene_synonym": list[str], # Gene synonyms
"location": tuple[int, int], # Gene coordinates
"contig": str, # Contig/chromosome
"product": list[str], # Product descriptions
"source": str, # Annotation source
"phase": list[str], # CDS phase info
"db_xref": list[list[str]], # Database cross-refs
"go_terms": list[list[str]], # GO terms
"EC_number": list[list[str]], # EC numbers
"note": list[list[str]], # Notes
"partialStart": list[bool], # Partial start flags
"partialStop": list[bool], # Partial stop flags
"pseudo": bool, # Pseudogene flag
}