Comprehensive Python toolkit for working with genome annotation files in GFF3, GTF, and TBL formats with format conversion and analysis capabilities
Comprehensive parsing and manipulation of GFF3 and GTF format files with support for multiple annotation sources, robust validation, and flexible output options. Handles complex gene models with alternative splicing and provides the foundation for all format conversion operations.
Parse GFF3 files into the central annotation dictionary format with support for multiple annotation sources and validation.
def gff2dict(gff, fasta, annotation=False, table=1, debug=False, gap_filter=False, gff_format="auto", logger=sys.stderr.write):
"""
Parse GFF3 file to annotation dictionary.
Parameters:
- gff (str): Path to input GFF3 file
- fasta (str): Path to genome FASTA file for sequence validation
- annotation (dict|bool): Pre-existing annotation dictionary to extend, or False
- table (int): Genetic code table for translation (1 or 11)
- debug (bool): Enable debug output for parsing errors
- gap_filter (bool): Filter out models with sequence gaps
- gff_format (str): GFF format variant ("auto", "default", "miniprot", etc.)
- logger (function): Logging function for error messages
Returns:
dict: Annotation dictionary with gene_id as keys and gene data as values
"""
def dict2gff3(infile, output=False, debug=False, source=False, newline=False):
"""
Write annotation dictionary to GFF3 format.
Parameters:
- infile (dict): Annotation dictionary to write
- output (str|bool): Output file path, or False for stdout
- debug (bool): Enable debug output
- source (str|bool): Override source field in output
- newline (bool): Add newlines between gene records
Returns:
None
"""
def dict2gff3alignments(infile, output=False, debug=False, alignments=False, source=False, newline=False):
"""
Write annotation dictionary to GFF3 alignments format for EVM evidence.
Parameters:
- infile (dict): Annotation dictionary to write
- output (str|bool): Output file path, or False for stdout
- debug (bool): Enable debug output
- alignments (dict|bool): Alignment data structure for evidence formatting
- source (str|bool): Override source field in output
- newline (bool): Add newlines between records
Returns:
None
"""Parse GTF files with support for different GTF formats and dialects from various annotation sources.
def gtf2dict(gtf, fasta, annotation=False, table=1, debug=False, gap_filter=False, gtf_format="auto", logger=sys.stderr.write):
"""
Parse GTF file to annotation dictionary.
Parameters:
- gtf (str): Path to input GTF file
- fasta (str): Path to genome FASTA file for sequence validation
- annotation (dict|bool): Pre-existing annotation dictionary to extend, or False
- table (int): Genetic code table for translation (1 or 11)
- debug (bool): Enable debug output for parsing errors
- gap_filter (bool): Filter out models with sequence gaps
- gtf_format (str): GTF format variant ("auto", "default", "genemark", "jgi")
- logger (function): Logging function for error messages
Returns:
dict: Annotation dictionary with gene_id as keys and gene data as values
"""
def dict2gtf(infile, output=False, source=False):
"""
Write annotation dictionary to GTF format.
Parameters:
- infile (dict): Annotation dictionary to write
- output (str|bool): Output file path, or False for stdout
- source (str|bool): Override source field in output
Returns:
None
"""Validate gene models and generate protein translations with comprehensive error checking.
def validate_models(annotation, fadict, logger=sys.stderr.write, table=1, gap_filter=False):
"""
Validate gene model structure and sequences.
Parameters:
- annotation (dict): Annotation dictionary to validate
- fadict (dict): Genome sequences dictionary
- logger (function): Logging function for error messages
- table (int): Genetic code table for validation
- gap_filter (bool): Filter out models with sequence gaps
Returns:
dict: Validated annotation dictionary
"""
def validate_and_translate_models(annotation, fadict, logger=sys.stderr.write, table=1):
"""
Validate gene models and generate protein translations.
Parameters:
- annotation (dict): Annotation dictionary to process
- fadict (dict): Genome sequences dictionary
- logger (function): Logging function for error messages
- table (int): Genetic code table for translation
Returns:
dict: Annotation dictionary with validated translations
"""Internal parsers for handling different GFF3 and GTF formats from various annotation sources.
def _gff_default_parser(gff, fasta, Genes):
"""
Default GFF3 parser implementation.
Parameters:
- gff (str): Path to GFF3 file
- fasta (str): Path to genome FASTA file
- Genes (dict): Annotation dictionary to populate
Returns:
dict: Updated annotation dictionary
"""
def _gff_miniprot_parser(gff, fasta, Genes):
"""
Miniprot-specific GFF3 parser for protein alignments.
Parameters:
- gff (str): Path to miniprot GFF3 file
- fasta (str): Path to genome FASTA file
- Genes (dict): Annotation dictionary to populate
Returns:
dict: Updated annotation dictionary
"""
def _gff_alignment_parser(gff, fasta, Genes):
"""
Alignment GFF3 parser for transcript/protein alignments.
Parameters:
- gff (str): Path to alignment GFF3 file
- fasta (str): Path to genome FASTA file
- Genes (dict): Annotation dictionary to populate
Returns:
dict: Updated annotation dictionary
"""
def _gff_ncbi_parser(gff, fasta, Genes):
"""
NCBI GFF3 parser for NCBI-formatted annotations.
Parameters:
- gff (str): Path to NCBI GFF3 file
- fasta (str): Path to genome FASTA file
- Genes (dict): Annotation dictionary to populate
Returns:
dict: Updated annotation dictionary
"""
def _gtf_default_parser(gtf, fasta, Genes, gtf_format="default"):
"""
Default GTF parser implementation.
Parameters:
- gtf (str): Path to GTF file
- fasta (str): Path to genome FASTA file
- Genes (dict): Annotation dictionary to populate
- gtf_format (str): GTF format variant
Returns:
dict: Updated annotation dictionary
"""
def _gtf_genemark_parser(gtf, fasta, Genes, gtf_format="genemark"):
"""
GeneMark GTF parser for GeneMark-specific format.
Parameters:
- gtf (str): Path to GeneMark GTF file
- fasta (str): Path to genome FASTA file
- Genes (dict): Annotation dictionary to populate
- gtf_format (str): GTF format variant
Returns:
dict: Updated annotation dictionary
"""
def _gtf_jgi_parser(gtf, fasta, Genes, gtf_format="jgi"):
"""
JGI GTF parser for JGI-specific format.
Parameters:
- gtf (str): Path to JGI GTF file
- fasta (str): Path to genome FASTA file
- Genes (dict): Annotation dictionary to populate
- gtf_format (str): GTF format variant
Returns:
dict: Updated annotation dictionary
"""Process and simplify Gene Ontology term lists for cleaner annotation output.
def simplifyGO(inputList):
"""
Simplify Gene Ontology term list format.
Parameters:
- inputList (list): List of GO terms in various formats
Returns:
list: Simplified GO term list
"""Handle start and end gaps in genomic sequences during parsing and validation.
def start_end_gap(seq, coords):
"""
Handle start/end gaps in genomic sequences.
Parameters:
- seq (str): Genomic sequence
- coords (list): List of coordinate tuples
Returns:
tuple: Adjusted coordinates and gap information
"""from gfftk.gff import gff2dict, dict2gff3
# Parse GFF3 file to annotation dictionary
annotation = gff2dict("input.gff3", "genome.fasta")
# Access gene information
for gene_id, gene_data in annotation.items():
print(f"Gene: {gene_id}")
print(f"Location: {gene_data['location']}")
print(f"Strand: {gene_data['strand']}")
print(f"Products: {gene_data['product']}")
# Write back to GFF3 format
dict2gff3(annotation, output="output.gff3")from gfftk.gff import gtf2dict, dict2gtf
# Parse GTF file
annotation = gtf2dict("input.gtf", "genome.fasta", debug=True)
# Write to GTF format with custom source
dict2gtf(annotation, output="output.gtf", source="custom_pipeline")from gfftk.gff import gff2dict, validate_and_translate_models
from gfftk.fasta import fasta2dict
# Load data
annotation = gff2dict("annotation.gff3", "genome.fasta")
genome = fasta2dict("genome.fasta")
# Validate and generate translations
validated = validate_and_translate_models(annotation, genome, table=1)
# Access protein translations
for gene_id, gene_data in validated.items():
for i, protein in enumerate(gene_data['protein']):
transcript_id = gene_data['ids'][i]
print(f"{transcript_id}: {protein}")from gfftk.gff import gff2dict
# Parse different annotation sources with debug output
augustus_annotation = gff2dict("augustus.gff3", "genome.fasta", debug=True)
ncbi_annotation = gff2dict("ncbi.gff3", "genome.fasta", debug=True)
miniprot_annotation = gff2dict("miniprot.gff3", "genome.fasta", debug=True)
# Combine annotations (example workflow)
combined = {}
combined.update(augustus_annotation)
combined.update(ncbi_annotation)
combined.update(miniprot_annotation)# Annotation dictionary structure (detailed in main index)
AnnotationDict = dict[str, GeneAnnotation]
# Parser function type
ParserFunction = callable[[str, str, dict], dict]
# Logger function type
LoggerFunction = callable[[str], None]
# Coordinate tuple format
CoordinateTuple = tuple[int, int]
# Feature coordinate list
FeatureCoordinates = list[CoordinateTuple]
# Gene Ontology term format
GOTerm = str # Format: "GO:0000000"
# Database cross-reference format
DbXref = str # Format: "database:identifier"Install with Tessl CLI
npx tessl i tessl/pypi-gfftk