tessl/pypi-gfftk

Comprehensive Python toolkit for working with genome annotation files in GFF3, GTF, and TBL formats with format conversion and analysis capabilities

Overview

Eval results

Files

GFF3 and GTF Processing

Name: tessl/pypi-gfftk
Author: tessl

Comprehensive parsing and manipulation of GFF3 and GTF format files with support for multiple annotation sources, robust validation, and flexible output options. Handles complex gene models with alternative splicing and provides the foundation for all format conversion operations.

Capabilities

GFF3 Parsing

Parse GFF3 files into the central annotation dictionary format with support for multiple annotation sources and validation.

def gff2dict(gff, fasta, annotation=False, table=1, debug=False, gap_filter=False, gff_format="auto", logger=sys.stderr.write):
    """
    Parse GFF3 file to annotation dictionary.

    Parameters:
    - gff (str): Path to input GFF3 file
    - fasta (str): Path to genome FASTA file for sequence validation
    - annotation (dict|bool): Pre-existing annotation dictionary to extend, or False
    - table (int): Genetic code table for translation (1 or 11)
    - debug (bool): Enable debug output for parsing errors
    - gap_filter (bool): Filter out models with sequence gaps
    - gff_format (str): GFF format variant ("auto", "default", "miniprot", etc.)
    - logger (function): Logging function for error messages

    Returns:
    dict: Annotation dictionary with gene_id as keys and gene data as values
    """

def dict2gff3(infile, output=False, debug=False, source=False, newline=False):
    """
    Write annotation dictionary to GFF3 format.

    Parameters:
    - infile (dict): Annotation dictionary to write
    - output (str|bool): Output file path, or False for stdout
    - debug (bool): Enable debug output
    - source (str|bool): Override source field in output
    - newline (bool): Add newlines between gene records

    Returns:
    None
    """

def dict2gff3alignments(infile, output=False, debug=False, alignments=False, source=False, newline=False):
    """
    Write annotation dictionary to GFF3 alignments format for EVM evidence.

    Parameters:
    - infile (dict): Annotation dictionary to write
    - output (str|bool): Output file path, or False for stdout
    - debug (bool): Enable debug output
    - alignments (dict|bool): Alignment data structure for evidence formatting
    - source (str|bool): Override source field in output
    - newline (bool): Add newlines between records

    Returns:
    None
    """

GTF Parsing

Parse GTF files with support for different GTF formats and dialects from various annotation sources.

def gtf2dict(gtf, fasta, annotation=False, table=1, debug=False, gap_filter=False, gtf_format="auto", logger=sys.stderr.write):
    """
    Parse GTF file to annotation dictionary.

    Parameters:
    - gtf (str): Path to input GTF file
    - fasta (str): Path to genome FASTA file for sequence validation
    - annotation (dict|bool): Pre-existing annotation dictionary to extend, or False
    - table (int): Genetic code table for translation (1 or 11)
    - debug (bool): Enable debug output for parsing errors
    - gap_filter (bool): Filter out models with sequence gaps
    - gtf_format (str): GTF format variant ("auto", "default", "genemark", "jgi")
    - logger (function): Logging function for error messages

    Returns:
    dict: Annotation dictionary with gene_id as keys and gene data as values
    """

def dict2gtf(infile, output=False, source=False):
    """
    Write annotation dictionary to GTF format.

    Parameters:
    - infile (dict): Annotation dictionary to write
    - output (str|bool): Output file path, or False for stdout
    - source (str|bool): Override source field in output

    Returns:
    None
    """

Validation and Translation

Validate gene models and generate protein translations with comprehensive error checking.

def validate_models(annotation, fadict, logger=sys.stderr.write, table=1, gap_filter=False):
    """
    Validate gene model structure and sequences.

    Parameters:
    - annotation (dict): Annotation dictionary to validate
    - fadict (dict): Genome sequences dictionary
    - logger (function): Logging function for error messages
    - table (int): Genetic code table for validation
    - gap_filter (bool): Filter out models with sequence gaps

    Returns:
    dict: Validated annotation dictionary
    """

def validate_and_translate_models(annotation, fadict, logger=sys.stderr.write, table=1):
    """
    Validate gene models and generate protein translations.

    Parameters:
    - annotation (dict): Annotation dictionary to process
    - fadict (dict): Genome sequences dictionary
    - logger (function): Logging function for error messages
    - table (int): Genetic code table for translation

    Returns:
    dict: Annotation dictionary with validated translations
    """

Specialized Parsers

Internal parsers for handling different GFF3 and GTF formats from various annotation sources.

def _gff_default_parser(gff, fasta, Genes):
    """
    Default GFF3 parser implementation.

    Parameters:
    - gff (str): Path to GFF3 file
    - fasta (str): Path to genome FASTA file
    - Genes (dict): Annotation dictionary to populate

    Returns:
    dict: Updated annotation dictionary
    """

def _gff_miniprot_parser(gff, fasta, Genes):
    """
    Miniprot-specific GFF3 parser for protein alignments.

    Parameters:
    - gff (str): Path to miniprot GFF3 file
    - fasta (str): Path to genome FASTA file
    - Genes (dict): Annotation dictionary to populate

    Returns:
    dict: Updated annotation dictionary
    """

def _gff_alignment_parser(gff, fasta, Genes):
    """
    Alignment GFF3 parser for transcript/protein alignments.

    Parameters:
    - gff (str): Path to alignment GFF3 file
    - fasta (str): Path to genome FASTA file
    - Genes (dict): Annotation dictionary to populate

    Returns:
    dict: Updated annotation dictionary
    """

def _gff_ncbi_parser(gff, fasta, Genes):
    """
    NCBI GFF3 parser for NCBI-formatted annotations.

    Parameters:
    - gff (str): Path to NCBI GFF3 file
    - fasta (str): Path to genome FASTA file
    - Genes (dict): Annotation dictionary to populate

    Returns:
    dict: Updated annotation dictionary
    """

def _gtf_default_parser(gtf, fasta, Genes, gtf_format="default"):
    """
    Default GTF parser implementation.

    Parameters:
    - gtf (str): Path to GTF file
    - fasta (str): Path to genome FASTA file
    - Genes (dict): Annotation dictionary to populate
    - gtf_format (str): GTF format variant

    Returns:
    dict: Updated annotation dictionary
    """

def _gtf_genemark_parser(gtf, fasta, Genes, gtf_format="genemark"):
    """
    GeneMark GTF parser for GeneMark-specific format.

    Parameters:
    - gtf (str): Path to GeneMark GTF file
    - fasta (str): Path to genome FASTA file
    - Genes (dict): Annotation dictionary to populate
    - gtf_format (str): GTF format variant

    Returns:
    dict: Updated annotation dictionary
    """

def _gtf_jgi_parser(gtf, fasta, Genes, gtf_format="jgi"):
    """
    JGI GTF parser for JGI-specific format.

    Parameters:
    - gtf (str): Path to JGI GTF file
    - fasta (str): Path to genome FASTA file
    - Genes (dict): Annotation dictionary to populate
    - gtf_format (str): GTF format variant

    Returns:
    dict: Updated annotation dictionary
    """

GO Term Processing

Process and simplify Gene Ontology term lists for cleaner annotation output.

def simplifyGO(inputList):
    """
    Simplify Gene Ontology term list format.

    Parameters:
    - inputList (list): List of GO terms in various formats

    Returns:
    list: Simplified GO term list
    """

Sequence Gap Handling

Handle start and end gaps in genomic sequences during parsing and validation.

def start_end_gap(seq, coords):
    """
    Handle start/end gaps in genomic sequences.

    Parameters:
    - seq (str): Genomic sequence
    - coords (list): List of coordinate tuples

    Returns:
    tuple: Adjusted coordinates and gap information
    """

Usage Examples

Basic GFF3 Parsing

from gfftk.gff import gff2dict, dict2gff3

# Parse GFF3 file to annotation dictionary
annotation = gff2dict("input.gff3", "genome.fasta")

# Access gene information
for gene_id, gene_data in annotation.items():
    print(f"Gene: {gene_id}")
    print(f"Location: {gene_data['location']}")
    print(f"Strand: {gene_data['strand']}")
    print(f"Products: {gene_data['product']}")

# Write back to GFF3 format
dict2gff3(annotation, output="output.gff3")

GTF Processing

from gfftk.gff import gtf2dict, dict2gtf

# Parse GTF file
annotation = gtf2dict("input.gtf", "genome.fasta", debug=True)

# Write to GTF format with custom source
dict2gtf(annotation, output="output.gtf", source="custom_pipeline")

Validation and Translation

from gfftk.gff import gff2dict, validate_and_translate_models
from gfftk.fasta import fasta2dict

# Load data
annotation = gff2dict("annotation.gff3", "genome.fasta")
genome = fasta2dict("genome.fasta")

# Validate and generate translations
validated = validate_and_translate_models(annotation, genome, table=1)

# Access protein translations
for gene_id, gene_data in validated.items():
    for i, protein in enumerate(gene_data['protein']):
        transcript_id = gene_data['ids'][i]
        print(f"{transcript_id}: {protein}")

Working with Different Sources

from gfftk.gff import gff2dict

# Parse different annotation sources with debug output
augustus_annotation = gff2dict("augustus.gff3", "genome.fasta", debug=True)
ncbi_annotation = gff2dict("ncbi.gff3", "genome.fasta", debug=True)
miniprot_annotation = gff2dict("miniprot.gff3", "genome.fasta", debug=True)

# Combine annotations (example workflow)
combined = {}
combined.update(augustus_annotation)
combined.update(ncbi_annotation)
combined.update(miniprot_annotation)

Types

# Annotation dictionary structure (detailed in main index)
AnnotationDict = dict[str, GeneAnnotation]

# Parser function type
ParserFunction = callable[[str, str, dict], dict]

# Logger function type
LoggerFunction = callable[[str], None]

# Coordinate tuple format
CoordinateTuple = tuple[int, int]

# Feature coordinate list
FeatureCoordinates = list[CoordinateTuple]

# Gene Ontology term format
GOTerm = str  # Format: "GO:0000000"

# Database cross-reference format
DbXref = str  # Format: "database:identifier"

Install with Tessl CLI