tessl/pypi-gfftk

Comprehensive Python toolkit for working with genome annotation files in GFF3, GTF, and TBL formats with format conversion and analysis capabilities

Overview

Eval results

Files

Consensus Gene Prediction

Name: tessl/pypi-gfftk
Author: tessl

EvidenceModeler-like consensus gene prediction system that combines multiple annotation sources using protein and transcript evidence alignment, configurable source weights, and structural validation to generate high-quality consensus gene models.

Capabilities

Main Consensus Generation

Generate consensus gene predictions from multiple annotation sources with evidence-based scoring.

def generate_consensus(fasta, genes, proteins, transcripts, weights, out, debug=False, minscore=False, repeats=False, repeat_overlap=90, tiebreakers="calculated", min_exon=3, min_intron=11, max_intron=-1, max_exon=-1, evidence_derived_models=[], num_processes=None, utrs=True, min_utr_length=10, max_utr_length=2000, log=sys.stderr.write):
    """
    Generate consensus predictions from multiple annotation sources.

    Parameters:
    - fasta (str): Path to genome FASTA file
    - genes (list): List of paths to gene model files in GFF3 format
    - proteins (list): List of paths to protein alignment files in GFF3 format
    - transcripts (list): List of paths to transcript alignment files in GFF3 format
    - weights (dict): User-supplied source weights {source: weight}
    - out (str): Path to output consensus GFF3 file
    - debug (bool): Enable debug output
    - minscore (int|bool): Minimum score to retain gene model, or False for auto
    - repeats (str|bool): Path to repeat annotations, or False
    - repeat_overlap (int): Percent overlap with repeats to filter (default: 90)
    - tiebreakers (str): Tiebreaker method ("calculated" or other)
    - min_exon (int): Minimum exon length (default: 3)
    - min_intron (int): Minimum intron length (default: 11)
    - max_intron (int): Maximum intron length (-1 for no limit)
    - max_exon (int): Maximum exon length (-1 for no limit)
    - evidence_derived_models (list): List of evidence-derived model sources
    - num_processes (int|None): Number of parallel processes
    - utrs (bool): Enable UTR extension (default: True)
    - min_utr_length (int): Minimum UTR length (default: 10)
    - max_utr_length (int): Maximum UTR length (default: 2000)
    - log (function): Logging function

    Returns:
    None
    """

Annotation Edit Distance

Calculate Annotation Edit Distance (AED) between gene models for similarity assessment.

def getAED(query, reference):
    """
    Calculate Annotation Edit Distance between two gene models.

    Parameters:
    - query (dict): Query gene model data structure
    - reference (dict): Reference gene model data structure

    Returns:
    float: AED score (0.0 = perfect match, 1.0 = no overlap)
    """

Evidence-Based Scoring

Score gene models based on overlap with protein and transcript evidence.

def score_by_evidence(locus, weights={}, derived=[]):
    """
    Score gene models based on evidence overlap.

    Parameters:
    - locus (list): List of gene models at a genomic locus
    - weights (dict): User-supplied source weights
    - derived (list): Derived weights from evidence analysis

    Returns:
    dict: Scored gene models with evidence metrics
    """

Model Selection

Select the best gene model from a locus based on scoring criteria.

def best_model_default(locus_name, contig, strand, locus, debug=False, weights={}, order={}, min_exon=3, min_intron=10, max_intron=-1, max_exon=-1, evidence_derived_models=[]):
    """
    Select best gene model from locus using default scoring algorithm.

    Parameters:
    - locus_name (str): Name identifier for the locus
    - contig (str): Contig/chromosome name
    - strand (str): Strand orientation ("+" or "-")
    - locus (list): List of gene models at genomic locus
    - debug (bool): Enable debug output
    - weights (dict): User-supplied source weights
    - order (dict): Derived weights from evidence analysis
    - min_exon (int): Minimum exon length requirement
    - min_intron (int): Minimum intron length requirement
    - max_intron (int): Maximum intron length (-1 for no limit)
    - max_exon (int): Maximum exon length requirement (-1 for no limit)
    - evidence_derived_models (list): List of evidence-derived model sources

    Returns:
    dict: Best scoring gene model or None if no suitable model found
    """

Data Structure Conversion

Convert between different data structures used in consensus prediction.

def gff2interlap(infile, fasta, inter=False):
    """
    Convert GFF3 file to InterLap data structure for overlap analysis.

    Parameters:
    - infile (str): Path to GFF3 file
    - fasta (str): Path to genome FASTA file
    - inter (InterLap|bool): Existing InterLap object to extend, or False

    Returns:
    InterLap: InterLap object with genomic intervals
    """

def bed2interlap(bedfile, inter=False):
    """
    Convert BED file to InterLap data structure.

    Parameters:
    - bedfile (str): Path to BED format file
    - inter (InterLap|bool): Existing InterLap object to extend, or False

    Returns:
    InterLap: InterLap object with genomic intervals
    """

def safe_extract_coordinates(coords):
    """
    Safely extract coordinates from gene model data structure.

    Parameters:
    - coords (list): Coordinate data structure

    Returns:
    list: Validated coordinate tuples
    """

Gene Clustering and Locus Identification

Identify and cluster overlapping gene models into loci for consensus prediction.

def cluster_interlap(obj):
    """
    Cluster overlapping gene models using InterLap data structure.

    Parameters:
    - obj (InterLap): InterLap object containing gene intervals

    Returns:
    list: List of clustered loci with overlapping gene models
    """

def get_loci(annot_dict):
    """
    Group gene models into genomic loci based on overlap.

    Parameters:
    - annot_dict (dict): Annotation dictionary

    Returns:
    dict: Dictionary of loci with overlapping gene models
    """

def sub_cluster(obj):
    """
    Perform sub-clustering within loci for complex overlaps.

    Parameters:
    - obj (InterLap): InterLap object with clustered intervals

    Returns:
    list: List of refined sub-clusters
    """

def contained(a, b):
    """
    Check if interval a is contained within interval b.

    Parameters:
    - a (tuple): Interval coordinates (start, end)
    - b (tuple): Interval coordinates (start, end)

    Returns:
    bool: True if a is contained in b
    """

def get_overlap(a, b):
    """
    Calculate overlap between two genomic intervals.

    Parameters:
    - a (tuple): First interval (start, end)
    - b (tuple): Second interval (start, end)

    Returns:
    int: Number of overlapping base pairs
    """

UTR Processing

Advanced UTR identification and extension algorithms.

def check_intron_compatibility(model_coords, transcript_coords, strand):
    """
    Check if model introns are compatible with transcript evidence.

    Parameters:
    - model_coords (list): Model exon coordinates
    - transcript_coords (list): Transcript exon coordinates
    - strand (str): Strand orientation ("+" or "-")

    Returns:
    bool: True if introns are compatible
    """

def select_best_utrs(utr_exons_list, strand, min_length=10, max_length=2000):
    """
    Select optimal UTR exons from multiple evidence sources.

    Parameters:
    - utr_exons_list (list): List of UTR exon coordinate sets
    - strand (str): Strand orientation ("+" or "-")
    - min_length (int): Minimum UTR length
    - max_length (int): Maximum UTR length

    Returns:
    list: Selected UTR exon coordinates
    """

def extend_utrs(gene_model, transcript_evidence, strand, min_length=10, max_length=2000):
    """
    Extend gene model UTRs using transcript evidence.

    Parameters:
    - gene_model (dict): Gene model data structure
    - transcript_evidence (list): List of transcript alignments
    - strand (str): Strand orientation ("+" or "-")
    - min_length (int): Minimum UTR extension length
    - max_length (int): Maximum UTR extension length

    Returns:
    dict: Gene model with extended UTRs
    """

Repeat Filtering and Quality Control

Filter gene models based on repeat content and structural quality.

def filter_models_repeats(fasta, repeats, gene_models, filter_threshold=90, log=False):
    """
    Filter gene models overlapping repetitive regions.

    Parameters:
    - fasta (str): Path to genome FASTA file
    - repeats (str): Path to repeat annotations (BED format)
    - gene_models (dict): Gene model dictionary
    - filter_threshold (int): Percent overlap threshold for filtering
    - log (function|bool): Logging function or False

    Returns:
    dict: Filtered gene models
    """

def reasonable_model(coords, min_protein=30, min_exon=3, min_intron=10, max_intron=-1, max_exon=-1):
    """
    Validate gene model structure against biological constraints.

    Parameters:
    - coords (dict): Gene coordinates {'CDS': [...], 'protein': [...]}
    - min_protein (int): Minimum protein length (amino acids)
    - min_exon (int): Minimum exon length (nucleotides)
    - min_intron (int): Minimum intron length (nucleotides)
    - max_intron (int): Maximum intron length (-1 for no limit)
    - max_exon (int): Maximum exon length (-1 for no limit)

    Returns:
    bool: True if model passes structural validation
    """

Evidence Integration

Load and integrate protein and transcript evidence into consensus prediction.

def gffevidence2dict(file, Evi):
    """
    Parse evidence alignments from GFF3 into evidence dictionary.

    Parameters:
    - file (str): Path to evidence GFF3 file
    - Evi (dict): Existing evidence dictionary to update

    Returns:
    dict: Updated evidence dictionary
    """

def add_evidence(loci, evidence, source="proteins"):
    """
    Add evidence data to gene model loci.

    Parameters:
    - loci (dict): Dictionary of gene model loci
    - evidence (dict): Evidence alignment dictionary
    - source (str): Evidence type ("proteins" or "transcripts")

    Returns:
    dict: Loci with integrated evidence information
    """

def parse_data(genome, gene, protein, transcript, log=sys.stderr.write):
    """
    Parse all input data files for consensus prediction.

    Parameters:
    - genome (str): Path to genome FASTA file
    - gene (list): List of gene model file paths
    - protein (list): List of protein alignment file paths
    - transcript (list): List of transcript alignment file paths
    - log (function): Logging function

    Returns:
    tuple: (genome_dict, gene_dict, evidence_dict)
    """

def filter4evidence(data, n_genes=3, n_evidence=2):
    """
    Filter loci requiring minimum gene and evidence counts.

    Parameters:
    - data (dict): Loci data dictionary
    - n_genes (int): Minimum number of gene models required
    - n_evidence (int): Minimum number of evidence alignments required

    Returns:
    dict: Filtered loci meeting evidence requirements
    """

Scoring and Ranking Algorithms

Advanced scoring functions for consensus model selection.

def score_aggregator(locus, weights, evidence_weights, min_protein=30):
    """
    Aggregate multiple scoring metrics for final model ranking.

    Parameters:
    - locus (list): List of gene models at locus
    - weights (dict): User-supplied source weights
    - evidence_weights (dict): Evidence-derived weights
    - min_protein (int): Minimum protein length requirement

    Returns:
    list: Ranked gene models with aggregated scores
    """

def cluster_by_aed(locus, score=0.005):
    """
    Cluster highly similar models using AED scores.

    Parameters:
    - locus (list): List of gene models
    - score (float): AED threshold for clustering

    Returns:
    list: Clustered gene models
    """

def map_coords(g_coords, e_coords):
    """
    Map gene coordinates to evidence coordinates.

    Parameters:
    - g_coords (list): Gene model coordinates
    - e_coords (list): Evidence alignment coordinates

    Returns:
    dict: Coordinate mapping information
    """

def score_evidence(g_coords, e_coords, weight=2):
    """
    Score gene model based on evidence overlap.

    Parameters:
    - g_coords (list): Gene model coordinates
    - e_coords (list): Evidence coordinates
    - weight (float): Evidence weight factor

    Returns:
    float: Evidence-based score
    """

Weight Calculation and Source Ranking

Calculate source weights and rankings based on evidence support.

def calculate_source_order(data):
    """
    Calculate source ranking based on evidence support.

    Parameters:
    - data (dict): Gene model and evidence data

    Returns:
    dict: Source rankings and weights
    """

def order_sources(locus):
    """
    Order prediction sources by quality metrics.

    Parameters:
    - locus (list): List of gene models at locus

    Returns:
    dict: Ordered source rankings
    """

def src_scaling_factor(obj):
    """
    Calculate scaling factors for different prediction sources.

    Parameters:
    - obj (dict): Source performance data

    Returns:
    dict: Source-specific scaling factors
    """

def de_novo_distance(locus):
    """
    Calculate distance metrics for de novo source weights.

    Parameters:
    - locus (list): List of gene models

    Returns:
    dict: Distance-based weight metrics
    """

def calculate_gene_distance(locus):
    """
    Calculate structural distance between gene models.

    Parameters:
    - locus (list): List of gene models at locus

    Returns:
    dict: Inter-model distance metrics
    """

Algorithm Configuration

Configure consensus prediction algorithm parameters.

def auto_score_threshold(weights, order, user_weight=6):
    """
    Automatically determine optimal score threshold.

    Parameters:
    - weights (dict): User-supplied weights
    - order (dict): Source order rankings
    - user_weight (int): Weight for user preferences

    Returns:
    float: Calculated score threshold
    """

def ensure_unique_names(genes):
    """
    Ensure all gene models have unique identifiers.

    Parameters:
    - genes (dict): Gene model dictionary

    Returns:
    dict: Gene models with unique names
    """

def fasta_length(fasta):
    """
    Get sequence lengths from FASTA file.

    Parameters:
    - fasta (str): Path to FASTA file

    Returns:
    dict: Dictionary mapping sequence names to lengths
    """

Output and Refinement

Refine consensus predictions and generate output files.

def refine_cluster(locus, derived=[]):
    """
    Refine gene model cluster using derived weights.

    Parameters:
    - locus (list): List of gene models
    - derived (list): Derived weight parameters

    Returns:
    list: Refined gene model cluster
    """

def solve_sub_loci(result):
    """
    Resolve complex multi-gene loci into individual predictions.

    Parameters:
    - result (dict): Clustered locus data

    Returns:
    dict: Resolved individual gene predictions
    """

def gff_writer(input, output):
    """
    Write consensus predictions to GFF3 format.

    Parameters:
    - input (dict): Consensus gene predictions
    - output (str): Output GFF3 file path

    Returns:
    None
    """

CLI Interface

Command-line interface for consensus prediction.

def consensus(args):
    """
    Main consensus prediction CLI entry point.

    Parameters:
    - args (argparse.Namespace): Command line arguments

    Returns:
    None
    """
    - max_exon (int): Maximum exon length (-1 for no limit)
    - evidence_derived_models (list): Evidence-derived model sources

    Returns:
    dict: Best gene model with score information
    """

Repeat Filtering

Filter gene models that significantly overlap with repetitive elements.

def filter_models_repeats(fasta, repeats, gene_models, filter_threshold=90, log=False):
    """
    Filter gene models overlapping with repetitive elements.

    Parameters:
    - fasta (str): Path to genome FASTA file
    - repeats (str): Path to repeat annotations (BED or GFF3)
    - gene_models (dict): Dictionary of gene models to filter
    - filter_threshold (int): Percent overlap threshold for filtering
    - log (function|bool): Logging function or boolean

    Returns:
    dict: Filtered gene models with repeats removed
    """

Structural Validation

Validate gene model structure against biological constraints.

def reasonable_model(coords, min_protein=30, min_exon=3, min_intron=10, max_intron=-1, max_exon=-1):
    """
    Validate gene model structure against biological constraints.

    Parameters:
    - coords (dict): Gene model coordinate information
    - min_protein (int): Minimum protein length in amino acids
    - min_exon (int): Minimum exon length in nucleotides
    - min_intron (int): Minimum intron length in nucleotides
    - max_intron (int): Maximum intron length (-1 for no limit)
    - max_exon (int): Maximum exon length (-1 for no limit)

    Returns:
    bool: True if model passes validation, False otherwise
    """

Usage Examples

Basic Consensus Prediction

from gfftk.consensus import generate_consensus

# Define input files
gene_predictions = ["augustus.gff3", "genemark.gff3", "snap.gff3"]
protein_alignments = ["uniprot.gff3", "pfam.gff3"]
transcript_alignments = ["rnaseq.gff3"]

# User-defined source weights
weights = {
    "augustus": 3,
    "genemark": 2,
    "snap": 1
}

# Generate consensus
generate_consensus(
    fasta="genome.fasta",
    genes=gene_predictions,
    proteins=protein_alignments,
    transcripts=transcript_alignments,
    weights=weights,
    output="consensus.gff3",
    num_processes=4,
    min_protein=50,
    repeat_overlap=80
)

AED Calculation

from gfftk.consensus import getAED
from gfftk.gff import gff2dict

# Load two annotations for comparison
reference = gff2dict("reference.gff3", "genome.fasta")
query = gff2dict("prediction.gff3", "genome.fasta")

# Calculate AED for overlapping genes
for gene_id in reference:
    if gene_id in query:
        aed = getAED(query[gene_id], reference[gene_id])
        print(f"Gene {gene_id}: AED = {aed:.3f}")

Custom Scoring Pipeline

from gfftk.consensus import score_by_evidence, best_model_default
from gfftk.gff import gff2dict

# Load gene models and evidence
genes = gff2dict("predictions.gff3", "genome.fasta")
proteins = gff2dict("protein_alignments.gff3", "genome.fasta")

# Group genes by locus (simplified example)
loci = {}
for gene_id, gene_data in genes.items():
    locus_key = f"{gene_data['contig']}:{gene_data['location'][0]}-{gene_data['location'][1]}"
    if locus_key not in loci:
        loci[locus_key] = []
    loci[locus_key].append({gene_id: gene_data})

# Score each locus
weights = {"augustus": 2, "genemark": 1}
consensus_models = {}

for locus_key, locus_models in loci.items():
    # Score models by evidence
    scored = score_by_evidence(locus_models, weights)

    # Select best model
    best = best_model_default(scored, weights, [], min_protein=30)

    if best:
        consensus_models.update(best)

print(f"Selected {len(consensus_models)} consensus gene models")

Repeat Filtering

from gfftk.consensus import filter_models_repeats
from gfftk.gff import gff2dict

# Load gene models
models = gff2dict("gene_models.gff3", "genome.fasta")

# Filter models overlapping repeats
filtered_models = filter_models_repeats(
    fasta="genome.fasta",
    repeats="repeats.bed",
    gene_models=models,
    filter_threshold=75,  # Remove models with >75% repeat overlap
    log=print
)

print(f"Retained {len(filtered_models)}/{len(models)} models after repeat filtering")

Structural Validation

from gfftk.consensus import reasonable_model
from gfftk.gff import gff2dict

# Load gene models
models = gff2dict("gene_models.gff3", "genome.fasta")

# Validate each model
valid_models = {}
for gene_id, gene_data in models.items():
    coords = {
        'CDS': gene_data['CDS'],
        'protein': gene_data['protein'],
        'mRNA': gene_data['mRNA']
    }

    # Apply structural constraints
    if reasonable_model(
        coords,
        min_protein=50,      # At least 50 amino acids
        min_exon=15,         # At least 15 bp exons
        min_intron=20,       # At least 20 bp introns
        max_intron=50000     # Max 50kb introns
    ):
        valid_models[gene_id] = gene_data

print(f"Validated {len(valid_models)}/{len(models)} gene models")

Types

# Gene model locus structure
Locus = list[dict]  # List of gene models at same genomic location

# Source weights dictionary
SourceWeights = dict[str, float]  # {source_name: weight_value}

# Evidence overlap metrics
EvidenceMetrics = {
    "protein_overlap": float,     # Percent overlap with protein evidence
    "transcript_overlap": float,  # Percent overlap with transcript evidence
    "aed_score": float,          # Annotation Edit Distance
    "combined_score": float      # Final combined score
}

# Scored gene model
ScoredModel = {
    "gene_data": dict,           # Original gene annotation data
    "score": float,              # Final score
    "evidence": EvidenceMetrics  # Evidence support metrics
}

# Consensus parameters
ConsensusParams = {
    "min_protein": int,          # Minimum protein length
    "min_exon": int,            # Minimum exon length
    "max_exon": int,            # Maximum exon length
    "min_intron": int,          # Minimum intron length
    "max_intron": int,          # Maximum intron length
    "repeat_overlap": int,       # Repeat overlap threshold
    "minscore": int,            # Minimum model score
    "num_processes": int        # Number of parallel processes
}

# AED score
AEDScore = float  # 0.0 (perfect) to 1.0 (no overlap)

# Genomic coordinates
GenomicCoordinates = list[tuple[int, int]]

# Repeat annotation format
RepeatAnnotation = {
    "contig": str,
    "start": int,
    "end": int,
    "type": str
}

Install with Tessl CLI