Comprehensive Python toolkit for working with genome annotation files in GFF3, GTF, and TBL formats with format conversion and analysis capabilities
EvidenceModeler-like consensus gene prediction system that combines multiple annotation sources using protein and transcript evidence alignment, configurable source weights, and structural validation to generate high-quality consensus gene models.
Generate consensus gene predictions from multiple annotation sources with evidence-based scoring.
def generate_consensus(fasta, genes, proteins, transcripts, weights, out, debug=False, minscore=False, repeats=False, repeat_overlap=90, tiebreakers="calculated", min_exon=3, min_intron=11, max_intron=-1, max_exon=-1, evidence_derived_models=[], num_processes=None, utrs=True, min_utr_length=10, max_utr_length=2000, log=sys.stderr.write):
"""
Generate consensus predictions from multiple annotation sources.
Parameters:
- fasta (str): Path to genome FASTA file
- genes (list): List of paths to gene model files in GFF3 format
- proteins (list): List of paths to protein alignment files in GFF3 format
- transcripts (list): List of paths to transcript alignment files in GFF3 format
- weights (dict): User-supplied source weights {source: weight}
- out (str): Path to output consensus GFF3 file
- debug (bool): Enable debug output
- minscore (int|bool): Minimum score to retain gene model, or False for auto
- repeats (str|bool): Path to repeat annotations, or False
- repeat_overlap (int): Percent overlap with repeats to filter (default: 90)
- tiebreakers (str): Tiebreaker method ("calculated" or other)
- min_exon (int): Minimum exon length (default: 3)
- min_intron (int): Minimum intron length (default: 11)
- max_intron (int): Maximum intron length (-1 for no limit)
- max_exon (int): Maximum exon length (-1 for no limit)
- evidence_derived_models (list): List of evidence-derived model sources
- num_processes (int|None): Number of parallel processes
- utrs (bool): Enable UTR extension (default: True)
- min_utr_length (int): Minimum UTR length (default: 10)
- max_utr_length (int): Maximum UTR length (default: 2000)
- log (function): Logging function
Returns:
None
"""Calculate Annotation Edit Distance (AED) between gene models for similarity assessment.
def getAED(query, reference):
"""
Calculate Annotation Edit Distance between two gene models.
Parameters:
- query (dict): Query gene model data structure
- reference (dict): Reference gene model data structure
Returns:
float: AED score (0.0 = perfect match, 1.0 = no overlap)
"""Score gene models based on overlap with protein and transcript evidence.
def score_by_evidence(locus, weights={}, derived=[]):
"""
Score gene models based on evidence overlap.
Parameters:
- locus (list): List of gene models at a genomic locus
- weights (dict): User-supplied source weights
- derived (list): Derived weights from evidence analysis
Returns:
dict: Scored gene models with evidence metrics
"""Select the best gene model from a locus based on scoring criteria.
def best_model_default(locus_name, contig, strand, locus, debug=False, weights={}, order={}, min_exon=3, min_intron=10, max_intron=-1, max_exon=-1, evidence_derived_models=[]):
"""
Select best gene model from locus using default scoring algorithm.
Parameters:
- locus_name (str): Name identifier for the locus
- contig (str): Contig/chromosome name
- strand (str): Strand orientation ("+" or "-")
- locus (list): List of gene models at genomic locus
- debug (bool): Enable debug output
- weights (dict): User-supplied source weights
- order (dict): Derived weights from evidence analysis
- min_exon (int): Minimum exon length requirement
- min_intron (int): Minimum intron length requirement
- max_intron (int): Maximum intron length (-1 for no limit)
- max_exon (int): Maximum exon length requirement (-1 for no limit)
- evidence_derived_models (list): List of evidence-derived model sources
Returns:
dict: Best scoring gene model or None if no suitable model found
"""Convert between different data structures used in consensus prediction.
def gff2interlap(infile, fasta, inter=False):
"""
Convert GFF3 file to InterLap data structure for overlap analysis.
Parameters:
- infile (str): Path to GFF3 file
- fasta (str): Path to genome FASTA file
- inter (InterLap|bool): Existing InterLap object to extend, or False
Returns:
InterLap: InterLap object with genomic intervals
"""
def bed2interlap(bedfile, inter=False):
"""
Convert BED file to InterLap data structure.
Parameters:
- bedfile (str): Path to BED format file
- inter (InterLap|bool): Existing InterLap object to extend, or False
Returns:
InterLap: InterLap object with genomic intervals
"""
def safe_extract_coordinates(coords):
"""
Safely extract coordinates from gene model data structure.
Parameters:
- coords (list): Coordinate data structure
Returns:
list: Validated coordinate tuples
"""Identify and cluster overlapping gene models into loci for consensus prediction.
def cluster_interlap(obj):
"""
Cluster overlapping gene models using InterLap data structure.
Parameters:
- obj (InterLap): InterLap object containing gene intervals
Returns:
list: List of clustered loci with overlapping gene models
"""
def get_loci(annot_dict):
"""
Group gene models into genomic loci based on overlap.
Parameters:
- annot_dict (dict): Annotation dictionary
Returns:
dict: Dictionary of loci with overlapping gene models
"""
def sub_cluster(obj):
"""
Perform sub-clustering within loci for complex overlaps.
Parameters:
- obj (InterLap): InterLap object with clustered intervals
Returns:
list: List of refined sub-clusters
"""
def contained(a, b):
"""
Check if interval a is contained within interval b.
Parameters:
- a (tuple): Interval coordinates (start, end)
- b (tuple): Interval coordinates (start, end)
Returns:
bool: True if a is contained in b
"""
def get_overlap(a, b):
"""
Calculate overlap between two genomic intervals.
Parameters:
- a (tuple): First interval (start, end)
- b (tuple): Second interval (start, end)
Returns:
int: Number of overlapping base pairs
"""Advanced UTR identification and extension algorithms.
def check_intron_compatibility(model_coords, transcript_coords, strand):
"""
Check if model introns are compatible with transcript evidence.
Parameters:
- model_coords (list): Model exon coordinates
- transcript_coords (list): Transcript exon coordinates
- strand (str): Strand orientation ("+" or "-")
Returns:
bool: True if introns are compatible
"""
def select_best_utrs(utr_exons_list, strand, min_length=10, max_length=2000):
"""
Select optimal UTR exons from multiple evidence sources.
Parameters:
- utr_exons_list (list): List of UTR exon coordinate sets
- strand (str): Strand orientation ("+" or "-")
- min_length (int): Minimum UTR length
- max_length (int): Maximum UTR length
Returns:
list: Selected UTR exon coordinates
"""
def extend_utrs(gene_model, transcript_evidence, strand, min_length=10, max_length=2000):
"""
Extend gene model UTRs using transcript evidence.
Parameters:
- gene_model (dict): Gene model data structure
- transcript_evidence (list): List of transcript alignments
- strand (str): Strand orientation ("+" or "-")
- min_length (int): Minimum UTR extension length
- max_length (int): Maximum UTR extension length
Returns:
dict: Gene model with extended UTRs
"""Filter gene models based on repeat content and structural quality.
def filter_models_repeats(fasta, repeats, gene_models, filter_threshold=90, log=False):
"""
Filter gene models overlapping repetitive regions.
Parameters:
- fasta (str): Path to genome FASTA file
- repeats (str): Path to repeat annotations (BED format)
- gene_models (dict): Gene model dictionary
- filter_threshold (int): Percent overlap threshold for filtering
- log (function|bool): Logging function or False
Returns:
dict: Filtered gene models
"""
def reasonable_model(coords, min_protein=30, min_exon=3, min_intron=10, max_intron=-1, max_exon=-1):
"""
Validate gene model structure against biological constraints.
Parameters:
- coords (dict): Gene coordinates {'CDS': [...], 'protein': [...]}
- min_protein (int): Minimum protein length (amino acids)
- min_exon (int): Minimum exon length (nucleotides)
- min_intron (int): Minimum intron length (nucleotides)
- max_intron (int): Maximum intron length (-1 for no limit)
- max_exon (int): Maximum exon length (-1 for no limit)
Returns:
bool: True if model passes structural validation
"""Load and integrate protein and transcript evidence into consensus prediction.
def gffevidence2dict(file, Evi):
"""
Parse evidence alignments from GFF3 into evidence dictionary.
Parameters:
- file (str): Path to evidence GFF3 file
- Evi (dict): Existing evidence dictionary to update
Returns:
dict: Updated evidence dictionary
"""
def add_evidence(loci, evidence, source="proteins"):
"""
Add evidence data to gene model loci.
Parameters:
- loci (dict): Dictionary of gene model loci
- evidence (dict): Evidence alignment dictionary
- source (str): Evidence type ("proteins" or "transcripts")
Returns:
dict: Loci with integrated evidence information
"""
def parse_data(genome, gene, protein, transcript, log=sys.stderr.write):
"""
Parse all input data files for consensus prediction.
Parameters:
- genome (str): Path to genome FASTA file
- gene (list): List of gene model file paths
- protein (list): List of protein alignment file paths
- transcript (list): List of transcript alignment file paths
- log (function): Logging function
Returns:
tuple: (genome_dict, gene_dict, evidence_dict)
"""
def filter4evidence(data, n_genes=3, n_evidence=2):
"""
Filter loci requiring minimum gene and evidence counts.
Parameters:
- data (dict): Loci data dictionary
- n_genes (int): Minimum number of gene models required
- n_evidence (int): Minimum number of evidence alignments required
Returns:
dict: Filtered loci meeting evidence requirements
"""Advanced scoring functions for consensus model selection.
def score_aggregator(locus, weights, evidence_weights, min_protein=30):
"""
Aggregate multiple scoring metrics for final model ranking.
Parameters:
- locus (list): List of gene models at locus
- weights (dict): User-supplied source weights
- evidence_weights (dict): Evidence-derived weights
- min_protein (int): Minimum protein length requirement
Returns:
list: Ranked gene models with aggregated scores
"""
def cluster_by_aed(locus, score=0.005):
"""
Cluster highly similar models using AED scores.
Parameters:
- locus (list): List of gene models
- score (float): AED threshold for clustering
Returns:
list: Clustered gene models
"""
def map_coords(g_coords, e_coords):
"""
Map gene coordinates to evidence coordinates.
Parameters:
- g_coords (list): Gene model coordinates
- e_coords (list): Evidence alignment coordinates
Returns:
dict: Coordinate mapping information
"""
def score_evidence(g_coords, e_coords, weight=2):
"""
Score gene model based on evidence overlap.
Parameters:
- g_coords (list): Gene model coordinates
- e_coords (list): Evidence coordinates
- weight (float): Evidence weight factor
Returns:
float: Evidence-based score
"""Calculate source weights and rankings based on evidence support.
def calculate_source_order(data):
"""
Calculate source ranking based on evidence support.
Parameters:
- data (dict): Gene model and evidence data
Returns:
dict: Source rankings and weights
"""
def order_sources(locus):
"""
Order prediction sources by quality metrics.
Parameters:
- locus (list): List of gene models at locus
Returns:
dict: Ordered source rankings
"""
def src_scaling_factor(obj):
"""
Calculate scaling factors for different prediction sources.
Parameters:
- obj (dict): Source performance data
Returns:
dict: Source-specific scaling factors
"""
def de_novo_distance(locus):
"""
Calculate distance metrics for de novo source weights.
Parameters:
- locus (list): List of gene models
Returns:
dict: Distance-based weight metrics
"""
def calculate_gene_distance(locus):
"""
Calculate structural distance between gene models.
Parameters:
- locus (list): List of gene models at locus
Returns:
dict: Inter-model distance metrics
"""Configure consensus prediction algorithm parameters.
def auto_score_threshold(weights, order, user_weight=6):
"""
Automatically determine optimal score threshold.
Parameters:
- weights (dict): User-supplied weights
- order (dict): Source order rankings
- user_weight (int): Weight for user preferences
Returns:
float: Calculated score threshold
"""
def ensure_unique_names(genes):
"""
Ensure all gene models have unique identifiers.
Parameters:
- genes (dict): Gene model dictionary
Returns:
dict: Gene models with unique names
"""
def fasta_length(fasta):
"""
Get sequence lengths from FASTA file.
Parameters:
- fasta (str): Path to FASTA file
Returns:
dict: Dictionary mapping sequence names to lengths
"""Refine consensus predictions and generate output files.
def refine_cluster(locus, derived=[]):
"""
Refine gene model cluster using derived weights.
Parameters:
- locus (list): List of gene models
- derived (list): Derived weight parameters
Returns:
list: Refined gene model cluster
"""
def solve_sub_loci(result):
"""
Resolve complex multi-gene loci into individual predictions.
Parameters:
- result (dict): Clustered locus data
Returns:
dict: Resolved individual gene predictions
"""
def gff_writer(input, output):
"""
Write consensus predictions to GFF3 format.
Parameters:
- input (dict): Consensus gene predictions
- output (str): Output GFF3 file path
Returns:
None
"""Command-line interface for consensus prediction.
def consensus(args):
"""
Main consensus prediction CLI entry point.
Parameters:
- args (argparse.Namespace): Command line arguments
Returns:
None
"""
- max_exon (int): Maximum exon length (-1 for no limit)
- evidence_derived_models (list): Evidence-derived model sources
Returns:
dict: Best gene model with score information
"""Filter gene models that significantly overlap with repetitive elements.
def filter_models_repeats(fasta, repeats, gene_models, filter_threshold=90, log=False):
"""
Filter gene models overlapping with repetitive elements.
Parameters:
- fasta (str): Path to genome FASTA file
- repeats (str): Path to repeat annotations (BED or GFF3)
- gene_models (dict): Dictionary of gene models to filter
- filter_threshold (int): Percent overlap threshold for filtering
- log (function|bool): Logging function or boolean
Returns:
dict: Filtered gene models with repeats removed
"""Validate gene model structure against biological constraints.
def reasonable_model(coords, min_protein=30, min_exon=3, min_intron=10, max_intron=-1, max_exon=-1):
"""
Validate gene model structure against biological constraints.
Parameters:
- coords (dict): Gene model coordinate information
- min_protein (int): Minimum protein length in amino acids
- min_exon (int): Minimum exon length in nucleotides
- min_intron (int): Minimum intron length in nucleotides
- max_intron (int): Maximum intron length (-1 for no limit)
- max_exon (int): Maximum exon length (-1 for no limit)
Returns:
bool: True if model passes validation, False otherwise
"""from gfftk.consensus import generate_consensus
# Define input files
gene_predictions = ["augustus.gff3", "genemark.gff3", "snap.gff3"]
protein_alignments = ["uniprot.gff3", "pfam.gff3"]
transcript_alignments = ["rnaseq.gff3"]
# User-defined source weights
weights = {
"augustus": 3,
"genemark": 2,
"snap": 1
}
# Generate consensus
generate_consensus(
fasta="genome.fasta",
genes=gene_predictions,
proteins=protein_alignments,
transcripts=transcript_alignments,
weights=weights,
output="consensus.gff3",
num_processes=4,
min_protein=50,
repeat_overlap=80
)from gfftk.consensus import getAED
from gfftk.gff import gff2dict
# Load two annotations for comparison
reference = gff2dict("reference.gff3", "genome.fasta")
query = gff2dict("prediction.gff3", "genome.fasta")
# Calculate AED for overlapping genes
for gene_id in reference:
if gene_id in query:
aed = getAED(query[gene_id], reference[gene_id])
print(f"Gene {gene_id}: AED = {aed:.3f}")from gfftk.consensus import score_by_evidence, best_model_default
from gfftk.gff import gff2dict
# Load gene models and evidence
genes = gff2dict("predictions.gff3", "genome.fasta")
proteins = gff2dict("protein_alignments.gff3", "genome.fasta")
# Group genes by locus (simplified example)
loci = {}
for gene_id, gene_data in genes.items():
locus_key = f"{gene_data['contig']}:{gene_data['location'][0]}-{gene_data['location'][1]}"
if locus_key not in loci:
loci[locus_key] = []
loci[locus_key].append({gene_id: gene_data})
# Score each locus
weights = {"augustus": 2, "genemark": 1}
consensus_models = {}
for locus_key, locus_models in loci.items():
# Score models by evidence
scored = score_by_evidence(locus_models, weights)
# Select best model
best = best_model_default(scored, weights, [], min_protein=30)
if best:
consensus_models.update(best)
print(f"Selected {len(consensus_models)} consensus gene models")from gfftk.consensus import filter_models_repeats
from gfftk.gff import gff2dict
# Load gene models
models = gff2dict("gene_models.gff3", "genome.fasta")
# Filter models overlapping repeats
filtered_models = filter_models_repeats(
fasta="genome.fasta",
repeats="repeats.bed",
gene_models=models,
filter_threshold=75, # Remove models with >75% repeat overlap
log=print
)
print(f"Retained {len(filtered_models)}/{len(models)} models after repeat filtering")from gfftk.consensus import reasonable_model
from gfftk.gff import gff2dict
# Load gene models
models = gff2dict("gene_models.gff3", "genome.fasta")
# Validate each model
valid_models = {}
for gene_id, gene_data in models.items():
coords = {
'CDS': gene_data['CDS'],
'protein': gene_data['protein'],
'mRNA': gene_data['mRNA']
}
# Apply structural constraints
if reasonable_model(
coords,
min_protein=50, # At least 50 amino acids
min_exon=15, # At least 15 bp exons
min_intron=20, # At least 20 bp introns
max_intron=50000 # Max 50kb introns
):
valid_models[gene_id] = gene_data
print(f"Validated {len(valid_models)}/{len(models)} gene models")# Gene model locus structure
Locus = list[dict] # List of gene models at same genomic location
# Source weights dictionary
SourceWeights = dict[str, float] # {source_name: weight_value}
# Evidence overlap metrics
EvidenceMetrics = {
"protein_overlap": float, # Percent overlap with protein evidence
"transcript_overlap": float, # Percent overlap with transcript evidence
"aed_score": float, # Annotation Edit Distance
"combined_score": float # Final combined score
}
# Scored gene model
ScoredModel = {
"gene_data": dict, # Original gene annotation data
"score": float, # Final score
"evidence": EvidenceMetrics # Evidence support metrics
}
# Consensus parameters
ConsensusParams = {
"min_protein": int, # Minimum protein length
"min_exon": int, # Minimum exon length
"max_exon": int, # Maximum exon length
"min_intron": int, # Minimum intron length
"max_intron": int, # Maximum intron length
"repeat_overlap": int, # Repeat overlap threshold
"minscore": int, # Minimum model score
"num_processes": int # Number of parallel processes
}
# AED score
AEDScore = float # 0.0 (perfect) to 1.0 (no overlap)
# Genomic coordinates
GenomicCoordinates = list[tuple[int, int]]
# Repeat annotation format
RepeatAnnotation = {
"contig": str,
"start": int,
"end": int,
"type": str
}Install with Tessl CLI
npx tessl i tessl/pypi-gfftk