Comprehensive Python toolkit for working with genome annotation files in GFF3, GTF, and TBL formats with format conversion and analysis capabilities
Compare two genome annotations to identify differences, calculate similarity metrics, and generate detailed comparison reports with feature-level analysis and overlap statistics.
Compare two complete GFF3 annotations and generate comprehensive statistics.
def compareAnnotations(old, new, fasta, output=False):
"""
Compare two GFF3 annotations and generate comparison statistics.
Parameters:
- old (str): Path to reference/old annotation GFF3 file
- new (str): Path to query/new annotation GFF3 file
- fasta (str): Path to genome FASTA file
- output (str|bool): Output file for detailed results, or False for return only
Returns:
list: Comparison statistics and metrics
"""Calculate pairwise Annotation Edit Distance scores between gene model sets.
def pairwiseAED(query, reference):
"""
Calculate pairwise AED scores between query and reference gene sets.
Parameters:
- query (list): List of query gene models
- reference (list): List of reference gene models
Returns:
str: Formatted string with AED statistics
"""Convert GFF3 annotations to InterLap data structure for efficient overlap analysis.
def gff2interlap(input, fasta):
"""
Convert GFF3 annotation to InterLap structure for overlap queries.
Parameters:
- input (str): Path to GFF3 file
- fasta (str): Path to genome FASTA file
Returns:
tuple: (interlap_object, annotation_dict) pair for overlap analysis
"""Count different types of features in annotation dictionaries.
def countFeatures(input):
"""
Count features in annotation dictionary.
Parameters:
- input (dict): Annotation dictionary to analyze
Returns:
tuple: Feature counts (genes, transcripts, exons, cds_features)
"""from gfftk.compare import compareAnnotations
# Compare two GFF3 annotations
comparison_results = compareAnnotations(
old="reference_v1.gff3",
new="reference_v2.gff3",
fasta="genome.fasta",
output="comparison_report.txt"
)
print("Comparison Results:")
for result in comparison_results:
print(result)from gfftk.compare import countFeatures, gff2interlap
from gfftk.gff import gff2dict
# Load annotations
ref_annotation = gff2dict("reference.gff3", "genome.fasta")
query_annotation = gff2dict("query.gff3", "genome.fasta")
# Count features in each annotation
ref_counts = countFeatures(ref_annotation)
query_counts = countFeatures(query_annotation)
print(f"Reference: {ref_counts[0]} genes, {ref_counts[1]} transcripts")
print(f"Query: {query_counts[0]} genes, {query_counts[1]} transcripts")
# Convert to InterLap for overlap analysis
ref_interlap, ref_genes = gff2interlap("reference.gff3", "genome.fasta")
query_interlap, query_genes = gff2interlap("query.gff3", "genome.fasta")from gfftk.compare import pairwiseAED
from gfftk.gff import gff2dict
# Load annotations
reference = gff2dict("reference.gff3", "genome.fasta")
prediction = gff2dict("prediction.gff3", "genome.fasta")
# Convert to lists for pairwise comparison
ref_list = [gene_data for gene_data in reference.values()]
pred_list = [gene_data for gene_data in prediction.values()]
# Calculate pairwise AED statistics
aed_results = pairwiseAED(pred_list, ref_list)
print(f"AED Analysis Results:\n{aed_results}")from gfftk.compare import compareAnnotations, countFeatures, gff2interlap
from gfftk.gff import gff2dict
from gfftk.consensus import getAED
# Input files
reference_gff = "reference.gff3"
query_gff = "prediction.gff3"
genome_fasta = "genome.fasta"
# 1. Load annotations
print("Loading annotations...")
ref_dict = gff2dict(reference_gff, genome_fasta)
query_dict = gff2dict(query_gff, genome_fasta)
# 2. Basic feature counting
ref_counts = countFeatures(ref_dict)
query_counts = countFeatures(query_dict)
print(f"Reference annotation: {ref_counts}")
print(f"Query annotation: {query_counts}")
# 3. Overall comparison
print("Performing comprehensive comparison...")
comparison = compareAnnotations(reference_gff, query_gff, genome_fasta)
# 4. Gene-level AED analysis
print("Calculating gene-level AED scores...")
aed_scores = {}
common_genes = set(ref_dict.keys()) & set(query_dict.keys())
for gene_id in common_genes:
aed = getAED(query_dict[gene_id], ref_dict[gene_id])
aed_scores[gene_id] = aed
# Summary statistics
avg_aed = sum(aed_scores.values()) / len(aed_scores) if aed_scores else 0
perfect_matches = sum(1 for aed in aed_scores.values() if aed == 0.0)
good_matches = sum(1 for aed in aed_scores.values() if aed <= 0.1)
print(f"Average AED: {avg_aed:.3f}")
print(f"Perfect matches: {perfect_matches}/{len(aed_scores)}")
print(f"Good matches (AED ≤ 0.1): {good_matches}/{len(aed_scores)}")
# 5. InterLap-based overlap analysis
print("Setting up overlap analysis...")
ref_interlap, ref_genes = gff2interlap(reference_gff, genome_fasta)
query_interlap, query_genes = gff2interlap(query_gff, genome_fasta)from gfftk.compare import compareAnnotations
import os
def batch_compare_annotations(reference_gff, query_directory, genome_fasta, output_dir):
"""Compare reference against multiple query annotations."""
results = {}
for filename in os.listdir(query_directory):
if filename.endswith('.gff3'):
query_path = os.path.join(query_directory, filename)
output_path = os.path.join(output_dir, f"{filename}_comparison.txt")
print(f"Comparing {filename}...")
comparison = compareAnnotations(
old=reference_gff,
new=query_path,
fasta=genome_fasta,
output=output_path
)
results[filename] = comparison
return results
# Example usage
results = batch_compare_annotations(
reference_gff="gold_standard.gff3",
query_directory="predictions/",
genome_fasta="genome.fasta",
output_dir="comparison_results/"
)# Comparison result structure
ComparisonResult = {
"reference_stats": dict, # Reference annotation statistics
"query_stats": dict, # Query annotation statistics
"overlap_stats": dict, # Overlap analysis results
"aed_distribution": dict, # AED score distribution
"feature_differences": dict # Detailed feature differences
}
# Feature count tuple
FeatureCounts = tuple[int, int, int, int] # (genes, transcripts, exons, cds_features)
# InterLap analysis tuple
InterLapData = tuple[object, dict] # (interlap_object, gene_dictionary)
# AED statistics string format
AEDStats = str # Formatted string with statistical summary
# Overlap query result
OverlapResult = {
"query_gene": str, # Query gene identifier
"reference_matches": list, # List of overlapping reference genes
"overlap_percent": float, # Percent overlap
"aed_score": float # AED score with best match
}
# Batch comparison results
BatchResults = dict[str, ComparisonResult] # {filename: comparison_result}Install with Tessl CLI
npx tessl i tessl/pypi-gfftk