0
# Annotation Comparison
1
2
Compare two genome annotations to identify differences, calculate similarity metrics, and generate detailed comparison reports with feature-level analysis and overlap statistics.
3
4
## Capabilities
5
6
### Annotation Comparison
7
8
Compare two complete GFF3 annotations and generate comprehensive statistics.
9
10
```python { .api }
11
def compareAnnotations(old, new, fasta, output=False):
12
"""
13
Compare two GFF3 annotations and generate comparison statistics.
14
15
Parameters:
16
- old (str): Path to reference/old annotation GFF3 file
17
- new (str): Path to query/new annotation GFF3 file
18
- fasta (str): Path to genome FASTA file
19
- output (str|bool): Output file for detailed results, or False for return only
20
21
Returns:
22
list: Comparison statistics and metrics
23
"""
24
```
25
26
### Pairwise AED Calculation
27
28
Calculate pairwise Annotation Edit Distance scores between gene model sets.
29
30
```python { .api }
31
def pairwiseAED(query, reference):
32
"""
33
Calculate pairwise AED scores between query and reference gene sets.
34
35
Parameters:
36
- query (list): List of query gene models
37
- reference (list): List of reference gene models
38
39
Returns:
40
str: Formatted string with AED statistics
41
"""
42
```
43
44
### InterLap Conversion
45
46
Convert GFF3 annotations to InterLap data structure for efficient overlap analysis.
47
48
```python { .api }
49
def gff2interlap(input, fasta):
50
"""
51
Convert GFF3 annotation to InterLap structure for overlap queries.
52
53
Parameters:
54
- input (str): Path to GFF3 file
55
- fasta (str): Path to genome FASTA file
56
57
Returns:
58
tuple: (interlap_object, annotation_dict) pair for overlap analysis
59
"""
60
```
61
62
### Feature Counting
63
64
Count different types of features in annotation dictionaries.
65
66
```python { .api }
67
def countFeatures(input):
68
"""
69
Count features in annotation dictionary.
70
71
Parameters:
72
- input (dict): Annotation dictionary to analyze
73
74
Returns:
75
tuple: Feature counts (genes, transcripts, exons, cds_features)
76
"""
77
```
78
79
## Usage Examples
80
81
### Basic Annotation Comparison
82
83
```python
84
from gfftk.compare import compareAnnotations
85
86
# Compare two GFF3 annotations
87
comparison_results = compareAnnotations(
88
old="reference_v1.gff3",
89
new="reference_v2.gff3",
90
fasta="genome.fasta",
91
output="comparison_report.txt"
92
)
93
94
print("Comparison Results:")
95
for result in comparison_results:
96
print(result)
97
```
98
99
### Detailed Feature Analysis
100
101
```python
102
from gfftk.compare import countFeatures, gff2interlap
103
from gfftk.gff import gff2dict
104
105
# Load annotations
106
ref_annotation = gff2dict("reference.gff3", "genome.fasta")
107
query_annotation = gff2dict("query.gff3", "genome.fasta")
108
109
# Count features in each annotation
110
ref_counts = countFeatures(ref_annotation)
111
query_counts = countFeatures(query_annotation)
112
113
print(f"Reference: {ref_counts[0]} genes, {ref_counts[1]} transcripts")
114
print(f"Query: {query_counts[0]} genes, {query_counts[1]} transcripts")
115
116
# Convert to InterLap for overlap analysis
117
ref_interlap, ref_genes = gff2interlap("reference.gff3", "genome.fasta")
118
query_interlap, query_genes = gff2interlap("query.gff3", "genome.fasta")
119
```
120
121
### AED-Based Comparison
122
123
```python
124
from gfftk.compare import pairwiseAED
125
from gfftk.gff import gff2dict
126
127
# Load annotations
128
reference = gff2dict("reference.gff3", "genome.fasta")
129
prediction = gff2dict("prediction.gff3", "genome.fasta")
130
131
# Convert to lists for pairwise comparison
132
ref_list = [gene_data for gene_data in reference.values()]
133
pred_list = [gene_data for gene_data in prediction.values()]
134
135
# Calculate pairwise AED statistics
136
aed_results = pairwiseAED(pred_list, ref_list)
137
print(f"AED Analysis Results:\n{aed_results}")
138
```
139
140
### Comprehensive Pipeline
141
142
```python
143
from gfftk.compare import compareAnnotations, countFeatures, gff2interlap
144
from gfftk.gff import gff2dict
145
from gfftk.consensus import getAED
146
147
# Input files
148
reference_gff = "reference.gff3"
149
query_gff = "prediction.gff3"
150
genome_fasta = "genome.fasta"
151
152
# 1. Load annotations
153
print("Loading annotations...")
154
ref_dict = gff2dict(reference_gff, genome_fasta)
155
query_dict = gff2dict(query_gff, genome_fasta)
156
157
# 2. Basic feature counting
158
ref_counts = countFeatures(ref_dict)
159
query_counts = countFeatures(query_dict)
160
161
print(f"Reference annotation: {ref_counts}")
162
print(f"Query annotation: {query_counts}")
163
164
# 3. Overall comparison
165
print("Performing comprehensive comparison...")
166
comparison = compareAnnotations(reference_gff, query_gff, genome_fasta)
167
168
# 4. Gene-level AED analysis
169
print("Calculating gene-level AED scores...")
170
aed_scores = {}
171
common_genes = set(ref_dict.keys()) & set(query_dict.keys())
172
173
for gene_id in common_genes:
174
aed = getAED(query_dict[gene_id], ref_dict[gene_id])
175
aed_scores[gene_id] = aed
176
177
# Summary statistics
178
avg_aed = sum(aed_scores.values()) / len(aed_scores) if aed_scores else 0
179
perfect_matches = sum(1 for aed in aed_scores.values() if aed == 0.0)
180
good_matches = sum(1 for aed in aed_scores.values() if aed <= 0.1)
181
182
print(f"Average AED: {avg_aed:.3f}")
183
print(f"Perfect matches: {perfect_matches}/{len(aed_scores)}")
184
print(f"Good matches (AED ≤ 0.1): {good_matches}/{len(aed_scores)}")
185
186
# 5. InterLap-based overlap analysis
187
print("Setting up overlap analysis...")
188
ref_interlap, ref_genes = gff2interlap(reference_gff, genome_fasta)
189
query_interlap, query_genes = gff2interlap(query_gff, genome_fasta)
190
```
191
192
### Batch Comparison
193
194
```python
195
from gfftk.compare import compareAnnotations
196
import os
197
198
def batch_compare_annotations(reference_gff, query_directory, genome_fasta, output_dir):
199
"""Compare reference against multiple query annotations."""
200
201
results = {}
202
203
for filename in os.listdir(query_directory):
204
if filename.endswith('.gff3'):
205
query_path = os.path.join(query_directory, filename)
206
output_path = os.path.join(output_dir, f"{filename}_comparison.txt")
207
208
print(f"Comparing {filename}...")
209
210
comparison = compareAnnotations(
211
old=reference_gff,
212
new=query_path,
213
fasta=genome_fasta,
214
output=output_path
215
)
216
217
results[filename] = comparison
218
219
return results
220
221
# Example usage
222
results = batch_compare_annotations(
223
reference_gff="gold_standard.gff3",
224
query_directory="predictions/",
225
genome_fasta="genome.fasta",
226
output_dir="comparison_results/"
227
)
228
```
229
230
## Types
231
232
```python { .api }
233
# Comparison result structure
234
ComparisonResult = {
235
"reference_stats": dict, # Reference annotation statistics
236
"query_stats": dict, # Query annotation statistics
237
"overlap_stats": dict, # Overlap analysis results
238
"aed_distribution": dict, # AED score distribution
239
"feature_differences": dict # Detailed feature differences
240
}
241
242
# Feature count tuple
243
FeatureCounts = tuple[int, int, int, int] # (genes, transcripts, exons, cds_features)
244
245
# InterLap analysis tuple
246
InterLapData = tuple[object, dict] # (interlap_object, gene_dictionary)
247
248
# AED statistics string format
249
AEDStats = str # Formatted string with statistical summary
250
251
# Overlap query result
252
OverlapResult = {
253
"query_gene": str, # Query gene identifier
254
"reference_matches": list, # List of overlapping reference genes
255
"overlap_percent": float, # Percent overlap
256
"aed_score": float # AED score with best match
257
}
258
259
# Batch comparison results
260
BatchResults = dict[str, ComparisonResult] # {filename: comparison_result}
261
```