or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli-commands.mdcomparison.mdconsensus.mdformat-conversion.mdgenbank-tbl.mdgff-processing.mdindex.mdsequence-operations.mdutilities.md

comparison.mddocs/

0

# Annotation Comparison

1

2

Compare two genome annotations to identify differences, calculate similarity metrics, and generate detailed comparison reports with feature-level analysis and overlap statistics.

3

4

## Capabilities

5

6

### Annotation Comparison

7

8

Compare two complete GFF3 annotations and generate comprehensive statistics.

9

10

```python { .api }

11

def compareAnnotations(old, new, fasta, output=False):

12

"""

13

Compare two GFF3 annotations and generate comparison statistics.

14

15

Parameters:

16

- old (str): Path to reference/old annotation GFF3 file

17

- new (str): Path to query/new annotation GFF3 file

18

- fasta (str): Path to genome FASTA file

19

- output (str|bool): Output file for detailed results, or False for return only

20

21

Returns:

22

list: Comparison statistics and metrics

23

"""

24

```

25

26

### Pairwise AED Calculation

27

28

Calculate pairwise Annotation Edit Distance scores between gene model sets.

29

30

```python { .api }

31

def pairwiseAED(query, reference):

32

"""

33

Calculate pairwise AED scores between query and reference gene sets.

34

35

Parameters:

36

- query (list): List of query gene models

37

- reference (list): List of reference gene models

38

39

Returns:

40

str: Formatted string with AED statistics

41

"""

42

```

43

44

### InterLap Conversion

45

46

Convert GFF3 annotations to InterLap data structure for efficient overlap analysis.

47

48

```python { .api }

49

def gff2interlap(input, fasta):

50

"""

51

Convert GFF3 annotation to InterLap structure for overlap queries.

52

53

Parameters:

54

- input (str): Path to GFF3 file

55

- fasta (str): Path to genome FASTA file

56

57

Returns:

58

tuple: (interlap_object, annotation_dict) pair for overlap analysis

59

"""

60

```

61

62

### Feature Counting

63

64

Count different types of features in annotation dictionaries.

65

66

```python { .api }

67

def countFeatures(input):

68

"""

69

Count features in annotation dictionary.

70

71

Parameters:

72

- input (dict): Annotation dictionary to analyze

73

74

Returns:

75

tuple: Feature counts (genes, transcripts, exons, cds_features)

76

"""

77

```

78

79

## Usage Examples

80

81

### Basic Annotation Comparison

82

83

```python

84

from gfftk.compare import compareAnnotations

85

86

# Compare two GFF3 annotations

87

comparison_results = compareAnnotations(

88

old="reference_v1.gff3",

89

new="reference_v2.gff3",

90

fasta="genome.fasta",

91

output="comparison_report.txt"

92

)

93

94

print("Comparison Results:")

95

for result in comparison_results:

96

print(result)

97

```

98

99

### Detailed Feature Analysis

100

101

```python

102

from gfftk.compare import countFeatures, gff2interlap

103

from gfftk.gff import gff2dict

104

105

# Load annotations

106

ref_annotation = gff2dict("reference.gff3", "genome.fasta")

107

query_annotation = gff2dict("query.gff3", "genome.fasta")

108

109

# Count features in each annotation

110

ref_counts = countFeatures(ref_annotation)

111

query_counts = countFeatures(query_annotation)

112

113

print(f"Reference: {ref_counts[0]} genes, {ref_counts[1]} transcripts")

114

print(f"Query: {query_counts[0]} genes, {query_counts[1]} transcripts")

115

116

# Convert to InterLap for overlap analysis

117

ref_interlap, ref_genes = gff2interlap("reference.gff3", "genome.fasta")

118

query_interlap, query_genes = gff2interlap("query.gff3", "genome.fasta")

119

```

120

121

### AED-Based Comparison

122

123

```python

124

from gfftk.compare import pairwiseAED

125

from gfftk.gff import gff2dict

126

127

# Load annotations

128

reference = gff2dict("reference.gff3", "genome.fasta")

129

prediction = gff2dict("prediction.gff3", "genome.fasta")

130

131

# Convert to lists for pairwise comparison

132

ref_list = [gene_data for gene_data in reference.values()]

133

pred_list = [gene_data for gene_data in prediction.values()]

134

135

# Calculate pairwise AED statistics

136

aed_results = pairwiseAED(pred_list, ref_list)

137

print(f"AED Analysis Results:\n{aed_results}")

138

```

139

140

### Comprehensive Pipeline

141

142

```python

143

from gfftk.compare import compareAnnotations, countFeatures, gff2interlap

144

from gfftk.gff import gff2dict

145

from gfftk.consensus import getAED

146

147

# Input files

148

reference_gff = "reference.gff3"

149

query_gff = "prediction.gff3"

150

genome_fasta = "genome.fasta"

151

152

# 1. Load annotations

153

print("Loading annotations...")

154

ref_dict = gff2dict(reference_gff, genome_fasta)

155

query_dict = gff2dict(query_gff, genome_fasta)

156

157

# 2. Basic feature counting

158

ref_counts = countFeatures(ref_dict)

159

query_counts = countFeatures(query_dict)

160

161

print(f"Reference annotation: {ref_counts}")

162

print(f"Query annotation: {query_counts}")

163

164

# 3. Overall comparison

165

print("Performing comprehensive comparison...")

166

comparison = compareAnnotations(reference_gff, query_gff, genome_fasta)

167

168

# 4. Gene-level AED analysis

169

print("Calculating gene-level AED scores...")

170

aed_scores = {}

171

common_genes = set(ref_dict.keys()) & set(query_dict.keys())

172

173

for gene_id in common_genes:

174

aed = getAED(query_dict[gene_id], ref_dict[gene_id])

175

aed_scores[gene_id] = aed

176

177

# Summary statistics

178

avg_aed = sum(aed_scores.values()) / len(aed_scores) if aed_scores else 0

179

perfect_matches = sum(1 for aed in aed_scores.values() if aed == 0.0)

180

good_matches = sum(1 for aed in aed_scores.values() if aed <= 0.1)

181

182

print(f"Average AED: {avg_aed:.3f}")

183

print(f"Perfect matches: {perfect_matches}/{len(aed_scores)}")

184

print(f"Good matches (AED ≤ 0.1): {good_matches}/{len(aed_scores)}")

185

186

# 5. InterLap-based overlap analysis

187

print("Setting up overlap analysis...")

188

ref_interlap, ref_genes = gff2interlap(reference_gff, genome_fasta)

189

query_interlap, query_genes = gff2interlap(query_gff, genome_fasta)

190

```

191

192

### Batch Comparison

193

194

```python

195

from gfftk.compare import compareAnnotations

196

import os

197

198

def batch_compare_annotations(reference_gff, query_directory, genome_fasta, output_dir):

199

"""Compare reference against multiple query annotations."""

200

201

results = {}

202

203

for filename in os.listdir(query_directory):

204

if filename.endswith('.gff3'):

205

query_path = os.path.join(query_directory, filename)

206

output_path = os.path.join(output_dir, f"{filename}_comparison.txt")

207

208

print(f"Comparing {filename}...")

209

210

comparison = compareAnnotations(

211

old=reference_gff,

212

new=query_path,

213

fasta=genome_fasta,

214

output=output_path

215

)

216

217

results[filename] = comparison

218

219

return results

220

221

# Example usage

222

results = batch_compare_annotations(

223

reference_gff="gold_standard.gff3",

224

query_directory="predictions/",

225

genome_fasta="genome.fasta",

226

output_dir="comparison_results/"

227

)

228

```

229

230

## Types

231

232

```python { .api }

233

# Comparison result structure

234

ComparisonResult = {

235

"reference_stats": dict, # Reference annotation statistics

236

"query_stats": dict, # Query annotation statistics

237

"overlap_stats": dict, # Overlap analysis results

238

"aed_distribution": dict, # AED score distribution

239

"feature_differences": dict # Detailed feature differences

240

}

241

242

# Feature count tuple

243

FeatureCounts = tuple[int, int, int, int] # (genes, transcripts, exons, cds_features)

244

245

# InterLap analysis tuple

246

InterLapData = tuple[object, dict] # (interlap_object, gene_dictionary)

247

248

# AED statistics string format

249

AEDStats = str # Formatted string with statistical summary

250

251

# Overlap query result

252

OverlapResult = {

253

"query_gene": str, # Query gene identifier

254

"reference_matches": list, # List of overlapping reference genes

255

"overlap_percent": float, # Percent overlap

256

"aed_score": float # AED score with best match

257

}

258

259

# Batch comparison results

260

BatchResults = dict[str, ComparisonResult] # {filename: comparison_result}

261

```