or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

algorithms.mddata-loading.mddata-types.mdfile-formats.mdindex.mdtransformations.md

algorithms.mddocs/

0

# Genomic Algorithms

1

2

Bioinformatics algorithms including consensus calling, sequence alignment, and variant normalization optimized for distributed processing. ADAM Core provides scalable implementations of standard genomic algorithms while maintaining compatibility with established tools like GATK and Picard.

3

4

## Capabilities

5

6

### Consensus Generation

7

8

Algorithms for generating consensus sequences from aligned reads or incorporating known variants.

9

10

```scala { .api }

11

/**

12

* Base trait for consensus sequence generation algorithms

13

* Provides interface for generating consensus from genomic data

14

*/

15

trait ConsensusGenerator {

16

/**

17

* Generate consensus sequence from aligned reads

18

* @param reads - Iterable collection of alignment records

19

* @return Consensus object containing consensus sequence and quality information

20

*/

21

def findConsensus(reads: Iterable[AlignmentRecord]): Consensus

22

}

23

24

/**

25

* Generate consensus sequences from aligned reads using coverage-based calling

26

* Implements majority-rule consensus with configurable thresholds

27

*/

28

class ConsensusGeneratorFromReads(

29

consensusFrequencyThreshold: Double = 0.4,

30

minReadsAtPosition: Int = 5

31

) extends ConsensusGenerator {

32

33

def findConsensus(reads: Iterable[AlignmentRecord]): Consensus

34

}

35

36

/**

37

* Generate consensus incorporating known variant sites

38

* Uses known variants to guide consensus calling in ambiguous regions

39

*/

40

class ConsensusGeneratorFromKnowns(

41

variants: RDD[Variant],

42

minAlleleFraction: Double = 0.25

43

) extends ConsensusGenerator {

44

45

def findConsensus(reads: Iterable[AlignmentRecord]): Consensus

46

}

47

48

/**

49

* Union consensus generator combining multiple consensus strategies

50

* Merges results from multiple consensus generators with conflict resolution

51

*/

52

class UnionConsensusGenerator(

53

generators: List[ConsensusGenerator]

54

) extends ConsensusGenerator {

55

56

def findConsensus(reads: Iterable[AlignmentRecord]): Consensus

57

}

58

```

59

60

**Usage Examples:**

61

62

```scala

63

import org.bdgenomics.adam.algorithms.consensus._

64

import org.bdgenomics.adam.rdd.ADAMContext._

65

66

// Basic consensus from reads

67

val generator = new ConsensusGeneratorFromReads(

68

consensusFrequencyThreshold = 0.6,

69

minReadsAtPosition = 10

70

)

71

72

val alignments = sc.loadBam("sample.bam")

73

val readsAtPosition = alignments.rdd.filter(_.getContigName == "chr1")

74

val consensus = generator.findConsensus(readsAtPosition.collect())

75

76

// Consensus incorporating known variants

77

val knownVariants = sc.loadVcf("dbsnp.vcf").toVariants().rdd

78

val variantGenerator = new ConsensusGeneratorFromKnowns(

79

knownVariants,

80

minAlleleFraction = 0.3

81

)

82

83

// Combined consensus strategy

84

val unionGenerator = new UnionConsensusGenerator(

85

List(generator, variantGenerator)

86

)

87

```

88

89

### Sequence Alignment

90

91

Local sequence alignment algorithms for comparing genomic sequences.

92

93

```scala { .api }

94

/**

95

* Smith-Waterman local sequence alignment algorithm

96

* Provides optimal local alignment between two sequences

97

*/

98

object SmithWaterman {

99

/**

100

* Perform local sequence alignment between reference and query sequences

101

* @param reference - Reference sequence string

102

* @param read - Query sequence string

103

* @param scoring - Scoring scheme for alignment

104

* @return Alignment object containing optimal local alignment

105

*/

106

def align(reference: String,

107

read: String,

108

scoring: SmithWatermanScoring): Alignment

109

}

110

111

/**

112

* Scoring scheme interface for sequence alignment

113

*/

114

trait SmithWatermanScoring {

115

/** Score for matching bases */

116

def matchScore: Int

117

118

/** Score for mismatching bases */

119

def mismatchScore: Int

120

121

/** Score for opening a gap */

122

def gapOpenScore: Int

123

124

/** Score for extending a gap */

125

def gapExtendScore: Int

126

}

127

128

/**

129

* Constant gap scoring scheme with fixed penalties

130

* @param matchScore - Score for matches

131

* @param mismatchScore - Penalty for mismatches

132

* @param gapScore - Penalty for gaps

133

*/

134

class SmithWatermanConstantGapScoring(

135

matchScore: Int = 10,

136

mismatchScore: Int = -2,

137

gapScore: Int = -8

138

) extends SmithWatermanScoring {

139

140

def gapOpenScore: Int = gapScore

141

def gapExtendScore: Int = gapScore

142

}

143

144

/**

145

* Affine gap scoring with different open and extend penalties

146

* @param matchScore - Score for matches

147

* @param mismatchScore - Penalty for mismatches

148

* @param gapOpenScore - Penalty for opening gaps

149

* @param gapExtendScore - Penalty for extending gaps

150

*/

151

class SmithWatermanGapScoringFromFn(

152

matchScore: Int,

153

mismatchScore: Int,

154

gapOpenScore: Int,

155

gapExtendScore: Int

156

) extends SmithWatermanScoring

157

```

158

159

**Usage Examples:**

160

161

```scala

162

import org.bdgenomics.adam.algorithms.smithwaterman._

163

164

// Set up scoring scheme

165

val scoring = new SmithWatermanConstantGapScoring(

166

matchScore = 10,

167

mismatchScore = -2,

168

gapScore = -5

169

)

170

171

// Perform alignment

172

val reference = "ACGTACGTACGT"

173

val query = "ACGTACGT"

174

val alignment = SmithWaterman.align(reference, query, scoring)

175

176

// Affine gap scoring for more realistic penalties

177

val affineScoring = new SmithWatermanGapScoringFromFn(

178

matchScore = 10,

179

mismatchScore = -4,

180

gapOpenScore = -10,

181

gapExtendScore = -1

182

)

183

184

val affineAlignment = SmithWaterman.align(reference, query, affineScoring)

185

```

186

187

### Variant Normalization

188

189

Algorithms for standardizing variant representations and resolving complex variants.

190

191

```scala { .api }

192

/**

193

* Utilities for normalizing variant representations

194

* Implements variant normalization standards for consistent representation

195

*/

196

object NormalizationUtils {

197

/**

198

* Left-align and trim variant representation

199

* @param variant - Variant to normalize

200

* @param reference - Reference sequence context

201

* @return Normalized variant representation

202

*/

203

def leftAlignAndTrim(variant: Variant, reference: String): Variant

204

205

/**

206

* Decompose multi-nucleotide variants into primitive representations

207

* @param variant - Complex variant to decompose

208

* @return Sequence of primitive variants

209

*/

210

def decomposeMNV(variant: Variant): Seq[Variant]

211

212

/**

213

* Normalize indel representation to canonical form

214

* @param variant - Indel variant to normalize

215

* @param reference - Reference sequence context

216

* @return Canonically represented indel

217

*/

218

def normalizeIndel(variant: Variant, reference: String): Variant

219

}

220

```

221

222

**Usage Examples:**

223

224

```scala

225

import org.bdgenomics.adam.algorithms.consensus.NormalizationUtils

226

import org.bdgenomics.adam.rdd.ADAMContext._

227

228

// Load variants and reference

229

val variants = sc.loadVcf("variants.vcf").toVariants()

230

val reference = sc.loadFasta("reference.fasta")

231

232

// Normalize variants

233

val normalized = variants.rdd.map { variant =>

234

val refContext = reference.extract(

235

ReferenceRegion(variant.getContigName, variant.getStart - 10, variant.getEnd + 10)

236

)

237

238

NormalizationUtils.leftAlignAndTrim(variant, refContext)

239

}

240

241

// Decompose complex variants

242

val decomposed = variants.rdd.flatMap { variant =>

243

if (variant.getAlternateAllele.length > 1) {

244

NormalizationUtils.decomposeMNV(variant)

245

} else {

246

Seq(variant)

247

}

248

}

249

```

250

251

### Quality Score Recalibration

252

253

Base quality score recalibration algorithms for improving sequencing accuracy.

254

255

```scala { .api }

256

/**

257

* Base quality score recalibration using known variant sites

258

* Implements GATK-style base quality score recalibration

259

* @param alignments - AlignmentRecordRDD to recalibrate

260

* @param knownSnps - Known variant sites for recalibration modeling

261

* @return AlignmentRecordRDD with recalibrated quality scores

262

*/

263

def recalibrateBaseQualities(alignments: AlignmentRecordRDD,

264

knownSnps: VariantRDD): AlignmentRecordRDD

265

266

/**

267

* Build quality score recalibration model

268

* @param alignments - Training alignments

269

* @param knownSites - Known variant sites

270

* @return Recalibration model for applying to new data

271

*/

272

def buildRecalibrationModel(alignments: AlignmentRecordRDD,

273

knownSites: VariantRDD): RecalibrationModel

274

```

275

276

### Duplicate Marking

277

278

Algorithms for identifying and marking duplicate reads in sequencing data.

279

280

```scala { .api }

281

/**

282

* Mark duplicate reads based on alignment coordinates

283

* Implements Picard-compatible duplicate marking algorithm

284

* @param alignments - AlignmentRecordRDD to process

285

* @return AlignmentRecordRDD with duplicate reads flagged

286

*/

287

def markDuplicates(alignments: AlignmentRecordRDD): AlignmentRecordRDD

288

289

/**

290

* Mark duplicate fragments for paired-end data

291

* @param fragments - FragmentRDD to process

292

* @return FragmentRDD with duplicate fragments flagged

293

*/

294

def markDuplicates(fragments: FragmentRDD): FragmentRDD

295

```

296

297

**Usage Examples:**

298

299

```scala

300

import org.bdgenomics.adam.rdd.ADAMContext._

301

302

val alignments = sc.loadBam("sample.bam")

303

304

// Mark duplicates

305

val markedDuplicates = alignments.markDuplicates()

306

307

// Remove duplicates for analysis

308

val uniqueReads = markedDuplicates.transform(_.filter(!_.getDuplicateRead))

309

310

// Quality recalibration

311

val knownSites = sc.loadVcf("dbsnp.vcf").toVariants()

312

val recalibrated = alignments.recalibrateBaseQualities(knownSites)

313

```

314

315

### Indel Realignment

316

317

Local realignment algorithms for improving alignment accuracy around indels.

318

319

```scala { .api }

320

/**

321

* Local realignment around indels to improve alignment accuracy

322

* @param alignments - AlignmentRecordRDD to realign

323

* @param knownIndels - Optional known indel sites for guidance

324

* @return AlignmentRecordRDD with improved alignments around indels

325

*/

326

def realignIndels(alignments: AlignmentRecordRDD,

327

knownIndels: Option[VariantRDD] = None): AlignmentRecordRDD

328

329

/**

330

* Identify candidate regions for realignment

331

* @param alignments - Input alignments

332

* @return RDD of genomic regions requiring realignment

333

*/

334

def findRealignmentCandidates(alignments: AlignmentRecordRDD): RDD[ReferenceRegion]

335

```

336

337

### Coverage Analysis Algorithms

338

339

Algorithms for analyzing sequencing depth and coverage patterns.

340

341

```scala { .api }

342

/**

343

* Calculate coverage depth across genomic positions

344

* @param alignments - AlignmentRecordRDD to analyze

345

* @param collapse - Whether to merge adjacent positions with same coverage

346

* @return CoverageRDD with depth information

347

*/

348

def calculateCoverage(alignments: AlignmentRecordRDD,

349

collapse: Boolean = true): CoverageRDD

350

351

/**

352

* Identify regions with coverage above specified threshold

353

* @param coverage - CoverageRDD to analyze

354

* @param threshold - Minimum coverage threshold

355

* @return FeatureRDD of high-coverage regions

356

*/

357

def findHighCoverageRegions(coverage: CoverageRDD,

358

threshold: Double): FeatureRDD

359

360

/**

361

* Calculate coverage statistics across genomic regions

362

* @param coverage - Coverage data

363

* @param regions - Regions of interest

364

* @return RDD of coverage statistics per region

365

*/

366

def coverageStatistics(coverage: CoverageRDD,

367

regions: FeatureRDD): RDD[(Feature, CoverageStatistics)]

368

```

369

370

### K-mer Analysis

371

372

Algorithms for analyzing k-mer composition in genomic sequences.

373

374

```scala { .api }

375

/**

376

* Count k-mers in sequencing reads

377

* @param alignments - Reads to analyze

378

* @param kmerLength - Length of k-mers to count

379

* @return RDD of k-mer sequences with occurrence counts

380

*/

381

def countKmers(alignments: AlignmentRecordRDD,

382

kmerLength: Int): RDD[(String, Long)]

383

384

/**

385

* Find overrepresented k-mers in reads

386

* @param kmers - K-mer counts from countKmers

387

* @param threshold - Minimum count threshold for overrepresentation

388

* @return RDD of overrepresented k-mers

389

*/

390

def findOverrepresentedKmers(kmers: RDD[(String, Long)],

391

threshold: Long): RDD[(String, Long)]

392

```

393

394

**Usage Examples:**

395

396

```scala

397

// K-mer analysis workflow

398

val alignments = sc.loadBam("sample.bam")

399

400

// Count 21-mers

401

val kmers = alignments.countKmers(21)

402

403

// Find most frequent k-mers

404

val topKmers = kmers.top(100)(Ordering.by(_._2))

405

406

// Identify overrepresented sequences

407

val overrepresented = kmers.filter(_._2 > 1000)

408

```

409

410

## Algorithm Performance Considerations

411

412

1. **Use appropriate partitioning** - Algorithms work best with data partitioned by genomic region

413

2. **Consider memory requirements** - Some algorithms require substantial memory for intermediate data structures

414

3. **Leverage broadcast variables** - Use broadcast for small reference datasets accessed by all partitions

415

4. **Pipeline operations** - Chain multiple algorithms to minimize data shuffling

416

5. **Persist intermediate results** - Cache RDDs that will be reused in iterative algorithms

417

6. **Tune parallelism** - Adjust partition count based on cluster size and data volume

418

419

## Algorithm Compatibility

420

421

ADAM's algorithms maintain compatibility with standard bioinformatics tools:

422

423

- **Duplicate marking**: Compatible with Picard MarkDuplicates

424

- **Quality recalibration**: Compatible with GATK BaseRecalibrator

425

- **Variant normalization**: Follows VCF specification standards

426

- **Consensus calling**: Compatible with samtools consensus methods

427

428

This ensures ADAM-processed data integrates seamlessly with existing genomics pipelines.