Java/Python API wrappers for ADAM genomics analysis library enabling scalable genomic data processing with Apache Spark
npx @tessl/cli install tessl/maven-org-bdgenomics-adam--adam-apis-2-10@0.23.0ADAM APIs provides Java and Python-friendly API wrappers for the ADAM (A Distributed Alignment Mapper) genomics analysis library. This module enables scalable genomic data processing using Apache Spark's distributed computing capabilities, offering convenient wrapper classes and converters that make ADAM's core functionality accessible to Java and Python developers.
<dependency>
<groupId>org.bdgenomics.adam</groupId>
<artifactId>adam-apis_2.10</artifactId>
<version>0.23.0</version>
</dependency>import org.bdgenomics.adam.api.java.JavaADAMContext;
import org.bdgenomics.adam.rdd.ADAMContext;
import org.apache.spark.api.java.JavaSparkContext;
import htsjdk.samtools.ValidationStringency;For genomic RDD types:
import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD;
import org.bdgenomics.adam.rdd.contig.NucleotideContigFragmentRDD;
import org.bdgenomics.adam.rdd.fragment.FragmentRDD;
import org.bdgenomics.adam.rdd.feature.FeatureRDD;
import org.bdgenomics.adam.rdd.feature.CoverageRDD;
import org.bdgenomics.adam.rdd.variant.GenotypeRDD;
import org.bdgenomics.adam.rdd.variant.VariantRDD;
import org.bdgenomics.adam.rdd.variant.VariantContextRDD;
import org.bdgenomics.adam.util.ReferenceFile;For RDD/Dataset conversions:
import org.bdgenomics.adam.api.java.*;For Python API support:
import org.bdgenomics.adam.api.python.DataFrameConversionWrapper;import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.bdgenomics.adam.api.java.JavaADAMContext;
import org.bdgenomics.adam.rdd.ADAMContext;
import org.bdgenomics.adam.rdd.read.AlignmentRecordRDD;
// Create Spark context
SparkConf conf = new SparkConf().setAppName("ADAM API Example");
JavaSparkContext jsc = new JavaSparkContext(conf);
// Create ADAM context
ADAMContext ac = new ADAMContext(jsc.sc());
JavaADAMContext jac = new JavaADAMContext(ac);
// Load genomic data
AlignmentRecordRDD alignments = jac.loadAlignments("sample.bam");
System.out.println("Loaded " + alignments.jrdd().count() + " alignment records");
// Load other genomic data types
jac.loadVariants("variants.vcf");
jac.loadFeatures("annotations.bed");
jac.loadContigFragments("reference.fa");ADAM APIs is built around several key components:
Core functionality for loading genomic data from various file formats into ADAM's specialized RDD types. Supports automatic format detection and validation.
// Main context class
class JavaADAMContext {
JavaADAMContext(ADAMContext ac);
JavaSparkContext getSparkContext();
// Load alignment data (BAM/CRAM/SAM/FASTA/FASTQ)
AlignmentRecordRDD loadAlignments(String pathName);
AlignmentRecordRDD loadAlignments(String pathName, ValidationStringency stringency);
// Load reference sequences
NucleotideContigFragmentRDD loadContigFragments(String pathName);
ReferenceFile loadReferenceFile(String pathName);
ReferenceFile loadReferenceFile(String pathName, Long maximumLength);
// Load fragments (paired-end sequencing data)
FragmentRDD loadFragments(String pathName);
FragmentRDD loadFragments(String pathName, ValidationStringency stringency);
// Load genomic features (annotations)
FeatureRDD loadFeatures(String pathName);
FeatureRDD loadFeatures(String pathName, ValidationStringency stringency);
// Load coverage data
CoverageRDD loadCoverage(String pathName);
CoverageRDD loadCoverage(String pathName, ValidationStringency stringency);
// Load variant data
GenotypeRDD loadGenotypes(String pathName);
GenotypeRDD loadGenotypes(String pathName, ValidationStringency stringency);
VariantRDD loadVariants(String pathName);
VariantRDD loadVariants(String pathName, ValidationStringency stringency);
}Comprehensive set of converter classes for transforming between different genomic RDD types. Each converter implements Function2 interface for use in Spark transformations.
// Base conversion interface
interface SameTypeConversion<T, U extends GenomicRDD<T, U>> extends Function2<U, RDD<T>, U> {
U call(U v1, RDD<T> v2);
}
// Example converter classes
class ContigsToAlignmentRecordsConverter extends Function2<NucleotideContigFragmentRDD, RDD<AlignmentRecord>, AlignmentRecordRDD>;
class AlignmentRecordsToVariantsConverter extends Function2<AlignmentRecordRDD, RDD<Variant>, VariantRDD>;
class VariantsToGenotypesConverter extends Function2<VariantRDD, RDD<Genotype>, GenotypeRDD>;Spark SQL Dataset-based converters providing similar functionality to RDD converters but with Dataset operations for better performance and SQL integration.
// Base dataset conversion traits
interface ToAlignmentRecordDatasetConversion<T extends Product, U extends GenomicDataset<?, T, U>>
extends GenomicDatasetConversion<T, U, AlignmentRecord, AlignmentRecordRDD>;
// Example dataset converter classes
class ContigsToAlignmentRecordsDatasetConverter extends ToAlignmentRecordDatasetConversion<NucleotideContigFragment, NucleotideContigFragmentRDD>;
class VariantsToGenotypesDatasetConverter extends ToGenotypeDatasetConversion<Variant, VariantRDD>;Wrapper functionality enabling Python integration through DataFrame conversion utilities.
class DataFrameConversionWrapper implements JFunction<DataFrame, DataFrame> {
DataFrameConversionWrapper(DataFrame newDf);
DataFrame call(DataFrame v1);
}All formats support standard Hadoop compression codecs (.gz, .bz2) where applicable.
// Genomic RDD wrapper types with metadata preservation
interface GenomicRDD<T, U extends GenomicRDD<T, U>> {
RDD<T> jrdd();
// Additional metadata methods...
}
class AlignmentRecordRDD extends GenomicRDD<AlignmentRecord, AlignmentRecordRDD> {}
class NucleotideContigFragmentRDD extends GenomicRDD<NucleotideContigFragment, NucleotideContigFragmentRDD> {}
class FragmentRDD extends GenomicRDD<Fragment, FragmentRDD> {}
class FeatureRDD extends GenomicRDD<Feature, FeatureRDD> {}
class CoverageRDD extends GenomicRDD<Coverage, CoverageRDD> {}
class GenotypeRDD extends GenomicRDD<Genotype, GenotypeRDD> {}
class VariantRDD extends GenomicRDD<Variant, VariantRDD> {}
class VariantContextRDD extends GenomicRDD<VariantContext, VariantContextRDD> {}// HTSJDK validation strictness control
enum ValidationStringency {
STRICT, // Fail on any format violations
LENIENT, // Warn on format issues but continue processing
SILENT // Ignore format violations silently
}// Broadcastable reference sequences
class ReferenceFile {
// Methods for efficient reference lookups across cluster
}
// Spark integration types
class JavaSparkContext {
// Standard Spark Java API context
}
class DataFrame {
// Spark SQL DataFrame for Python integration
}