Core library for distributed genomics data processing built on Apache Spark with support for major genomic file formats
—
Core functionality for loading genomic data from various file formats and saving transformed results. ADAM Core provides a unified interface for accessing genomic data regardless of the underlying storage format, with support for both local files and distributed storage systems.
The main entry point for all data loading operations, automatically added to SparkContext via implicit conversion.
/**
* Implicit conversion that adds ADAM data loading methods to SparkContext
* @param sc - Spark context to extend
* @return ADAMContext with genomic data loading capabilities
*/
implicit def sparkContextToADAMContext(sc: SparkContext): ADAMContextLoad aligned and unaligned sequencing reads from SAM, BAM, and CRAM formats.
/**
* Load alignment records from SAM/BAM/CRAM files
* @param pathName - Path to alignment file or directory of files
* @param stringency - Validation stringency for format compliance
* @return AlignmentRecordRDD containing sequencing reads
*/
def loadBam(pathName: String,
stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordRDD
/**
* Load alignment records from indexed BAM/CRAM file for specific genomic regions
* @param pathName - Path to indexed alignment file
* @param viewRegions - Genomic regions to query
* @return AlignmentRecordRDD containing reads overlapping the regions
*/
def loadIndexedBam(pathName: String, viewRegions: Iterable[ReferenceRegion]): AlignmentRecordRDDUsage Examples:
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.adam.models.ReferenceRegion
// Load entire BAM file
val allReads = sc.loadBam("sample.bam")
// Load with lenient validation for malformed files
val reads = sc.loadBam("sample.bam", ValidationStringency.LENIENT)
// Load specific region from indexed BAM
val region = ReferenceRegion("chr1", 1000000, 2000000)
val regionReads = sc.loadIndexedBam("sample.bam", region)Load genetic variants and genotype information from VCF files.
/**
* Load variant contexts from VCF files with full metadata
* @param pathName - Path to VCF file or directory of files
* @param stringency - Validation stringency for VCF format compliance
* @return VariantContextRDD containing variants with genotype information
*/
def loadVcf(pathName: String,
stringency: ValidationStringency = ValidationStringency.STRICT): VariantContextRDD
/**
* Load variants from indexed VCF file for specific genomic regions
* @param pathName - Path to indexed VCF file (with .tbi or .csi index)
* @param viewRegions - Genomic regions to query
* @return VariantContextRDD containing variants in the specified regions
*/
def loadIndexedVcf(pathName: String, viewRegions: Iterable[ReferenceRegion]): VariantContextRDDUsage Examples:
// Load complete VCF file
val variants = sc.loadVcf("variants.vcf")
// Load from compressed VCF with strict validation
val compressedVariants = sc.loadVcf("variants.vcf.gz", ValidationStringency.STRICT)
// Load specific chromosomal region
val chrRegion = ReferenceRegion("chr22", 0, 51304566)
val chr22Variants = sc.loadIndexedVcf("variants.vcf.gz", chrRegion)Load raw sequencing data and reference sequences.
/**
* Load FASTQ sequencing reads (single-end or paired-end)
* @param pathName1 - Path to first FASTQ file (or single-end file)
* @param optPathName2 - Optional path to second FASTQ file for paired-end reads
* @param optRecordGroup - Optional read group identifier for the reads
* @param stringency - Validation stringency for FASTQ format compliance
* @return AlignmentRecordRDD containing unaligned sequencing reads
*/
def loadFastq(pathName1: String,
optPathName2: Option[String],
optRecordGroup: Option[String] = None,
stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordRDD
/**
* Load interleaved FASTQ file where paired reads are alternately arranged
* @param pathName - Path to interleaved FASTQ file
* @return AlignmentRecordRDD containing paired sequencing reads
*/
def loadInterleavedFastq(pathName: String): AlignmentRecordRDD
/**
* Load reference genome sequences from FASTA files
* @param pathName - Path to FASTA file
* @param maximumLength - Maximum length of individual sequences to load
* @return NucleotideContigFragmentRDD containing reference sequences
*/
def loadFasta(pathName: String, maximumLength: Long = 10000L): NucleotideContigFragmentRDDUsage Examples:
// Load single-end FASTQ
val singleEndReads = sc.loadFastq("reads.fastq")
// Load paired-end FASTQ files
val pairedReads = sc.loadFastq("reads_R1.fastq", Some("reads_R2.fastq"))
// Load with custom read group
val readsWithRG = sc.loadFastq("reads.fastq", None, Some("sample1.rg1"))
// Load interleaved paired FASTQ
val interleavedReads = sc.loadInterleavedFastq("paired.fastq")
// Load reference genome
val reference = sc.loadFasta("hg38.fasta", maximumLength = 50000L)Load genomic annotations and features from various formats.
/**
* Load genomic features from BED, GFF3, GTF, or other supported formats
* @param pathName - Path to feature file
* @return FeatureRDD containing genomic annotations
*/
def loadFeatures(pathName: String): FeatureRDDUsage Examples:
// Load BED file annotations
val bedFeatures = sc.loadFeatures("annotations.bed")
// Load GTF gene annotations
val geneFeatures = sc.loadFeatures("genes.gtf")
// Load GFF3 annotations
val gff3Features = sc.loadFeatures("features.gff3")Load genomic data without specifying the exact format, with automatic format detection.
/**
* Load alignment records with automatic format detection
* @param pathName - Path to alignment file (SAM/BAM/CRAM/ADAM)
* @return AlignmentRecordRDD containing sequencing reads
*/
def loadAlignments(pathName: String): AlignmentRecordRDD
/**
* Load variants with automatic format detection
* @param pathName - Path to variant file (VCF/ADAM)
* @return VariantRDD containing genetic variants
*/
def loadVariants(pathName: String): VariantRDD
/**
* Load genotypes with automatic format detection
* @param pathName - Path to genotype file (VCF/ADAM)
* @return GenotypeRDD containing genotype calls
*/
def loadGenotypes(pathName: String): GenotypeRDDLoad data from ADAM's native Parquet+Avro format with optional filtering and projection.
/**
* Load data from Parquet files with optional predicate pushdown and column projection
* @param pathName - Path to Parquet file or directory
* @param optPredicate - Optional filter predicate for server-side filtering
* @param optProjection - Optional schema projection to load only specific fields
* @return RDD of the specified type T
*/
def loadParquet[T](pathName: String,
optPredicate: Option[FilterPredicate] = None,
optProjection: Option[Schema] = None): RDD[T]Usage Examples:
import org.bdgenomics.adam.projections.{AlignmentRecordField, Projection}
import org.apache.parquet.filter2.predicate.FilterApi._
// Load with column projection for efficiency
val projection = Projection(AlignmentRecordField.readName, AlignmentRecordField.readMapped)
val projectedReads = sc.loadParquet[AlignmentRecord]("reads.adam",
optProjection = Some(projection))
// Load with predicate filtering
val mappedFilter = equal(binaryColumn("readMapped"), true)
val mappedReads = sc.loadParquet[AlignmentRecord]("reads.adam",
optPredicate = Some(mappedFilter))Additional configuration options for specialized loading scenarios.
// Validation stringency options
object ValidationStringency extends Enumeration {
val STRICT = Value // Fail on format violations
val LENIENT = Value // Log warnings for violations
val SILENT = Value // Ignore format violations
}
// SAM format types for saving
object SAMFormat extends Enumeration {
val SAM = Value // Plain text SAM
val BAM = Value // Binary BAM
val CRAM = Value // Compressed CRAM
}.cache() methodInstall with Tessl CLI
npx tessl i tessl/maven-org-bdgenomics-adam--adam-core