Core library for distributed genomics data processing built on Apache Spark with support for major genomic file formats
—
Comprehensive support for reading and writing genomic file formats, with automatic format detection and validation. ADAM Core bridges legacy genomic file formats with modern distributed processing capabilities, providing efficient I/O operations for all major genomic data types.
Support for sequencing alignment data in standard and modern formats.
/**
* Load alignment data from SAM, BAM, or CRAM files
* Automatically detects format based on file extension and magic bytes
* @param pathName - Path to alignment file or directory
* @param stringency - Validation stringency for format compliance
* @return AlignmentRecordRDD containing alignment records
*/
def loadBam(pathName: String,
stringency: ValidationStringency = ValidationStringency.STRICT): AlignmentRecordRDD
/**
* Save alignment records as SAM/BAM/CRAM format
* @param pathName - Output path
* @param asType - Output format specification
* @param asSingleFile - Whether to merge output into single file
* @param stringency - Validation stringency for output compliance
*/
def saveAsSam(pathName: String,
asType: SAMFormat = SAMFormat.SAM,
asSingleFile: Boolean = false,
stringency: ValidationStringency = ValidationStringency.STRICT): UnitSupported Alignment Formats:
Usage Examples:
import org.bdgenomics.adam.rdd.ADAMContext._
// Load various alignment formats
val samData = sc.loadBam("alignments.sam")
val bamData = sc.loadBam("alignments.bam")
val cramData = sc.loadBam("alignments.cram")
// Save in different formats
samData.saveAsSam("output.sam", SAMFormat.SAM)
samData.saveAsSam("output.bam", SAMFormat.BAM)
samData.saveAsSam("output.cram", SAMFormat.CRAM)
// Save as ADAM's native format
samData.saveAsParquet("alignments.adam")Support for genetic variant data with full VCF specification compliance.
/**
* Load variant data from VCF files (plain text or compressed)
* Supports VCF 4.0+ specifications with full header parsing
* @param pathName - Path to VCF file or directory
* @param stringency - Validation stringency for VCF compliance
* @return VariantContextRDD containing variants with metadata
*/
def loadVcf(pathName: String,
stringency: ValidationStringency = ValidationStringency.STRICT): VariantContextRDD
/**
* Save variant data as VCF format
* @param pathName - Output path
* @param stringency - Validation stringency for VCF compliance
* @param asSingleFile - Whether to merge output into single file
*/
def saveAsVcf(pathName: String,
stringency: ValidationStringency = ValidationStringency.STRICT,
asSingleFile: Boolean = false): UnitSupported Variant Formats:
Usage Examples:
// Load VCF files
val variants = sc.loadVcf("variants.vcf")
val compressedVariants = sc.loadVcf("variants.vcf.gz")
// Work with different variant representations
val variantOnly = variants.toVariants() // Just variant sites
val genotypes = variants.toGenotypes() // Genotype calls
// Save in various formats
variants.saveAsVcf("output.vcf")
variantOnly.saveAsParquet("variants.adam")
genotypes.saveAsParquet("genotypes.adam")Support for raw sequencing data and reference genomes.
/**
* Load FASTQ sequencing data (single-end or paired-end)
* @param pathName1 - First FASTQ file (or single-end file)
* @param optPathName2 - Optional second FASTQ file for paired-end
* @return AlignmentRecordRDD containing unaligned reads
*/
def loadFastq(pathName1: String, optPathName2: Option[String] = None): AlignmentRecordRDD
/**
* Load interleaved FASTQ where paired reads alternate
* @param pathName - Path to interleaved FASTQ file
* @return AlignmentRecordRDD containing paired reads
*/
def loadInterleavedFastq(pathName: String): AlignmentRecordRDD
/**
* Load reference genome sequences from FASTA files
* @param pathName - Path to FASTA file
* @param maximumLength - Maximum sequence length to load per record
* @return NucleotideContigFragmentRDD containing reference sequences
*/
def loadFasta(pathName: String, maximumLength: Long = 10000L): NucleotideContigFragmentRDD
/**
* Save reads as FASTQ format
* @param pathName - Output path
* @param outputOriginalBaseQualities - Use original vs. recalibrated qualities
* @param asSingleFile - Merge output into single file
*/
def saveAsFastq(pathName: String,
outputOriginalBaseQualities: Boolean = false,
asSingleFile: Boolean = false): Unit
/**
* Save reference sequences as FASTA format
* @param pathName - Output path
* @param lineWidth - Bases per line in output
*/
def saveAsFasta(pathName: String, lineWidth: Int = 60): UnitSupported Sequence Formats:
Usage Examples:
// Load sequencing data
val singleEnd = sc.loadFastq("reads.fastq")
val pairedEnd = sc.loadFastq("R1.fastq", Some("R2.fastq"))
val interleaved = sc.loadInterleavedFastq("paired.fastq")
// Load reference genome
val reference = sc.loadFasta("hg38.fasta")
// Save processed reads
val processed = singleEnd.transform(_.filter(_.getReadName.startsWith("good")))
processed.saveAsFastq("filtered_reads.fastq")
// Save reference sequences
reference.saveAsFasta("output_reference.fasta", lineWidth = 80)Support for genomic feature annotations in multiple standard formats.
/**
* Load genomic features with automatic format detection
* Supports BED, GFF3, GTF, IntervalList, and NarrowPeak formats
* @param pathName - Path to feature file
* @return FeatureRDD containing genomic annotations
*/
def loadFeatures(pathName: String): FeatureRDD
/**
* Save features as BED format
* @param pathName - Output path
*/
def saveAsBed(pathName: String): Unit
/**
* Save features as GTF format (Gene Transfer Format)
* @param pathName - Output path
*/
def saveAsGtf(pathName: String): Unit
/**
* Save features as GFF3 format (General Feature Format)
* @param pathName - Output path
*/
def saveAsGff3(pathName: String): Unit
/**
* Save features as Picard IntervalList format
* @param pathName - Output path
*/
def saveAsIntervalList(pathName: String): Unit
/**
* Save features as ENCODE narrowPeak format
* @param pathName - Output path
*/
def saveAsNarrowPeak(pathName: String): UnitSupported Feature Formats:
Usage Examples:
// Load various annotation formats
val bedFeatures = sc.loadFeatures("regions.bed")
val geneAnnotations = sc.loadFeatures("genes.gtf")
val gff3Features = sc.loadFeatures("annotations.gff3")
// Convert between formats
bedFeatures.saveAsGtf("converted.gtf")
geneAnnotations.saveAsBed("genes.bed")
// Filter and save
val exons = geneAnnotations.transform(_.filter(_.getFeatureType == "exon"))
exons.saveAsIntervalList("exons.interval_list")Support for sequencing depth and coverage data visualization formats.
/**
* Save coverage data as WIG format for genome browsers
* @param pathName - Output path
*/
def saveAsWig(pathName: String): Unit
/**
* Save coverage data as BigWig format (through conversion)
* @param pathName - Output path
* @param sequenceDictionary - Reference sequence information
*/
def saveAsBigWig(pathName: String, sequenceDictionary: SequenceDictionary): UnitUsage Examples:
// Generate and save coverage
val alignments = sc.loadBam("sample.bam")
val coverage = alignments.toCoverage()
// Save for genome browser visualization
coverage.saveAsWig("coverage.wig")
// Convert features to coverage
val features = sc.loadFeatures("peaks.bed")
val featureCoverage = features.toCoverage()
featureCoverage.saveAsWig("peak_coverage.wig")Automatic format detection for seamless data loading regardless of file format.
/**
* Load alignment data with automatic format detection
* Detects SAM, BAM, CRAM, or ADAM formats automatically
* @param pathName - Path to alignment file
* @return AlignmentRecordRDD
*/
def loadAlignments(pathName: String): AlignmentRecordRDD
/**
* Load variant data with automatic format detection
* Detects VCF or ADAM variant formats automatically
* @param pathName - Path to variant file
* @return VariantRDD
*/
def loadVariants(pathName: String): VariantRDD
/**
* Load genotype data with automatic format detection
* Detects VCF or ADAM genotype formats automatically
* @param pathName - Path to genotype file
* @return GenotypeRDD
*/
def loadGenotypes(pathName: String): GenotypeRDD
/**
* Load feature data with automatic format detection
* Detects BED, GTF, GFF3, or ADAM feature formats automatically
* @param pathName - Path to feature file
* @return FeatureRDD
*/
def loadFeatures(pathName: String): FeatureRDDUsage Examples:
// Load without specifying format
val alignments = sc.loadAlignments("unknown_format_file") // Auto-detects
val variants = sc.loadVariants("variants_file") // Auto-detects
val features = sc.loadFeatures("annotations_file") // Auto-detects
// Particularly useful for processing directories with mixed formats
val mixedAlignments = sc.loadAlignments("alignment_directory/")ADAM's high-performance columnar storage format with schema evolution support.
/**
* Load data from ADAM Parquet files with optional projection and filtering
* @param pathName - Path to Parquet file or directory
* @param optPredicate - Optional server-side filtering predicate
* @param optProjection - Optional column projection for efficiency
* @return RDD of specified type
*/
def loadParquet[T](pathName: String,
optPredicate: Option[FilterPredicate] = None,
optProjection: Option[Schema] = None): RDD[T]
/**
* Save any GenomicRDD as ADAM Parquet format
* @param pathName - Output path
*/
def saveAsParquet(pathName: String): UnitUsage Examples:
import org.bdgenomics.adam.projections.{AlignmentRecordField, Projection}
import org.apache.parquet.filter2.predicate.FilterApi._
// Save as ADAM format
val alignments = sc.loadBam("input.bam")
alignments.saveAsParquet("alignments.adam")
// Load with column projection for efficiency
val projection = Projection(AlignmentRecordField.readName,
AlignmentRecordField.sequence,
AlignmentRecordField.readMapped)
val projectedAlignments = sc.loadParquet[AlignmentRecord](
"alignments.adam",
optProjection = Some(projection)
)
// Load with server-side filtering
val mappedFilter = equal(binaryColumn("readMapped"), true)
val mappedReads = sc.loadParquet[AlignmentRecord](
"alignments.adam",
optPredicate = Some(mappedFilter)
)Comprehensive validation and error handling across all supported formats.
/**
* Validation stringency levels for format compliance
*/
object ValidationStringency extends Enumeration {
/** Fail immediately on any format violations */
val STRICT = Value
/** Log warnings for format violations but continue processing */
val LENIENT = Value
/** Ignore format violations silently */
val SILENT = Value
}
/**
* SAM format output types
*/
object SAMFormat extends Enumeration {
val SAM = Value // Plain text SAM
val BAM = Value // Binary BAM
val CRAM = Value // Reference compressed CRAM
}ADAM automatically detects formats based on file extensions:
When file extensions are ambiguous or missing, ADAM examines file content headers for format detection.
Install with Tessl CLI
npx tessl i tessl/maven-org-bdgenomics-adam--adam-core