Core functionality for Apache Spark, providing RDDs, SparkContext, and the fundamental distributed computing engine for big data processing.
npx @tessl/cli install tessl/maven-org-apache-spark--spark-core_2-10@1.6.0Apache Spark Core provides the foundational distributed computing engine for Apache Spark, implementing core abstractions like Resilient Distributed Datasets (RDDs) that enable fault-tolerant distributed data processing across clusters. It includes the SparkContext for managing distributed applications, schedulers for task execution, serializers for data exchange, broadcast variables for efficient data sharing, accumulators for distributed counters, and comprehensive APIs for data transformations and actions.
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDDFor Java applications:
import org.apache.spark.SparkContext;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.Accumulator;import org.apache.spark.{SparkContext, SparkConf}
// Configure Spark application
val conf = new SparkConf()
.setAppName("MySparkApp")
.setMaster("local[*]")
// Create SparkContext
val sc = new SparkContext(conf)
// Create RDD from collection
val data = Array(1, 2, 3, 4, 5)
val rdd = sc.parallelize(data)
// Transform and act on RDD
val result = rdd
.map(_ * 2)
.filter(_ > 5)
.collect()
// Broadcast variable
val broadcastVar = sc.broadcast(Array(1, 2, 3))
// Accumulator
val accum = sc.accumulator(0)
// Clean up
sc.stop()Apache Spark Core is built around several key components:
Core entry point for Spark applications with configuration management and resource coordination. Essential for creating RDDs, managing cluster connections, and coordinating distributed execution.
class SparkContext(config: SparkConf)
class SparkConf()
// Core RDD creation methods
def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T]
def range(start: Long, end: Long, step: Long = 1, numSlices: Int = defaultParallelism): RDD[Long]
def makeRDD[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T]
def emptyRDD[T: ClassTag]: RDD[T]
// File I/O methods
def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String]
def wholeTextFiles(path: String, minPartitions: Int = defaultMinPartitions): RDD[(String, String)]
def binaryFiles(path: String, minPartitions: Int = defaultMinPartitions): RDD[(String, PortableDataStream)]
def objectFile[T: ClassTag](path: String, minPartitions: Int = defaultMinPartitions): RDD[T]
// Shared variables
def broadcast[T: ClassTag](value: T): Broadcast[T]
def accumulator[T](initialValue: T)(implicit param: AccumulatorParam[T]): Accumulator[T]
// Context management
def stop(): Unit
def setCheckpointDir(directory: String): UnitSparkContext and Configuration
Resilient Distributed Datasets providing the core abstraction for distributed data processing with transformations, actions, and persistence capabilities.
abstract class RDD[T: ClassTag]
// Core transformations (lazy evaluation)
def map[U: ClassTag](f: T => U): RDD[U]
def filter(f: T => Boolean): RDD[T]
def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U]
def union(other: RDD[T]): RDD[T]
def distinct(): RDD[T]
def sample(withReplacement: Boolean, fraction: Double, seed: Long): RDD[T]
// Advanced transformations
def sortBy[K](f: T => K, ascending: Boolean = true)(implicit ord: Ordering[K], ctag: ClassTag[K]): RDD[T]
def keyBy[K](f: T => K): RDD[(K, T)]
def zipWithIndex(): RDD[(T, Long)]
def zipWithUniqueId(): RDD[(T, Long)]
// Core actions
def collect(): Array[T]
def count(): Long
def first(): T
def take(num: Int): Array[T]
def reduce(f: (T, T) => T): T
def fold(zeroValue: T)(op: (T, T) => T): T
def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U
// Persistence methods
def persist(newLevel: StorageLevel): RDD[T]
def cache(): RDD[T]
def unpersist(blocking: Boolean = true): RDD[T]Specialized operations for RDDs containing key-value pairs, including joins, grouping, and aggregation operations essential for data processing workflows.
class PairRDDFunctions[K, V](self: RDD[(K, V)])
def groupByKey(): RDD[(K, Iterable[V])]
def reduceByKey(func: (V, V) => V): RDD[(K, V)]
def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))]
def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))]Broadcast variables and accumulators for efficient data sharing and distributed counting across cluster nodes.
abstract class Broadcast[T]
class Accumulator[T]
// SparkContext methods for shared variables
def broadcast[T](value: T): Broadcast[T]
def accumulator[T](initialValue: T): Accumulator[T]Storage levels and caching mechanisms for optimizing RDD persistence across memory and disk with various replication strategies.
object StorageLevel
// Storage level constants
val MEMORY_ONLY: StorageLevel
val MEMORY_AND_DISK: StorageLevel
val DISK_ONLY: StorageLevel
// RDD persistence methods
def persist(storageLevel: StorageLevel): RDD[T]
def cache(): RDD[T]
def unpersist(): RDD[T]File I/O operations for reading from and writing to various data sources including text files, sequence files, and Hadoop-compatible formats.
// SparkContext I/O methods
def textFile(path: String): RDD[String]
def wholeTextFiles(path: String): RDD[(String, String)]
def sequenceFile[K, V](path: String): RDD[(K, V)]
// RDD output methods
def saveAsTextFile(path: String): Unit
def saveAsSequenceFile(path: String): UnitPartitioning strategies and shuffle operations for controlling data distribution and optimizing performance across cluster nodes.
abstract class Partitioner
class HashPartitioner(partitions: Int) extends Partitioner
class RangePartitioner[K, V](partitions: Int, rdd: RDD[(K, V)]) extends Partitioner
// Partitioning methods
def partitionBy(partitioner: Partitioner): RDD[(K, V)]
def repartition(numPartitions: Int): RDD[T]
def coalesce(numPartitions: Int): RDD[T]// Core type aliases and abstractions
type Partition = org.apache.spark.Partition
type TaskContext = org.apache.spark.TaskContext
type SparkFiles = org.apache.spark.SparkFiles.type
// Function type aliases for Java interop
type Function[T, R] = T => R
type Function2[T1, T2, R] = (T1, T2) => R
type VoidFunction[T] = T => Unit