Apache Spark Core provides the foundational execution engine and API for distributed data processing with RDDs, task scheduling, and cluster management.
npx @tessl/cli install tessl/maven-org-apache-spark--spark-core_2_13@4.0.0Apache Spark Core provides the foundational execution engine and API for distributed data processing across clusters. It implements the core distributed computing primitives including RDDs (Resilient Distributed Datasets), task scheduling, memory management, fault tolerance, and the base APIs that power all other Spark components.
<dependency><groupId>org.apache.spark</groupId><artifactId>spark-core_2.13</artifactId><version>4.0.0</version></dependency>import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDDFor Java:
import org.apache.spark.SparkContext;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.JavaRDD;import org.apache.spark.{SparkContext, SparkConf}
// Create configuration
val conf = new SparkConf()
.setAppName("MySparkApp")
.setMaster("local[*]")
// Create Spark context
val sc = new SparkContext(conf)
// Create RDD from collection
val data = sc.parallelize(Array(1, 2, 3, 4, 5))
// Transform and action
val squared = data.map(x => x * x)
val result = squared.collect()
// Cleanup
sc.stop()Spark Core is built around several key components:
Core application setup and cluster connection management. SparkContext serves as the primary interface for creating RDDs and configuring distributed execution.
class SparkContext(config: SparkConf) {
def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T]
def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String]
def stop(): Unit
def broadcast[T: ClassTag](value: T): Broadcast[T]
def version: String
def defaultParallelism: Int
}
class SparkConf(loadDefaults: Boolean = true) {
def set(key: String, value: String): SparkConf
def setAppName(name: String): SparkConf
def setMaster(master: String): SparkConf
}Resilient Distributed Dataset API providing transformations and actions for distributed data processing. RDDs support fault-tolerant parallel operations on large datasets.
abstract class RDD[T: ClassTag] {
def map[U: ClassTag](f: T => U): RDD[U]
def filter(f: T => Boolean): RDD[T]
def flatMap[U: ClassTag](f: T => IterableOnce[U]): RDD[U]
def collect(): Array[T]
def count(): Long
def reduce(f: (T, T) => T): T
def persist(newLevel: StorageLevel): RDD[T]
def persist(): RDD[T]
def cache(): RDD[T]
}Java-friendly wrappers for Spark functionality providing type-safe distributed processing for Java applications.
public class JavaSparkContext {
public <T> JavaRDD<T> parallelize(java.util.List<T> list)
public <T> JavaRDD<T> parallelize(java.util.List<T> list, int numSlices)
public JavaRDD<String> textFile(String path)
public void stop()
}
public class JavaRDD<T> {
public <R> JavaRDD<R> map(org.apache.spark.api.java.function.Function<T, R> f)
public JavaRDD<T> filter(org.apache.spark.api.java.function.Function<T, Boolean> f)
public java.util.List<T> collect()
}Data storage levels and caching mechanisms for optimizing repeated access to RDDs with configurable memory and disk usage.
class StorageLevel(
useDisk: Boolean,
useMemory: Boolean,
useOffHeap: Boolean,
deserialized: Boolean,
replication: Int
)
object StorageLevel {
val MEMORY_ONLY: StorageLevel
val MEMORY_AND_DISK: StorageLevel
val DISK_ONLY: StorageLevel
}Efficient read-only variable distribution to all cluster nodes for sharing large datasets or lookup tables across tasks.
abstract class Broadcast[T: ClassTag] {
def value: T
def unpersist(blocking: Boolean = false): Unit
def destroy(): Unit
}Shared variables for collecting information from distributed tasks, supporting aggregation patterns like counters and sums.
abstract class AccumulatorV2[IN, OUT] {
def add(v: IN): Unit
def value: OUT
def reset(): Unit
def copy(): AccumulatorV2[IN, OUT]
}
class LongAccumulator extends AccumulatorV2[java.lang.Long, java.lang.Long]
class DoubleAccumulator extends AccumulatorV2[java.lang.Double, java.lang.Double]Data partitioning strategies for controlling how RDD elements are distributed across cluster nodes to optimize performance.
abstract class Partitioner extends Serializable {
def numPartitions: Int
def getPartition(key: Any): Int
}
class HashPartitioner(partitions: Int) extends Partitioner
class RangePartitioner[K: Ordering: ClassTag, V](
partitions: Int,
rdd: RDD[_ <: Product2[K, V]]
) extends PartitionerSerialization frameworks for efficient data transfer and storage with support for Java serialization and Kryo.
abstract class Serializer {
def newInstance(): SerializerInstance
}
abstract class SerializerInstance {
def serialize[T: ClassTag](t: T): ByteBuffer
def deserialize[T: ClassTag](bytes: ByteBuffer): T
}
class JavaSerializer(conf: SparkConf) extends Serializer
class KryoSerializer(conf: SparkConf) extends Serializertrait ClassTag[T]
trait Ordering[T]
case class TaskContext(
stageId: Int,
stageAttemptNumber: Int,
partitionId: Int,
taskAttemptId: Long
)
sealed trait TaskEndReason
case object Success extends TaskEndReason
case class ExceptionFailure(
className: String,
description: String,
stackTrace: Array[StackTraceElement]
) extends TaskEndReason