Apache Spark Core - The foundational distributed computing engine for Apache Spark that provides RDD abstractions, task scheduling, memory management, and cluster execution capabilities.
npx @tessl/cli install tessl/maven-org-apache-spark--spark-core_2-11@1.6.0Apache Spark Core provides the foundational distributed computing engine for Apache Spark. It implements the Resilient Distributed Dataset (RDD) programming model, sophisticated task scheduling, advanced memory management, and comprehensive support for multiple cluster managers. The core engine enables fault-tolerant parallel operations on large datasets across distributed clusters.
org.apache.spark:spark-core_2.11:1.6.3Scala:
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevelJava:
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.SparkConf;Scala:
import org.apache.spark.{SparkContext, SparkConf}
// Create Spark configuration
val conf = new SparkConf()
.setAppName("MySparkApp")
.setMaster("local[*]")
// Create Spark context
val sc = new SparkContext(conf)
// Create RDD from collection
val data = sc.parallelize(1 to 10)
// Transform and collect results
val result = data
.map(_ * 2)
.filter(_ > 10)
.collect()
// Stop the context
sc.stop()Java:
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.SparkConf;
// Create configuration
SparkConf conf = new SparkConf()
.setAppName("MyJavaSparkApp")
.setMaster("local[*]");
// Create Java Spark context
JavaSparkContext sc = new JavaSparkContext(conf);
// Create RDD and perform operations
JavaRDD<Integer> data = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
JavaRDD<Integer> result = data
.map(x -> x * 2)
.filter(x -> x > 5);
List<Integer> collected = result.collect();
sc.stop();Apache Spark Core is built around several key components:
Core functionality for creating and managing Spark applications, including cluster connections, resource allocation, and application lifecycle management.
class SparkContext(config: SparkConf) extends Logging {
def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T]
def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String]
def wholeTextFiles(path: String, minPartitions: Int = defaultMinPartitions): RDD[(String, String)]
def stop(): Unit
def broadcast[T: ClassTag](value: T): Broadcast[T]
def accumulator[T](initialValue: T)(implicit param: AccumulatorParam[T]): Accumulator[T]
}
class SparkConf(loadDefaults: Boolean = true) extends Cloneable {
def set(key: String, value: String): SparkConf
def setMaster(master: String): SparkConf
def setAppName(name: String): SparkConf
def get(key: String): String
def get(key: String, defaultValue: String): String
}The core RDD API providing transformations and actions for distributed data processing, including map, filter, reduce operations and advanced transformations like joins and aggregations.
abstract class RDD[T: ClassTag] extends Serializable {
// Transformations
def map[U: ClassTag](f: T => U): RDD[U]
def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U]
def filter(f: T => Boolean): RDD[T]
def distinct(numPartitions: Int = partitions.length): RDD[T]
def union(other: RDD[T]): RDD[T]
// Actions
def collect(): Array[T]
def count(): Long
def first(): T
def take(num: Int): Array[T]
def reduce(f: (T, T) => T): T
def foreach(f: T => Unit): Unit
// Persistence
def cache(): RDD.this.type
def persist(newLevel: StorageLevel): RDD.this.type
}Advanced operations for key-value pair RDDs including grouping, joining, and aggregation operations essential for data processing workflows.
class PairRDDFunctions[K, V](self: RDD[(K, V)]) {
def groupByKey(): RDD[(K, Iterable[V])]
def reduceByKey(func: (V, V) => V): RDD[(K, V)]
def aggregateByKey[U: ClassTag](zeroValue: U)(seqOp: (U, V) => U, combOp: (U, U) => U): RDD[(K, U)]
def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))]
def leftOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))]
def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))]
def sortByKey(ascending: Boolean = true): RDD[(K, V)]
}Java-friendly wrappers providing the complete Spark functionality through Java-compatible interfaces, lambda support, and familiar Java collection types.
public class JavaSparkContext implements Closeable {
public JavaSparkContext(SparkConf conf)
public <T> JavaRDD<T> parallelize(List<T> list)
public <T> JavaRDD<T> parallelize(List<T> list, int numSlices)
public JavaRDD<String> textFile(String path)
public <T> Broadcast<T> broadcast(T value)
public void stop()
}
public class JavaRDD<T> extends AbstractJavaRDDLike<T, JavaRDD<T>> {
public <R> JavaRDD<R> map(Function<T, R> f)
public <R> JavaRDD<R> flatMap(FlatMapFunction<T, R> f)
public JavaRDD<T> filter(Function<T, Boolean> f)
public List<T> collect()
public long count()
public T first()
}Memory management and persistence strategies for optimizing RDD storage across cluster nodes, including various storage levels and caching mechanisms.
object StorageLevel {
val NONE: StorageLevel
val DISK_ONLY: StorageLevel
val DISK_ONLY_2: StorageLevel
val MEMORY_ONLY: StorageLevel
val MEMORY_ONLY_2: StorageLevel
val MEMORY_ONLY_SER: StorageLevel
val MEMORY_AND_DISK: StorageLevel
val MEMORY_AND_DISK_2: StorageLevel
val MEMORY_AND_DISK_SER: StorageLevel
}Distributed variable support for efficiently sharing read-only data across tasks (broadcast variables) and collecting information from executors (accumulators).
abstract class Broadcast[T: ClassTag] extends Serializable {
def value: T
def unpersist(blocking: Boolean = true): Unit
def destroy(): Unit
def id: Long
}
class Accumulator[T] private[spark] (
@transient private[spark] val initialValue: T,
param: AccumulatorParam[T],
name: Option[String] = None) extends Serializable {
def value: T
def add(term: T): Unit
def += (term: T): Unit
def localValue: T
}// Core configuration class
class SparkConf(loadDefaults: Boolean = true) extends Cloneable with Logging
// Main entry point for Spark functionality
class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationClient
// Basic distributed dataset abstraction
abstract class RDD[T: ClassTag] extends Serializable with Logging
// Storage levels for RDD persistence
class StorageLevel private(
private var _useDisk: Boolean,
private var _useMemory: Boolean,
private var _useOffHeap: Boolean,
private var _deserialized: Boolean,
private var _replication: Int = 1) extends Externalizable
// Partitioning strategies
abstract class Partitioner extends Serializable {
def numPartitions: Int
def getPartition(key: Any): Int
}
// Task execution context
abstract class TaskContext extends Serializable {
def partitionId(): Int
def stageId(): Int
def taskAttemptId(): Long
def attemptNumber(): Int
}