Apache Spark Core - The foundational distributed computing engine for Apache Spark that provides RDD abstractions, task scheduling, memory management, and cluster execution capabilities.
npx @tessl/cli install tessl/maven-org-apache-spark--spark-core_2-11@1.6.00
# Apache Spark Core
1
2
Apache Spark Core provides the foundational distributed computing engine for Apache Spark. It implements the Resilient Distributed Dataset (RDD) programming model, sophisticated task scheduling, advanced memory management, and comprehensive support for multiple cluster managers. The core engine enables fault-tolerant parallel operations on large datasets across distributed clusters.
3
4
## Package Information
5
6
- **Package Name**: spark-core_2.11
7
- **Package Type**: maven
8
- **Language**: Scala (with Java API)
9
- **Version**: 1.6.3
10
- **Installation**: Add to Maven/SBT dependencies: `org.apache.spark:spark-core_2.11:1.6.3`
11
12
## Core Imports
13
14
**Scala:**
15
```scala
16
import org.apache.spark.{SparkContext, SparkConf}
17
import org.apache.spark.rdd.RDD
18
import org.apache.spark.storage.StorageLevel
19
```
20
21
**Java:**
22
```java
23
import org.apache.spark.api.java.JavaSparkContext;
24
import org.apache.spark.api.java.JavaRDD;
25
import org.apache.spark.api.java.JavaPairRDD;
26
import org.apache.spark.SparkConf;
27
```
28
29
## Basic Usage
30
31
**Scala:**
32
```scala
33
import org.apache.spark.{SparkContext, SparkConf}
34
35
// Create Spark configuration
36
val conf = new SparkConf()
37
.setAppName("MySparkApp")
38
.setMaster("local[*]")
39
40
// Create Spark context
41
val sc = new SparkContext(conf)
42
43
// Create RDD from collection
44
val data = sc.parallelize(1 to 10)
45
46
// Transform and collect results
47
val result = data
48
.map(_ * 2)
49
.filter(_ > 10)
50
.collect()
51
52
// Stop the context
53
sc.stop()
54
```
55
56
**Java:**
57
```java
58
import org.apache.spark.api.java.JavaSparkContext;
59
import org.apache.spark.SparkConf;
60
61
// Create configuration
62
SparkConf conf = new SparkConf()
63
.setAppName("MyJavaSparkApp")
64
.setMaster("local[*]");
65
66
// Create Java Spark context
67
JavaSparkContext sc = new JavaSparkContext(conf);
68
69
// Create RDD and perform operations
70
JavaRDD<Integer> data = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
71
JavaRDD<Integer> result = data
72
.map(x -> x * 2)
73
.filter(x -> x > 5);
74
75
List<Integer> collected = result.collect();
76
sc.stop();
77
```
78
79
## Architecture
80
81
Apache Spark Core is built around several key components:
82
83
- **SparkContext**: The main entry point that coordinates all Spark operations and manages the connection to the cluster
84
- **RDD (Resilient Distributed Dataset)**: The fundamental data abstraction representing an immutable, partitioned collection that can be operated on in parallel
85
- **Task Scheduler**: Sophisticated scheduling system that optimizes job execution across cluster resources with data locality awareness
86
- **Memory Management**: Advanced caching and storage system with configurable storage levels and automatic spill-to-disk capabilities
87
- **Cluster Managers**: Support for multiple cluster managers including Standalone, YARN, and Mesos
88
- **Fault Tolerance**: Automatic recovery from node failures through RDD lineage and checkpointing
89
90
## Capabilities
91
92
### Spark Context Management
93
94
Core functionality for creating and managing Spark applications, including cluster connections, resource allocation, and application lifecycle management.
95
96
```scala { .api }
97
class SparkContext(config: SparkConf) extends Logging {
98
def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T]
99
def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String]
100
def wholeTextFiles(path: String, minPartitions: Int = defaultMinPartitions): RDD[(String, String)]
101
def stop(): Unit
102
def broadcast[T: ClassTag](value: T): Broadcast[T]
103
def accumulator[T](initialValue: T)(implicit param: AccumulatorParam[T]): Accumulator[T]
104
}
105
106
class SparkConf(loadDefaults: Boolean = true) extends Cloneable {
107
def set(key: String, value: String): SparkConf
108
def setMaster(master: String): SparkConf
109
def setAppName(name: String): SparkConf
110
def get(key: String): String
111
def get(key: String, defaultValue: String): String
112
}
113
```
114
115
[Context Management](./context-management.md)
116
117
### RDD Operations
118
119
The core RDD API providing transformations and actions for distributed data processing, including map, filter, reduce operations and advanced transformations like joins and aggregations.
120
121
```scala { .api }
122
abstract class RDD[T: ClassTag] extends Serializable {
123
// Transformations
124
def map[U: ClassTag](f: T => U): RDD[U]
125
def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U]
126
def filter(f: T => Boolean): RDD[T]
127
def distinct(numPartitions: Int = partitions.length): RDD[T]
128
def union(other: RDD[T]): RDD[T]
129
130
// Actions
131
def collect(): Array[T]
132
def count(): Long
133
def first(): T
134
def take(num: Int): Array[T]
135
def reduce(f: (T, T) => T): T
136
def foreach(f: T => Unit): Unit
137
138
// Persistence
139
def cache(): RDD.this.type
140
def persist(newLevel: StorageLevel): RDD.this.type
141
}
142
```
143
144
[RDD Operations](./rdd-operations.md)
145
146
### Pair RDD Operations
147
148
Advanced operations for key-value pair RDDs including grouping, joining, and aggregation operations essential for data processing workflows.
149
150
```scala { .api }
151
class PairRDDFunctions[K, V](self: RDD[(K, V)]) {
152
def groupByKey(): RDD[(K, Iterable[V])]
153
def reduceByKey(func: (V, V) => V): RDD[(K, V)]
154
def aggregateByKey[U: ClassTag](zeroValue: U)(seqOp: (U, V) => U, combOp: (U, U) => U): RDD[(K, U)]
155
def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))]
156
def leftOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))]
157
def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))]
158
def sortByKey(ascending: Boolean = true): RDD[(K, V)]
159
}
160
```
161
162
[Pair RDD Operations](./pair-rdd-operations.md)
163
164
### Java API
165
166
Java-friendly wrappers providing the complete Spark functionality through Java-compatible interfaces, lambda support, and familiar Java collection types.
167
168
```java { .api }
169
public class JavaSparkContext implements Closeable {
170
public JavaSparkContext(SparkConf conf)
171
public <T> JavaRDD<T> parallelize(List<T> list)
172
public <T> JavaRDD<T> parallelize(List<T> list, int numSlices)
173
public JavaRDD<String> textFile(String path)
174
public <T> Broadcast<T> broadcast(T value)
175
public void stop()
176
}
177
178
public class JavaRDD<T> extends AbstractJavaRDDLike<T, JavaRDD<T>> {
179
public <R> JavaRDD<R> map(Function<T, R> f)
180
public <R> JavaRDD<R> flatMap(FlatMapFunction<T, R> f)
181
public JavaRDD<T> filter(Function<T, Boolean> f)
182
public List<T> collect()
183
public long count()
184
public T first()
185
}
186
```
187
188
[Java API](./java-api.md)
189
190
### Storage and Persistence
191
192
Memory management and persistence strategies for optimizing RDD storage across cluster nodes, including various storage levels and caching mechanisms.
193
194
```scala { .api }
195
object StorageLevel {
196
val NONE: StorageLevel
197
val DISK_ONLY: StorageLevel
198
val DISK_ONLY_2: StorageLevel
199
val MEMORY_ONLY: StorageLevel
200
val MEMORY_ONLY_2: StorageLevel
201
val MEMORY_ONLY_SER: StorageLevel
202
val MEMORY_AND_DISK: StorageLevel
203
val MEMORY_AND_DISK_2: StorageLevel
204
val MEMORY_AND_DISK_SER: StorageLevel
205
}
206
```
207
208
[Storage and Persistence](./storage-persistence.md)
209
210
### Broadcast Variables and Accumulators
211
212
Distributed variable support for efficiently sharing read-only data across tasks (broadcast variables) and collecting information from executors (accumulators).
213
214
```scala { .api }
215
abstract class Broadcast[T: ClassTag] extends Serializable {
216
def value: T
217
def unpersist(blocking: Boolean = true): Unit
218
def destroy(): Unit
219
def id: Long
220
}
221
222
class Accumulator[T] private[spark] (
223
@transient private[spark] val initialValue: T,
224
param: AccumulatorParam[T],
225
name: Option[String] = None) extends Serializable {
226
def value: T
227
def add(term: T): Unit
228
def += (term: T): Unit
229
def localValue: T
230
}
231
```
232
233
[Broadcast and Accumulators](./broadcast-accumulators.md)
234
235
## Types
236
237
```scala { .api }
238
// Core configuration class
239
class SparkConf(loadDefaults: Boolean = true) extends Cloneable with Logging
240
241
// Main entry point for Spark functionality
242
class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationClient
243
244
// Basic distributed dataset abstraction
245
abstract class RDD[T: ClassTag] extends Serializable with Logging
246
247
// Storage levels for RDD persistence
248
class StorageLevel private(
249
private var _useDisk: Boolean,
250
private var _useMemory: Boolean,
251
private var _useOffHeap: Boolean,
252
private var _deserialized: Boolean,
253
private var _replication: Int = 1) extends Externalizable
254
255
// Partitioning strategies
256
abstract class Partitioner extends Serializable {
257
def numPartitions: Int
258
def getPartition(key: Any): Int
259
}
260
261
// Task execution context
262
abstract class TaskContext extends Serializable {
263
def partitionId(): Int
264
def stageId(): Int
265
def taskAttemptId(): Long
266
def attemptNumber(): Int
267
}
268
```