Apache Spark Core - The foundational component of Apache Spark providing distributed computing capabilities including RDDs, transformations, actions, and cluster management.
npx @tessl/cli install tessl/maven-org-apache-spark--spark-core-2-11@2.4.00
# Apache Spark Core
1
2
Apache Spark Core is the foundational component of the Apache Spark unified analytics engine for large-scale data processing. It provides the core functionality including distributed task scheduling, memory management, fault recovery, and interactions with storage systems. The library implements resilient distributed datasets (RDDs) as the fundamental data abstraction, offering fault-tolerant collections that can be operated on in parallel across a cluster.
3
4
## Package Information
5
6
- **Package Name**: org.apache.spark:spark-core_2.11
7
- **Package Type**: Maven
8
- **Language**: Scala/Java
9
- **Version**: 2.4.8
10
- **Installation**: Add to your Maven `pom.xml`:
11
12
```xml
13
<dependency>
14
<groupId>org.apache.spark</groupId>
15
<artifactId>spark-core_2.11</artifactId>
16
<version>2.4.8</version>
17
</dependency>
18
```
19
20
For SBT:
21
```scala
22
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.8"
23
```
24
25
## Core Imports
26
27
### Scala
28
```scala
29
import org.apache.spark.{SparkContext, SparkConf}
30
import org.apache.spark.rdd.RDD
31
import org.apache.spark.broadcast.Broadcast
32
import org.apache.spark.util.AccumulatorV2
33
import org.apache.spark.storage.StorageLevel
34
```
35
36
### Java
37
```java
38
import org.apache.spark.api.java.JavaSparkContext;
39
import org.apache.spark.api.java.JavaRDD;
40
import org.apache.spark.api.java.JavaPairRDD;
41
import org.apache.spark.SparkConf;
42
import org.apache.spark.broadcast.Broadcast;
43
import org.apache.spark.storage.StorageLevel;
44
```
45
46
## Basic Usage
47
48
### Scala Example
49
```scala
50
import org.apache.spark.{SparkContext, SparkConf}
51
52
// Create Spark configuration
53
val conf = new SparkConf()
54
.setAppName("MySparkApp")
55
.setMaster("local[*]")
56
57
// Create Spark context
58
val sc = new SparkContext(conf)
59
60
// Create RDD from a collection
61
val data = sc.parallelize(Seq(1, 2, 3, 4, 5))
62
63
// Transform and action
64
val result = data
65
.map(_ * 2)
66
.filter(_ > 4)
67
.collect()
68
69
// Clean up
70
sc.stop()
71
```
72
73
### Java Example
74
```java
75
import org.apache.spark.api.java.JavaSparkContext;
76
import org.apache.spark.SparkConf;
77
import org.apache.spark.api.java.JavaRDD;
78
79
// Create configuration
80
SparkConf conf = new SparkConf()
81
.setAppName("MySparkApp")
82
.setMaster("local[*]");
83
84
// Create context
85
JavaSparkContext jsc = new JavaSparkContext(conf);
86
87
// Create RDD
88
JavaRDD<Integer> data = jsc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
89
90
// Transform and collect
91
List<Integer> result = data
92
.map(x -> x * 2)
93
.filter(x -> x > 4)
94
.collect();
95
96
// Clean up
97
jsc.stop();
98
```
99
100
## Architecture
101
102
Spark Core is built around several key abstractions:
103
104
- **SparkContext**: The main entry point that coordinates distributed data processing
105
- **RDD (Resilient Distributed Dataset)**: Immutable, fault-tolerant distributed collections
106
- **Transformations**: Lazy operations that define new RDDs (map, filter, join, etc.)
107
- **Actions**: Operations that trigger computation and return results (collect, count, save, etc.)
108
- **Broadcast Variables**: Read-only variables cached across all nodes
109
- **Accumulators**: Variables for aggregating information across tasks
110
111
## Capabilities
112
113
### Core Context and Configuration
114
115
The primary entry points for configuring and initializing Spark applications.
116
117
```scala { .api }
118
class SparkConf() {
119
def set(key: String, value: String): SparkConf
120
def setMaster(master: String): SparkConf
121
def setAppName(name: String): SparkConf
122
def get(key: String): String
123
def get(key: String, defaultValue: String): String
124
}
125
126
class SparkContext(config: SparkConf) {
127
def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T]
128
def textFile(path: String, minPartitions: Int = defaultMinPartitions): RDD[String]
129
def stop(): Unit
130
def broadcast[T: ClassTag](value: T): Broadcast[T]
131
def longAccumulator(): LongAccumulator
132
def doubleAccumulator(): DoubleAccumulator
133
}
134
```
135
136
[Context and Configuration](./context-configuration.md)
137
138
### RDD Operations and Transformations
139
140
The core distributed data abstraction with comprehensive transformation and action operations.
141
142
```scala { .api }
143
abstract class RDD[T: ClassTag] {
144
// Transformations
145
def map[U: ClassTag](f: T => U): RDD[U]
146
def flatMap[U: ClassTag](f: T => TraversableOnce[U]): RDD[U]
147
def filter(f: T => Boolean): RDD[T]
148
def distinct(numPartitions: Int = partitions.length): RDD[T]
149
def union(other: RDD[T]): RDD[T]
150
def intersection(other: RDD[T]): RDD[T]
151
152
// Actions
153
def collect(): Array[T]
154
def count(): Long
155
def first(): T
156
def take(num: Int): Array[T]
157
def reduce(f: (T, T) => T): T
158
def foreach(f: T => Unit): Unit
159
160
// Persistence
161
def persist(newLevel: StorageLevel = StorageLevel.MEMORY_ONLY): this.type
162
def cache(): this.type
163
def unpersist(blocking: Boolean = true): this.type
164
}
165
```
166
167
[RDD Operations](./rdd-operations.md)
168
169
### Key-Value Pair Operations
170
171
Specialized operations available on RDDs of key-value pairs for aggregation and joining.
172
173
```scala { .api }
174
class PairRDDFunctions[K, V](self: RDD[(K, V)]) {
175
def keys: RDD[K]
176
def values: RDD[V]
177
def groupByKey(): RDD[(K, Iterable[V])]
178
def reduceByKey(func: (V, V) => V): RDD[(K, V)]
179
def aggregateByKey[U: ClassTag](zeroValue: U)(seqOp: (U, V) => U, combOp: (U, U) => U): RDD[(K, U)]
180
def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))]
181
def leftOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))]
182
def cogroup[W](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))]
183
def sortByKey(ascending: Boolean = true): RDD[(K, V)]
184
}
185
```
186
187
[Key-Value Operations](./key-value-operations.md)
188
189
### Java API
190
191
Java-friendly wrappers that provide type-safe operations and integrate with Java collections.
192
193
```java { .api }
194
public class JavaSparkContext {
195
public <T> JavaRDD<T> parallelize(List<T> list);
196
public JavaRDD<String> textFile(String path);
197
public <T> Broadcast<T> broadcast(T value);
198
public void stop();
199
}
200
201
public class JavaRDD<T> {
202
public <R> JavaRDD<R> map(Function<T, R> f);
203
public <U> JavaRDD<U> flatMap(FlatMapFunction<T, U> f);
204
public JavaRDD<T> filter(Function<T, Boolean> f);
205
public List<T> collect();
206
public long count();
207
public T first();
208
}
209
210
public class JavaPairRDD<K, V> {
211
public JavaRDD<K> keys();
212
public JavaRDD<V> values();
213
public JavaPairRDD<K, Iterable<V>> groupByKey();
214
public JavaPairRDD<K, V> reduceByKey(Function2<V, V, V> func);
215
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other);
216
}
217
```
218
219
[Java API](./java-api.md)
220
221
### Broadcast Variables and Accumulators
222
223
Shared variables for efficient data distribution and aggregation across cluster nodes.
224
225
```scala { .api }
226
abstract class Broadcast[T] {
227
def value: T
228
def unpersist(): Unit
229
def unpersist(blocking: Boolean): Unit
230
def destroy(): Unit
231
def id: Long
232
}
233
234
abstract class AccumulatorV2[IN, OUT] {
235
def isZero: Boolean
236
def copy(): AccumulatorV2[IN, OUT]
237
def reset(): Unit
238
def add(v: IN): Unit
239
def merge(other: AccumulatorV2[IN, OUT]): Unit
240
def value: OUT
241
}
242
243
class LongAccumulator extends AccumulatorV2[java.lang.Long, java.lang.Long] {
244
def add(v: Long): Unit
245
def add(v: java.lang.Long): Unit
246
def sum: Long
247
def count: Long
248
def avg: Double
249
}
250
```
251
252
[Broadcast and Accumulators](./broadcast-accumulators.md)
253
254
### Storage and Persistence
255
256
Fine-grained control over RDD caching and persistence strategies across memory and disk.
257
258
```scala { .api }
259
object StorageLevel {
260
val NONE: StorageLevel
261
val DISK_ONLY: StorageLevel
262
val DISK_ONLY_2: StorageLevel
263
val MEMORY_ONLY: StorageLevel
264
val MEMORY_ONLY_2: StorageLevel
265
val MEMORY_ONLY_SER: StorageLevel
266
val MEMORY_ONLY_SER_2: StorageLevel
267
val MEMORY_AND_DISK: StorageLevel
268
val MEMORY_AND_DISK_2: StorageLevel
269
val MEMORY_AND_DISK_SER: StorageLevel
270
val MEMORY_AND_DISK_SER_2: StorageLevel
271
val OFF_HEAP: StorageLevel
272
}
273
274
class StorageLevel {
275
def useDisk: Boolean
276
def useMemory: Boolean
277
def useOffHeap: Boolean
278
def deserialized: Boolean
279
def replication: Int
280
}
281
```
282
283
[Storage and Persistence](./storage-persistence.md)
284
285
### Task Context and Execution Environment
286
287
Runtime information and control for tasks executing on cluster nodes.
288
289
```scala { .api }
290
abstract class TaskContext {
291
def isCompleted(): Boolean
292
def isInterrupted(): Boolean
293
def stageId(): Int
294
def partitionId(): Int
295
def attemptNumber(): Int
296
def taskAttemptId(): Long
297
def addTaskCompletionListener(listener: TaskCompletionListener): TaskContext
298
def addTaskFailureListener(listener: TaskFailureListener): TaskContext
299
def getLocalProperty(key: String): String
300
}
301
302
object TaskContext {
303
def get(): TaskContext
304
def getPartitionId(): Int
305
}
306
```
307
308
[Task Context](./task-context.md)
309
310
### Status Tracking and Monitoring
311
312
APIs for monitoring job and stage progress, executor status, and application metrics.
313
314
```scala { .api }
315
class SparkStatusTracker {
316
def getJobIdsForGroup(jobGroup: String): Array[Int]
317
def getActiveStageIds(): Array[Int]
318
def getActiveJobIds(): Array[Int]
319
def getJobInfo(jobId: Int): Option[SparkJobInfo]
320
def getStageInfo(stageId: Int): Option[SparkStageInfo]
321
def getExecutorInfos: Array[SparkExecutorInfo]
322
}
323
324
class SparkJobInfo {
325
def jobId(): Int
326
def stageIds(): Array[Int]
327
def status(): JobExecutionStatus
328
}
329
330
class SparkStageInfo {
331
def stageId(): Int
332
def name(): String
333
def numTasks(): Int
334
def numActiveTasks(): Int
335
def numCompleteTasks(): Int
336
def numFailedTasks(): Int
337
}
338
```
339
340
[Status and Monitoring](./status-monitoring.md)
341
342
## Common Types
343
344
```scala { .api }
345
// Core type constraints
346
type ClassTag[T] = scala.reflect.ClassTag[T]
347
348
// Partitioning
349
abstract class Partitioner {
350
def numPartitions: Int
351
def getPartition(key: Any): Int
352
}
353
354
class HashPartitioner(partitions: Int) extends Partitioner
355
class RangePartitioner[K: Ordering: ClassTag, V](
356
partitions: Int,
357
rdd: RDD[_ <: Product2[K, V]]
358
) extends Partitioner
359
360
// Function types for Java API
361
@FunctionalInterface
362
trait Function[T1, R] extends Serializable {
363
def call(v1: T1): R
364
}
365
366
@FunctionalInterface
367
trait Function2[T1, T2, R] extends Serializable {
368
def call(v1: T1, v2: T2): R
369
}
370
371
@FunctionalInterface
372
trait VoidFunction[T] extends Serializable {
373
def call(t: T): Unit
374
}
375
376
@FunctionalInterface
377
trait FlatMapFunction[T, R] extends Serializable {
378
def call(t: T): java.util.Iterator[R]
379
}
380
381
@FunctionalInterface
382
trait PairFunction[T, K, V] extends Serializable {
383
def call(t: T): Tuple2[K, V]
384
}
385
386
// Exception types
387
class SparkException(message: String, cause: Throwable) extends Exception
388
class TaskKilledException(reason: String) extends RuntimeException
389
class TaskNotSerializableException(className: String) extends SparkException
390
```