0
# Java API
1
2
Java-friendly wrappers for Spark functionality providing type-safe distributed processing and seamless integration with Java applications.
3
4
## Capabilities
5
6
### JavaSparkContext
7
8
Java-friendly version of SparkContext providing the main entry point for Java Spark applications.
9
10
```java { .api }
11
/**
12
* Java-friendly wrapper for SparkContext
13
*/
14
public class JavaSparkContext {
15
/** Create JavaSparkContext from SparkContext */
16
public JavaSparkContext(SparkContext sc)
17
18
/** Create JavaSparkContext from SparkConf */
19
public JavaSparkContext(SparkConf conf)
20
21
/** Create JavaSparkContext with app name and master URL */
22
public JavaSparkContext(String master, String appName)
23
24
/** Create RDD from Java collection */
25
public <T> JavaRDD<T> parallelize(java.util.List<T> list)
26
public <T> JavaRDD<T> parallelize(java.util.List<T> list, int numSlices)
27
28
/** Create pair RDD from Java collection */
29
public <K, V> JavaPairRDD<K, V> parallelizePairs(java.util.List<scala.Tuple2<K, V>> list)
30
public <K, V> JavaPairRDD<K, V> parallelizePairs(java.util.List<scala.Tuple2<K, V>> list, int numSlices)
31
32
/** Read text file */
33
public JavaRDD<String> textFile(String path)
34
public JavaRDD<String> textFile(String path, int minPartitions)
35
36
/** Read whole text files */
37
public JavaPairRDD<String, String> wholeTextFiles(String path)
38
public JavaPairRDD<String, String> wholeTextFiles(String path, int minPartitions)
39
40
/** Create RDD from Hadoop InputFormat */
41
public <K, V> JavaPairRDD<K, V> hadoopRDD(
42
JobConf conf,
43
Class<? extends InputFormat<K, V>> inputFormatClass,
44
Class<K> keyClass,
45
Class<V> valueClass
46
)
47
48
/** Create RDD from new Hadoop InputFormat */
49
public <K, V> JavaPairRDD<K, V> newAPIHadoopRDD(
50
Configuration conf,
51
Class<? extends NewInputFormat<K, V>> fClass,
52
Class<K> kClass,
53
Class<V> vClass
54
)
55
56
/** Create broadcast variable */
57
public <T> Broadcast<T> broadcast(T value)
58
59
/** Create accumulator */
60
public LongAccumulator longAccumulator()
61
public LongAccumulator longAccumulator(String name)
62
public DoubleAccumulator doubleAccumulator()
63
public DoubleAccumulator doubleAccumulator(String name)
64
public <T> CollectionAccumulator<T> collectionAccumulator()
65
public <T> CollectionAccumulator<T> collectionAccumulator(String name)
66
67
/** Add file to Spark job */
68
public void addFile(String path)
69
public void addFile(String path, boolean recursive)
70
71
/** Add JAR file */
72
public void addJar(String path)
73
74
/** Set checkpoint directory */
75
public void setCheckpointDir(String dir)
76
77
/** Get underlying SparkContext */
78
public SparkContext sc()
79
80
/** Get status tracker */
81
public JavaSparkStatusTracker statusTracker()
82
83
/** Stop JavaSparkContext */
84
public void stop()
85
86
/** Close JavaSparkContext (same as stop) */
87
public void close()
88
}
89
```
90
91
### JavaRDD
92
93
Java-friendly wrapper for RDD providing type-safe distributed operations.
94
95
```java { .api }
96
/**
97
* Java-friendly wrapper for RDD
98
*/
99
public class JavaRDD<T> {
100
// Transformations
101
102
/** Transform each element */
103
public <R> JavaRDD<R> map(org.apache.spark.api.java.function.Function<T, R> f)
104
105
/** Transform and flatten */
106
public <R> JavaRDD<R> flatMap(org.apache.spark.api.java.function.FlatMapFunction<T, R> f)
107
108
/** Filter elements */
109
public JavaRDD<T> filter(org.apache.spark.api.java.function.Function<T, Boolean> f)
110
111
/** Map with partition index */
112
public <R> JavaRDD<R> mapPartitionsWithIndex(
113
org.apache.spark.api.java.function.org.apache.spark.api.java.function.Function2<Integer, java.util.Iterator<T>, java.util.Iterator<R>> f,
114
boolean preservesPartitioning
115
)
116
117
/** Sample elements */
118
public JavaRDD<T> sample(boolean withReplacement, double fraction)
119
public JavaRDD<T> sample(boolean withReplacement, double fraction, long seed)
120
121
/** Union with another RDD */
122
public JavaRDD<T> union(JavaRDD<T> other)
123
124
/** Intersection with another RDD */
125
public JavaRDD<T> intersection(JavaRDD<T> other)
126
127
/** Get distinct elements */
128
public JavaRDD<T> distinct()
129
public JavaRDD<T> distinct(int numPartitions)
130
131
/** Group by key function */
132
public <K> JavaPairRDD<K, Iterable<T>> groupBy(Function<T, K> f)
133
134
/** Coalesce partitions */
135
public JavaRDD<T> coalesce(int numPartitions)
136
public JavaRDD<T> coalesce(int numPartitions, boolean shuffle)
137
138
/** Repartition */
139
public JavaRDD<T> repartition(int numPartitions)
140
141
/** Sort by key function */
142
public <S> JavaRDD<T> sortBy(Function<T, S> f, boolean ascending, int numPartitions)
143
144
/** Zip with another RDD */
145
public <U> JavaPairRDD<T, U> zip(JavaRDD<U> other)
146
147
/** Zip with indices */
148
public JavaPairRDD<T, Long> zipWithIndex()
149
150
/** Zip with unique IDs */
151
public JavaPairRDD<T, Long> zipWithUniqueId()
152
153
/** Map to pair RDD */
154
public <K, V> JavaPairRDD<K, V> mapToPair(PairFunction<T, K, V> f)
155
156
// Actions
157
158
/** Collect all elements */
159
public List<T> collect()
160
161
/** Count elements */
162
public long count()
163
164
/** Get first element */
165
public T first()
166
167
/** Take first n elements */
168
public List<T> take(int num)
169
170
/** Take ordered elements */
171
public List<T> takeOrdered(int num)
172
public List<T> takeOrdered(int num, Comparator<T> comp)
173
174
/** Take random sample */
175
public List<T> takeSample(boolean withReplacement, int num)
176
public List<T> takeSample(boolean withReplacement, int num, long seed)
177
178
/** Reduce elements */
179
public T reduce(org.apache.spark.api.java.function.Function2<T, T, T> f)
180
181
/** Fold with zero value */
182
public T fold(T zeroValue, org.apache.spark.api.java.function.Function2<T, T, T> op)
183
184
/** Aggregate with different types */
185
public <U> U aggregate(U zeroValue, org.apache.spark.api.java.function.Function2<U, T, U> seqOp, org.apache.spark.api.java.function.Function2<U, U, U> combOp)
186
187
/** Tree reduce */
188
public T treeReduce(org.apache.spark.api.java.function.Function2<T, T, T> f)
189
190
/** Tree aggregate */
191
public <U> U treeAggregate(
192
U zeroValue,
193
org.apache.spark.api.java.function.Function2<U, T, U> seqOp,
194
org.apache.spark.api.java.function.Function2<U, U, U> combOp,
195
int depth
196
)
197
198
/** Apply function to each element */
199
public void foreach(VoidFunction<T> f)
200
201
/** Apply function to each partition */
202
public void foreachPartition(VoidFunction<Iterator<T>> f)
203
204
/** Count by value */
205
public Map<T, Long> countByValue()
206
207
/** Save as text file */
208
public void saveAsTextFile(String path)
209
public void saveAsTextFile(String path, Class<? extends CompressionCodec> codec)
210
211
// Persistence
212
213
/** Persist with storage level */
214
public JavaRDD<T> persist(StorageLevel newLevel)
215
216
/** Cache in memory */
217
public JavaRDD<T> cache()
218
219
/** Unpersist */
220
public JavaRDD<T> unpersist()
221
public JavaRDD<T> unpersist(boolean blocking)
222
223
/** Checkpoint */
224
public void checkpoint()
225
226
/** Check if empty */
227
public boolean isEmpty()
228
229
// Metadata
230
231
/** Get partitions */
232
public List<Partition> partitions()
233
234
/** Get storage level */
235
public StorageLevel getStorageLevel()
236
237
/** Convert to Scala RDD */
238
public RDD<T> rdd()
239
}
240
```
241
242
### JavaPairRDD
243
244
Java-friendly wrapper for pair RDDs providing key-value operations.
245
246
```java { .api }
247
/**
248
* Java-friendly wrapper for pair RDD
249
*/
250
public class JavaPairRDD<K, V> {
251
// Transformations
252
253
/** Map values */
254
public <W> JavaPairRDD<K, W> mapValues(Function<V, W> f)
255
256
/** Flat map values */
257
public <W> JavaPairRDD<K, W> flatMapValues(Function<V, Iterable<W>> f)
258
259
/** Map to different key-value pairs */
260
public <K2, V2> JavaPairRDD<K2, V2> mapToPair(PairFunction<Tuple2<K, V>, K2, V2> f)
261
262
/** Group by key */
263
public JavaPairRDD<K, Iterable<V>> groupByKey()
264
public JavaPairRDD<K, Iterable<V>> groupByKey(int numPartitions)
265
public JavaPairRDD<K, Iterable<V>> groupByKey(Partitioner partitioner)
266
267
/** Reduce by key */
268
public JavaPairRDD<K, V> reduceByKey(org.apache.spark.api.java.function.Function2<V, V, V> func)
269
public JavaPairRDD<K, V> reduceByKey(org.apache.spark.api.java.function.Function2<V, V, V> func, int numPartitions)
270
public JavaPairRDD<K, V> reduceByKey(Partitioner partitioner, org.apache.spark.api.java.function.Function2<V, V, V> func)
271
272
/** Aggregate by key */
273
public <U> JavaPairRDD<K, U> aggregateByKey(
274
U zeroValue,
275
org.apache.spark.api.java.function.Function2<U, V, U> seqFunc,
276
org.apache.spark.api.java.function.Function2<U, U, U> combFunc
277
)
278
public <U> JavaPairRDD<K, U> aggregateByKey(
279
U zeroValue,
280
int numPartitions,
281
org.apache.spark.api.java.function.Function2<U, V, U> seqFunc,
282
org.apache.spark.api.java.function.Function2<U, U, U> combFunc
283
)
284
285
/** Fold by key */
286
public JavaPairRDD<K, V> foldByKey(V zeroValue, org.apache.spark.api.java.function.Function2<V, V, V> func)
287
public JavaPairRDD<K, V> foldByKey(V zeroValue, int numPartitions, org.apache.spark.api.java.function.Function2<V, V, V> func)
288
289
/** Combine by key */
290
public <C> JavaPairRDD<K, C> combineByKey(
291
org.apache.spark.api.java.function.Function<V, C> createCombiner,
292
org.apache.spark.api.java.function.Function2<C, V, C> mergeValue,
293
org.apache.spark.api.java.function.Function2<C, C, C> mergeCombiners
294
)
295
296
/** Join operations */
297
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other)
298
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, int numPartitions)
299
public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other)
300
public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other)
301
public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other)
302
303
/** Cogroup operations */
304
public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other)
305
306
/** Sort by key */
307
public JavaPairRDD<K, V> sortByKey()
308
public JavaPairRDD<K, V> sortByKey(boolean ascending)
309
public JavaPairRDD<K, V> sortByKey(boolean ascending, int numPartitions)
310
311
/** Get keys */
312
public JavaRDD<K> keys()
313
314
/** Get values */
315
public JavaRDD<V> values()
316
317
/** Subtract by key */
318
public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other)
319
320
// Actions
321
322
/** Lookup values for key */
323
public List<V> lookup(K key)
324
325
/** Collect as map */
326
public Map<K, V> collectAsMap()
327
328
/** Count by key */
329
public Map<K, Long> countByKey()
330
331
/** Count by key approximately */
332
public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout)
333
334
/** Save as Hadoop file */
335
public void saveAsHadoopFile(
336
String path,
337
Class<?> keyClass,
338
Class<?> valueClass,
339
Class<? extends OutputFormat> outputFormatClass
340
)
341
342
/** Save as new API Hadoop file */
343
public void saveAsNewAPIHadoopFile(
344
String path,
345
Class<?> keyClass,
346
Class<?> valueClass,
347
Class<? extends NewOutputFormat> outputFormatClass
348
)
349
}
350
```
351
352
### JavaDoubleRDD
353
354
Java-friendly wrapper for RDDs of doubles providing statistical operations.
355
356
```java { .api }
357
/**
358
* Java-friendly wrapper for RDD of doubles
359
*/
360
public class JavaDoubleRDD {
361
/** Compute mean */
362
public double mean()
363
364
/** Compute variance */
365
public double variance()
366
367
/** Compute standard deviation */
368
public double stdev()
369
370
/** Compute sum */
371
public double sum()
372
373
/** Compute statistics */
374
public StatCounter stats()
375
376
/** Compute histogram */
377
public Tuple2<double[], long[]> histogram(int buckets)
378
public long[] histogram(double[] buckets)
379
380
/** Sum approximately */
381
public PartialResult<BoundedDouble> sumApprox(long timeout)
382
383
/** Mean approximately */
384
public PartialResult<BoundedDouble> meanApprox(long timeout)
385
}
386
```
387
388
### Function Interfaces
389
390
Java 8 compatible function interfaces for transformations.
391
392
```java { .api }
393
/** Function interface for map operations */
394
@FunctionalInterface
395
public interface Function<T, R> extends Serializable {
396
R call(T t) throws Exception;
397
}
398
399
/** Function interface for pair transformations */
400
@FunctionalInterface
401
public interface PairFunction<T, K, V> extends Serializable {
402
Tuple2<K, V> call(T t) throws Exception;
403
}
404
405
/** Function interface for flat map operations */
406
@FunctionalInterface
407
public interface FlatMapFunction<T, R> extends Serializable {
408
Iterator<R> call(T t) throws Exception;
409
}
410
411
/** Function interface for two-argument operations */
412
@FunctionalInterface
413
public interface org.apache.spark.api.java.function.Function2<T1, T2, R> extends Serializable {
414
R call(T1 t1, T2 t2) throws Exception;
415
}
416
417
/** Function interface for void operations */
418
@FunctionalInterface
419
public interface VoidFunction<T> extends Serializable {
420
void call(T t) throws Exception;
421
}
422
```
423
424
### Storage Levels for Java
425
426
```java { .api }
427
/**
428
* Storage level constants for Java API
429
*/
430
public class StorageLevels {
431
public static final StorageLevel MEMORY_ONLY = StorageLevel.MEMORY_ONLY();
432
public static final StorageLevel MEMORY_AND_DISK = StorageLevel.MEMORY_AND_DISK();
433
public static final StorageLevel MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY_SER();
434
public static final StorageLevel MEMORY_AND_DISK_SER = StorageLevel.MEMORY_AND_DISK_SER();
435
public static final StorageLevel DISK_ONLY = StorageLevel.DISK_ONLY();
436
public static final StorageLevel MEMORY_ONLY_2 = StorageLevel.MEMORY_ONLY_2();
437
public static final StorageLevel MEMORY_AND_DISK_2 = StorageLevel.MEMORY_AND_DISK_2();
438
}
439
```
440
441
**Usage Examples:**
442
443
```java
444
import org.apache.spark.SparkConf;
445
import org.apache.spark.api.java.JavaSparkContext;
446
import org.apache.spark.api.java.JavaRDD;
447
import org.apache.spark.api.java.JavaPairRDD;
448
import scala.Tuple2;
449
450
import java.util.Arrays;
451
import java.util.List;
452
453
// Setup
454
SparkConf conf = new SparkConf()
455
.setAppName("Java Spark Example")
456
.setMaster("local[*]");
457
JavaSparkContext sc = new JavaSparkContext(conf);
458
459
// Create RDD
460
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
461
JavaRDD<Integer> rdd = sc.parallelize(data);
462
463
// Transformations
464
JavaRDD<Integer> squares = rdd.map(x -> x * x);
465
JavaRDD<Integer> evens = rdd.filter(x -> x % 2 == 0);
466
467
// Pair operations
468
JavaPairRDD<String, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>("num", x));
469
JavaPairRDD<String, Integer> sums = pairs.reduceByKey((a, b) -> a + b);
470
471
// Actions
472
List<Integer> result = squares.collect();
473
long count = rdd.count();
474
int sum = rdd.reduce((a, b) -> a + b);
475
476
// Cleanup
477
sc.close();
478
```