Spec RegistrySpec Registry

Help your agents use open-source better. Learn more.

Find usage specs for your project’s dependencies

>

maven-apache-spark

Description
Lightning-fast unified analytics engine for large-scale data processing with high-level APIs in Scala, Java, Python, and R
Author
tessl
Last updated

How to use

npx @tessl/cli registry install tessl/maven-apache-spark@1.0.0

java-api.md docs/

1
# Java API
2
3
Spark provides comprehensive Java APIs that mirror the Scala functionality while providing Java-friendly interfaces. The Java API includes JavaRDD, JavaPairRDD, and JavaDoubleRDD classes that offer type-safe operations for Java developers.
4
5
## JavaSparkContext
6
7
The Java-friendly version of SparkContext.
8
9
### JavaSparkContext Class
10
11
```java { .api }
12
public class JavaSparkContext {
13
// Constructors
14
public JavaSparkContext()
15
public JavaSparkContext(SparkConf conf)
16
public JavaSparkContext(String master, String appName)
17
public JavaSparkContext(String master, String appName, SparkConf conf)
18
public JavaSparkContext(String master, String appName, String sparkHome, String jarFile)
19
public JavaSparkContext(String master, String appName, String sparkHome, String[] jars)
20
public JavaSparkContext(String master, String appName, String sparkHome, String[] jars, Map<String, String> environment)
21
22
// Core properties
23
public SparkContext sc()
24
public String master()
25
public String appName()
26
public Boolean isLocal()
27
public Integer defaultParallelism()
28
public Integer defaultMinPartitions()
29
}
30
```
31
32
### Creating JavaSparkContext
33
34
```java
35
import org.apache.spark.SparkConf;
36
import org.apache.spark.api.java.JavaSparkContext;
37
38
// Basic creation with SparkConf
39
SparkConf conf = new SparkConf()
40
.setAppName("Java Spark App")
41
.setMaster("local[*]");
42
43
JavaSparkContext jsc = new JavaSparkContext(conf);
44
45
// Alternative constructors
46
JavaSparkContext jsc2 = new JavaSparkContext("local[*]", "My Java App");
47
48
// With all parameters
49
String[] jars = {"myapp.jar", "dependencies.jar"};
50
Map<String, String> env = new HashMap<>();
51
env.put("SPARK_ENV", "production");
52
53
JavaSparkContext jsc3 = new JavaSparkContext(
54
"local[*]", // master
55
"My Java App", // app name
56
"/path/to/spark", // spark home
57
jars, // jar files
58
env // environment
59
);
60
```
61
62
## JavaRDD
63
64
Java-friendly wrapper for RDD operations.
65
66
### JavaRDD Class
67
68
```java { .api }
69
public class JavaRDD<T> extends AbstractJavaRDDLike<T, JavaRDD<T>> {
70
// Transformations
71
public <U> JavaRDD<U> map(Function<T, U> f)
72
public <U> JavaRDD<U> flatMap(FlatMapFunction<T, U> f)
73
public JavaRDD<T> filter(Function<T, Boolean> f)
74
public JavaRDD<T> distinct()
75
public JavaRDD<T> distinct(int numPartitions)
76
public JavaRDD<T> sample(boolean withReplacement, double fraction)
77
public JavaRDD<T> sample(boolean withReplacement, double fraction, long seed)
78
public JavaRDD<T> union(JavaRDD<T> other)
79
public JavaRDD<T> intersection(JavaRDD<T> other)
80
public JavaRDD<T> subtract(JavaRDD<T> other)
81
public <U> JavaPairRDD<T, U> cartesian(JavaRDD<U> other)
82
83
// Partition operations
84
public <U> JavaRDD<U> mapPartitions(FlatMapFunction<Iterator<T>, U> f)
85
public <U> JavaRDD<U> mapPartitionsWithIndex(Function2<Integer, Iterator<T>, Iterator<U>> f, boolean preservesPartitioning)
86
public JavaRDD<T> coalesce(int numPartitions)
87
public JavaRDD<T> coalesce(int numPartitions, boolean shuffle)
88
public JavaRDD<T> repartition(int numPartitions)
89
90
// Actions
91
public List<T> collect()
92
public long count()
93
public T first()
94
public List<T> take(int num)
95
public List<T> takeSample(boolean withReplacement, int num)
96
public List<T> takeSample(boolean withReplacement, int num, long seed)
97
public T reduce(Function2<T, T, T> f)
98
public T fold(T zeroValue, Function2<T, T, T> func)
99
public <U> U aggregate(U zeroValue, Function2<U, T, U> seqFunc, Function2<U, U, U> combFunc)
100
public void foreach(VoidFunction<T> f)
101
public void foreachPartition(VoidFunction<Iterator<T>> f)
102
103
// Persistence
104
public JavaRDD<T> cache()
105
public JavaRDD<T> persist(StorageLevel newLevel)
106
public JavaRDD<T> unpersist()
107
public JavaRDD<T> unpersist(boolean blocking)
108
}
109
```
110
111
### Creating and Using JavaRDD
112
113
```java
114
import org.apache.spark.api.java.JavaRDD;
115
import org.apache.spark.api.java.function.Function;
116
import org.apache.spark.api.java.function.Function2;
117
import org.apache.spark.api.java.function.FlatMapFunction;
118
import java.util.Arrays;
119
import java.util.List;
120
121
// Create JavaRDD from collection
122
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
123
JavaRDD<Integer> javaRDD = jsc.parallelize(data);
124
125
// Map transformation
126
JavaRDD<Integer> doubled = javaRDD.map(new Function<Integer, Integer>() {
127
public Integer call(Integer x) {
128
return x * 2;
129
}
130
});
131
132
// Using lambda expressions (Java 8+)
133
JavaRDD<Integer> doubled2 = javaRDD.map(x -> x * 2);
134
135
// FlatMap transformation
136
JavaRDD<String> lines = jsc.textFile("input.txt");
137
JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
138
public Iterable<String> call(String line) {
139
return Arrays.asList(line.split(" "));
140
}
141
});
142
143
// With lambda
144
JavaRDD<String> words2 = lines.flatMap(line -> Arrays.asList(line.split(" ")));
145
146
// Filter transformation
147
JavaRDD<Integer> evens = javaRDD.filter(new Function<Integer, Boolean>() {
148
public Boolean call(Integer x) {
149
return x % 2 == 0;
150
}
151
});
152
153
// With lambda
154
JavaRDD<Integer> evens2 = javaRDD.filter(x -> x % 2 == 0);
155
```
156
157
### Actions on JavaRDD
158
159
```java
160
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
161
JavaRDD<Integer> rdd = jsc.parallelize(data);
162
163
// Collect all elements
164
List<Integer> result = rdd.collect();
165
166
// Count elements
167
long count = rdd.count();
168
169
// Get first element
170
Integer first = rdd.first();
171
172
// Take first n elements
173
List<Integer> firstThree = rdd.take(3);
174
175
// Reduce with function
176
Integer sum = rdd.reduce(new Function2<Integer, Integer, Integer>() {
177
public Integer call(Integer a, Integer b) {
178
return a + b;
179
}
180
});
181
182
// With lambda
183
Integer sum2 = rdd.reduce((a, b) -> a + b);
184
185
// Fold with zero value
186
Integer foldResult = rdd.fold(0, (a, b) -> a + b);
187
188
// Aggregate with different types
189
class Stats implements Serializable {
190
public int sum;
191
public int count;
192
193
public Stats(int sum, int count) {
194
this.sum = sum;
195
this.count = count;
196
}
197
}
198
199
Stats stats = rdd.aggregate(
200
new Stats(0, 0), // Zero value
201
new Function2<Stats, Integer, Stats>() { // Seq function
202
public Stats call(Stats s, Integer x) {
203
return new Stats(s.sum + x, s.count + 1);
204
}
205
},
206
new Function2<Stats, Stats, Stats>() { // Combine function
207
public Stats call(Stats s1, Stats s2) {
208
return new Stats(s1.sum + s2.sum, s1.count + s2.count);
209
}
210
}
211
);
212
```
213
214
## JavaPairRDD
215
216
Java wrapper for key-value pair RDDs.
217
218
### JavaPairRDD Class
219
220
```java { .api }
221
public class JavaPairRDD<K, V> extends AbstractJavaRDDLike<Tuple2<K, V>, JavaPairRDD<K, V>> {
222
// Key-Value operations
223
public JavaRDD<K> keys()
224
public JavaRDD<V> values()
225
public <U> JavaPairRDD<K, U> mapValues(Function<V, U> f)
226
public <U> JavaPairRDD<K, U> flatMapValues(FlatMapFunction<V, U> f)
227
228
// Aggregations
229
public JavaPairRDD<K, V> reduceByKey(Function2<V, V, V> func)
230
public JavaPairRDD<K, V> reduceByKey(Partitioner partitioner, Function2<V, V, V> func)
231
public JavaPairRDD<K, Iterable<V>> groupByKey()
232
public JavaPairRDD<K, Iterable<V>> groupByKey(Partitioner partitioner)
233
public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners, Partitioner partitioner)
234
public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, Partitioner partitioner, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)
235
public JavaPairRDD<K, V> foldByKey(V zeroValue, Function2<V, V, V> func)
236
237
// Joins
238
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other)
239
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, Partitioner partitioner)
240
public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other)
241
public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other)
242
public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other)
243
public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other)
244
245
// Actions
246
public Map<K, V> collectAsMap()
247
public Map<K, Long> countByKey()
248
public List<V> lookup(K key)
249
250
// Save operations
251
public void saveAsHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends OutputFormat> outputFormatClass)
252
public void saveAsNewAPIHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends org.apache.hadoop.mapreduce.OutputFormat> outputFormatClass)
253
}
254
```
255
256
### Creating and Using JavaPairRDD
257
258
```java
259
import org.apache.spark.api.java.JavaPairRDD;
260
import org.apache.spark.api.java.function.PairFunction;
261
import scala.Tuple2;
262
import java.util.Arrays;
263
import java.util.List;
264
265
// Create JavaPairRDD from tuples
266
List<Tuple2<String, Integer>> pairs = Arrays.asList(
267
new Tuple2<>("apple", 5),
268
new Tuple2<>("banana", 3),
269
new Tuple2<>("apple", 2),
270
new Tuple2<>("orange", 1)
271
);
272
273
JavaPairRDD<String, Integer> pairRDD = jsc.parallelizePairs(pairs);
274
275
// Create from JavaRDD using mapToPair
276
JavaRDD<String> lines = jsc.textFile("input.txt");
277
JavaPairRDD<String, Integer> wordCounts = lines
278
.flatMap(line -> Arrays.asList(line.split(" ")))
279
.mapToPair(new PairFunction<String, String, Integer>() {
280
public Tuple2<String, Integer> call(String word) {
281
return new Tuple2<>(word, 1);
282
}
283
});
284
285
// With lambda
286
JavaPairRDD<String, Integer> wordCounts2 = lines
287
.flatMap(line -> Arrays.asList(line.split(" ")))
288
.mapToPair(word -> new Tuple2<>(word, 1));
289
```
290
291
### Key-Value Transformations
292
293
```java
294
JavaPairRDD<String, Integer> pairs = jsc.parallelizePairs(Arrays.asList(
295
new Tuple2<>("a", 1),
296
new Tuple2<>("b", 2),
297
new Tuple2<>("a", 3)
298
));
299
300
// Get keys and values
301
JavaRDD<String> keys = pairs.keys();
302
JavaRDD<Integer> values = pairs.values();
303
304
// Transform values while preserving keys
305
JavaPairRDD<String, Integer> doubled = pairs.mapValues(x -> x * 2);
306
307
// FlatMap values
308
JavaPairRDD<String, Character> chars = pairs.flatMapValues(
309
value -> Arrays.asList(value.toString().toCharArray())
310
);
311
312
// Reduce by key
313
JavaPairRDD<String, Integer> sums = pairs.reduceByKey((a, b) -> a + b);
314
315
// Group by key
316
JavaPairRDD<String, Iterable<Integer>> grouped = pairs.groupByKey();
317
318
// Aggregate by key
319
JavaPairRDD<String, Integer> aggregated = pairs.aggregateByKey(
320
0, // Zero value
321
(acc, value) -> acc + value, // Seq function
322
(acc1, acc2) -> acc1 + acc2 // Combine function
323
);
324
```
325
326
### Join Operations
327
328
```java
329
JavaPairRDD<String, String> names = jsc.parallelizePairs(Arrays.asList(
330
new Tuple2<>("1", "Alice"),
331
new Tuple2<>("2", "Bob"),
332
new Tuple2<>("3", "Charlie")
333
));
334
335
JavaPairRDD<String, Integer> ages = jsc.parallelizePairs(Arrays.asList(
336
new Tuple2<>("1", 25),
337
new Tuple2<>("2", 30),
338
new Tuple2<>("4", 35)
339
));
340
341
// Inner join
342
JavaPairRDD<String, Tuple2<String, Integer>> joined = names.join(ages);
343
// Result: [("1", ("Alice", 25)), ("2", ("Bob", 30))]
344
345
// Left outer join
346
JavaPairRDD<String, Tuple2<String, Optional<Integer>>> leftJoined = names.leftOuterJoin(ages);
347
// Result: [("1", ("Alice", Some(25))), ("2", ("Bob", Some(30))), ("3", ("Charlie", None))]
348
349
// Full outer join
350
JavaPairRDD<String, Tuple2<Optional<String>, Optional<Integer>>> fullJoined = names.fullOuterJoin(ages);
351
352
// Cogroup
353
JavaPairRDD<String, Tuple2<Iterable<String>, Iterable<Integer>>> cogrouped = names.cogroup(ages);
354
```
355
356
### Actions on JavaPairRDD
357
358
```java
359
JavaPairRDD<String, Integer> pairs = jsc.parallelizePairs(Arrays.asList(
360
new Tuple2<>("apple", 5),
361
new Tuple2<>("banana", 3),
362
new Tuple2<>("apple", 2)
363
));
364
365
// Collect as Map (assumes unique keys)
366
Map<String, Integer> map = pairs.collectAsMap();
367
368
// Count by key
369
Map<String, Long> counts = pairs.countByKey();
370
371
// Lookup values for a key
372
List<Integer> appleValues = pairs.lookup("apple"); // [5, 2]
373
374
// Count all elements
375
long totalCount = pairs.count();
376
```
377
378
## JavaDoubleRDD
379
380
Specialized RDD for double values with statistical operations.
381
382
### JavaDoubleRDD Class
383
384
```java { .api }
385
public class JavaDoubleRDD extends AbstractJavaRDDLike<Double, JavaDoubleRDD> {
386
// Statistical operations
387
public double mean()
388
public double sum()
389
public StatCounter stats()
390
public double variance()
391
public double sampleVariance()
392
public double stdev()
393
public double sampleStdev()
394
public long[] histogram(double[] buckets)
395
public Tuple2<double[], long[]> histogram(int bucketCount)
396
}
397
```
398
399
### Using JavaDoubleRDD
400
401
```java
402
import org.apache.spark.api.java.JavaDoubleRDD;
403
import org.apache.spark.util.StatCounter;
404
405
// Create JavaDoubleRDD
406
List<Double> numbers = Arrays.asList(1.0, 2.0, 3.0, 4.0, 5.0);
407
JavaDoubleRDD doubleRDD = jsc.parallelizeDoubles(numbers);
408
409
// Convert from JavaRDD<Double>
410
JavaRDD<Double> rdd = jsc.parallelize(numbers);
411
JavaDoubleRDD doubleRDD2 = rdd.mapToDouble(x -> x);
412
413
// Statistical operations
414
double mean = doubleRDD.mean();
415
double sum = doubleRDD.sum();
416
double variance = doubleRDD.variance();
417
double stdev = doubleRDD.stdev();
418
419
// Get detailed statistics
420
StatCounter stats = doubleRDD.stats();
421
System.out.println("Count: " + stats.count());
422
System.out.println("Mean: " + stats.mean());
423
System.out.println("Stdev: " + stats.stdev());
424
System.out.println("Max: " + stats.max());
425
System.out.println("Min: " + stats.min());
426
427
// Histogram
428
double[] buckets = {0.0, 2.0, 4.0, 6.0};
429
long[] histogram = doubleRDD.histogram(buckets);
430
431
// Or with automatic bucketing
432
Tuple2<double[], long[]> autoHistogram = doubleRDD.histogram(4);
433
```
434
435
## Function Interfaces
436
437
Java API uses function interfaces for type-safe transformations.
438
439
### Function Interfaces
440
441
```java { .api }
442
// Single argument function
443
public interface Function<T, R> extends Serializable {
444
R call(T t) throws Exception;
445
}
446
447
// Two argument function
448
public interface Function2<T1, T2, R> extends Serializable {
449
R call(T1 t1, T2 t2) throws Exception;
450
}
451
452
// Void function (for actions)
453
public interface VoidFunction<T> extends Serializable {
454
void call(T t) throws Exception;
455
}
456
457
// FlatMap function
458
public interface FlatMapFunction<T, R> extends Serializable {
459
Iterable<R> call(T t) throws Exception;
460
}
461
462
// Pair function (for creating key-value pairs)
463
public interface PairFunction<T, K, V> extends Serializable {
464
Tuple2<K, V> call(T t) throws Exception;
465
}
466
467
// PairFlatMap function
468
public interface PairFlatMapFunction<T, K, V> extends Serializable {
469
Iterable<Tuple2<K, V>> call(T t) throws Exception;
470
}
471
```
472
473
### Function Usage Examples
474
475
```java
476
import org.apache.spark.api.java.function.*;
477
478
// Anonymous inner class
479
JavaRDD<Integer> doubled = rdd.map(new Function<Integer, Integer>() {
480
public Integer call(Integer x) {
481
return x * 2;
482
}
483
});
484
485
// Lambda expression (Java 8+)
486
JavaRDD<Integer> doubled2 = rdd.map(x -> x * 2);
487
488
// Method reference (Java 8+)
489
JavaRDD<String> strings = rdd.map(Object::toString);
490
491
// Complex transformation with PairFunction
492
JavaPairRDD<String, Integer> pairs = words.mapToPair(
493
new PairFunction<String, String, Integer>() {
494
public Tuple2<String, Integer> call(String word) {
495
return new Tuple2<>(word.toLowerCase(), word.length());
496
}
497
}
498
);
499
500
// FlatMap example
501
JavaRDD<String> words = lines.flatMap(
502
new FlatMapFunction<String, String>() {
503
public Iterable<String> call(String line) {
504
return Arrays.asList(line.split("\\s+"));
505
}
506
}
507
);
508
509
// Void function for actions
510
rdd.foreach(new VoidFunction<Integer>() {
511
public void call(Integer x) {
512
System.out.println(x);
513
}
514
});
515
```
516
517
## Shared Variables in Java
518
519
### Broadcast Variables
520
521
```java
522
import org.apache.spark.broadcast.Broadcast;
523
import java.util.Map;
524
import java.util.HashMap;
525
526
// Create broadcast variable
527
Map<String, Integer> lookupTable = new HashMap<>();
528
lookupTable.put("apple", 1);
529
lookupTable.put("banana", 2);
530
lookupTable.put("orange", 3);
531
532
Broadcast<Map<String, Integer>> broadcastTable = jsc.broadcast(lookupTable);
533
534
// Use in transformations
535
JavaRDD<String> fruits = jsc.parallelize(Arrays.asList("apple", "banana", "apple"));
536
JavaRDD<Integer> codes = fruits.map(fruit ->
537
broadcastTable.value().getOrDefault(fruit, 0)
538
);
539
540
// Clean up
541
broadcastTable.unpersist();
542
```
543
544
### Accumulators
545
546
```java
547
import org.apache.spark.Accumulator;
548
549
// Create accumulator
550
Accumulator<Integer> errorCount = jsc.accumulator(0);
551
552
// Use in transformations
553
JavaRDD<String> lines = jsc.textFile("input.txt");
554
JavaRDD<String> validLines = lines.filter(line -> {
555
if (line.trim().isEmpty()) {
556
errorCount.add(1);
557
return false;
558
}
559
return true;
560
});
561
562
// Trigger action to update accumulator
563
validLines.count();
564
565
// Get accumulator value
566
System.out.println("Error count: " + errorCount.value());
567
568
// Custom accumulator types
569
Accumulator<Double> doubleAcc = jsc.accumulator(0.0);
570
```
571
572
## Complete Example
573
574
```java
575
import org.apache.spark.SparkConf;
576
import org.apache.spark.api.java.JavaSparkContext;
577
import org.apache.spark.api.java.JavaRDD;
578
import org.apache.spark.api.java.JavaPairRDD;
579
import org.apache.spark.api.java.function.*;
580
import scala.Tuple2;
581
import java.util.Arrays;
582
import java.util.Map;
583
584
public class SparkWordCount {
585
public static void main(String[] args) {
586
// Create Spark context
587
SparkConf conf = new SparkConf()
588
.setAppName("Java Word Count")
589
.setMaster("local[*]");
590
591
JavaSparkContext jsc = new JavaSparkContext(conf);
592
593
try {
594
// Read input file
595
JavaRDD<String> lines = jsc.textFile("input.txt");
596
597
// Split lines into words
598
JavaRDD<String> words = lines.flatMap(line ->
599
Arrays.asList(line.toLowerCase().split("\\s+"))
600
);
601
602
// Filter out empty words
603
JavaRDD<String> validWords = words.filter(word -> !word.trim().isEmpty());
604
605
// Create word-count pairs
606
JavaPairRDD<String, Integer> wordPairs = validWords.mapToPair(
607
word -> new Tuple2<>(word, 1)
608
);
609
610
// Sum counts by key
611
JavaPairRDD<String, Integer> wordCounts = wordPairs.reduceByKey(
612
(a, b) -> a + b
613
);
614
615
// Sort by count descending
616
JavaPairRDD<String, Integer> sortedCounts = wordCounts.mapToPair(
617
pair -> new Tuple2<>(pair._2, pair._1) // Swap to (count, word)
618
).sortByKey(false).mapToPair(
619
pair -> new Tuple2<>(pair._2, pair._1) // Swap back to (word, count)
620
);
621
622
// Collect and print results
623
Map<String, Integer> results = sortedCounts.collectAsMap();
624
625
System.out.println("Word Count Results:");
626
results.entrySet().stream()
627
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
628
.limit(10)
629
.forEach(entry ->
630
System.out.println(entry.getKey() + ": " + entry.getValue())
631
);
632
633
// Save results
634
sortedCounts.saveAsTextFile("output");
635
636
} finally {
637
// Stop Spark context
638
jsc.stop();
639
}
640
}
641
}
642
```
643
644
## Maven Dependencies
645
646
```xml
647
<dependencies>
648
<dependency>
649
<groupId>org.apache.spark</groupId>
650
<artifactId>spark-core_2.10</artifactId>
651
<version>1.0.0</version>
652
</dependency>
653
</dependencies>
654
```
655
656
This comprehensive guide covers the complete Java API for Apache Spark, enabling Java developers to build scalable data processing applications with type safety and familiar Java patterns.