0
# Java API Integration
1
2
Comprehensive functional interfaces for Spark's Java API, enabling type-safe lambda expressions and functional programming patterns for Java developers using Spark operations.
3
4
## Capabilities
5
6
### Core Function Interfaces
7
8
Base functional interfaces for common transformation and action operations.
9
10
```java { .api }
11
/**
12
* Base single-argument function interface
13
* @param <T1> - Input type
14
* @param <R> - Return type
15
*/
16
@FunctionalInterface
17
public interface Function<T1, R> extends Serializable {
18
/**
19
* Apply function to input value
20
* @param v1 - Input value
21
* @return Transformed result
22
* @throws Exception - Any exception during function execution
23
*/
24
R call(T1 v1) throws Exception;
25
}
26
27
/**
28
* No-argument function interface
29
* @param <R> - Return type
30
*/
31
@FunctionalInterface
32
public interface Function0<R> extends Serializable {
33
/**
34
* Apply function with no arguments
35
* @return Function result
36
* @throws Exception - Any exception during function execution
37
*/
38
R call() throws Exception;
39
}
40
41
/**
42
* Two-argument function interface
43
* @param <T1> - First input type
44
* @param <T2> - Second input type
45
* @param <R> - Return type
46
*/
47
@FunctionalInterface
48
public interface Function2<T1, T2, R> extends Serializable {
49
/**
50
* Apply function to two input values
51
* @param v1 - First input value
52
* @param v2 - Second input value
53
* @return Transformed result
54
* @throws Exception - Any exception during function execution
55
*/
56
R call(T1 v1, T2 v2) throws Exception;
57
}
58
59
/**
60
* Three-argument function interface
61
* @param <T1> - First input type
62
* @param <T2> - Second input type
63
* @param <T3> - Third input type
64
* @param <R> - Return type
65
*/
66
@FunctionalInterface
67
public interface Function3<T1, T2, T3, R> extends Serializable {
68
/**
69
* Apply function to three input values
70
* @param v1 - First input value
71
* @param v2 - Second input value
72
* @param v3 - Third input value
73
* @return Transformed result
74
* @throws Exception - Any exception during function execution
75
*/
76
R call(T1 v1, T2 v2, T3 v3) throws Exception;
77
}
78
79
/**
80
* Four-argument function interface
81
* @param <T1> - First input type
82
* @param <T2> - Second input type
83
* @param <T3> - Third input type
84
* @param <T4> - Fourth input type
85
* @param <R> - Return type
86
*/
87
@FunctionalInterface
88
public interface Function4<T1, T2, T3, T4, R> extends Serializable {
89
/**
90
* Apply function to four input values
91
* @param v1 - First input value
92
* @param v2 - Second input value
93
* @param v3 - Third input value
94
* @param v4 - Fourth input value
95
* @return Transformed result
96
* @throws Exception - Any exception during function execution
97
*/
98
R call(T1 v1, T2 v2, T3 v3, T4 v4) throws Exception;
99
}
100
```
101
102
### Specialized Function Interfaces
103
104
Advanced functional interfaces for specific Spark operations like pair RDD creation, flatMap operations, and void functions.
105
106
```java { .api }
107
/**
108
* Function that returns key-value pairs for creating PairRDDs
109
* @param <T> - Input type
110
* @param <K> - Key type
111
* @param <V> - Value type
112
*/
113
@FunctionalInterface
114
public interface PairFunction<T, K, V> extends Serializable {
115
/**
116
* Returns a key-value pair from input
117
* @param t - Input value
118
* @return Tuple2 containing key and value
119
* @throws Exception - Any exception during function execution
120
*/
121
Tuple2<K, V> call(T t) throws Exception;
122
}
123
124
/**
125
* Function that returns zero or more output records from each input record
126
* @param <T> - Input type
127
* @param <R> - Output type
128
*/
129
@FunctionalInterface
130
public interface FlatMapFunction<T, R> extends Serializable {
131
/**
132
* Returns iterator of output records
133
* @param t - Input value
134
* @return Iterator over output values
135
* @throws Exception - Any exception during function execution
136
*/
137
Iterator<R> call(T t) throws Exception;
138
}
139
140
/**
141
* Function with no return value for actions
142
* @param <T> - Input type
143
*/
144
@FunctionalInterface
145
public interface VoidFunction<T> extends Serializable {
146
/**
147
* Execute action on input value
148
* @param t - Input value
149
* @throws Exception - Any exception during function execution
150
*/
151
void call(T t) throws Exception;
152
}
153
154
/**
155
* Two-argument void function for actions
156
* @param <T1> - First input type
157
* @param <T2> - Second input type
158
*/
159
@FunctionalInterface
160
public interface VoidFunction2<T1, T2> extends Serializable {
161
/**
162
* Execute action on two input values
163
* @param v1 - First input value
164
* @param v2 - Second input value
165
* @throws Exception - Any exception during function execution
166
*/
167
void call(T1 v1, T2 v2) throws Exception;
168
}
169
170
/**
171
* Function for filtering operations
172
* @param <T> - Input type
173
*/
174
@FunctionalInterface
175
public interface FilterFunction<T> extends Serializable {
176
/**
177
* Test whether input should be included
178
* @param value - Input value to test
179
* @return true if value should be included
180
* @throws Exception - Any exception during function execution
181
*/
182
boolean call(T value) throws Exception;
183
}
184
185
/**
186
* Function for reducing operations
187
* @param <T> - Input and output type
188
*/
189
@FunctionalInterface
190
public interface ReduceFunction<T> extends Serializable {
191
/**
192
* Combine two values into one
193
* @param v1 - First value
194
* @param v2 - Second value
195
* @return Combined result
196
* @throws Exception - Any exception during function execution
197
*/
198
T call(T v1, T v2) throws Exception;
199
}
200
201
/**
202
* Function for operations on each partition
203
* @param <T> - Input type
204
*/
205
@FunctionalInterface
206
public interface ForeachPartitionFunction<T> extends Serializable {
207
/**
208
* Execute action on partition iterator
209
* @param t - Iterator over partition elements
210
* @throws Exception - Any exception during function execution
211
*/
212
void call(Iterator<T> t) throws Exception;
213
}
214
```
215
216
**Usage Examples:**
217
218
```java
219
import org.apache.spark.api.java.function.*;
220
import org.apache.spark.api.java.JavaRDD;
221
222
// Single-argument function for map operations
223
Function<String, Integer> stringLength = s -> s.length();
224
JavaRDD<Integer> lengths = stringRDD.map(stringLength);
225
226
// Two-argument function for reduce operations
227
Function2<Integer, Integer, Integer> sum = (a, b) -> a + b;
228
int total = numberRDD.reduce(sum);
229
230
// No-argument function for suppliers
231
Function0<String> currentTime = () -> java.time.Instant.now().toString();
232
233
// Multi-argument functions for complex operations
234
Function3<String, Integer, Boolean, String> formatter =
235
(str, num, flag) -> flag ? str.toUpperCase() + num : str + num;
236
```
237
238
### Void Functions
239
240
Functions that perform side effects without returning values, commonly used for actions like foreach.
241
242
```java { .api }
243
/**
244
* Function with no return value (void function)
245
* @param <T> - Input type
246
*/
247
@FunctionalInterface
248
public interface VoidFunction<T> extends Serializable {
249
/**
250
* Apply function to input value with no return
251
* @param t - Input value
252
* @throws Exception - Any exception during function execution
253
*/
254
void call(T t) throws Exception;
255
}
256
257
/**
258
* Two-argument function with no return value
259
* @param <T1> - First input type
260
* @param <T2> - Second input type
261
*/
262
@FunctionalInterface
263
public interface VoidFunction2<T1, T2> extends Serializable {
264
/**
265
* Apply function to two input values with no return
266
* @param t1 - First input value
267
* @param t2 - Second input value
268
* @throws Exception - Any exception during function execution
269
*/
270
void call(T1 t1, T2 t2) throws Exception;
271
}
272
```
273
274
**Usage Examples:**
275
276
```java
277
import org.apache.spark.api.java.function.*;
278
279
// Void function for foreach operations
280
VoidFunction<String> printString = s -> System.out.println(s);
281
stringRDD.foreach(printString);
282
283
// Void function for side effects
284
VoidFunction<Integer> incrementCounter = i -> {
285
counter.addAndGet(i);
286
logger.info("Processed: " + i);
287
};
288
numberRDD.foreach(incrementCounter);
289
290
// Two-argument void function
291
VoidFunction2<String, Integer> logPair = (key, value) -> {
292
System.out.println("Key: " + key + ", Value: " + value);
293
};
294
```
295
296
### Specialized Transformation Functions
297
298
Functions designed for specific Spark operations like mapping, filtering, and flat mapping.
299
300
```java { .api }
301
/**
302
* Function for mapping transformations
303
* @param <T> - Input type
304
* @param <R> - Output type
305
*/
306
@FunctionalInterface
307
public interface MapFunction<T, R> extends Serializable {
308
/**
309
* Transform input value to output value
310
* @param value - Input value
311
* @return Transformed value
312
* @throws Exception - Any exception during transformation
313
*/
314
R call(T value) throws Exception;
315
}
316
317
/**
318
* Function for filtering operations
319
* @param <T> - Input type
320
*/
321
@FunctionalInterface
322
public interface FilterFunction<T> extends Serializable {
323
/**
324
* Test if value should be included in result
325
* @param value - Input value to test
326
* @return true if value should be included, false otherwise
327
* @throws Exception - Any exception during filtering
328
*/
329
boolean call(T value) throws Exception;
330
}
331
332
/**
333
* Function for flat mapping operations (one-to-many)
334
* @param <T> - Input type
335
* @param <R> - Output element type
336
*/
337
@FunctionalInterface
338
public interface FlatMapFunction<T, R> extends Serializable {
339
/**
340
* Transform single input to iterator of outputs
341
* @param t - Input value
342
* @return Iterator of output values
343
* @throws Exception - Any exception during transformation
344
*/
345
Iterator<R> call(T t) throws Exception;
346
}
347
348
/**
349
* Function for flat mapping operations with two arguments
350
* @param <A> - First input type
351
* @param <B> - Second input type
352
* @param <R> - Output element type
353
*/
354
@FunctionalInterface
355
public interface FlatMapFunction2<A, B, R> extends Serializable {
356
/**
357
* Transform two inputs to iterator of outputs
358
* @param a - First input value
359
* @param b - Second input value
360
* @return Iterator of output values
361
* @throws Exception - Any exception during transformation
362
*/
363
Iterator<R> call(A a, B b) throws Exception;
364
}
365
```
366
367
**Usage Examples:**
368
369
```java
370
import org.apache.spark.api.java.function.*;
371
import java.util.*;
372
373
// Map function for Dataset operations
374
MapFunction<Person, String> getName = person -> person.getName();
375
Dataset<String> names = personDataset.map(getName, Encoders.STRING());
376
377
// Filter function
378
FilterFunction<Integer> isEven = i -> i % 2 == 0;
379
Dataset<Integer> evenNumbers = numberDataset.filter(isEven);
380
381
// Flat map function - split strings into words
382
FlatMapFunction<String, String> splitWords = line -> {
383
return Arrays.asList(line.split(" ")).iterator();
384
};
385
JavaRDD<String> words = linesRDD.flatMap(splitWords);
386
387
// Two-argument flat map
388
FlatMapFunction2<String, String, String> combineAndSplit = (s1, s2) -> {
389
String combined = s1 + " " + s2;
390
return Arrays.asList(combined.split(" ")).iterator();
391
};
392
```
393
394
### Pair Functions
395
396
Functions that work with key-value pairs, essential for operations like groupByKey and reduceByKey.
397
398
```java { .api }
399
/**
400
* Function that produces key-value pairs (Tuple2)
401
* @param <T> - Input type
402
* @param <K> - Key type
403
* @param <V> - Value type
404
*/
405
@FunctionalInterface
406
public interface PairFunction<T, K, V> extends Serializable {
407
/**
408
* Transform input to key-value pair
409
* @param t - Input value
410
* @return Tuple2 containing key and value
411
* @throws Exception - Any exception during transformation
412
*/
413
Tuple2<K, V> call(T t) throws Exception;
414
}
415
416
/**
417
* Flat map function that produces key-value pairs
418
* @param <T> - Input type
419
* @param <K> - Key type
420
* @param <V> - Value type
421
*/
422
@FunctionalInterface
423
public interface PairFlatMapFunction<T, K, V> extends Serializable {
424
/**
425
* Transform input to iterator of key-value pairs
426
* @param t - Input value
427
* @return Iterator of Tuple2 containing keys and values
428
* @throws Exception - Any exception during transformation
429
*/
430
Iterator<Tuple2<K, V>> call(T t) throws Exception;
431
}
432
```
433
434
**Usage Examples:**
435
436
```java
437
import org.apache.spark.api.java.function.*;
438
import scala.Tuple2;
439
440
// Pair function for creating key-value pairs
441
PairFunction<String, String, Integer> wordToPair =
442
word -> new Tuple2<>(word, 1);
443
JavaPairRDD<String, Integer> wordCounts = wordsRDD.mapToPair(wordToPair);
444
445
// Pair flat map function
446
PairFlatMapFunction<String, String, Integer> lineToPairs = line -> {
447
List<Tuple2<String, Integer>> pairs = new ArrayList<>();
448
for (String word : line.split(" ")) {
449
pairs.add(new Tuple2<>(word, 1));
450
}
451
return pairs.iterator();
452
};
453
JavaPairRDD<String, Integer> wordPairs = linesRDD.flatMapToPair(lineToPairs);
454
```
455
456
### Reduction Functions
457
458
Functions for aggregation and reduction operations.
459
460
```java { .api }
461
/**
462
* Function for reduction operations
463
* @param <T> - Type being reduced
464
*/
465
@FunctionalInterface
466
public interface ReduceFunction<T> extends Serializable {
467
/**
468
* Combine two values into one
469
* @param v1 - First value
470
* @param v2 - Second value
471
* @return Combined result
472
* @throws Exception - Any exception during reduction
473
*/
474
T call(T v1, T v2) throws Exception;
475
}
476
```
477
478
**Usage Examples:**
479
480
```java
481
import org.apache.spark.api.java.function.*;
482
483
// Reduction function for summing integers
484
ReduceFunction<Integer> sum = (a, b) -> a + b;
485
int total = numberRDD.reduce(sum);
486
487
// Reduction function for finding maximum
488
ReduceFunction<Double> max = (a, b) -> Math.max(a, b);
489
double maximum = doubleRDD.reduce(max);
490
491
// Reduction function for string concatenation
492
ReduceFunction<String> concat = (s1, s2) -> s1 + " " + s2;
493
String combined = stringRDD.reduce(concat);
494
```
495
496
### Specialized Numeric Functions
497
498
Functions specifically designed for numeric operations.
499
500
```java { .api }
501
/**
502
* Function that returns double values
503
* @param <T> - Input type
504
*/
505
@FunctionalInterface
506
public interface DoubleFunction<T> extends Serializable {
507
/**
508
* Transform input to double value
509
* @param t - Input value
510
* @return Double result
511
* @throws Exception - Any exception during transformation
512
*/
513
double call(T t) throws Exception;
514
}
515
516
/**
517
* Flat map function that returns double values
518
* @param <T> - Input type
519
*/
520
@FunctionalInterface
521
public interface DoubleFlatMapFunction<T> extends Serializable {
522
/**
523
* Transform input to iterator of double values
524
* @param t - Input value
525
* @return Iterator of double values
526
* @throws Exception - Any exception during transformation
527
*/
528
Iterator<Double> call(T t) throws Exception;
529
}
530
```
531
532
**Usage Examples:**
533
534
```java
535
import org.apache.spark.api.java.function.*;
536
537
// Double function for numeric extraction
538
DoubleFunction<String> parseDouble = s -> Double.parseDouble(s);
539
JavaDoubleRDD doubleRDD = stringRDD.mapToDouble(parseDouble);
540
541
// Double flat map function
542
DoubleFlatMapFunction<String> extractNumbers = line -> {
543
List<Double> numbers = new ArrayList<>();
544
for (String token : line.split(" ")) {
545
try {
546
numbers.add(Double.parseDouble(token));
547
} catch (NumberFormatException e) {
548
// Skip non-numeric tokens
549
}
550
}
551
return numbers.iterator();
552
};
553
```
554
555
### Advanced Grouping Functions
556
557
Functions for advanced grouping and co-grouping operations.
558
559
```java { .api }
560
/**
561
* Function for co-grouping operations
562
* @param <V1> - First value type
563
* @param <V2> - Second value type
564
* @param <R> - Result type
565
*/
566
@FunctionalInterface
567
public interface CoGroupFunction<V1, V2, R> extends Serializable {
568
/**
569
* Process co-grouped values
570
* @param v1 - Iterator of first group values
571
* @param v2 - Iterator of second group values
572
* @return Processing result
573
* @throws Exception - Any exception during processing
574
*/
575
R call(Iterator<V1> v1, Iterator<V2> v2) throws Exception;
576
}
577
578
/**
579
* Function for mapping grouped data
580
* @param <K> - Key type
581
* @param <V> - Value type
582
* @param <R> - Result type
583
*/
584
@FunctionalInterface
585
public interface MapGroupsFunction<K, V, R> extends Serializable {
586
/**
587
* Process grouped values for a key
588
* @param key - Group key
589
* @param values - Iterator of values for the key
590
* @return Processing result
591
* @throws Exception - Any exception during processing
592
*/
593
R call(K key, Iterator<V> values) throws Exception;
594
}
595
596
/**
597
* Function for flat mapping grouped data
598
* @param <K> - Key type
599
* @param <V> - Value type
600
* @param <R> - Result element type
601
*/
602
@FunctionalInterface
603
public interface FlatMapGroupsFunction<K, V, R> extends Serializable {
604
/**
605
* Process grouped values and return iterator of results
606
* @param key - Group key
607
* @param values - Iterator of values for the key
608
* @return Iterator of processing results
609
* @throws Exception - Any exception during processing
610
*/
611
Iterator<R> call(K key, Iterator<V> values) throws Exception;
612
}
613
```
614
615
**Usage Examples:**
616
617
```java
618
import org.apache.spark.api.java.function.*;
619
620
// Map groups function for aggregation
621
MapGroupsFunction<String, Integer, Double> computeAverage = (key, values) -> {
622
int sum = 0;
623
int count = 0;
624
while (values.hasNext()) {
625
sum += values.next();
626
count++;
627
}
628
return count > 0 ? (double) sum / count : 0.0;
629
};
630
631
// Flat map groups function for expansion
632
FlatMapGroupsFunction<String, Person, String> extractEmails = (department, people) -> {
633
List<String> emails = new ArrayList<>();
634
while (people.hasNext()) {
635
Person person = people.next();
636
if (person.getEmail() != null) {
637
emails.add(person.getEmail());
638
}
639
}
640
return emails.iterator();
641
};
642
```
643
644
### Partition and Action Functions
645
646
Functions for partition-wise operations and actions.
647
648
```java { .api }
649
/**
650
* Function for mapping entire partitions
651
* @param <T> - Input element type
652
* @param <R> - Output element type
653
*/
654
@FunctionalInterface
655
public interface MapPartitionsFunction<T, R> extends Serializable {
656
/**
657
* Process entire partition
658
* @param input - Iterator of partition elements
659
* @return Iterator of results
660
* @throws Exception - Any exception during processing
661
*/
662
Iterator<R> call(Iterator<T> input) throws Exception;
663
}
664
665
/**
666
* Function for foreach operations on elements
667
* @param <T> - Input type
668
*/
669
@FunctionalInterface
670
public interface ForeachFunction<T> extends Serializable {
671
/**
672
* Process single element (side effect)
673
* @param t - Input element
674
* @throws Exception - Any exception during processing
675
*/
676
void call(T t) throws Exception;
677
}
678
679
/**
680
* Function for foreach operations on partitions
681
* @param <T> - Input element type
682
*/
683
@FunctionalInterface
684
public interface ForeachPartitionFunction<T> extends Serializable {
685
/**
686
* Process entire partition (side effect)
687
* @param t - Iterator of partition elements
688
* @throws Exception - Any exception during processing
689
*/
690
void call(Iterator<T> t) throws Exception;
691
}
692
```
693
694
**Usage Examples:**
695
696
```java
697
import org.apache.spark.api.java.function.*;
698
699
// Map partitions function for batch processing
700
MapPartitionsFunction<String, String> processPartition = partition -> {
701
List<String> results = new ArrayList<>();
702
BatchProcessor processor = new BatchProcessor();
703
704
while (partition.hasNext()) {
705
results.add(processor.process(partition.next()));
706
}
707
708
processor.close();
709
return results.iterator();
710
};
711
712
// Foreach function for side effects
713
ForeachFunction<String> writeToFile = line -> {
714
fileWriter.write(line + "\n");
715
};
716
717
// Foreach partition function for batch side effects
718
ForeachPartitionFunction<Record> batchInsert = records -> {
719
DatabaseConnection conn = getConnection();
720
PreparedStatement stmt = conn.prepareStatement("INSERT INTO table VALUES (?)");
721
722
while (records.hasNext()) {
723
stmt.setString(1, records.next().getValue());
724
stmt.addBatch();
725
}
726
727
stmt.executeBatch();
728
conn.close();
729
};
730
```
731
732
## Type Definitions
733
734
```java { .api }
735
// Core function interfaces
736
@FunctionalInterface
737
interface Function<T1, R> extends Serializable {
738
R call(T1 v1) throws Exception;
739
}
740
741
@FunctionalInterface
742
interface Function2<T1, T2, R> extends Serializable {
743
R call(T1 v1, T2 v2) throws Exception;
744
}
745
746
@FunctionalInterface
747
interface VoidFunction<T> extends Serializable {
748
void call(T t) throws Exception;
749
}
750
751
// Specialized transformation interfaces
752
@FunctionalInterface
753
interface MapFunction<T, R> extends Serializable {
754
R call(T value) throws Exception;
755
}
756
757
@FunctionalInterface
758
interface FilterFunction<T> extends Serializable {
759
boolean call(T value) throws Exception;
760
}
761
762
@FunctionalInterface
763
interface FlatMapFunction<T, R> extends Serializable {
764
Iterator<R> call(T t) throws Exception;
765
}
766
767
// Pair operation interfaces
768
@FunctionalInterface
769
interface PairFunction<T, K, V> extends Serializable {
770
Tuple2<K, V> call(T t) throws Exception;
771
}
772
773
// Reduction and aggregation interfaces
774
@FunctionalInterface
775
interface ReduceFunction<T> extends Serializable {
776
T call(T v1, T v2) throws Exception;
777
}
778
779
// Numeric operation interfaces
780
@FunctionalInterface
781
interface DoubleFunction<T> extends Serializable {
782
double call(T t) throws Exception;
783
}
784
785
// Grouping operation interfaces
786
@FunctionalInterface
787
interface MapGroupsFunction<K, V, R> extends Serializable {
788
R call(K key, Iterator<V> values) throws Exception;
789
}
790
```