0
# Java API
1
2
Apache Spark provides comprehensive Java API compatibility through wrapper classes that provide Java-friendly interfaces for all core functionality.
3
4
## JavaSparkContext
5
6
Java-friendly wrapper for SparkContext providing the main entry point for Java applications.
7
8
```java { .api }
9
public class JavaSparkContext {
10
// Constructors
11
public JavaSparkContext(SparkConf conf)
12
public JavaSparkContext(SparkContext sc)
13
public JavaSparkContext(String master, String appName)
14
public JavaSparkContext(String master, String appName, String sparkHome, String[] jars)
15
16
// RDD Creation
17
public <T> JavaRDD<T> parallelize(List<T> list)
18
public <T> JavaRDD<T> parallelize(List<T> list, int numSlices)
19
public JavaRDD<String> textFile(String path)
20
public JavaRDD<String> textFile(String path, int minPartitions)
21
public JavaPairRDD<String, String> wholeTextFiles(String path)
22
public JavaPairRDD<String, String> wholeTextFiles(String path, int minPartitions)
23
24
// Hadoop Integration
25
public <K, V, F extends NewInputFormat<K, V>> JavaPairRDD<K, V> newAPIHadoopFile(
26
String path, Class<F> fClass, Class<K> kClass, Class<V> vClass)
27
public <K, V> JavaPairRDD<K, V> hadoopFile(
28
String path, Class<? extends InputFormat<K, V>> inputFormatClass,
29
Class<K> keyClass, Class<V> valueClass)
30
public <K, V> JavaPairRDD<K, V> hadoopFile(
31
String path, Class<? extends InputFormat<K, V>> inputFormatClass,
32
Class<K> keyClass, Class<V> valueClass, int minPartitions)
33
34
// Shared Variables
35
public <T> Broadcast<T> broadcast(T value)
36
public LongAccumulator longAccumulator()
37
public LongAccumulator longAccumulator(String name)
38
public DoubleAccumulator doubleAccumulator()
39
public DoubleAccumulator doubleAccumulator(String name)
40
public <T> CollectionAccumulator<T> collectionAccumulator()
41
public <T> CollectionAccumulator<T> collectionAccumulator(String name)
42
43
// Properties
44
public SparkContext sc()
45
public int defaultParallelism()
46
public int defaultMinPartitions()
47
public SparkStatusTracker statusTracker()
48
49
// Lifecycle
50
public void stop()
51
public void close()
52
}
53
```
54
55
## JavaRDD
56
57
Java wrapper for RDD providing Java-friendly transformations and actions.
58
59
```java { .api }
60
public class JavaRDD<T> {
61
// Transformations
62
public <R> JavaRDD<R> map(Function<T, R> f)
63
public <U> JavaRDD<U> flatMap(FlatMapFunction<T, U> f)
64
public JavaRDD<T> filter(Function<T, Boolean> f)
65
public JavaRDD<T> distinct()
66
public JavaRDD<T> distinct(int numPartitions)
67
public JavaRDD<T> sample(boolean withReplacement, double fraction)
68
public JavaRDD<T> sample(boolean withReplacement, double fraction, long seed)
69
public JavaRDD<T> union(JavaRDD<T> other)
70
public JavaRDD<T> intersection(JavaRDD<T> other)
71
public <U> JavaPairRDD<T, U> cartesian(JavaRDD<U> other)
72
public <K> JavaPairRDD<K, Iterable<T>> groupBy(Function<T, K> f)
73
public JavaRDD<String> pipe(String command)
74
public JavaRDD<T> coalesce(int numPartitions)
75
public JavaRDD<T> coalesce(int numPartitions, boolean shuffle)
76
public JavaRDD<T> repartition(int numPartitions)
77
public <S> JavaRDD<T> sortBy(Function<T, S> f, boolean ascending, int numPartitions)
78
public <K> JavaPairRDD<K, T> keyBy(Function<T, K> f)
79
80
// Actions
81
public List<T> collect()
82
public long count()
83
public T first()
84
public List<T> take(int num)
85
public List<T> top(int num)
86
public List<T> takeOrdered(int num)
87
public List<T> takeOrdered(int num, Comparator<T> comp)
88
public List<T> takeSample(boolean withReplacement, int num)
89
public List<T> takeSample(boolean withReplacement, int num, long seed)
90
public T reduce(Function2<T, T, T> f)
91
public T fold(T zeroValue, Function2<T, T, T> op)
92
public <U> U aggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp)
93
public <U> U treeAggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp)
94
public <U> U treeAggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp, int depth)
95
public void foreach(VoidFunction<T> f)
96
public void foreachPartition(VoidFunction<Iterator<T>> f)
97
98
// I/O Actions
99
public void saveAsTextFile(String path)
100
public void saveAsTextFile(String path, Class<? extends CompressionCodec> codec)
101
public void saveAsObjectFile(String path)
102
103
// Persistence
104
public JavaRDD<T> persist(StorageLevel newLevel)
105
public JavaRDD<T> cache()
106
public JavaRDD<T> unpersist()
107
public JavaRDD<T> unpersist(boolean blocking)
108
public StorageLevel getStorageLevel()
109
public void checkpoint()
110
public boolean isCheckpointed()
111
public Optional<String> getCheckpointFile()
112
113
// Metadata
114
public JavaSparkContext context()
115
public int getNumPartitions()
116
public int id()
117
public String name()
118
public JavaRDD<T> setName(String name)
119
public String toDebugString()
120
}
121
```
122
123
## JavaPairRDD
124
125
Java wrapper for pair RDDs providing key-value operations.
126
127
```java { .api }
128
public class JavaPairRDD<K, V> {
129
// Grouping Operations
130
public JavaPairRDD<K, Iterable<V>> groupByKey()
131
public JavaPairRDD<K, Iterable<V>> groupByKey(int numPartitions)
132
public JavaPairRDD<K, Iterable<V>> groupByKey(Partitioner partitioner)
133
134
// Reduction Operations
135
public JavaPairRDD<K, V> reduceByKey(Function2<V, V, V> func)
136
public JavaPairRDD<K, V> reduceByKey(Function2<V, V, V> func, int numPartitions)
137
public JavaPairRDD<K, V> reduceByKey(Partitioner partitioner, Function2<V, V, V> func)
138
public JavaPairRDD<K, V> foldByKey(V zeroValue, Function2<V, V, V> func)
139
public JavaPairRDD<K, V> foldByKey(V zeroValue, int numPartitions, Function2<V, V, V> func)
140
public JavaPairRDD<K, V> foldByKey(V zeroValue, Partitioner partitioner, Function2<V, V, V> func)
141
142
// Aggregation Operations
143
public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)
144
public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, int numPartitions, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)
145
public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, Partitioner partitioner, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)
146
public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners)
147
public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners, int numPartitions)
148
public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners, Partitioner partitioner)
149
150
// Partitioning
151
public JavaPairRDD<K, V> partitionBy(Partitioner partitioner)
152
153
// Join Operations
154
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other)
155
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, int numPartitions)
156
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, Partitioner partitioner)
157
public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other)
158
public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other, int numPartitions)
159
public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)
160
public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other)
161
public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other, int numPartitions)
162
public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)
163
public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other)
164
public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other, int numPartitions)
165
public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)
166
public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other)
167
public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other, int numPartitions)
168
public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other, Partitioner partitioner)
169
170
// Set Operations
171
public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other)
172
public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other, int numPartitions)
173
public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other, Partitioner partitioner)
174
175
// Lookups and Collection
176
public List<V> lookup(K key)
177
public Map<K, V> collectAsMap()
178
public Map<K, Long> countByKey()
179
public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout)
180
public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout, double confidence)
181
182
// Value Operations
183
public <U> JavaPairRDD<K, U> mapValues(Function<V, U> f)
184
public <U> JavaPairRDD<K, U> flatMapValues(Function<V, Iterable<U>> f)
185
public JavaRDD<K> keys()
186
public JavaRDD<V> values()
187
188
// Sorting
189
public JavaPairRDD<K, V> sortByKey()
190
public JavaPairRDD<K, V> sortByKey(boolean ascending)
191
public JavaPairRDD<K, V> sortByKey(boolean ascending, int numPartitions)
192
193
// Conversion
194
public JavaRDD<Tuple2<K, V>> rdd()
195
196
// I/O Operations
197
public void saveAsHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends OutputFormat> outputFormatClass)
198
public void saveAsHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends OutputFormat> outputFormatClass, Class<? extends CompressionCodec> codec)
199
public void saveAsNewAPIHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends NewOutputFormat> outputFormatClass)
200
}
201
```
202
203
## JavaDoubleRDD
204
205
Java wrapper for RDDs of Double values providing statistical operations.
206
207
```java { .api }
208
public class JavaDoubleRDD {
209
// Statistical Operations
210
public double sum()
211
public StatCounter stats()
212
public double mean()
213
public double variance()
214
public double stdev()
215
public double sampleStdev()
216
public double sampleVariance()
217
218
// Histogram Operations
219
public long[] histogram(double[] buckets)
220
public Tuple2<double[], long[]> histogram(int buckets)
221
222
// Standard RDD Operations (inherited)
223
public List<Double> collect()
224
public long count()
225
public Double first()
226
public List<Double> take(int num)
227
public Double reduce(Function2<Double, Double, Double> f)
228
229
// Transformations
230
public JavaDoubleRDD filter(Function<Double, Boolean> f)
231
public JavaDoubleRDD map(DoubleFunction<Double> f)
232
public JavaDoubleRDD cache()
233
public JavaDoubleRDD persist(StorageLevel newLevel)
234
}
235
```
236
237
## Usage Examples
238
239
### Basic Java Usage
240
```java
241
import org.apache.spark.api.java.JavaSparkContext;
242
import org.apache.spark.api.java.JavaRDD;
243
import org.apache.spark.SparkConf;
244
import java.util.Arrays;
245
import java.util.List;
246
247
SparkConf conf = new SparkConf()
248
.setAppName("Java Spark Example")
249
.setMaster("local[*]");
250
251
JavaSparkContext sc = new JavaSparkContext(conf);
252
253
// Create RDD from collection
254
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
255
JavaRDD<Integer> rdd = sc.parallelize(data);
256
257
// Transformations and actions
258
JavaRDD<Integer> squares = rdd.map(x -> x * x);
259
List<Integer> result = squares.collect();
260
261
sc.stop();
262
```
263
264
### Key-Value Operations
265
```java
266
import org.apache.spark.api.java.JavaPairRDD;
267
import scala.Tuple2;
268
269
List<Tuple2<String, Integer>> pairs = Arrays.asList(
270
new Tuple2<>("a", 1),
271
new Tuple2<>("b", 2),
272
new Tuple2<>("a", 3)
273
);
274
275
JavaPairRDD<String, Integer> pairRDD = sc.parallelizePairs(pairs);
276
277
// Reduce by key
278
JavaPairRDD<String, Integer> sums = pairRDD.reduceByKey((a, b) -> a + b);
279
280
// Collect as map
281
Map<String, Integer> resultMap = sums.collectAsMap();
282
```
283
284
### Lambda Expressions vs Function Objects
285
```java
286
// Using lambda expressions (Java 8+)
287
JavaRDD<Integer> mapped = rdd.map(x -> x * 2);
288
JavaRDD<Integer> filtered = rdd.filter(x -> x > 10);
289
290
// Using Function objects (Java 7 compatibility)
291
import org.apache.spark.api.java.function.Function;
292
293
JavaRDD<Integer> mapped2 = rdd.map(new Function<Integer, Integer>() {
294
public Integer call(Integer x) {
295
return x * 2;
296
}
297
});
298
```
299
300
### Working with Text Files
301
```java
302
// Read text file
303
JavaRDD<String> lines = sc.textFile("hdfs://path/to/file.txt");
304
305
// Word count example
306
JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
307
JavaPairRDD<String, Integer> pairs = words.mapToPair(word -> new Tuple2<>(word, 1));
308
JavaPairRDD<String, Integer> counts = pairs.reduceByKey((a, b) -> a + b);
309
310
// Save results
311
counts.saveAsTextFile("hdfs://path/to/output");
312
```