Tessl Tile for maven/org.apache.spark/spark-core_2.13@4.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

accumulators.md application-context.md broadcast-variables.md index.md java-api.md partitioning.md rdd-operations.md serialization.md storage-persistence.md

java-api.mddocs/

0
# Java API
1

2
Java-friendly wrappers for Spark functionality providing type-safe distributed processing and seamless integration with Java applications.
3

4
## Capabilities
5

6
### JavaSparkContext
7

8
Java-friendly version of SparkContext providing the main entry point for Java Spark applications.
9

10
```java { .api }
11
/**
12
 * Java-friendly wrapper for SparkContext
13
 */
14
public class JavaSparkContext {
15
  /** Create JavaSparkContext from SparkContext */
16
  public JavaSparkContext(SparkContext sc)
17
  
18
  /** Create JavaSparkContext from SparkConf */
19
  public JavaSparkContext(SparkConf conf)
20
  
21
  /** Create JavaSparkContext with app name and master URL */
22
  public JavaSparkContext(String master, String appName)
23
  
24
  /** Create RDD from Java collection */
25
  public <T> JavaRDD<T> parallelize(java.util.List<T> list)
26
  public <T> JavaRDD<T> parallelize(java.util.List<T> list, int numSlices)
27
  
28
  /** Create pair RDD from Java collection */
29
  public <K, V> JavaPairRDD<K, V> parallelizePairs(java.util.List<scala.Tuple2<K, V>> list)
30
  public <K, V> JavaPairRDD<K, V> parallelizePairs(java.util.List<scala.Tuple2<K, V>> list, int numSlices)
31
  
32
  /** Read text file */
33
  public JavaRDD<String> textFile(String path)
34
  public JavaRDD<String> textFile(String path, int minPartitions)
35
  
36
  /** Read whole text files */
37
  public JavaPairRDD<String, String> wholeTextFiles(String path)
38
  public JavaPairRDD<String, String> wholeTextFiles(String path, int minPartitions)
39
  
40
  /** Create RDD from Hadoop InputFormat */
41
  public <K, V> JavaPairRDD<K, V> hadoopRDD(
42
    JobConf conf,
43
    Class<? extends InputFormat<K, V>> inputFormatClass,
44
    Class<K> keyClass,
45
    Class<V> valueClass
46
  )
47
  
48
  /** Create RDD from new Hadoop InputFormat */
49
  public <K, V> JavaPairRDD<K, V> newAPIHadoopRDD(
50
    Configuration conf,
51
    Class<? extends NewInputFormat<K, V>> fClass,
52
    Class<K> kClass,
53
    Class<V> vClass
54
  )
55
  
56
  /** Create broadcast variable */
57
  public <T> Broadcast<T> broadcast(T value)
58
  
59
  /** Create accumulator */
60
  public LongAccumulator longAccumulator()
61
  public LongAccumulator longAccumulator(String name)
62
  public DoubleAccumulator doubleAccumulator()
63
  public DoubleAccumulator doubleAccumulator(String name)
64
  public <T> CollectionAccumulator<T> collectionAccumulator()
65
  public <T> CollectionAccumulator<T> collectionAccumulator(String name)
66
  
67
  /** Add file to Spark job */
68
  public void addFile(String path)
69
  public void addFile(String path, boolean recursive)
70
  
71
  /** Add JAR file */
72
  public void addJar(String path)
73
  
74
  /** Set checkpoint directory */
75
  public void setCheckpointDir(String dir)
76
  
77
  /** Get underlying SparkContext */
78
  public SparkContext sc()
79
  
80
  /** Get status tracker */
81
  public JavaSparkStatusTracker statusTracker()
82
  
83
  /** Stop JavaSparkContext */
84
  public void stop()
85
  
86
  /** Close JavaSparkContext (same as stop) */
87
  public void close()
88
}
89
```
90

91
### JavaRDD
92

93
Java-friendly wrapper for RDD providing type-safe distributed operations.
94

95
```java { .api }
96
/**
97
 * Java-friendly wrapper for RDD
98
 */
99
public class JavaRDD<T> {
100
  // Transformations
101
  
102
  /** Transform each element */
103
  public <R> JavaRDD<R> map(org.apache.spark.api.java.function.Function<T, R> f)
104
  
105
  /** Transform and flatten */
106
  public <R> JavaRDD<R> flatMap(org.apache.spark.api.java.function.FlatMapFunction<T, R> f)
107
  
108
  /** Filter elements */
109
  public JavaRDD<T> filter(org.apache.spark.api.java.function.Function<T, Boolean> f)
110
  
111
  /** Map with partition index */
112
  public <R> JavaRDD<R> mapPartitionsWithIndex(
113
    org.apache.spark.api.java.function.org.apache.spark.api.java.function.Function2<Integer, java.util.Iterator<T>, java.util.Iterator<R>> f,
114
    boolean preservesPartitioning
115
  )
116
  
117
  /** Sample elements */
118
  public JavaRDD<T> sample(boolean withReplacement, double fraction)
119
  public JavaRDD<T> sample(boolean withReplacement, double fraction, long seed)
120
  
121
  /** Union with another RDD */
122
  public JavaRDD<T> union(JavaRDD<T> other)
123
  
124
  /** Intersection with another RDD */
125
  public JavaRDD<T> intersection(JavaRDD<T> other)
126
  
127
  /** Get distinct elements */
128
  public JavaRDD<T> distinct()
129
  public JavaRDD<T> distinct(int numPartitions)
130
  
131
  /** Group by key function */
132
  public <K> JavaPairRDD<K, Iterable<T>> groupBy(Function<T, K> f)
133
  
134
  /** Coalesce partitions */
135
  public JavaRDD<T> coalesce(int numPartitions)
136
  public JavaRDD<T> coalesce(int numPartitions, boolean shuffle)
137
  
138
  /** Repartition */
139
  public JavaRDD<T> repartition(int numPartitions)
140
  
141
  /** Sort by key function */
142
  public <S> JavaRDD<T> sortBy(Function<T, S> f, boolean ascending, int numPartitions)
143
  
144
  /** Zip with another RDD */
145
  public <U> JavaPairRDD<T, U> zip(JavaRDD<U> other)
146
  
147
  /** Zip with indices */
148
  public JavaPairRDD<T, Long> zipWithIndex()
149
  
150
  /** Zip with unique IDs */
151
  public JavaPairRDD<T, Long> zipWithUniqueId()
152
  
153
  /** Map to pair RDD */
154
  public <K, V> JavaPairRDD<K, V> mapToPair(PairFunction<T, K, V> f)
155
  
156
  // Actions
157
  
158
  /** Collect all elements */
159
  public List<T> collect()
160
  
161
  /** Count elements */
162
  public long count()
163
  
164
  /** Get first element */
165
  public T first()
166
  
167
  /** Take first n elements */
168
  public List<T> take(int num)
169
  
170
  /** Take ordered elements */
171
  public List<T> takeOrdered(int num)
172
  public List<T> takeOrdered(int num, Comparator<T> comp)
173
  
174
  /** Take random sample */
175
  public List<T> takeSample(boolean withReplacement, int num)
176
  public List<T> takeSample(boolean withReplacement, int num, long seed)
177
  
178
  /** Reduce elements */
179
  public T reduce(org.apache.spark.api.java.function.Function2<T, T, T> f)
180
  
181
  /** Fold with zero value */
182
  public T fold(T zeroValue, org.apache.spark.api.java.function.Function2<T, T, T> op)
183
  
184
  /** Aggregate with different types */
185
  public <U> U aggregate(U zeroValue, org.apache.spark.api.java.function.Function2<U, T, U> seqOp, org.apache.spark.api.java.function.Function2<U, U, U> combOp)
186
  
187
  /** Tree reduce */
188
  public T treeReduce(org.apache.spark.api.java.function.Function2<T, T, T> f)
189
  
190
  /** Tree aggregate */
191
  public <U> U treeAggregate(
192
    U zeroValue,
193
    org.apache.spark.api.java.function.Function2<U, T, U> seqOp,
194
    org.apache.spark.api.java.function.Function2<U, U, U> combOp,
195
    int depth
196
  )
197
  
198
  /** Apply function to each element */
199
  public void foreach(VoidFunction<T> f)
200
  
201
  /** Apply function to each partition */
202
  public void foreachPartition(VoidFunction<Iterator<T>> f)
203
  
204
  /** Count by value */
205
  public Map<T, Long> countByValue()
206
  
207
  /** Save as text file */
208
  public void saveAsTextFile(String path)
209
  public void saveAsTextFile(String path, Class<? extends CompressionCodec> codec)
210
  
211
  // Persistence
212
  
213
  /** Persist with storage level */
214
  public JavaRDD<T> persist(StorageLevel newLevel)
215
  
216
  /** Cache in memory */
217
  public JavaRDD<T> cache()
218
  
219
  /** Unpersist */
220
  public JavaRDD<T> unpersist()
221
  public JavaRDD<T> unpersist(boolean blocking)
222
  
223
  /** Checkpoint */
224
  public void checkpoint()
225
  
226
  /** Check if empty */
227
  public boolean isEmpty()
228
  
229
  // Metadata
230
  
231
  /** Get partitions */
232
  public List<Partition> partitions()
233
  
234
  /** Get storage level */
235
  public StorageLevel getStorageLevel()
236
  
237
  /** Convert to Scala RDD */
238
  public RDD<T> rdd()
239
}
240
```
241

242
### JavaPairRDD
243

244
Java-friendly wrapper for pair RDDs providing key-value operations.
245

246
```java { .api }
247
/**
248
 * Java-friendly wrapper for pair RDD
249
 */
250
public class JavaPairRDD<K, V> {
251
  // Transformations
252
  
253
  /** Map values */
254
  public <W> JavaPairRDD<K, W> mapValues(Function<V, W> f)
255
  
256
  /** Flat map values */
257
  public <W> JavaPairRDD<K, W> flatMapValues(Function<V, Iterable<W>> f)
258
  
259
  /** Map to different key-value pairs */
260
  public <K2, V2> JavaPairRDD<K2, V2> mapToPair(PairFunction<Tuple2<K, V>, K2, V2> f)
261
  
262
  /** Group by key */
263
  public JavaPairRDD<K, Iterable<V>> groupByKey()
264
  public JavaPairRDD<K, Iterable<V>> groupByKey(int numPartitions)
265
  public JavaPairRDD<K, Iterable<V>> groupByKey(Partitioner partitioner)
266
  
267
  /** Reduce by key */
268
  public JavaPairRDD<K, V> reduceByKey(org.apache.spark.api.java.function.Function2<V, V, V> func)
269
  public JavaPairRDD<K, V> reduceByKey(org.apache.spark.api.java.function.Function2<V, V, V> func, int numPartitions)
270
  public JavaPairRDD<K, V> reduceByKey(Partitioner partitioner, org.apache.spark.api.java.function.Function2<V, V, V> func)
271
  
272
  /** Aggregate by key */
273
  public <U> JavaPairRDD<K, U> aggregateByKey(
274
    U zeroValue,
275
    org.apache.spark.api.java.function.Function2<U, V, U> seqFunc,
276
    org.apache.spark.api.java.function.Function2<U, U, U> combFunc
277
  )
278
  public <U> JavaPairRDD<K, U> aggregateByKey(
279
    U zeroValue,
280
    int numPartitions,
281
    org.apache.spark.api.java.function.Function2<U, V, U> seqFunc,
282
    org.apache.spark.api.java.function.Function2<U, U, U> combFunc
283
  )
284
  
285
  /** Fold by key */
286
  public JavaPairRDD<K, V> foldByKey(V zeroValue, org.apache.spark.api.java.function.Function2<V, V, V> func)
287
  public JavaPairRDD<K, V> foldByKey(V zeroValue, int numPartitions, org.apache.spark.api.java.function.Function2<V, V, V> func)
288
  
289
  /** Combine by key */
290
  public <C> JavaPairRDD<K, C> combineByKey(
291
    org.apache.spark.api.java.function.Function<V, C> createCombiner,
292
    org.apache.spark.api.java.function.Function2<C, V, C> mergeValue,
293
    org.apache.spark.api.java.function.Function2<C, C, C> mergeCombiners
294
  )
295
  
296
  /** Join operations */
297
  public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other)
298
  public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, int numPartitions)
299
  public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other)
300
  public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other)
301
  public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other)
302
  
303
  /** Cogroup operations */
304
  public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other)
305
  
306
  /** Sort by key */
307
  public JavaPairRDD<K, V> sortByKey()
308
  public JavaPairRDD<K, V> sortByKey(boolean ascending)
309
  public JavaPairRDD<K, V> sortByKey(boolean ascending, int numPartitions)
310
  
311
  /** Get keys */
312
  public JavaRDD<K> keys()
313
  
314
  /** Get values */
315
  public JavaRDD<V> values()
316
  
317
  /** Subtract by key */
318
  public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other)
319
  
320
  // Actions
321
  
322
  /** Lookup values for key */
323
  public List<V> lookup(K key)
324
  
325
  /** Collect as map */
326
  public Map<K, V> collectAsMap()
327
  
328
  /** Count by key */
329
  public Map<K, Long> countByKey()
330
  
331
  /** Count by key approximately */
332
  public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout)
333
  
334
  /** Save as Hadoop file */
335
  public void saveAsHadoopFile(
336
    String path,
337
    Class<?> keyClass,
338
    Class<?> valueClass,
339
    Class<? extends OutputFormat> outputFormatClass
340
  )
341
  
342
  /** Save as new API Hadoop file */
343
  public void saveAsNewAPIHadoopFile(
344
    String path,
345
    Class<?> keyClass,
346
    Class<?> valueClass,
347
    Class<? extends NewOutputFormat> outputFormatClass
348
  )
349
}
350
```
351

352
### JavaDoubleRDD
353

354
Java-friendly wrapper for RDDs of doubles providing statistical operations.
355

356
```java { .api }
357
/**
358
 * Java-friendly wrapper for RDD of doubles
359
 */
360
public class JavaDoubleRDD {
361
  /** Compute mean */
362
  public double mean()
363
  
364
  /** Compute variance */
365
  public double variance()
366
  
367
  /** Compute standard deviation */
368
  public double stdev()
369
  
370
  /** Compute sum */
371
  public double sum()
372
  
373
  /** Compute statistics */
374
  public StatCounter stats()
375
  
376
  /** Compute histogram */
377
  public Tuple2<double[], long[]> histogram(int buckets)
378
  public long[] histogram(double[] buckets)
379
  
380
  /** Sum approximately */
381
  public PartialResult<BoundedDouble> sumApprox(long timeout)
382
  
383
  /** Mean approximately */
384
  public PartialResult<BoundedDouble> meanApprox(long timeout)
385
}
386
```
387

388
### Function Interfaces
389

390
Java 8 compatible function interfaces for transformations.
391

392
```java { .api }
393
/** Function interface for map operations */
394
@FunctionalInterface
395
public interface Function<T, R> extends Serializable {
396
  R call(T t) throws Exception;
397
}
398

399
/** Function interface for pair transformations */
400
@FunctionalInterface
401
public interface PairFunction<T, K, V> extends Serializable {
402
  Tuple2<K, V> call(T t) throws Exception;
403
}
404

405
/** Function interface for flat map operations */
406
@FunctionalInterface
407
public interface FlatMapFunction<T, R> extends Serializable {
408
  Iterator<R> call(T t) throws Exception;
409
}
410

411
/** Function interface for two-argument operations */
412
@FunctionalInterface
413
public interface org.apache.spark.api.java.function.Function2<T1, T2, R> extends Serializable {
414
  R call(T1 t1, T2 t2) throws Exception;
415
}
416

417
/** Function interface for void operations */
418
@FunctionalInterface
419
public interface VoidFunction<T> extends Serializable {
420
  void call(T t) throws Exception;
421
}
422
```
423

424
### Storage Levels for Java
425

426
```java { .api }
427
/**
428
 * Storage level constants for Java API
429
 */
430
public class StorageLevels {
431
  public static final StorageLevel MEMORY_ONLY = StorageLevel.MEMORY_ONLY();
432
  public static final StorageLevel MEMORY_AND_DISK = StorageLevel.MEMORY_AND_DISK();
433
  public static final StorageLevel MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY_SER();
434
  public static final StorageLevel MEMORY_AND_DISK_SER = StorageLevel.MEMORY_AND_DISK_SER();
435
  public static final StorageLevel DISK_ONLY = StorageLevel.DISK_ONLY();
436
  public static final StorageLevel MEMORY_ONLY_2 = StorageLevel.MEMORY_ONLY_2();
437
  public static final StorageLevel MEMORY_AND_DISK_2 = StorageLevel.MEMORY_AND_DISK_2();
438
}
439
```
440

441
**Usage Examples:**
442

443
```java
444
import org.apache.spark.SparkConf;
445
import org.apache.spark.api.java.JavaSparkContext;
446
import org.apache.spark.api.java.JavaRDD;
447
import org.apache.spark.api.java.JavaPairRDD;
448
import scala.Tuple2;
449

450
import java.util.Arrays;
451
import java.util.List;
452

453
// Setup
454
SparkConf conf = new SparkConf()
455
    .setAppName("Java Spark Example")
456
    .setMaster("local[*]");
457
JavaSparkContext sc = new JavaSparkContext(conf);
458

459
// Create RDD
460
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
461
JavaRDD<Integer> rdd = sc.parallelize(data);
462

463
// Transformations
464
JavaRDD<Integer> squares = rdd.map(x -> x * x);
465
JavaRDD<Integer> evens = rdd.filter(x -> x % 2 == 0);
466

467
// Pair operations
468
JavaPairRDD<String, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>("num", x));
469
JavaPairRDD<String, Integer> sums = pairs.reduceByKey((a, b) -> a + b);
470

471
// Actions
472
List<Integer> result = squares.collect();
473
long count = rdd.count();
474
int sum = rdd.reduce((a, b) -> a + b);
475

476
// Cleanup
477
sc.close();
478
```

Version

Tile

Files

java-api.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

java-api.mddocs/