or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

context-config.mdindex.mdjava-api.mdrdd-operations.mdresource-management.mdserialization.mdshared-variables.mdstorage-caching.mdtask-context.md

java-api.mddocs/

0

# Java API

1

2

Apache Spark provides comprehensive Java API compatibility through wrapper classes that provide Java-friendly interfaces for all core functionality.

3

4

## JavaSparkContext

5

6

Java-friendly wrapper for SparkContext providing the main entry point for Java applications.

7

8

```java { .api }

9

public class JavaSparkContext {

10

// Constructors

11

public JavaSparkContext(SparkConf conf)

12

public JavaSparkContext(SparkContext sc)

13

public JavaSparkContext(String master, String appName)

14

public JavaSparkContext(String master, String appName, String sparkHome, String[] jars)

15

16

// RDD Creation

17

public <T> JavaRDD<T> parallelize(List<T> list)

18

public <T> JavaRDD<T> parallelize(List<T> list, int numSlices)

19

public JavaRDD<String> textFile(String path)

20

public JavaRDD<String> textFile(String path, int minPartitions)

21

public JavaPairRDD<String, String> wholeTextFiles(String path)

22

public JavaPairRDD<String, String> wholeTextFiles(String path, int minPartitions)

23

24

// Hadoop Integration

25

public <K, V, F extends NewInputFormat<K, V>> JavaPairRDD<K, V> newAPIHadoopFile(

26

String path, Class<F> fClass, Class<K> kClass, Class<V> vClass)

27

public <K, V> JavaPairRDD<K, V> hadoopFile(

28

String path, Class<? extends InputFormat<K, V>> inputFormatClass,

29

Class<K> keyClass, Class<V> valueClass)

30

public <K, V> JavaPairRDD<K, V> hadoopFile(

31

String path, Class<? extends InputFormat<K, V>> inputFormatClass,

32

Class<K> keyClass, Class<V> valueClass, int minPartitions)

33

34

// Shared Variables

35

public <T> Broadcast<T> broadcast(T value)

36

public LongAccumulator longAccumulator()

37

public LongAccumulator longAccumulator(String name)

38

public DoubleAccumulator doubleAccumulator()

39

public DoubleAccumulator doubleAccumulator(String name)

40

public <T> CollectionAccumulator<T> collectionAccumulator()

41

public <T> CollectionAccumulator<T> collectionAccumulator(String name)

42

43

// Properties

44

public SparkContext sc()

45

public int defaultParallelism()

46

public int defaultMinPartitions()

47

public SparkStatusTracker statusTracker()

48

49

// Lifecycle

50

public void stop()

51

public void close()

52

}

53

```

54

55

## JavaRDD

56

57

Java wrapper for RDD providing Java-friendly transformations and actions.

58

59

```java { .api }

60

public class JavaRDD<T> {

61

// Transformations

62

public <R> JavaRDD<R> map(Function<T, R> f)

63

public <U> JavaRDD<U> flatMap(FlatMapFunction<T, U> f)

64

public JavaRDD<T> filter(Function<T, Boolean> f)

65

public JavaRDD<T> distinct()

66

public JavaRDD<T> distinct(int numPartitions)

67

public JavaRDD<T> sample(boolean withReplacement, double fraction)

68

public JavaRDD<T> sample(boolean withReplacement, double fraction, long seed)

69

public JavaRDD<T> union(JavaRDD<T> other)

70

public JavaRDD<T> intersection(JavaRDD<T> other)

71

public <U> JavaPairRDD<T, U> cartesian(JavaRDD<U> other)

72

public <K> JavaPairRDD<K, Iterable<T>> groupBy(Function<T, K> f)

73

public JavaRDD<String> pipe(String command)

74

public JavaRDD<T> coalesce(int numPartitions)

75

public JavaRDD<T> coalesce(int numPartitions, boolean shuffle)

76

public JavaRDD<T> repartition(int numPartitions)

77

public <S> JavaRDD<T> sortBy(Function<T, S> f, boolean ascending, int numPartitions)

78

public <K> JavaPairRDD<K, T> keyBy(Function<T, K> f)

79

80

// Actions

81

public List<T> collect()

82

public long count()

83

public T first()

84

public List<T> take(int num)

85

public List<T> top(int num)

86

public List<T> takeOrdered(int num)

87

public List<T> takeOrdered(int num, Comparator<T> comp)

88

public List<T> takeSample(boolean withReplacement, int num)

89

public List<T> takeSample(boolean withReplacement, int num, long seed)

90

public T reduce(Function2<T, T, T> f)

91

public T fold(T zeroValue, Function2<T, T, T> op)

92

public <U> U aggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp)

93

public <U> U treeAggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp)

94

public <U> U treeAggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp, int depth)

95

public void foreach(VoidFunction<T> f)

96

public void foreachPartition(VoidFunction<Iterator<T>> f)

97

98

// I/O Actions

99

public void saveAsTextFile(String path)

100

public void saveAsTextFile(String path, Class<? extends CompressionCodec> codec)

101

public void saveAsObjectFile(String path)

102

103

// Persistence

104

public JavaRDD<T> persist(StorageLevel newLevel)

105

public JavaRDD<T> cache()

106

public JavaRDD<T> unpersist()

107

public JavaRDD<T> unpersist(boolean blocking)

108

public StorageLevel getStorageLevel()

109

public void checkpoint()

110

public boolean isCheckpointed()

111

public Optional<String> getCheckpointFile()

112

113

// Metadata

114

public JavaSparkContext context()

115

public int getNumPartitions()

116

public int id()

117

public String name()

118

public JavaRDD<T> setName(String name)

119

public String toDebugString()

120

}

121

```

122

123

## JavaPairRDD

124

125

Java wrapper for pair RDDs providing key-value operations.

126

127

```java { .api }

128

public class JavaPairRDD<K, V> {

129

// Grouping Operations

130

public JavaPairRDD<K, Iterable<V>> groupByKey()

131

public JavaPairRDD<K, Iterable<V>> groupByKey(int numPartitions)

132

public JavaPairRDD<K, Iterable<V>> groupByKey(Partitioner partitioner)

133

134

// Reduction Operations

135

public JavaPairRDD<K, V> reduceByKey(Function2<V, V, V> func)

136

public JavaPairRDD<K, V> reduceByKey(Function2<V, V, V> func, int numPartitions)

137

public JavaPairRDD<K, V> reduceByKey(Partitioner partitioner, Function2<V, V, V> func)

138

public JavaPairRDD<K, V> foldByKey(V zeroValue, Function2<V, V, V> func)

139

public JavaPairRDD<K, V> foldByKey(V zeroValue, int numPartitions, Function2<V, V, V> func)

140

public JavaPairRDD<K, V> foldByKey(V zeroValue, Partitioner partitioner, Function2<V, V, V> func)

141

142

// Aggregation Operations

143

public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)

144

public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, int numPartitions, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)

145

public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, Partitioner partitioner, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)

146

public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners)

147

public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners, int numPartitions)

148

public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners, Partitioner partitioner)

149

150

// Partitioning

151

public JavaPairRDD<K, V> partitionBy(Partitioner partitioner)

152

153

// Join Operations

154

public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other)

155

public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, int numPartitions)

156

public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, Partitioner partitioner)

157

public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other)

158

public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other, int numPartitions)

159

public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)

160

public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other)

161

public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other, int numPartitions)

162

public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)

163

public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other)

164

public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other, int numPartitions)

165

public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)

166

public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other)

167

public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other, int numPartitions)

168

public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other, Partitioner partitioner)

169

170

// Set Operations

171

public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other)

172

public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other, int numPartitions)

173

public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other, Partitioner partitioner)

174

175

// Lookups and Collection

176

public List<V> lookup(K key)

177

public Map<K, V> collectAsMap()

178

public Map<K, Long> countByKey()

179

public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout)

180

public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout, double confidence)

181

182

// Value Operations

183

public <U> JavaPairRDD<K, U> mapValues(Function<V, U> f)

184

public <U> JavaPairRDD<K, U> flatMapValues(Function<V, Iterable<U>> f)

185

public JavaRDD<K> keys()

186

public JavaRDD<V> values()

187

188

// Sorting

189

public JavaPairRDD<K, V> sortByKey()

190

public JavaPairRDD<K, V> sortByKey(boolean ascending)

191

public JavaPairRDD<K, V> sortByKey(boolean ascending, int numPartitions)

192

193

// Conversion

194

public JavaRDD<Tuple2<K, V>> rdd()

195

196

// I/O Operations

197

public void saveAsHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends OutputFormat> outputFormatClass)

198

public void saveAsHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends OutputFormat> outputFormatClass, Class<? extends CompressionCodec> codec)

199

public void saveAsNewAPIHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends NewOutputFormat> outputFormatClass)

200

}

201

```

202

203

## JavaDoubleRDD

204

205

Java wrapper for RDDs of Double values providing statistical operations.

206

207

```java { .api }

208

public class JavaDoubleRDD {

209

// Statistical Operations

210

public double sum()

211

public StatCounter stats()

212

public double mean()

213

public double variance()

214

public double stdev()

215

public double sampleStdev()

216

public double sampleVariance()

217

218

// Histogram Operations

219

public long[] histogram(double[] buckets)

220

public Tuple2<double[], long[]> histogram(int buckets)

221

222

// Standard RDD Operations (inherited)

223

public List<Double> collect()

224

public long count()

225

public Double first()

226

public List<Double> take(int num)

227

public Double reduce(Function2<Double, Double, Double> f)

228

229

// Transformations

230

public JavaDoubleRDD filter(Function<Double, Boolean> f)

231

public JavaDoubleRDD map(DoubleFunction<Double> f)

232

public JavaDoubleRDD cache()

233

public JavaDoubleRDD persist(StorageLevel newLevel)

234

}

235

```

236

237

## Usage Examples

238

239

### Basic Java Usage

240

```java

241

import org.apache.spark.api.java.JavaSparkContext;

242

import org.apache.spark.api.java.JavaRDD;

243

import org.apache.spark.SparkConf;

244

import java.util.Arrays;

245

import java.util.List;

246

247

SparkConf conf = new SparkConf()

248

.setAppName("Java Spark Example")

249

.setMaster("local[*]");

250

251

JavaSparkContext sc = new JavaSparkContext(conf);

252

253

// Create RDD from collection

254

List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);

255

JavaRDD<Integer> rdd = sc.parallelize(data);

256

257

// Transformations and actions

258

JavaRDD<Integer> squares = rdd.map(x -> x * x);

259

List<Integer> result = squares.collect();

260

261

sc.stop();

262

```

263

264

### Key-Value Operations

265

```java

266

import org.apache.spark.api.java.JavaPairRDD;

267

import scala.Tuple2;

268

269

List<Tuple2<String, Integer>> pairs = Arrays.asList(

270

new Tuple2<>("a", 1),

271

new Tuple2<>("b", 2),

272

new Tuple2<>("a", 3)

273

);

274

275

JavaPairRDD<String, Integer> pairRDD = sc.parallelizePairs(pairs);

276

277

// Reduce by key

278

JavaPairRDD<String, Integer> sums = pairRDD.reduceByKey((a, b) -> a + b);

279

280

// Collect as map

281

Map<String, Integer> resultMap = sums.collectAsMap();

282

```

283

284

### Lambda Expressions vs Function Objects

285

```java

286

// Using lambda expressions (Java 8+)

287

JavaRDD<Integer> mapped = rdd.map(x -> x * 2);

288

JavaRDD<Integer> filtered = rdd.filter(x -> x > 10);

289

290

// Using Function objects (Java 7 compatibility)

291

import org.apache.spark.api.java.function.Function;

292

293

JavaRDD<Integer> mapped2 = rdd.map(new Function<Integer, Integer>() {

294

public Integer call(Integer x) {

295

return x * 2;

296

}

297

});

298

```

299

300

### Working with Text Files

301

```java

302

// Read text file

303

JavaRDD<String> lines = sc.textFile("hdfs://path/to/file.txt");

304

305

// Word count example

306

JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());

307

JavaPairRDD<String, Integer> pairs = words.mapToPair(word -> new Tuple2<>(word, 1));

308

JavaPairRDD<String, Integer> counts = pairs.reduceByKey((a, b) -> a + b);

309

310

// Save results

311

counts.saveAsTextFile("hdfs://path/to/output");

312

```