or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

accumulators.mdapplication-context.mdbroadcast-variables.mdindex.mdjava-api.mdpartitioning.mdrdd-operations.mdserialization.mdstorage-persistence.md

java-api.mddocs/

0

# Java API

1

2

Java-friendly wrappers for Spark functionality providing type-safe distributed processing and seamless integration with Java applications.

3

4

## Capabilities

5

6

### JavaSparkContext

7

8

Java-friendly version of SparkContext providing the main entry point for Java Spark applications.

9

10

```java { .api }

11

/**

12

* Java-friendly wrapper for SparkContext

13

*/

14

public class JavaSparkContext {

15

/** Create JavaSparkContext from SparkContext */

16

public JavaSparkContext(SparkContext sc)

17

18

/** Create JavaSparkContext from SparkConf */

19

public JavaSparkContext(SparkConf conf)

20

21

/** Create JavaSparkContext with app name and master URL */

22

public JavaSparkContext(String master, String appName)

23

24

/** Create RDD from Java collection */

25

public <T> JavaRDD<T> parallelize(java.util.List<T> list)

26

public <T> JavaRDD<T> parallelize(java.util.List<T> list, int numSlices)

27

28

/** Create pair RDD from Java collection */

29

public <K, V> JavaPairRDD<K, V> parallelizePairs(java.util.List<scala.Tuple2<K, V>> list)

30

public <K, V> JavaPairRDD<K, V> parallelizePairs(java.util.List<scala.Tuple2<K, V>> list, int numSlices)

31

32

/** Read text file */

33

public JavaRDD<String> textFile(String path)

34

public JavaRDD<String> textFile(String path, int minPartitions)

35

36

/** Read whole text files */

37

public JavaPairRDD<String, String> wholeTextFiles(String path)

38

public JavaPairRDD<String, String> wholeTextFiles(String path, int minPartitions)

39

40

/** Create RDD from Hadoop InputFormat */

41

public <K, V> JavaPairRDD<K, V> hadoopRDD(

42

JobConf conf,

43

Class<? extends InputFormat<K, V>> inputFormatClass,

44

Class<K> keyClass,

45

Class<V> valueClass

46

)

47

48

/** Create RDD from new Hadoop InputFormat */

49

public <K, V> JavaPairRDD<K, V> newAPIHadoopRDD(

50

Configuration conf,

51

Class<? extends NewInputFormat<K, V>> fClass,

52

Class<K> kClass,

53

Class<V> vClass

54

)

55

56

/** Create broadcast variable */

57

public <T> Broadcast<T> broadcast(T value)

58

59

/** Create accumulator */

60

public LongAccumulator longAccumulator()

61

public LongAccumulator longAccumulator(String name)

62

public DoubleAccumulator doubleAccumulator()

63

public DoubleAccumulator doubleAccumulator(String name)

64

public <T> CollectionAccumulator<T> collectionAccumulator()

65

public <T> CollectionAccumulator<T> collectionAccumulator(String name)

66

67

/** Add file to Spark job */

68

public void addFile(String path)

69

public void addFile(String path, boolean recursive)

70

71

/** Add JAR file */

72

public void addJar(String path)

73

74

/** Set checkpoint directory */

75

public void setCheckpointDir(String dir)

76

77

/** Get underlying SparkContext */

78

public SparkContext sc()

79

80

/** Get status tracker */

81

public JavaSparkStatusTracker statusTracker()

82

83

/** Stop JavaSparkContext */

84

public void stop()

85

86

/** Close JavaSparkContext (same as stop) */

87

public void close()

88

}

89

```

90

91

### JavaRDD

92

93

Java-friendly wrapper for RDD providing type-safe distributed operations.

94

95

```java { .api }

96

/**

97

* Java-friendly wrapper for RDD

98

*/

99

public class JavaRDD<T> {

100

// Transformations

101

102

/** Transform each element */

103

public <R> JavaRDD<R> map(org.apache.spark.api.java.function.Function<T, R> f)

104

105

/** Transform and flatten */

106

public <R> JavaRDD<R> flatMap(org.apache.spark.api.java.function.FlatMapFunction<T, R> f)

107

108

/** Filter elements */

109

public JavaRDD<T> filter(org.apache.spark.api.java.function.Function<T, Boolean> f)

110

111

/** Map with partition index */

112

public <R> JavaRDD<R> mapPartitionsWithIndex(

113

org.apache.spark.api.java.function.org.apache.spark.api.java.function.Function2<Integer, java.util.Iterator<T>, java.util.Iterator<R>> f,

114

boolean preservesPartitioning

115

)

116

117

/** Sample elements */

118

public JavaRDD<T> sample(boolean withReplacement, double fraction)

119

public JavaRDD<T> sample(boolean withReplacement, double fraction, long seed)

120

121

/** Union with another RDD */

122

public JavaRDD<T> union(JavaRDD<T> other)

123

124

/** Intersection with another RDD */

125

public JavaRDD<T> intersection(JavaRDD<T> other)

126

127

/** Get distinct elements */

128

public JavaRDD<T> distinct()

129

public JavaRDD<T> distinct(int numPartitions)

130

131

/** Group by key function */

132

public <K> JavaPairRDD<K, Iterable<T>> groupBy(Function<T, K> f)

133

134

/** Coalesce partitions */

135

public JavaRDD<T> coalesce(int numPartitions)

136

public JavaRDD<T> coalesce(int numPartitions, boolean shuffle)

137

138

/** Repartition */

139

public JavaRDD<T> repartition(int numPartitions)

140

141

/** Sort by key function */

142

public <S> JavaRDD<T> sortBy(Function<T, S> f, boolean ascending, int numPartitions)

143

144

/** Zip with another RDD */

145

public <U> JavaPairRDD<T, U> zip(JavaRDD<U> other)

146

147

/** Zip with indices */

148

public JavaPairRDD<T, Long> zipWithIndex()

149

150

/** Zip with unique IDs */

151

public JavaPairRDD<T, Long> zipWithUniqueId()

152

153

/** Map to pair RDD */

154

public <K, V> JavaPairRDD<K, V> mapToPair(PairFunction<T, K, V> f)

155

156

// Actions

157

158

/** Collect all elements */

159

public List<T> collect()

160

161

/** Count elements */

162

public long count()

163

164

/** Get first element */

165

public T first()

166

167

/** Take first n elements */

168

public List<T> take(int num)

169

170

/** Take ordered elements */

171

public List<T> takeOrdered(int num)

172

public List<T> takeOrdered(int num, Comparator<T> comp)

173

174

/** Take random sample */

175

public List<T> takeSample(boolean withReplacement, int num)

176

public List<T> takeSample(boolean withReplacement, int num, long seed)

177

178

/** Reduce elements */

179

public T reduce(org.apache.spark.api.java.function.Function2<T, T, T> f)

180

181

/** Fold with zero value */

182

public T fold(T zeroValue, org.apache.spark.api.java.function.Function2<T, T, T> op)

183

184

/** Aggregate with different types */

185

public <U> U aggregate(U zeroValue, org.apache.spark.api.java.function.Function2<U, T, U> seqOp, org.apache.spark.api.java.function.Function2<U, U, U> combOp)

186

187

/** Tree reduce */

188

public T treeReduce(org.apache.spark.api.java.function.Function2<T, T, T> f)

189

190

/** Tree aggregate */

191

public <U> U treeAggregate(

192

U zeroValue,

193

org.apache.spark.api.java.function.Function2<U, T, U> seqOp,

194

org.apache.spark.api.java.function.Function2<U, U, U> combOp,

195

int depth

196

)

197

198

/** Apply function to each element */

199

public void foreach(VoidFunction<T> f)

200

201

/** Apply function to each partition */

202

public void foreachPartition(VoidFunction<Iterator<T>> f)

203

204

/** Count by value */

205

public Map<T, Long> countByValue()

206

207

/** Save as text file */

208

public void saveAsTextFile(String path)

209

public void saveAsTextFile(String path, Class<? extends CompressionCodec> codec)

210

211

// Persistence

212

213

/** Persist with storage level */

214

public JavaRDD<T> persist(StorageLevel newLevel)

215

216

/** Cache in memory */

217

public JavaRDD<T> cache()

218

219

/** Unpersist */

220

public JavaRDD<T> unpersist()

221

public JavaRDD<T> unpersist(boolean blocking)

222

223

/** Checkpoint */

224

public void checkpoint()

225

226

/** Check if empty */

227

public boolean isEmpty()

228

229

// Metadata

230

231

/** Get partitions */

232

public List<Partition> partitions()

233

234

/** Get storage level */

235

public StorageLevel getStorageLevel()

236

237

/** Convert to Scala RDD */

238

public RDD<T> rdd()

239

}

240

```

241

242

### JavaPairRDD

243

244

Java-friendly wrapper for pair RDDs providing key-value operations.

245

246

```java { .api }

247

/**

248

* Java-friendly wrapper for pair RDD

249

*/

250

public class JavaPairRDD<K, V> {

251

// Transformations

252

253

/** Map values */

254

public <W> JavaPairRDD<K, W> mapValues(Function<V, W> f)

255

256

/** Flat map values */

257

public <W> JavaPairRDD<K, W> flatMapValues(Function<V, Iterable<W>> f)

258

259

/** Map to different key-value pairs */

260

public <K2, V2> JavaPairRDD<K2, V2> mapToPair(PairFunction<Tuple2<K, V>, K2, V2> f)

261

262

/** Group by key */

263

public JavaPairRDD<K, Iterable<V>> groupByKey()

264

public JavaPairRDD<K, Iterable<V>> groupByKey(int numPartitions)

265

public JavaPairRDD<K, Iterable<V>> groupByKey(Partitioner partitioner)

266

267

/** Reduce by key */

268

public JavaPairRDD<K, V> reduceByKey(org.apache.spark.api.java.function.Function2<V, V, V> func)

269

public JavaPairRDD<K, V> reduceByKey(org.apache.spark.api.java.function.Function2<V, V, V> func, int numPartitions)

270

public JavaPairRDD<K, V> reduceByKey(Partitioner partitioner, org.apache.spark.api.java.function.Function2<V, V, V> func)

271

272

/** Aggregate by key */

273

public <U> JavaPairRDD<K, U> aggregateByKey(

274

U zeroValue,

275

org.apache.spark.api.java.function.Function2<U, V, U> seqFunc,

276

org.apache.spark.api.java.function.Function2<U, U, U> combFunc

277

)

278

public <U> JavaPairRDD<K, U> aggregateByKey(

279

U zeroValue,

280

int numPartitions,

281

org.apache.spark.api.java.function.Function2<U, V, U> seqFunc,

282

org.apache.spark.api.java.function.Function2<U, U, U> combFunc

283

)

284

285

/** Fold by key */

286

public JavaPairRDD<K, V> foldByKey(V zeroValue, org.apache.spark.api.java.function.Function2<V, V, V> func)

287

public JavaPairRDD<K, V> foldByKey(V zeroValue, int numPartitions, org.apache.spark.api.java.function.Function2<V, V, V> func)

288

289

/** Combine by key */

290

public <C> JavaPairRDD<K, C> combineByKey(

291

org.apache.spark.api.java.function.Function<V, C> createCombiner,

292

org.apache.spark.api.java.function.Function2<C, V, C> mergeValue,

293

org.apache.spark.api.java.function.Function2<C, C, C> mergeCombiners

294

)

295

296

/** Join operations */

297

public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other)

298

public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, int numPartitions)

299

public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other)

300

public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other)

301

public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other)

302

303

/** Cogroup operations */

304

public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other)

305

306

/** Sort by key */

307

public JavaPairRDD<K, V> sortByKey()

308

public JavaPairRDD<K, V> sortByKey(boolean ascending)

309

public JavaPairRDD<K, V> sortByKey(boolean ascending, int numPartitions)

310

311

/** Get keys */

312

public JavaRDD<K> keys()

313

314

/** Get values */

315

public JavaRDD<V> values()

316

317

/** Subtract by key */

318

public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other)

319

320

// Actions

321

322

/** Lookup values for key */

323

public List<V> lookup(K key)

324

325

/** Collect as map */

326

public Map<K, V> collectAsMap()

327

328

/** Count by key */

329

public Map<K, Long> countByKey()

330

331

/** Count by key approximately */

332

public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout)

333

334

/** Save as Hadoop file */

335

public void saveAsHadoopFile(

336

String path,

337

Class<?> keyClass,

338

Class<?> valueClass,

339

Class<? extends OutputFormat> outputFormatClass

340

)

341

342

/** Save as new API Hadoop file */

343

public void saveAsNewAPIHadoopFile(

344

String path,

345

Class<?> keyClass,

346

Class<?> valueClass,

347

Class<? extends NewOutputFormat> outputFormatClass

348

)

349

}

350

```

351

352

### JavaDoubleRDD

353

354

Java-friendly wrapper for RDDs of doubles providing statistical operations.

355

356

```java { .api }

357

/**

358

* Java-friendly wrapper for RDD of doubles

359

*/

360

public class JavaDoubleRDD {

361

/** Compute mean */

362

public double mean()

363

364

/** Compute variance */

365

public double variance()

366

367

/** Compute standard deviation */

368

public double stdev()

369

370

/** Compute sum */

371

public double sum()

372

373

/** Compute statistics */

374

public StatCounter stats()

375

376

/** Compute histogram */

377

public Tuple2<double[], long[]> histogram(int buckets)

378

public long[] histogram(double[] buckets)

379

380

/** Sum approximately */

381

public PartialResult<BoundedDouble> sumApprox(long timeout)

382

383

/** Mean approximately */

384

public PartialResult<BoundedDouble> meanApprox(long timeout)

385

}

386

```

387

388

### Function Interfaces

389

390

Java 8 compatible function interfaces for transformations.

391

392

```java { .api }

393

/** Function interface for map operations */

394

@FunctionalInterface

395

public interface Function<T, R> extends Serializable {

396

R call(T t) throws Exception;

397

}

398

399

/** Function interface for pair transformations */

400

@FunctionalInterface

401

public interface PairFunction<T, K, V> extends Serializable {

402

Tuple2<K, V> call(T t) throws Exception;

403

}

404

405

/** Function interface for flat map operations */

406

@FunctionalInterface

407

public interface FlatMapFunction<T, R> extends Serializable {

408

Iterator<R> call(T t) throws Exception;

409

}

410

411

/** Function interface for two-argument operations */

412

@FunctionalInterface

413

public interface org.apache.spark.api.java.function.Function2<T1, T2, R> extends Serializable {

414

R call(T1 t1, T2 t2) throws Exception;

415

}

416

417

/** Function interface for void operations */

418

@FunctionalInterface

419

public interface VoidFunction<T> extends Serializable {

420

void call(T t) throws Exception;

421

}

422

```

423

424

### Storage Levels for Java

425

426

```java { .api }

427

/**

428

* Storage level constants for Java API

429

*/

430

public class StorageLevels {

431

public static final StorageLevel MEMORY_ONLY = StorageLevel.MEMORY_ONLY();

432

public static final StorageLevel MEMORY_AND_DISK = StorageLevel.MEMORY_AND_DISK();

433

public static final StorageLevel MEMORY_ONLY_SER = StorageLevel.MEMORY_ONLY_SER();

434

public static final StorageLevel MEMORY_AND_DISK_SER = StorageLevel.MEMORY_AND_DISK_SER();

435

public static final StorageLevel DISK_ONLY = StorageLevel.DISK_ONLY();

436

public static final StorageLevel MEMORY_ONLY_2 = StorageLevel.MEMORY_ONLY_2();

437

public static final StorageLevel MEMORY_AND_DISK_2 = StorageLevel.MEMORY_AND_DISK_2();

438

}

439

```

440

441

**Usage Examples:**

442

443

```java

444

import org.apache.spark.SparkConf;

445

import org.apache.spark.api.java.JavaSparkContext;

446

import org.apache.spark.api.java.JavaRDD;

447

import org.apache.spark.api.java.JavaPairRDD;

448

import scala.Tuple2;

449

450

import java.util.Arrays;

451

import java.util.List;

452

453

// Setup

454

SparkConf conf = new SparkConf()

455

.setAppName("Java Spark Example")

456

.setMaster("local[*]");

457

JavaSparkContext sc = new JavaSparkContext(conf);

458

459

// Create RDD

460

List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);

461

JavaRDD<Integer> rdd = sc.parallelize(data);

462

463

// Transformations

464

JavaRDD<Integer> squares = rdd.map(x -> x * x);

465

JavaRDD<Integer> evens = rdd.filter(x -> x % 2 == 0);

466

467

// Pair operations

468

JavaPairRDD<String, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>("num", x));

469

JavaPairRDD<String, Integer> sums = pairs.reduceByKey((a, b) -> a + b);

470

471

// Actions

472

List<Integer> result = squares.collect();

473

long count = rdd.count();

474

int sum = rdd.reduce((a, b) -> a + b);

475

476

// Cleanup

477

sc.close();

478

```