or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

context-config.mdindex.mdjava-api.mdrdd-operations.mdresource-management.mdserialization.mdshared-variables.mdstorage-caching.mdtask-context.md
tile.json

java-api.mddocs/

Java API

Apache Spark provides comprehensive Java API compatibility through wrapper classes that provide Java-friendly interfaces for all core functionality.

JavaSparkContext

Java-friendly wrapper for SparkContext providing the main entry point for Java applications.

public class JavaSparkContext {
  // Constructors
  public JavaSparkContext(SparkConf conf)
  public JavaSparkContext(SparkContext sc)
  public JavaSparkContext(String master, String appName)
  public JavaSparkContext(String master, String appName, String sparkHome, String[] jars)
  
  // RDD Creation
  public <T> JavaRDD<T> parallelize(List<T> list)
  public <T> JavaRDD<T> parallelize(List<T> list, int numSlices)
  public JavaRDD<String> textFile(String path)
  public JavaRDD<String> textFile(String path, int minPartitions)
  public JavaPairRDD<String, String> wholeTextFiles(String path)
  public JavaPairRDD<String, String> wholeTextFiles(String path, int minPartitions)
  
  // Hadoop Integration
  public <K, V, F extends NewInputFormat<K, V>> JavaPairRDD<K, V> newAPIHadoopFile(
    String path, Class<F> fClass, Class<K> kClass, Class<V> vClass)
  public <K, V> JavaPairRDD<K, V> hadoopFile(
    String path, Class<? extends InputFormat<K, V>> inputFormatClass,
    Class<K> keyClass, Class<V> valueClass)
  public <K, V> JavaPairRDD<K, V> hadoopFile(
    String path, Class<? extends InputFormat<K, V>> inputFormatClass,
    Class<K> keyClass, Class<V> valueClass, int minPartitions)
  
  // Shared Variables
  public <T> Broadcast<T> broadcast(T value)
  public LongAccumulator longAccumulator()
  public LongAccumulator longAccumulator(String name)  
  public DoubleAccumulator doubleAccumulator()
  public DoubleAccumulator doubleAccumulator(String name)
  public <T> CollectionAccumulator<T> collectionAccumulator()
  public <T> CollectionAccumulator<T> collectionAccumulator(String name)
  
  // Properties
  public SparkContext sc()
  public int defaultParallelism()
  public int defaultMinPartitions()
  public SparkStatusTracker statusTracker()
  
  // Lifecycle
  public void stop()
  public void close()
}

JavaRDD

Java wrapper for RDD providing Java-friendly transformations and actions.

public class JavaRDD<T> {
  // Transformations
  public <R> JavaRDD<R> map(Function<T, R> f)
  public <U> JavaRDD<U> flatMap(FlatMapFunction<T, U> f)
  public JavaRDD<T> filter(Function<T, Boolean> f)
  public JavaRDD<T> distinct()
  public JavaRDD<T> distinct(int numPartitions)
  public JavaRDD<T> sample(boolean withReplacement, double fraction)
  public JavaRDD<T> sample(boolean withReplacement, double fraction, long seed)
  public JavaRDD<T> union(JavaRDD<T> other)
  public JavaRDD<T> intersection(JavaRDD<T> other)
  public <U> JavaPairRDD<T, U> cartesian(JavaRDD<U> other)
  public <K> JavaPairRDD<K, Iterable<T>> groupBy(Function<T, K> f)
  public JavaRDD<String> pipe(String command)
  public JavaRDD<T> coalesce(int numPartitions)
  public JavaRDD<T> coalesce(int numPartitions, boolean shuffle)
  public JavaRDD<T> repartition(int numPartitions)
  public <S> JavaRDD<T> sortBy(Function<T, S> f, boolean ascending, int numPartitions)
  public <K> JavaPairRDD<K, T> keyBy(Function<T, K> f)
  
  // Actions
  public List<T> collect()
  public long count()
  public T first()
  public List<T> take(int num)
  public List<T> top(int num)
  public List<T> takeOrdered(int num)
  public List<T> takeOrdered(int num, Comparator<T> comp)
  public List<T> takeSample(boolean withReplacement, int num)
  public List<T> takeSample(boolean withReplacement, int num, long seed)
  public T reduce(Function2<T, T, T> f)
  public T fold(T zeroValue, Function2<T, T, T> op)
  public <U> U aggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp)
  public <U> U treeAggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp)
  public <U> U treeAggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp, int depth)
  public void foreach(VoidFunction<T> f)
  public void foreachPartition(VoidFunction<Iterator<T>> f)
  
  // I/O Actions
  public void saveAsTextFile(String path)
  public void saveAsTextFile(String path, Class<? extends CompressionCodec> codec)
  public void saveAsObjectFile(String path)
  
  // Persistence
  public JavaRDD<T> persist(StorageLevel newLevel)
  public JavaRDD<T> cache()
  public JavaRDD<T> unpersist()
  public JavaRDD<T> unpersist(boolean blocking)
  public StorageLevel getStorageLevel()
  public void checkpoint()
  public boolean isCheckpointed()
  public Optional<String> getCheckpointFile()
  
  // Metadata
  public JavaSparkContext context()
  public int getNumPartitions()
  public int id()
  public String name()
  public JavaRDD<T> setName(String name)
  public String toDebugString()
}

JavaPairRDD

Java wrapper for pair RDDs providing key-value operations.

public class JavaPairRDD<K, V> {
  // Grouping Operations
  public JavaPairRDD<K, Iterable<V>> groupByKey()
  public JavaPairRDD<K, Iterable<V>> groupByKey(int numPartitions)
  public JavaPairRDD<K, Iterable<V>> groupByKey(Partitioner partitioner)
  
  // Reduction Operations
  public JavaPairRDD<K, V> reduceByKey(Function2<V, V, V> func)
  public JavaPairRDD<K, V> reduceByKey(Function2<V, V, V> func, int numPartitions)
  public JavaPairRDD<K, V> reduceByKey(Partitioner partitioner, Function2<V, V, V> func)
  public JavaPairRDD<K, V> foldByKey(V zeroValue, Function2<V, V, V> func)
  public JavaPairRDD<K, V> foldByKey(V zeroValue, int numPartitions, Function2<V, V, V> func)
  public JavaPairRDD<K, V> foldByKey(V zeroValue, Partitioner partitioner, Function2<V, V, V> func)
  
  // Aggregation Operations
  public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)
  public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, int numPartitions, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)
  public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, Partitioner partitioner, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)
  public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners)
  public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners, int numPartitions)
  public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners, Partitioner partitioner)
  
  // Partitioning
  public JavaPairRDD<K, V> partitionBy(Partitioner partitioner)
  
  // Join Operations
  public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other)
  public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, int numPartitions)
  public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, Partitioner partitioner)
  public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other)
  public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other, int numPartitions)
  public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)
  public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other)
  public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other, int numPartitions)
  public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)
  public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other)
  public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other, int numPartitions)
  public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)
  public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other)
  public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other, int numPartitions)
  public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other, Partitioner partitioner)
  
  // Set Operations
  public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other)
  public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other, int numPartitions)
  public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other, Partitioner partitioner)
  
  // Lookups and Collection
  public List<V> lookup(K key)
  public Map<K, V> collectAsMap()
  public Map<K, Long> countByKey()
  public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout)
  public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout, double confidence)
  
  // Value Operations
  public <U> JavaPairRDD<K, U> mapValues(Function<V, U> f)
  public <U> JavaPairRDD<K, U> flatMapValues(Function<V, Iterable<U>> f)
  public JavaRDD<K> keys()
  public JavaRDD<V> values()
  
  // Sorting  
  public JavaPairRDD<K, V> sortByKey()
  public JavaPairRDD<K, V> sortByKey(boolean ascending)
  public JavaPairRDD<K, V> sortByKey(boolean ascending, int numPartitions)
  
  // Conversion
  public JavaRDD<Tuple2<K, V>> rdd()
  
  // I/O Operations
  public void saveAsHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends OutputFormat> outputFormatClass)
  public void saveAsHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends OutputFormat> outputFormatClass, Class<? extends CompressionCodec> codec)
  public void saveAsNewAPIHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends NewOutputFormat> outputFormatClass)
}

JavaDoubleRDD

Java wrapper for RDDs of Double values providing statistical operations.

public class JavaDoubleRDD {
  // Statistical Operations
  public double sum()
  public StatCounter stats()
  public double mean()
  public double variance()
  public double stdev()
  public double sampleStdev()
  public double sampleVariance()
  
  // Histogram Operations
  public long[] histogram(double[] buckets)
  public Tuple2<double[], long[]> histogram(int buckets)
  
  // Standard RDD Operations (inherited)
  public List<Double> collect()
  public long count()
  public Double first()
  public List<Double> take(int num)
  public Double reduce(Function2<Double, Double, Double> f)
  
  // Transformations
  public JavaDoubleRDD filter(Function<Double, Boolean> f)
  public JavaDoubleRDD map(DoubleFunction<Double> f)
  public JavaDoubleRDD cache()
  public JavaDoubleRDD persist(StorageLevel newLevel)
}

Usage Examples

Basic Java Usage

import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.SparkConf;
import java.util.Arrays;
import java.util.List;

SparkConf conf = new SparkConf()
    .setAppName("Java Spark Example")
    .setMaster("local[*]");

JavaSparkContext sc = new JavaSparkContext(conf);

// Create RDD from collection
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> rdd = sc.parallelize(data);

// Transformations and actions
JavaRDD<Integer> squares = rdd.map(x -> x * x);
List<Integer> result = squares.collect();

sc.stop();

Key-Value Operations

import org.apache.spark.api.java.JavaPairRDD;
import scala.Tuple2;

List<Tuple2<String, Integer>> pairs = Arrays.asList(
    new Tuple2<>("a", 1),
    new Tuple2<>("b", 2),
    new Tuple2<>("a", 3)
);

JavaPairRDD<String, Integer> pairRDD = sc.parallelizePairs(pairs);

// Reduce by key
JavaPairRDD<String, Integer> sums = pairRDD.reduceByKey((a, b) -> a + b);

// Collect as map
Map<String, Integer> resultMap = sums.collectAsMap();

Lambda Expressions vs Function Objects

// Using lambda expressions (Java 8+)
JavaRDD<Integer> mapped = rdd.map(x -> x * 2);
JavaRDD<Integer> filtered = rdd.filter(x -> x > 10);

// Using Function objects (Java 7 compatibility)
import org.apache.spark.api.java.function.Function;

JavaRDD<Integer> mapped2 = rdd.map(new Function<Integer, Integer>() {
    public Integer call(Integer x) {
        return x * 2;
    }
});

Working with Text Files

// Read text file
JavaRDD<String> lines = sc.textFile("hdfs://path/to/file.txt");

// Word count example
JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
JavaPairRDD<String, Integer> pairs = words.mapToPair(word -> new Tuple2<>(word, 1));
JavaPairRDD<String, Integer> counts = pairs.reduceByKey((a, b) -> a + b);

// Save results
counts.saveAsTextFile("hdfs://path/to/output");