Apache Spark provides comprehensive Java API compatibility through wrapper classes that provide Java-friendly interfaces for all core functionality.
Java-friendly wrapper for SparkContext providing the main entry point for Java applications.
public class JavaSparkContext {
// Constructors
public JavaSparkContext(SparkConf conf)
public JavaSparkContext(SparkContext sc)
public JavaSparkContext(String master, String appName)
public JavaSparkContext(String master, String appName, String sparkHome, String[] jars)
// RDD Creation
public <T> JavaRDD<T> parallelize(List<T> list)
public <T> JavaRDD<T> parallelize(List<T> list, int numSlices)
public JavaRDD<String> textFile(String path)
public JavaRDD<String> textFile(String path, int minPartitions)
public JavaPairRDD<String, String> wholeTextFiles(String path)
public JavaPairRDD<String, String> wholeTextFiles(String path, int minPartitions)
// Hadoop Integration
public <K, V, F extends NewInputFormat<K, V>> JavaPairRDD<K, V> newAPIHadoopFile(
String path, Class<F> fClass, Class<K> kClass, Class<V> vClass)
public <K, V> JavaPairRDD<K, V> hadoopFile(
String path, Class<? extends InputFormat<K, V>> inputFormatClass,
Class<K> keyClass, Class<V> valueClass)
public <K, V> JavaPairRDD<K, V> hadoopFile(
String path, Class<? extends InputFormat<K, V>> inputFormatClass,
Class<K> keyClass, Class<V> valueClass, int minPartitions)
// Shared Variables
public <T> Broadcast<T> broadcast(T value)
public LongAccumulator longAccumulator()
public LongAccumulator longAccumulator(String name)
public DoubleAccumulator doubleAccumulator()
public DoubleAccumulator doubleAccumulator(String name)
public <T> CollectionAccumulator<T> collectionAccumulator()
public <T> CollectionAccumulator<T> collectionAccumulator(String name)
// Properties
public SparkContext sc()
public int defaultParallelism()
public int defaultMinPartitions()
public SparkStatusTracker statusTracker()
// Lifecycle
public void stop()
public void close()
}Java wrapper for RDD providing Java-friendly transformations and actions.
public class JavaRDD<T> {
// Transformations
public <R> JavaRDD<R> map(Function<T, R> f)
public <U> JavaRDD<U> flatMap(FlatMapFunction<T, U> f)
public JavaRDD<T> filter(Function<T, Boolean> f)
public JavaRDD<T> distinct()
public JavaRDD<T> distinct(int numPartitions)
public JavaRDD<T> sample(boolean withReplacement, double fraction)
public JavaRDD<T> sample(boolean withReplacement, double fraction, long seed)
public JavaRDD<T> union(JavaRDD<T> other)
public JavaRDD<T> intersection(JavaRDD<T> other)
public <U> JavaPairRDD<T, U> cartesian(JavaRDD<U> other)
public <K> JavaPairRDD<K, Iterable<T>> groupBy(Function<T, K> f)
public JavaRDD<String> pipe(String command)
public JavaRDD<T> coalesce(int numPartitions)
public JavaRDD<T> coalesce(int numPartitions, boolean shuffle)
public JavaRDD<T> repartition(int numPartitions)
public <S> JavaRDD<T> sortBy(Function<T, S> f, boolean ascending, int numPartitions)
public <K> JavaPairRDD<K, T> keyBy(Function<T, K> f)
// Actions
public List<T> collect()
public long count()
public T first()
public List<T> take(int num)
public List<T> top(int num)
public List<T> takeOrdered(int num)
public List<T> takeOrdered(int num, Comparator<T> comp)
public List<T> takeSample(boolean withReplacement, int num)
public List<T> takeSample(boolean withReplacement, int num, long seed)
public T reduce(Function2<T, T, T> f)
public T fold(T zeroValue, Function2<T, T, T> op)
public <U> U aggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp)
public <U> U treeAggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp)
public <U> U treeAggregate(U zeroValue, Function2<U, T, U> seqOp, Function2<U, U, U> combOp, int depth)
public void foreach(VoidFunction<T> f)
public void foreachPartition(VoidFunction<Iterator<T>> f)
// I/O Actions
public void saveAsTextFile(String path)
public void saveAsTextFile(String path, Class<? extends CompressionCodec> codec)
public void saveAsObjectFile(String path)
// Persistence
public JavaRDD<T> persist(StorageLevel newLevel)
public JavaRDD<T> cache()
public JavaRDD<T> unpersist()
public JavaRDD<T> unpersist(boolean blocking)
public StorageLevel getStorageLevel()
public void checkpoint()
public boolean isCheckpointed()
public Optional<String> getCheckpointFile()
// Metadata
public JavaSparkContext context()
public int getNumPartitions()
public int id()
public String name()
public JavaRDD<T> setName(String name)
public String toDebugString()
}Java wrapper for pair RDDs providing key-value operations.
public class JavaPairRDD<K, V> {
// Grouping Operations
public JavaPairRDD<K, Iterable<V>> groupByKey()
public JavaPairRDD<K, Iterable<V>> groupByKey(int numPartitions)
public JavaPairRDD<K, Iterable<V>> groupByKey(Partitioner partitioner)
// Reduction Operations
public JavaPairRDD<K, V> reduceByKey(Function2<V, V, V> func)
public JavaPairRDD<K, V> reduceByKey(Function2<V, V, V> func, int numPartitions)
public JavaPairRDD<K, V> reduceByKey(Partitioner partitioner, Function2<V, V, V> func)
public JavaPairRDD<K, V> foldByKey(V zeroValue, Function2<V, V, V> func)
public JavaPairRDD<K, V> foldByKey(V zeroValue, int numPartitions, Function2<V, V, V> func)
public JavaPairRDD<K, V> foldByKey(V zeroValue, Partitioner partitioner, Function2<V, V, V> func)
// Aggregation Operations
public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)
public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, int numPartitions, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)
public <U> JavaPairRDD<K, U> aggregateByKey(U zeroValue, Partitioner partitioner, Function2<U, V, U> seqFunc, Function2<U, U, U> combFunc)
public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners)
public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners, int numPartitions)
public <C> JavaPairRDD<K, C> combineByKey(Function<V, C> createCombiner, Function2<C, V, C> mergeValue, Function2<C, C, C> mergeCombiners, Partitioner partitioner)
// Partitioning
public JavaPairRDD<K, V> partitionBy(Partitioner partitioner)
// Join Operations
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other)
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, int numPartitions)
public <W> JavaPairRDD<K, Tuple2<V, W>> join(JavaPairRDD<K, W> other, Partitioner partitioner)
public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other)
public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other, int numPartitions)
public <W> JavaPairRDD<K, Tuple2<V, Optional<W>>> leftOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)
public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other)
public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other, int numPartitions)
public <W> JavaPairRDD<K, Tuple2<Optional<V>, W>> rightOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)
public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other)
public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other, int numPartitions)
public <W> JavaPairRDD<K, Tuple2<Optional<V>, Optional<W>>> fullOuterJoin(JavaPairRDD<K, W> other, Partitioner partitioner)
public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other)
public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other, int numPartitions)
public <W> JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<W>>> cogroup(JavaPairRDD<K, W> other, Partitioner partitioner)
// Set Operations
public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other)
public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other, int numPartitions)
public <W> JavaPairRDD<K, V> subtractByKey(JavaPairRDD<K, W> other, Partitioner partitioner)
// Lookups and Collection
public List<V> lookup(K key)
public Map<K, V> collectAsMap()
public Map<K, Long> countByKey()
public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout)
public PartialResult<Map<K, BoundedDouble>> countByKeyApprox(long timeout, double confidence)
// Value Operations
public <U> JavaPairRDD<K, U> mapValues(Function<V, U> f)
public <U> JavaPairRDD<K, U> flatMapValues(Function<V, Iterable<U>> f)
public JavaRDD<K> keys()
public JavaRDD<V> values()
// Sorting
public JavaPairRDD<K, V> sortByKey()
public JavaPairRDD<K, V> sortByKey(boolean ascending)
public JavaPairRDD<K, V> sortByKey(boolean ascending, int numPartitions)
// Conversion
public JavaRDD<Tuple2<K, V>> rdd()
// I/O Operations
public void saveAsHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends OutputFormat> outputFormatClass)
public void saveAsHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends OutputFormat> outputFormatClass, Class<? extends CompressionCodec> codec)
public void saveAsNewAPIHadoopFile(String path, Class<?> keyClass, Class<?> valueClass, Class<? extends NewOutputFormat> outputFormatClass)
}Java wrapper for RDDs of Double values providing statistical operations.
public class JavaDoubleRDD {
// Statistical Operations
public double sum()
public StatCounter stats()
public double mean()
public double variance()
public double stdev()
public double sampleStdev()
public double sampleVariance()
// Histogram Operations
public long[] histogram(double[] buckets)
public Tuple2<double[], long[]> histogram(int buckets)
// Standard RDD Operations (inherited)
public List<Double> collect()
public long count()
public Double first()
public List<Double> take(int num)
public Double reduce(Function2<Double, Double, Double> f)
// Transformations
public JavaDoubleRDD filter(Function<Double, Boolean> f)
public JavaDoubleRDD map(DoubleFunction<Double> f)
public JavaDoubleRDD cache()
public JavaDoubleRDD persist(StorageLevel newLevel)
}import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.SparkConf;
import java.util.Arrays;
import java.util.List;
SparkConf conf = new SparkConf()
.setAppName("Java Spark Example")
.setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(conf);
// Create RDD from collection
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> rdd = sc.parallelize(data);
// Transformations and actions
JavaRDD<Integer> squares = rdd.map(x -> x * x);
List<Integer> result = squares.collect();
sc.stop();import org.apache.spark.api.java.JavaPairRDD;
import scala.Tuple2;
List<Tuple2<String, Integer>> pairs = Arrays.asList(
new Tuple2<>("a", 1),
new Tuple2<>("b", 2),
new Tuple2<>("a", 3)
);
JavaPairRDD<String, Integer> pairRDD = sc.parallelizePairs(pairs);
// Reduce by key
JavaPairRDD<String, Integer> sums = pairRDD.reduceByKey((a, b) -> a + b);
// Collect as map
Map<String, Integer> resultMap = sums.collectAsMap();// Using lambda expressions (Java 8+)
JavaRDD<Integer> mapped = rdd.map(x -> x * 2);
JavaRDD<Integer> filtered = rdd.filter(x -> x > 10);
// Using Function objects (Java 7 compatibility)
import org.apache.spark.api.java.function.Function;
JavaRDD<Integer> mapped2 = rdd.map(new Function<Integer, Integer>() {
public Integer call(Integer x) {
return x * 2;
}
});// Read text file
JavaRDD<String> lines = sc.textFile("hdfs://path/to/file.txt");
// Word count example
JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
JavaPairRDD<String, Integer> pairs = words.mapToPair(word -> new Tuple2<>(word, 1));
JavaPairRDD<String, Integer> counts = pairs.reduceByKey((a, b) -> a + b);
// Save results
counts.saveAsTextFile("hdfs://path/to/output");