Apache Flink Java API - Core Java APIs for Apache Flink's batch and stream processing framework
npx @tessl/cli install tessl/maven-org-apache-flink--flink-java@1.20.0Apache Flink Java API provides core Java classes and interfaces for developing batch processing applications using DataSet transformations. This module offers comprehensive APIs for data transformation, aggregation, I/O operations, and execution environments within the Apache Flink ecosystem.
⚠️ Deprecation Notice: The entire DataSet API has been deprecated since Flink 1.18 and will be removed in a future major version. Users are encouraged to migrate to the DataStream API or Table API for new applications.
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.20.2</version>
</dependency>// Core execution environment
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.DataSet;
// Common transformation functions
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.ReduceFunction;import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.common.functions.MapFunction;
public class FlinkBatchExample {
public static void main(String[] args) throws Exception {
// Create execution environment
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// Create a DataSet from a collection
DataSet<String> input = env.fromElements("hello", "world", "flink", "java");
// Apply transformations
DataSet<String> result = input
.map(new MapFunction<String, String>() {
@Override
public String map(String value) {
return value.toUpperCase();
}
})
.filter(value -> value.length() > 4);
// Execute and print results
result.print();
// Execute the program
env.execute("Batch Processing Example");
}
}The Flink Java API is built around several key architectural components:
Core execution contexts for running Flink batch programs, providing methods to create data sources, configure execution parameters, and trigger program execution.
// Get execution environment (auto-detects local vs remote)
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// Create local execution environment
LocalEnvironment localEnv = ExecutionEnvironment.createLocalEnvironment();
// Create remote execution environment
RemoteEnvironment remoteEnv = ExecutionEnvironment.createRemoteEnvironment(
String host, int port, String... jarFiles);The primary data abstraction for batch processing, providing a comprehensive set of transformation operations for distributed data processing.
// Core transformation methods
<R> MapOperator<T, R> map(MapFunction<T, R> mapper);
<R> FlatMapOperator<T, R> flatMap(FlatMapFunction<T, R> flatter);
FilterOperator<T> filter(FilterFunction<T> filter);
ReduceOperator<T> reduce(ReduceFunction<T> reducer);
DistinctOperator<T> distinct();
// Grouping and aggregation
UnsortedGrouping<T> groupBy(int... fields);
UnsortedGrouping<T> groupBy(String... fields);
<K> UnsortedGrouping<T> groupBy(KeySelector<T, K> keyExtractor);Advanced operations for combining multiple DataSets using various join strategies and coGroup operations.
// Join operations
<R> JoinOperatorSets<T, R> join(DataSet<R> other);
<R> CoGroupOperator.CoGroupOperatorSets<T, R> coGroup(DataSet<R> other);
<R> CrossOperator.DefaultCross<T, R> cross(DataSet<R> other);
// Union operations
UnionOperator<T> union(DataSet<T> other);Comprehensive I/O capabilities for reading from and writing to various data formats and storage systems.
// Data source creation
<T> DataSet<T> fromCollection(Collection<T> data);
<T> DataSet<T> fromElements(T... data);
DataSet<String> readTextFile(String filePath);
CsvReader readCsvFile(String filePath);
// Data output operations
DataSink<T> writeAsText(String filePath);
DataSink<T> writeAsCsv(String filePath);
DataSink<T> print();Built-in aggregation functions and grouping operations for statistical computations and data summarization.
// Grouping operations
UnsortedGrouping<T> groupBy(int... fields);
SortedGrouping<T> sortGroup(int field, Order order);
// Aggregation operations
AggregateOperator<T> sum(int field);
AggregateOperator<T> min(int field);
AggregateOperator<T> max(int field);
AggregateOperator<T> aggregate(Aggregations agg, int field);Support for iterative algorithms including bulk iterations and delta iterations for machine learning and graph processing algorithms.
// Bulk iteration
IterativeDataSet<T> iterate(int maxIterations);
DataSet<T> closeWith(DataSet<T> iterationResult);
// Delta iteration
DeltaIteration<T, ?> iterateDelta(DataSet<?> workset, int maxIterations, int... keyFields);Helper utilities for common operations, parameter handling, and data set manipulation.
// Parameter handling
ParameterTool fromArgs(String[] args);
ParameterTool fromPropertiesFile(String path);
// DataSet utilities
DataSet<Tuple2<Long, T>> zipWithIndex(DataSet<T> input);
DataSet<T> sample(DataSet<T> input, boolean withReplacement, double fraction);