Apache Flink Java API - Core Java APIs for Apache Flink's batch and stream processing framework
—
DataSet is the primary data abstraction in Flink's batch processing API, representing a distributed collection of elements of the same type. It provides comprehensive transformation operations for data processing pipelines.
The abstract base class representing a distributed dataset with type information.
/**
* Abstract class representing a distributed dataset of elements of type T
* @param <T> the type of elements in the dataset
*/
public abstract class DataSet<T> {
// Core transformation and output methods
}Apply a function to each element, producing a new DataSet with potentially different type.
/**
* Apply a map function to each element
* @param mapper the map function to apply
* @return MapOperator for further configuration
*/
public <R> MapOperator<T, R> map(MapFunction<T, R> mapper);
/**
* Apply a map function to each partition
* @param mapper the map function to apply to each partition
* @return MapPartitionOperator for further configuration
*/
public <R> MapPartitionOperator<T, R> mapPartition(MapPartitionFunction<T, R> mapper);Usage Examples:
// Simple map transformation
DataSet<String> words = env.fromElements("hello", "world");
DataSet<Integer> lengths = words.map(new MapFunction<String, Integer>() {
@Override
public Integer map(String value) {
return value.length();
}
});
// Using lambda expression (Java 8+)
DataSet<String> uppercase = words.map(String::toUpperCase);
// Map partition for batch processing
DataSet<String> processed = words.mapPartition(new MapPartitionFunction<String, String>() {
@Override
public void mapPartition(Iterable<String> values, Collector<String> out) {
for (String value : values) {
out.collect(value.trim().toLowerCase());
}
}
});Apply a function that can produce zero, one, or multiple output elements for each input element.
/**
* Apply a flatMap function to each element
* @param flatter the flatMap function to apply
* @return FlatMapOperator for further configuration
*/
public <R> FlatMapOperator<T, R> flatMap(FlatMapFunction<T, R> flatter);Usage Examples:
// Split sentences into words
DataSet<String> sentences = env.fromElements("hello world", "flink java");
DataSet<String> words = sentences.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String sentence, Collector<String> out) {
for (String word : sentence.split(" ")) {
out.collect(word);
}
}
});Filter elements based on a predicate function.
/**
* Filter elements based on a predicate
* @param filter the filter predicate
* @return FilterOperator for further configuration
*/
public FilterOperator<T> filter(FilterFunction<T> filter);Usage Examples:
// Filter words longer than 4 characters
DataSet<String> words = env.fromElements("hello", "hi", "world", "a");
DataSet<String> longWords = words.filter(new FilterFunction<String>() {
@Override
public boolean filter(String value) {
return value.length() > 4;
}
});
// Using lambda expression
DataSet<String> filtered = words.filter(word -> word.startsWith("h"));Combine elements using an associative and commutative function.
/**
* Reduce the DataSet using a reduce function
* @param reducer the reduce function
* @return ReduceOperator for further configuration
*/
public ReduceOperator<T> reduce(ReduceFunction<T> reducer);
/**
* Apply a reduce function to grouped elements
* @param reducer the group reduce function
* @return GroupReduceOperator for further configuration
*/
public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer);Remove duplicate elements from the DataSet.
/**
* Remove duplicate elements
* @return DistinctOperator for further configuration
*/
public DistinctOperator<T> distinct();
/**
* Remove duplicates based on key fields
* @param fields the field positions for duplicate detection
* @return DistinctOperator for further configuration
*/
public DistinctOperator<T> distinct(int... fields);
/**
* Remove duplicates based on key selector
* @param keyExtractor function to extract the key for comparison
* @return DistinctOperator for further configuration
*/
public <K> DistinctOperator<T> distinct(KeySelector<T, K> keyExtractor);Combine multiple DataSets using set operations.
/**
* Union with another DataSet
* @param other the other DataSet to union with
* @return UnionOperator containing elements from both DataSets
*/
public UnionOperator<T> union(DataSet<T> other);Group elements by key fields or key selector functions.
/**
* Group by field positions (for Tuple types)
* @param fields the field positions to group by
* @return UnsortedGrouping for further operations
*/
public UnsortedGrouping<T> groupBy(int... fields);
/**
* Group by field names (for POJO types)
* @param fields the field names to group by
* @return UnsortedGrouping for further operations
*/
public UnsortedGrouping<T> groupBy(String... fields);
/**
* Group by key selector function
* @param keyExtractor function to extract the grouping key
* @return UnsortedGrouping for further operations
*/
public <K> UnsortedGrouping<T> groupBy(KeySelector<T, K> keyExtractor);Control how data is distributed across parallel instances.
/**
* Partition by hash of specified fields
* @param fields the fields to use for hash partitioning
* @return PartitionOperator for further configuration
*/
public PartitionOperator<T> partitionByHash(int... fields);
/**
* Partition by range of specified fields
* @param fields the fields to use for range partitioning
* @return PartitionOperator for further configuration
*/
public PartitionOperator<T> partitionByRange(int... fields);
/**
* Sort partition by specified field and order
* @param field the field to sort by
* @param order the sort order (ASCENDING or DESCENDING)
* @return SortPartitionOperator for further configuration
*/
public SortPartitionOperator<T> sortPartition(int field, Order order);Select specific fields from tuple or POJO types.
/**
* Project specified fields (for Tuple types)
* @param fieldIndexes the field indexes to project
* @return ProjectOperator with projected fields
*/
public ProjectOperator<T, ?> project(int... fieldIndexes);Collect data back to the driver program or output to external systems.
/**
* Collect all elements to the driver program
* @return List containing all elements
* @throws Exception if collection fails
*/
public List<T> collect() throws Exception;
/**
* Get the first n elements
* @param n number of elements to retrieve
* @return DataSet containing first n elements
*/
public DataSet<T> first(int n);
/**
* Count the number of elements
* @return the number of elements in the DataSet
* @throws Exception if counting fails
*/
public long count() throws Exception;
/**
* Sample elements from the DataSet
* @param withReplacement whether to sample with replacement
* @param fraction the fraction of elements to sample
* @return DataSet with sampled elements
*/
public DataSet<T> sample(boolean withReplacement, double fraction);Write DataSet content to external systems or print for debugging.
/**
* Write as text file
* @param filePath path to write the file
* @return DataSink for execution
*/
public DataSink<T> writeAsText(String filePath);
/**
* Write as text file with write mode
* @param filePath path to write the file
* @param writeMode mode for writing (OVERWRITE or NO_OVERWRITE)
* @return DataSink for execution
*/
public DataSink<T> writeAsText(String filePath, WriteMode writeMode);
/**
* Write as CSV file
* @param filePath path to write the CSV file
* @return DataSink for execution
*/
public DataSink<T> writeAsCsv(String filePath);
/**
* Print to standard output
* @return DataSink for execution
*/
public DataSink<T> print();
/**
* Print to standard error
* @return DataSink for execution
*/
public DataSink<T> printToErr();Access type information and execution environment.
/**
* Get the type information for this DataSet
* @return TypeInformation describing the element type
*/
public TypeInformation<T> getType();
/**
* Get the execution environment
* @return ExecutionEnvironment that created this DataSet
*/
public ExecutionEnvironment getExecutionEnvironment();import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.operators.*;
import org.apache.flink.api.common.functions.*;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.util.Collector;
import org.apache.flink.core.fs.FileSystem.WriteMode;Install with Tessl CLI
npx tessl i tessl/maven-org-apache-flink--flink-java