CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-apache-flink--flink-java

Apache Flink Java API - Core Java APIs for Apache Flink's batch and stream processing framework

Pending
Overview
Eval results
Files

join-cogroup-operations.mddocs/

Join and CoGroup Operations

Advanced operations for combining multiple DataSets using various join strategies, coGroup operations, and cross products. These operations enable complex data processing workflows involving multiple data sources.

Capabilities

Join Operations

Join two DataSets based on key equality with support for different join types.

/**
 * Join with another DataSet
 * @param other the other DataSet to join with
 * @return JoinOperatorSets for key specification and join configuration
 */
public <R> JoinOperatorSets<T, R> join(DataSet<R> other);

Usage Examples:

// Inner join on key fields
DataSet<Tuple2<Long, String>> users = env.fromElements(
    new Tuple2<>(1L, "Alice"), new Tuple2<>(2L, "Bob"));
DataSet<Tuple2<Long, String>> orders = env.fromElements(
    new Tuple2<>(1L, "Order1"), new Tuple2<>(1L, "Order2"), new Tuple2<>(2L, "Order3"));

// Join users and orders on user ID
DataSet<Tuple2<Tuple2<Long, String>, Tuple2<Long, String>>> joined = users
    .join(orders)
    .where(0)  // user ID field in users
    .equalTo(0) // user ID field in orders
    .types(Tuple2.class, Tuple2.class);

// Custom join function
DataSet<String> customJoin = users
    .join(orders)
    .where(0)
    .equalTo(0)
    .with(new JoinFunction<Tuple2<Long, String>, Tuple2<Long, String>, String>() {
        @Override
        public String join(Tuple2<Long, String> user, Tuple2<Long, String> order) {
            return user.f1 + " has " + order.f1;
        }
    });

Join Types

Different types of joins supported by the join operation.

/**
 * Enum defining join types
 */
public enum JoinType {
    INNER,        // Inner join (default)
    LEFT_OUTER,   // Left outer join
    RIGHT_OUTER,  // Right outer join  
    FULL_OUTER    // Full outer join
}

/**
 * Specify the join type
 * @param joinType the type of join to perform
 * @return configured join operator
 */
public JoinOperator<T1, T2, R> with(JoinType joinType);

Join Function Interface

Custom join logic for combining elements from both DataSets.

/**
 * Interface for join functions
 * @param <IN1> type of elements from first DataSet
 * @param <IN2> type of elements from second DataSet  
 * @param <OUT> type of result elements
 */
public interface JoinFunction<IN1, IN2, OUT> extends Function, Serializable {
    /**
     * Join function that combines two elements
     * @param first element from first DataSet
     * @param second element from second DataSet
     * @return combined result element
     */
    OUT join(IN1 first, IN2 second) throws Exception;
}

/**
 * Rich version with access to runtime context
 */
public abstract class RichJoinFunction<IN1, IN2, OUT> 
    extends AbstractRichFunction implements JoinFunction<IN1, IN2, OUT> {
}

CoGroup Operations

Group elements from two DataSets by key and process groups together.

/**
 * CoGroup with another DataSet
 * @param other the other DataSet to coGroup with
 * @return CoGroupOperatorSets for key specification and coGroup configuration
 */
public <R> CoGroupOperator.CoGroupOperatorSets<T, R> coGroup(DataSet<R> other);

Usage Examples:

// CoGroup users and orders
DataSet<String> coGroupResult = users
    .coGroup(orders)
    .where(0)
    .equalTo(0)
    .with(new CoGroupFunction<Tuple2<Long, String>, Tuple2<Long, String>, String>() {
        @Override
        public void coGroup(
            Iterable<Tuple2<Long, String>> users,
            Iterable<Tuple2<Long, String>> orders,
            Collector<String> out) {
            
            Iterator<Tuple2<Long, String>> userIter = users.iterator();
            if (userIter.hasNext()) {
                Tuple2<Long, String> user = userIter.next();
                int orderCount = 0;
                for (Tuple2<Long, String> order : orders) {
                    orderCount++;
                }
                out.collect(user.f1 + " has " + orderCount + " orders");
            }
        }
    });

CoGroup Function Interface

Custom coGroup logic for processing groups from both DataSets.

/**
 * Interface for coGroup functions
 * @param <IN1> type of elements from first DataSet
 * @param <IN2> type of elements from second DataSet
 * @param <OUT> type of result elements
 */
public interface CoGroupFunction<IN1, IN2, OUT> extends Function, Serializable {
    /**
     * CoGroup function that processes groups from both sides
     * @param first iterable of elements from first DataSet
     * @param second iterable of elements from second DataSet
     * @param out collector for result elements
     */
    void coGroup(Iterable<IN1> first, Iterable<IN2> second, Collector<OUT> out) throws Exception;
}

/**
 * Rich version with access to runtime context
 */
public abstract class RichCoGroupFunction<IN1, IN2, OUT>
    extends AbstractRichFunction implements CoGroupFunction<IN1, IN2, OUT> {
}

Cross Operations

Compute the cross product (Cartesian product) of two DataSets.

/**
 * Cross product with another DataSet
 * @param other the other DataSet to cross with
 * @return CrossOperator for cross product configuration
 */
public <R> CrossOperator.DefaultCross<T, R> cross(DataSet<R> other);

Usage Examples:

// Simple cross product
DataSet<String> colors = env.fromElements("red", "blue");
DataSet<String> sizes = env.fromElements("small", "large");

DataSet<Tuple2<String, String>> crossed = colors
    .cross(sizes)
    .types(String.class, String.class);

// Cross with custom function
DataSet<String> customCross = colors
    .cross(sizes)
    .with(new CrossFunction<String, String, String>() {
        @Override
        public String cross(String color, String size) {
            return size + " " + color + " item";
        }
    });

Cross Function Interface

Custom cross logic for combining every element from first DataSet with every element from second DataSet.

/**
 * Interface for cross functions
 * @param <IN1> type of elements from first DataSet
 * @param <IN2> type of elements from second DataSet
 * @param <OUT> type of result elements
 */
public interface CrossFunction<IN1, IN2, OUT> extends Function, Serializable {
    /**
     * Cross function that combines elements from both DataSets
     * @param first element from first DataSet
     * @param second element from second DataSet
     * @return combined result element
     */
    OUT cross(IN1 first, IN2 second) throws Exception;
}

/**
 * Rich version with access to runtime context
 */
public abstract class RichCrossFunction<IN1, IN2, OUT>
    extends AbstractRichFunction implements CrossFunction<IN1, IN2, OUT> {
}

Union Operations

Combine two DataSets of the same type into a single DataSet.

/**
 * Union with another DataSet of the same type
 * @param other the other DataSet to union with
 * @return UnionOperator containing elements from both DataSets
 */
public UnionOperator<T> union(DataSet<T> other);

Usage Examples:

// Union two DataSets
DataSet<String> dataset1 = env.fromElements("a", "b", "c");
DataSet<String> dataset2 = env.fromElements("d", "e", "f");

DataSet<String> combined = dataset1.union(dataset2);
// Result contains: a, b, c, d, e, f

// Union multiple DataSets
DataSet<String> dataset3 = env.fromElements("g", "h");
DataSet<String> allCombined = dataset1.union(dataset2).union(dataset3);

Key Specification

Methods for specifying keys for join and coGroup operations.

/**
 * Specify key fields by position (for Tuple types)
 * @param fields the field positions to use as keys
 * @return key specification for further configuration
 */
public JoinOperatorSets.JoinOperatorSetsPredicate where(int... fields);

/**
 * Specify key fields by name (for POJO types)
 * @param fields the field names to use as keys
 * @return key specification for further configuration
 */
public JoinOperatorSets.JoinOperatorSetsPredicate where(String... fields);

/**
 * Specify key using key selector function
 * @param keyExtractor function to extract the key
 * @return key specification for further configuration
 */
public <K> JoinOperatorSets.JoinOperatorSetsPredicateWithKeySelector<K> where(KeySelector<T, K> keyExtractor);

/**
 * Specify the matching key fields in the other DataSet
 * @param fields the field positions in the other DataSet
 * @return configured join operator
 */
public JoinOperator.EquiJoin<T1, T2> equalTo(int... fields);

Join Hints

Performance hints for join execution strategy.

/**
 * Enum for join strategy hints
 */
public enum JoinHint {
    OPTIMIZER_CHOOSES,    // Let optimizer choose strategy
    BROADCAST_HASH_FIRST, // Broadcast first DataSet and use hash join
    BROADCAST_HASH_SECOND,// Broadcast second DataSet and use hash join
    REPARTITION_HASH_FIRST, // Repartition both, use first as build side
    REPARTITION_HASH_SECOND, // Repartition both, use second as build side
    REPARTITION_SORT_MERGE   // Repartition and sort-merge join
}

/**
 * Provide a hint for join execution strategy
 * @param hint the execution strategy hint
 * @return configured join operator
 */
public JoinOperator<T1, T2, R> strategy(JoinHint hint);

Types

import org.apache.flink.api.java.operators.JoinOperator;
import org.apache.flink.api.java.operators.CoGroupOperator;
import org.apache.flink.api.java.operators.CrossOperator;
import org.apache.flink.api.java.operators.UnionOperator;
import org.apache.flink.api.java.operators.join.JoinType;
import org.apache.flink.api.java.operators.join.JoinOperatorSets;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.CrossFunction;
import org.apache.flink.api.common.functions.RichJoinFunction;
import org.apache.flink.api.common.functions.RichCoGroupFunction;
import org.apache.flink.api.common.functions.RichCrossFunction;
import org.apache.flink.util.Collector;

Install with Tessl CLI

npx tessl i tessl/maven-org-apache-flink--flink-java

docs

aggregation-grouping.md

data-input-output.md

dataset-operations.md

execution-environments.md

index.md

iteration-operations.md

join-cogroup-operations.md

utility-functions.md

tile.json