or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

api-completeness.mdcheckpointing.mddata-generation.mdexecution.mdfault-tolerance.mdindex.mdstreaming.md
tile.json

data-generation.mddocs/

Data Generation

Comprehensive test data generation utilities providing consistent, reusable datasets for Apache Flink API testing. These utilities generate standard datasets that are widely used across Flink's test suite.

Core Data Generation Classes

CollectionDataSets

Primary utility for generating Java API test datasets with various data types and sizes.

public class CollectionDataSets {
    // Basic tuple datasets
    public static DataSet<Tuple3<Integer, Long, String>> get3TupleDataSet(ExecutionEnvironment env);
    public static DataSet<Tuple3<Integer, Long, String>> getSmall3TupleDataSet(ExecutionEnvironment env);
    public static DataSet<Tuple5<Integer, Long, Integer, String, Long>> get5TupleDataSet(ExecutionEnvironment env);
    public static DataSet<Tuple5<Integer, Long, Integer, String, Long>> getSmall5TupleDataSet(ExecutionEnvironment env);
    
    // Nested tuple datasets
    public static DataSet<Tuple2<Tuple2<Integer, Integer>, String>> getSmallNestedTupleDataSet(ExecutionEnvironment env);
    public static DataSet<Tuple2<Tuple2<Integer, Integer>, String>> getGroupSortedNestedTupleDataSet(ExecutionEnvironment env);
    public static DataSet<Tuple3<Tuple2<Integer, Integer>, String, Integer>> getGroupSortedNestedTupleDataSet2(ExecutionEnvironment env);
    public static DataSet<Tuple2<byte[], Integer>> getTuple2WithByteArrayDataSet(ExecutionEnvironment env);
    
    // Complex tuple datasets
    public static DataSet<Tuple7<Integer, String, Integer, Integer, Long, String, Long>> getSmallTuplebasedDataSet(ExecutionEnvironment env);
    public static DataSet<Tuple7<Long, Integer, Integer, Long, String, Integer, String>> getSmallTuplebasedDataSetMatchingPojo(ExecutionEnvironment env);
    
    // Primitive type datasets
    public static DataSet<String> getStringDataSet(ExecutionEnvironment env);
    public static DataSet<Integer> getIntegerDataSet(ExecutionEnvironment env);
    
    // Basic custom type datasets
    public static DataSet<CustomType> getCustomTypeDataSet(ExecutionEnvironment env);
    public static DataSet<CustomType> getSmallCustomTypeDataSet(ExecutionEnvironment env);
    
    // POJO datasets
    public static DataSet<POJO> getSmallPojoDataSet(ExecutionEnvironment env);
    public static DataSet<POJO> getDuplicatePojoDataSet(ExecutionEnvironment env);
    public static DataSet<POJO> getMixedPojoDataSet(ExecutionEnvironment env);
    
    // Complex nested datasets
    public static DataSet<CrazyNested> getCrazyNestedDataSet(ExecutionEnvironment env);
    public static DataSet<FromTupleWithCTor> getPojoExtendingFromTuple(ExecutionEnvironment env);
    public static DataSet<PojoContainingTupleAndWritable> getPojoContainingTupleAndWritable(ExecutionEnvironment env);
    public static DataSet<PojoContainingTupleAndWritable> getGroupSortedPojoContainingTupleAndWritable(ExecutionEnvironment env);
    public static DataSet<Tuple3<Integer, CrazyNested, POJO>> getTupleContainingPojos(ExecutionEnvironment env);
    
    // Advanced POJO datasets
    public static DataSet<PojoWithMultiplePojos> getPojoWithMultiplePojos(ExecutionEnvironment env);
    public static DataSet<PojoWithDateAndEnum> getPojoWithDateAndEnum(ExecutionEnvironment env);
    public static DataSet<PojoWithCollection> getPojoWithCollection(ExecutionEnvironment env);
}

ValueCollectionDataSets

Utility for generating datasets using Flink Value types for serialization and performance testing.

public class ValueCollectionDataSets {
    public static DataSet<Tuple3<IntValue, LongValue, StringValue>> get3TupleDataSet(ExecutionEnvironment env);
    public static DataSet<Tuple3<IntValue, LongValue, StringValue>> getSmall3TupleDataSet(ExecutionEnvironment env);
    public static DataSet<Tuple5<IntValue, LongValue, IntValue, StringValue, LongValue>> get5TupleDataSet(ExecutionEnvironment env);
    public static DataSet<StringValue> getStringDataSet(ExecutionEnvironment env);
    public static DataSet<IntValue> getIntDataSet(ExecutionEnvironment env);
    public static DataSet<CustomType> getCustomTypeDataSet(ExecutionEnvironment env);
    public static DataSet<POJO> getSmallPojoDataSet(ExecutionEnvironment env);
}

Custom Test Types

CustomType

Serializable class with integer, long, and string fields for general testing.

public static class CustomType implements Serializable {
    public int myInt;
    public long myLong;
    public String myString;
    
    public CustomType();
    public CustomType(int i, long l, String s);
    
    @Override
    public boolean equals(Object obj);
    @Override
    public int hashCode();
    @Override
    public String toString();
}

POJO

Plain Old Java Object for testing POJO serialization and type extraction.

public static class POJO implements Serializable {
    public int number;
    public String str;
    
    public POJO();
    public POJO(int i, String s);
    
    @Override
    public boolean equals(Object obj);
    @Override
    public int hashCode();
    @Override
    public String toString();
}

CrazyNested

Complex nested structure for advanced testing scenarios.

public static class CrazyNested implements Serializable {
    public POJO nestLvl1;
    public CustomType nestLvl2;
    public int simpleField;
    
    public CrazyNested();
    public CrazyNested(POJO p, CustomType ct, int i);
    
    @Override
    public boolean equals(Object obj);
    @Override
    public int hashCode();
    @Override
    public String toString();
}

FromTupleWithCTor

POJO class that extends from Tuple3 for testing inheritance and serialization scenarios.

public static class FromTupleWithCTor extends FromTuple {
    public FromTupleWithCTor();
    public FromTupleWithCTor(String f0, String f1, Long f2);
}

public static class FromTuple extends Tuple3<String, String, Long> {
    public FromTuple();
}

PojoContainingTupleAndWritable

Complex POJO containing both tuple and Hadoop Writable types for compatibility testing.

public static class PojoContainingTupleAndWritable {
    public int someInt;
    public String someString;
    public IntWritable hadoopFan;
    public Tuple2<Long, Long> theTuple;
    
    public PojoContainingTupleAndWritable();
    public PojoContainingTupleAndWritable(int i, String s, IntWritable iw, Tuple2<Long, Long> t);
}

PojoWithMultiplePojos

POJO containing multiple nested POJO instances for complex object graph testing.

public static class PojoWithMultiplePojos {
    public Pojo1 pojo1;
    public Pojo2 pojo2;
    public Integer key;
    
    public PojoWithMultiplePojos();
}

public static class Pojo1 {
    public String a;
    public String b;
    
    public Pojo1();
}

public static class Pojo2 {
    public int a2;
    
    public Pojo2();
}

PojoWithDateAndEnum

POJO containing Date and Enum fields for specialized serialization testing.

public static class PojoWithDateAndEnum {
    public String group;
    public Date date;
    public Color color;
    
    public PojoWithDateAndEnum();
}

PojoWithCollection

POJO containing collection fields for testing complex collection serialization.

public static class PojoWithCollection {
    public List<Pojo1> pojos;
    public int key;
    
    public PojoWithCollection();
}

Input Format Utilities

InfiniteIntegerInputFormat

Input format that generates infinite sequences of integers for stress testing.

public class InfiniteIntegerInputFormat extends GenericInputFormat<Integer> {
    public InfiniteIntegerInputFormat(boolean delay);
    
    @Override
    public boolean reachedEnd();
    @Override
    public Integer nextRecord(Integer reuse);
}

InfiniteIntegerTupleInputFormat

Input format generating infinite sequences of integer tuples.

public class InfiniteIntegerTupleInputFormat extends GenericInputFormat<Tuple2<Integer, Integer>> {
    public InfiniteIntegerTupleInputFormat(boolean delay);
    
    @Override
    public boolean reachedEnd();
    @Override
    public Tuple2<Integer, Integer> nextRecord(Tuple2<Integer, Integer> reuse);
}

UniformIntTupleGeneratorInputFormat

Input format for generating uniformly distributed integer tuple data.

public class UniformIntTupleGeneratorInputFormat extends GenericInputFormat<Tuple2<Integer, Integer>> {
    public UniformIntTupleGeneratorInputFormat(int numKeys, int numVals);
    
    @Override
    public boolean reachedEnd();
    @Override
    public Tuple2<Integer, Integer> nextRecord(Tuple2<Integer, Integer> reuse);
}

PointInFormat

Input format for reading Point objects from text files.

public class PointInFormat extends DelimitedInputFormat<Point> {
    public PointInFormat();
    
    @Override
    public Point readRecord(Point reusable, byte[] bytes, int offset, int numBytes);
}

Coordinate and Geometry Types

CoordVector

Vector coordinate representation for geometric testing.

public class CoordVector implements Serializable {
    public float x;
    public float y;
    public float z;
    
    public CoordVector();
    public CoordVector(float x, float y, float z);
    
    @Override
    public boolean equals(Object obj);
    @Override
    public int hashCode();
    @Override
    public String toString();
}

Usage Examples

Basic Dataset Usage

// Get standard test data
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple3<Integer, Long, String>> dataSet = CollectionDataSets.get3TupleDataSet(env);

// Use small dataset for quick tests
DataSet<CustomType> smallDataSet = CollectionDataSets.getSmallCustomTypeDataSet(env);

// Use Value types for serialization testing
DataSet<Tuple3<IntValue, LongValue, StringValue>> valueDataSet = 
    ValueCollectionDataSets.get3TupleDataSet(env);

Custom Input Format Usage

// Infinite data source for stress testing
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Integer> infiniteInts = env.createInput(new InfiniteIntegerInputFormat());

// Uniform distribution generator
DataSet<Tuple2<Integer, Integer>> uniformData = 
    env.createInput(new UniformIntTupleGeneratorInputFormat(100, 1000));

Complex Type Testing

// Test with nested objects
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<CrazyNested> nestedDataSet = CollectionDataSets.getCrazyNestedDataSet(env);

// Verify serialization
nestedDataSet
    .map(x -> x) // Identity map to trigger serialization
    .collect(); // Force execution