0
# Data Structures and Vectorization
1
2
Core data structures for efficient columnar processing, type conversion between internal and external formats, and vectorized operations for high-performance analytics workloads.
3
4
## Capabilities
5
6
### Vectorized Column Access
7
8
Interfaces for accessing data in columnar format, providing efficient memory access patterns and enabling vectorized operations for analytical workloads.
9
10
```java { .api }
11
/**
12
* Base interface for columnar data access
13
*/
14
interface ColumnVector {
15
/** Check if value at given row is null */
16
boolean isNullAt(int rowId);
17
18
/** Get number of rows in this vector */
19
int getLen();
20
}
21
22
/** Columnar access for boolean data */
23
interface BooleanColumnVector extends ColumnVector {
24
boolean getBoolean(int rowId);
25
}
26
27
/** Columnar access for integer data */
28
interface IntColumnVector extends ColumnVector {
29
int getInt(int rowId);
30
}
31
32
/** Columnar access for long data */
33
interface LongColumnVector extends ColumnVector {
34
long getLong(int rowId);
35
}
36
37
/** Columnar access for float data */
38
interface FloatColumnVector extends ColumnVector {
39
float getFloat(int rowId);
40
}
41
42
/** Columnar access for double data */
43
interface DoubleColumnVector extends ColumnVector {
44
double getDouble(int rowId);
45
}
46
47
/** Columnar access for byte array data */
48
interface BytesColumnVector extends ColumnVector {
49
/** Get bytes data at row position */
50
Bytes getBytes(int rowId);
51
52
/** Inner class for byte array handling with offset and length */
53
class Bytes {
54
/** Byte data */
55
public byte[] data;
56
57
/** Offset in data array */
58
public int offset;
59
60
/** Length of data */
61
public int len;
62
63
/** Create bytes wrapper */
64
Bytes(byte[] data, int offset, int len);
65
66
/** Set bytes data */
67
void setByteArray(byte[] data, int offset, int len);
68
}
69
}
70
71
/** Columnar access for decimal data */
72
interface DecimalColumnVector extends ColumnVector {
73
DecimalData getDecimal(int rowId, int precision, int scale);
74
}
75
76
/** Columnar access for timestamp data */
77
interface TimestampColumnVector extends ColumnVector {
78
TimestampData getTimestamp(int rowId, int precision);
79
}
80
81
/** Columnar access for array data */
82
interface ArrayColumnVector extends ColumnVector {
83
ArrayData getArray(int rowId);
84
}
85
86
/** Columnar access for row data */
87
interface RowColumnVector extends ColumnVector {
88
RowData getRow(int rowId);
89
}
90
```
91
92
### Vectorized Column Batch
93
94
Main container for batch columnar processing, organizing multiple column vectors for efficient processing of tabular data.
95
96
```java { .api }
97
/**
98
* Main class for batch columnar processing
99
* Organizes multiple column vectors for efficient tabular data processing
100
*/
101
class VectorizedColumnBatch {
102
/** Column vectors in this batch */
103
public final ColumnVector[] columns;
104
105
/** Default batch size for optimal performance */
106
public static final int DEFAULT_SIZE = 2048;
107
108
/**
109
* Create a new vectorized column batch
110
* @param vectors Array of column vectors to organize
111
*/
112
VectorizedColumnBatch(ColumnVector[] vectors);
113
114
/** Set the number of rows in this batch */
115
void setNumRows(int numRows);
116
117
/** Get the number of rows in this batch */
118
int getNumRows();
119
120
/** Get number of columns (arity) */
121
int getArity();
122
123
/** Check if value is null at given position */
124
boolean isNullAt(int rowId, int colId);
125
126
/** Get boolean value at position */
127
boolean getBoolean(int rowId, int colId);
128
129
/** Get byte value at position */
130
byte getByte(int rowId, int colId);
131
132
/** Get short value at position */
133
short getShort(int rowId, int colId);
134
135
/** Get integer value at position */
136
int getInt(int rowId, int colId);
137
138
/** Get long value at position */
139
long getLong(int rowId, int colId);
140
141
/** Get float value at position */
142
float getFloat(int rowId, int colId);
143
144
/** Get double value at position */
145
double getDouble(int rowId, int colId);
146
147
/** Get string value at position */
148
String getString(int rowId, int colId);
149
150
/** Get decimal value at position */
151
DecimalData getDecimal(int rowId, int colId, int precision, int scale);
152
153
/** Get timestamp value at position */
154
TimestampData getTimestamp(int rowId, int colId, int precision);
155
156
/** Get array value at position */
157
ArrayData getArray(int rowId, int colId);
158
159
/** Get row value at position */
160
RowData getRow(int rowId, int colId);
161
}
162
```
163
164
### Writable Column Vectors
165
166
Mutable column vector implementations that support writing data, used for building columnar data structures.
167
168
```java { .api }
169
/**
170
* Base class for writable column vectors
171
*/
172
abstract class WritableColumnVector implements ColumnVector {
173
/** Set null value at given row */
174
void setNullAt(int rowId);
175
176
/** Set not null at given row */
177
void setNotNullAt(int rowId);
178
179
/** Reset the vector for reuse */
180
abstract void reset();
181
}
182
183
/** Writable boolean vector implementation */
184
class WritableBooleanVector extends WritableColumnVector
185
implements BooleanColumnVector {
186
/** Set boolean value at given row */
187
void setBoolean(int rowId, boolean value);
188
}
189
190
/** Writable integer vector implementation */
191
class WritableIntVector extends WritableColumnVector
192
implements IntColumnVector {
193
/** Set integer value at given row */
194
void setInt(int rowId, int value);
195
}
196
197
/** Writable long vector implementation */
198
class WritableLongVector extends WritableColumnVector
199
implements LongColumnVector {
200
/** Set long value at given row */
201
void setLong(int rowId, long value);
202
}
203
204
/** Writable float vector implementation */
205
class WritableFloatVector extends WritableColumnVector
206
implements FloatColumnVector {
207
/** Set float value at given row */
208
void setFloat(int rowId, float value);
209
}
210
211
/** Writable double vector implementation */
212
class WritableDoubleVector extends WritableColumnVector
213
implements DoubleColumnVector {
214
/** Set double value at given row */
215
void setDouble(int rowId, double value);
216
}
217
218
/** Writable byte vector implementation */
219
class WritableByteVector extends WritableColumnVector
220
implements ByteColumnVector {
221
/** Set byte value at given row */
222
void setByte(int rowId, byte value);
223
}
224
225
/** Writable bytes vector implementation */
226
class WritableBytesVector extends WritableColumnVector
227
implements BytesColumnVector {
228
/** Set bytes value at given row */
229
void setBytes(int rowId, byte[] value);
230
}
231
232
/** Writable timestamp vector implementation */
233
class WritableTimestampVector extends WritableColumnVector
234
implements TimestampColumnVector {
235
/** Set timestamp value at given row */
236
void setTimestamp(int rowId, TimestampData value);
237
}
238
```
239
240
### Heap-based Vector Implementations
241
242
Concrete implementations of column vectors using heap memory storage, providing efficient storage for columnar data.
243
244
```java { .api }
245
/** Heap-based boolean vector */
246
class HeapBooleanVector extends WritableBooleanVector {
247
HeapBooleanVector(int len);
248
}
249
250
/** Heap-based byte vector */
251
class HeapByteVector extends WritableByteVector {
252
HeapByteVector(int len);
253
}
254
255
/** Heap-based integer vector */
256
class HeapIntVector extends WritableIntVector {
257
HeapIntVector(int len);
258
}
259
260
/** Heap-based long vector */
261
class HeapLongVector extends WritableLongVector {
262
HeapLongVector(int len);
263
}
264
265
/** Heap-based float vector */
266
class HeapFloatVector extends WritableFloatVector {
267
HeapFloatVector(int len);
268
}
269
270
/** Heap-based double vector */
271
class HeapDoubleVector extends WritableDoubleVector {
272
HeapDoubleVector(int len);
273
}
274
275
/** Heap-based bytes vector */
276
class HeapBytesVector extends WritableBytesVector {
277
HeapBytesVector(int len);
278
}
279
280
/** Heap-based timestamp vector */
281
class HeapTimestampVector extends WritableTimestampVector {
282
HeapTimestampVector(int len);
283
}
284
```
285
286
### Data Structure Conversion
287
288
Framework for converting between Flink's internal data representations and external formats, enabling interoperability with various data processing systems.
289
290
```java { .api }
291
/**
292
* Key interface for converting between internal and external data formats
293
* Enables interoperability between Flink's internal representations and external systems
294
*/
295
interface DataStructureConverter<I, E> {
296
/** Convert from internal format to external format */
297
E toExternal(I internal);
298
299
/** Convert from external format to internal format */
300
I toInternal(E external);
301
}
302
```
303
304
### Array Converters
305
306
Specialized converters for array data types, handling conversion between internal array representations and external array formats.
307
308
```java { .api }
309
/** Convert boolean arrays */
310
class ArrayBooleanArrayConverter implements DataStructureConverter<ArrayData, boolean[]> {
311
ArrayBooleanArrayConverter();
312
}
313
314
/** Convert integer arrays */
315
class ArrayIntArrayConverter implements DataStructureConverter<ArrayData, int[]> {
316
ArrayIntArrayConverter();
317
}
318
319
/** Convert long arrays */
320
class ArrayLongArrayConverter implements DataStructureConverter<ArrayData, long[]> {
321
ArrayLongArrayConverter();
322
}
323
324
/** Convert float arrays */
325
class ArrayFloatArrayConverter implements DataStructureConverter<ArrayData, float[]> {
326
ArrayFloatArrayConverter();
327
}
328
329
/** Convert double arrays */
330
class ArrayDoubleArrayConverter implements DataStructureConverter<ArrayData, double[]> {
331
ArrayDoubleArrayConverter();
332
}
333
334
/** Convert string arrays */
335
class ArrayStringArrayConverter implements DataStructureConverter<ArrayData, String[]> {
336
ArrayStringArrayConverter();
337
}
338
```
339
340
### Collection Converters
341
342
Converters for common Java collection types, enabling seamless integration with standard Java data structures.
343
344
```java { .api }
345
/** Convert to/from ArrayList */
346
class ArrayListConverter<E> implements DataStructureConverter<ArrayData, ArrayList<E>> {
347
ArrayListConverter(DataStructureConverter<Object, E> elementConverter);
348
}
349
350
/** Convert to/from Map structures */
351
class MapMapConverter<K, V> implements DataStructureConverter<MapData, Map<K, V>> {
352
MapMapConverter(
353
DataStructureConverter<Object, K> keyConverter,
354
DataStructureConverter<Object, V> valueConverter
355
);
356
}
357
```
358
359
### Temporal Data Converters
360
361
Specialized converters for date, time, and timestamp data, handling conversion between internal temporal representations and Java temporal types.
362
363
```java { .api }
364
/** Convert date data */
365
class DateDateConverter implements DataStructureConverter<Integer, Date> {
366
DateDateConverter();
367
}
368
369
/** Convert time data */
370
class TimeTimeConverter implements DataStructureConverter<Integer, Time> {
371
TimeTimeConverter();
372
}
373
374
/** Convert timestamp data */
375
class TimestampTimestampConverter implements DataStructureConverter<TimestampData, Timestamp> {
376
TimestampTimestampConverter(int precision);
377
}
378
```
379
380
### Special Converters
381
382
Converters for complex data types including raw objects, structured objects, and row data.
383
384
```java { .api }
385
/** Convert raw objects */
386
class RawObjectConverter<T> implements DataStructureConverter<RawValueData<T>, T> {
387
RawObjectConverter(Class<T> clazz);
388
}
389
390
/** Convert structured objects */
391
class StructuredObjectConverter<T> implements DataStructureConverter<RowData, T> {
392
StructuredObjectConverter(Class<T> clazz, LogicalType dataType);
393
}
394
395
/** Convert row data */
396
class RowRowConverter implements DataStructureConverter<RowData, Row> {
397
RowRowConverter(LogicalType[] fieldTypes);
398
}
399
```
400
401
### Binary Data Utilities
402
403
Utilities for efficient manipulation of binary data representations, optimized for high-performance data processing.
404
405
```java { .api }
406
/** Utilities for binary row data manipulation */
407
class BinaryRowDataUtil {
408
/** Check if two binary rows are equal */
409
static boolean equals(BinaryRowData row1, BinaryRowData row2);
410
411
/** Get hash code for binary row data */
412
static int hashCode(BinaryRowData row);
413
414
/** Copy binary row data */
415
static BinaryRowData copy(BinaryRowData source);
416
}
417
418
/** Binary string data utilities */
419
class BinaryStringDataUtil {
420
/** Create binary string from Java string */
421
static BinaryStringData fromString(String str);
422
423
/** Convert binary string to Java string */
424
static String toString(BinaryStringData binaryString);
425
426
/** Compare two binary strings */
427
static int compare(BinaryStringData str1, BinaryStringData str2);
428
}
429
430
/** Interface for binary data writing */
431
interface BinaryWriter {
432
/** Write binary data */
433
void writeBytes(byte[] bytes);
434
435
/** Complete the write operation */
436
void complete();
437
}
438
439
/** Wrapper for row data with boxed primitives */
440
class BoxedWrapperRowData implements RowData {
441
BoxedWrapperRowData(RowData row);
442
443
/** Get field value as boxed primitive */
444
Object get(int pos);
445
}
446
```
447
448
### Dictionary Encoding
449
450
Interface for dictionary encoding support, enabling compressed storage and efficient processing of categorical data.
451
452
```java { .api }
453
/**
454
* Interface for dictionary encoding
455
* Enables compressed storage of categorical data
456
*/
457
interface Dictionary {
458
/** Decode dictionary encoded value */
459
Object decodeToBinary(int id);
460
461
/** Get the size of the dictionary */
462
int size();
463
}
464
```
465
466
## Usage Examples
467
468
```java
469
// Create column vectors
470
ColumnVector[] vectors = new ColumnVector[3];
471
vectors[0] = new HeapIntVector(1024);
472
vectors[1] = new HeapDoubleVector(1024);
473
vectors[2] = new HeapBytesVector(1024);
474
475
// Create a vectorized batch for processing
476
VectorizedColumnBatch batch = new VectorizedColumnBatch(vectors);
477
478
// Write data to the batch
479
WritableIntVector intCol = (WritableIntVector) batch.columns[0];
480
WritableDoubleVector doubleCol = (WritableDoubleVector) batch.columns[1];
481
WritableBytesVector bytesCol = (WritableBytesVector) batch.columns[2];
482
483
for (int i = 0; i < 100; i++) {
484
intCol.setInt(i, i * 10);
485
doubleCol.setDouble(i, i * 3.14);
486
bytesCol.setBytes(i, ("value_" + i).getBytes());
487
}
488
batch.setNumRows(100);
489
490
// Read data from the batch
491
for (int i = 0; i < batch.getNumRows(); i++) {
492
int intValue = batch.getInt(i, 0);
493
double doubleValue = batch.getDouble(i, 1);
494
String stringValue = batch.getString(i, 2);
495
}
496
497
// Convert between data formats
498
DataStructureConverter<ArrayData, int[]> arrayConverter =
499
new ArrayIntArrayConverter();
500
501
int[] externalArray = arrayConverter.toExternal(internalArrayData);
502
ArrayData internalArray = arrayConverter.toInternal(externalArray);
503
```