0
# Data Transforms and Processing
1
2
DataVec provides a comprehensive transformation system for preprocessing and cleaning data before feeding it to machine learning models. The transformation API enables complex data pipelines with column-level operations, mathematical transformations, and data quality improvements.
3
4
## Capabilities
5
6
### Transform Process
7
8
The core transformation workflow manager that orchestrates multiple transformation steps in a pipeline.
9
10
```java { .api }
11
public class TransformProcess {
12
public static Builder builder(Schema initialSchema);
13
public Schema getInitialSchema();
14
public Schema getFinalSchema();
15
public List<DataAction> getActionList();
16
public List<Writable> execute(List<Writable> input);
17
public List<List<Writable>> execute(List<List<Writable>> input);
18
}
19
20
public static class TransformProcess.Builder {
21
public Builder removeColumns(String... columnNames);
22
public Builder removeColumns(int... columnIndices);
23
public Builder renameColumn(String oldName, String newName);
24
public Builder filter(Condition condition);
25
public Builder transform(Transform transform);
26
public Builder convertToString(String columnName);
27
public Builder convertToDouble(String columnName);
28
public Builder convertToInteger(String columnName);
29
public Builder normalize(String columnName, Normalize normalization);
30
public Builder standardize(String columnName);
31
public Builder categoricalToOneHot(String columnName);
32
public Builder categoricalToInteger(String columnName);
33
public Builder stringToTimeTransform(String columnName, String dateTimeFormat, DateTimeZone dateTimeZone);
34
public Builder conditionalReplaceValueTransform(String columnName, Condition condition, Writable newValue);
35
public Builder appendStringColumnTransform(String columnName, String stringToAppend);
36
public Builder replaceStringTransform(String columnName, Map<String, String> mapping);
37
public TransformProcess build();
38
}
39
```
40
41
**Usage Example:**
42
43
```java
44
import org.datavec.api.transform.TransformProcess;
45
import org.datavec.api.transform.schema.Schema;
46
import org.datavec.api.transform.condition.ConditionOp;
47
import org.datavec.api.transform.condition.column.DoubleColumnCondition;
48
49
// Define input schema
50
Schema inputSchema = new Schema.Builder()
51
.addColumnString("name")
52
.addColumnInteger("age")
53
.addColumnDouble("income")
54
.addColumnCategorical("category", Arrays.asList("A", "B", "C"))
55
.build();
56
57
// Build transformation process
58
TransformProcess tp = new TransformProcess.Builder(inputSchema)
59
.removeColumns("name") // Remove name column
60
.filter(new DoubleColumnCondition("income", ConditionOp.GreaterThan, 0.0)) // Filter positive income
61
.normalize("income", Normalize.MinMax) // Min-max normalize income
62
.categoricalToOneHot("category") // One-hot encode category
63
.build();
64
65
// Apply transformations
66
List<Writable> input = Arrays.asList(
67
new Text("John"),
68
new IntWritable(25),
69
new DoubleWritable(50000.0),
70
new Text("A")
71
);
72
73
List<Writable> transformed = tp.execute(input);
74
// Result: [IntWritable(25), DoubleWritable(normalized_income), IntWritable(1), IntWritable(0), IntWritable(0)]
75
```
76
77
### Core Transform Interface
78
79
Base interface for all data transformations.
80
81
```java { .api }
82
public interface Transform {
83
List<Writable> map(List<Writable> writables);
84
String[] outputColumnNames();
85
ColumnType[] outputColumnTypes();
86
String transform(String input);
87
}
88
```
89
90
### Column Operations
91
92
Operations that work on individual columns of data.
93
94
```java { .api }
95
public enum ColumnType {
96
String,
97
Integer,
98
Long,
99
Double,
100
Categorical,
101
Time,
102
Bytes,
103
Boolean,
104
NDArray
105
}
106
107
public abstract class ColumnOp {
108
public abstract ColumnType getColumnType();
109
public abstract String[] columnNames();
110
}
111
112
public class ConvertToString extends ColumnOp {
113
public ConvertToString(String columnName);
114
}
115
116
public class ConvertToDouble extends ColumnOp {
117
public ConvertToDouble(String columnName);
118
}
119
120
public class ConvertToInteger extends ColumnOp {
121
public ConvertToInteger(String columnName);
122
}
123
```
124
125
**Usage Example:**
126
127
```java
128
// Convert string column to double
129
Transform convertTransform = new ConvertToDouble("price_string");
130
131
List<Writable> input = Arrays.asList(
132
new Text("Product A"),
133
new Text("29.99") // String representation of price
134
);
135
136
List<Writable> output = convertTransform.map(input);
137
// Result: [Text("Product A"), DoubleWritable(29.99)]
138
```
139
140
### Mathematical Operations
141
142
Mathematical transformations and calculations on numeric columns.
143
144
```java { .api }
145
public enum MathOp {
146
Add,
147
Subtract,
148
Multiply,
149
Divide,
150
Modulus,
151
ReverseSubtract,
152
ReverseDivide,
153
ScalarMin,
154
ScalarMax,
155
Abs,
156
Ceil,
157
Floor,
158
Round,
159
Sqrt,
160
Square,
161
Log,
162
Log10,
163
Exp,
164
Pow,
165
Sin,
166
Cos,
167
Tan,
168
ASin,
169
ACos,
170
ATan,
171
Sinh,
172
Cosh,
173
Tanh
174
}
175
176
public class MathFunction implements Transform {
177
public MathFunction(String columnName, MathOp operation);
178
public MathFunction(String columnName, MathOp operation, double scalar);
179
}
180
181
public class AddConstantColumnTransform implements Transform {
182
public AddConstantColumnTransform(String columnName, double value);
183
}
184
185
public class MultiplyConstantColumnTransform implements Transform {
186
public MultiplyConstantColumnTransform(String columnName, double value);
187
}
188
```
189
190
**Usage Examples:**
191
192
```java
193
// Square all values in a column
194
Transform squareTransform = new MathFunction("values", MathOp.Square);
195
196
// Add constant to column
197
Transform addConstant = new AddConstantColumnTransform("salary", 5000.0);
198
199
// Multiply by constant
200
Transform multiplyConstant = new MultiplyConstantColumnTransform("price", 1.1); // 10% increase
201
202
List<Writable> input = Arrays.asList(new DoubleWritable(100.0));
203
List<Writable> squared = squareTransform.map(input);
204
// Result: [DoubleWritable(10000.0)]
205
```
206
207
### Reduction Operations
208
209
Operations that reduce multiple rows to summary statistics.
210
211
```java { .api }
212
public enum ReduceOp {
213
Min,
214
Max,
215
Range,
216
Sum,
217
Mean,
218
Prod,
219
Stdev,
220
UncorrectedStdDev,
221
Variance,
222
PopulationVariance,
223
Count,
224
CountUnique
225
}
226
227
public class Reducer {
228
public static Builder builder(ReduceOp op, String column);
229
public IAggregableReduceOp<List<Writable>, List<Writable>> getReduction();
230
}
231
232
public enum StringReduceOp {
233
Merge,
234
Append,
235
Prepend,
236
Replace
237
}
238
239
public class StringReducer {
240
public static Builder builder(StringReduceOp op, String column);
241
public static StringReducer merge(String column, String delimiter);
242
public static StringReducer append(String column, String stringToAppend);
243
}
244
```
245
246
**Usage Examples:**
247
248
```java
249
// Calculate mean of a numeric column
250
IAggregableReduceOp<List<Writable>, List<Writable>> meanReduction =
251
Reducer.builder(ReduceOp.Mean, "values").build().getReduction();
252
253
// Merge string values with delimiter
254
StringReducer merger = StringReducer.merge("names", ",");
255
256
List<List<Writable>> data = Arrays.asList(
257
Arrays.asList(new DoubleWritable(10.0), new Text("Alice")),
258
Arrays.asList(new DoubleWritable(20.0), new Text("Bob")),
259
Arrays.asList(new DoubleWritable(30.0), new Text("Charlie"))
260
);
261
262
// Apply reduction operations
263
List<Writable> meanResult = meanReduction.aggregate(data);
264
// Result: [DoubleWritable(20.0)] - mean of 10, 20, 30
265
```
266
267
### Normalization and Standardization
268
269
Statistical normalization techniques for numeric data.
270
271
```java { .api }
272
public enum Normalize {
273
MinMax,
274
Standardize,
275
Normalize,
276
Log2,
277
Log10
278
}
279
280
public class NormalizeTransform implements Transform {
281
public NormalizeTransform(String columnName, Normalize normalization);
282
public NormalizeTransform(String columnName, Normalize normalization,
283
double minValue, double maxValue);
284
}
285
286
public class StandardizeTransform implements Transform {
287
public StandardizeTransform(String columnName);
288
public StandardizeTransform(String columnName, double mean, double stdev);
289
}
290
```
291
292
**Usage Examples:**
293
294
```java
295
// Min-max normalization to [0, 1]
296
Transform minMaxNorm = new NormalizeTransform("values", Normalize.MinMax);
297
298
// Z-score standardization
299
Transform standardize = new StandardizeTransform("values");
300
301
// Custom min-max range [0, 100]
302
Transform customRange = new NormalizeTransform("values", Normalize.MinMax, 0.0, 100.0);
303
304
List<Writable> input = Arrays.asList(new DoubleWritable(75.0));
305
List<Writable> normalized = minMaxNorm.map(input);
306
// Result depends on previously calculated min/max values from data
307
```
308
309
### Categorical Data Handling
310
311
Transformations for categorical and string data.
312
313
```java { .api }
314
public class CategoricalToIntegerTransform implements Transform {
315
public CategoricalToIntegerTransform(String columnName, List<String> categoryList);
316
}
317
318
public class CategoricalToOneHotTransform implements Transform {
319
public CategoricalToOneHotTransform(String columnName, List<String> categoryList);
320
}
321
322
public class StringToCategoricalTransform implements Transform {
323
public StringToCategoricalTransform(String columnName, List<String> categoryList);
324
}
325
326
public class ReplaceStringTransform implements Transform {
327
public ReplaceStringTransform(String columnName, Map<String, String> mapping);
328
}
329
```
330
331
**Usage Examples:**
332
333
```java
334
// Convert categories to integers
335
List<String> categories = Arrays.asList("small", "medium", "large");
336
Transform catToInt = new CategoricalToIntegerTransform("size", categories);
337
338
// Convert categories to one-hot encoding
339
Transform catToOneHot = new CategoricalToOneHotTransform("size", categories);
340
341
// String replacement mapping
342
Map<String, String> replacements = new HashMap<>();
343
replacements.put("yes", "1");
344
replacements.put("no", "0");
345
Transform stringReplace = new ReplaceStringTransform("response", replacements);
346
347
List<Writable> input = Arrays.asList(new Text("medium"));
348
List<Writable> intResult = catToInt.map(input);
349
// Result: [IntWritable(1)] - "medium" is index 1
350
351
List<Writable> oneHotResult = catToOneHot.map(input);
352
// Result: [IntWritable(0), IntWritable(1), IntWritable(0)] - one-hot for "medium"
353
```
354
355
### Conditional Transformations
356
357
Conditional logic for data transformations.
358
359
```java { .api }
360
public interface Condition {
361
boolean condition(List<Writable> list);
362
boolean condition(Object input);
363
String[] getColumnNames();
364
}
365
366
public enum ConditionOp {
367
Equal,
368
NotEqual,
369
LessThan,
370
LessOrEqual,
371
GreaterThan,
372
GreaterOrEqual
373
}
374
375
public class DoubleColumnCondition implements Condition {
376
public DoubleColumnCondition(String columnName, ConditionOp op, double value);
377
}
378
379
public class StringColumnCondition implements Condition {
380
public StringColumnCondition(String columnName, ConditionOp op, String value);
381
}
382
383
public class ConditionalReplaceValueTransform implements Transform {
384
public ConditionalReplaceValueTransform(String columnName, Condition condition, Writable newValue);
385
}
386
```
387
388
**Usage Examples:**
389
390
```java
391
// Replace negative values with zero
392
Condition negativeCondition = new DoubleColumnCondition("salary", ConditionOp.LessThan, 0.0);
393
Transform replaceNegative = new ConditionalReplaceValueTransform("salary", negativeCondition, new DoubleWritable(0.0));
394
395
// Replace specific string values
396
Condition invalidString = new StringColumnCondition("status", ConditionOp.Equal, "INVALID");
397
Transform replaceInvalid = new ConditionalReplaceValueTransform("status", invalidString, new Text("UNKNOWN"));
398
399
List<Writable> input = Arrays.asList(new DoubleWritable(-1000.0));
400
List<Writable> result = replaceNegative.map(input);
401
// Result: [DoubleWritable(0.0)] - negative value replaced with zero
402
```
403
404
## Integration Patterns
405
406
### With RecordReader
407
408
```java
409
// Apply transformations to record reader output
410
RecordReader reader = new CSVRecordReader();
411
reader.initialize(new FileSplit(new File("data.csv")));
412
413
TransformProcess tp = new TransformProcess.Builder(schema)
414
.normalize("feature1", Normalize.MinMax)
415
.categoricalToOneHot("category")
416
.build();
417
418
List<List<Writable>> transformedData = new ArrayList<>();
419
while (reader.hasNext()) {
420
List<Writable> record = reader.next();
421
List<Writable> transformed = tp.execute(record);
422
transformedData.add(transformed);
423
}
424
```
425
426
### With DataSetIterator
427
428
```java
429
// Transform data before creating DataSet
430
TransformProcessRecordReader transformReader = new TransformProcessRecordReader(baseReader, transformProcess);
431
432
DataSetIterator iterator = new RecordReaderDataSetIterator(
433
transformReader,
434
batchSize,
435
labelIndex,
436
numClasses
437
);
438
```
439
440
### Batch Processing
441
442
```java
443
// Process data in batches with transformations
444
List<List<Writable>> batch = new ArrayList<>();
445
// ... populate batch
446
447
// Apply transformation to entire batch
448
List<List<Writable>> transformedBatch = transformProcess.execute(batch);
449
450
// Process transformed batch
451
for (List<Writable> record : transformedBatch) {
452
// Handle transformed record
453
}
454
```
455
456
## Types
457
458
### Core Interfaces
459
460
```java { .api }
461
public interface Transform {
462
List<Writable> map(List<Writable> writables);
463
String[] outputColumnNames();
464
ColumnType[] outputColumnTypes();
465
String transform(String input);
466
}
467
468
public interface Condition {
469
boolean condition(List<Writable> list);
470
boolean condition(Object input);
471
String[] getColumnNames();
472
}
473
```
474
475
### Transform Process Classes
476
477
```java { .api }
478
public class TransformProcess;
479
public class TransformProcess.Builder;
480
public class TransformProcessRecordReader implements RecordReader;
481
```
482
483
### Column Operations
484
485
```java { .api }
486
public enum ColumnType;
487
public abstract class ColumnOp;
488
public class ConvertToString extends ColumnOp;
489
public class ConvertToDouble extends ColumnOp;
490
public class ConvertToInteger extends ColumnOp;
491
```
492
493
### Mathematical Operations
494
495
```java { .api }
496
public enum MathOp;
497
public class MathFunction implements Transform;
498
public class AddConstantColumnTransform implements Transform;
499
public class MultiplyConstantColumnTransform implements Transform;
500
```
501
502
### Reduction Operations
503
504
```java { .api }
505
public enum ReduceOp;
506
public enum StringReduceOp;
507
public class Reducer;
508
public class StringReducer;
509
```
510
511
### Normalization
512
513
```java { .api }
514
public enum Normalize;
515
public class NormalizeTransform implements Transform;
516
public class StandardizeTransform implements Transform;
517
```
518
519
### Categorical Transforms
520
521
```java { .api }
522
public class CategoricalToIntegerTransform implements Transform;
523
public class CategoricalToOneHotTransform implements Transform;
524
public class StringToCategoricalTransform implements Transform;
525
public class ReplaceStringTransform implements Transform;
526
```
527
528
### Conditional Operations
529
530
```java { .api }
531
public enum ConditionOp;
532
public class DoubleColumnCondition implements Condition;
533
public class StringColumnCondition implements Condition;
534
public class ConditionalReplaceValueTransform implements Transform;
535
```