ETL library for machine learning data preprocessing across diverse formats including HDFS, Spark, Images, Video, Audio, CSV, and Excel
—
DataVec provides a comprehensive transformation system for preprocessing and cleaning data before feeding it to machine learning models. The transformation API enables complex data pipelines with column-level operations, mathematical transformations, and data quality improvements.
The core transformation workflow manager that orchestrates multiple transformation steps in a pipeline.
public class TransformProcess {
public static Builder builder(Schema initialSchema);
public Schema getInitialSchema();
public Schema getFinalSchema();
public List<DataAction> getActionList();
public List<Writable> execute(List<Writable> input);
public List<List<Writable>> execute(List<List<Writable>> input);
}
public static class TransformProcess.Builder {
public Builder removeColumns(String... columnNames);
public Builder removeColumns(int... columnIndices);
public Builder renameColumn(String oldName, String newName);
public Builder filter(Condition condition);
public Builder transform(Transform transform);
public Builder convertToString(String columnName);
public Builder convertToDouble(String columnName);
public Builder convertToInteger(String columnName);
public Builder normalize(String columnName, Normalize normalization);
public Builder standardize(String columnName);
public Builder categoricalToOneHot(String columnName);
public Builder categoricalToInteger(String columnName);
public Builder stringToTimeTransform(String columnName, String dateTimeFormat, DateTimeZone dateTimeZone);
public Builder conditionalReplaceValueTransform(String columnName, Condition condition, Writable newValue);
public Builder appendStringColumnTransform(String columnName, String stringToAppend);
public Builder replaceStringTransform(String columnName, Map<String, String> mapping);
public TransformProcess build();
}Usage Example:
import org.datavec.api.transform.TransformProcess;
import org.datavec.api.transform.schema.Schema;
import org.datavec.api.transform.condition.ConditionOp;
import org.datavec.api.transform.condition.column.DoubleColumnCondition;
// Define input schema
Schema inputSchema = new Schema.Builder()
.addColumnString("name")
.addColumnInteger("age")
.addColumnDouble("income")
.addColumnCategorical("category", Arrays.asList("A", "B", "C"))
.build();
// Build transformation process
TransformProcess tp = new TransformProcess.Builder(inputSchema)
.removeColumns("name") // Remove name column
.filter(new DoubleColumnCondition("income", ConditionOp.GreaterThan, 0.0)) // Filter positive income
.normalize("income", Normalize.MinMax) // Min-max normalize income
.categoricalToOneHot("category") // One-hot encode category
.build();
// Apply transformations
List<Writable> input = Arrays.asList(
new Text("John"),
new IntWritable(25),
new DoubleWritable(50000.0),
new Text("A")
);
List<Writable> transformed = tp.execute(input);
// Result: [IntWritable(25), DoubleWritable(normalized_income), IntWritable(1), IntWritable(0), IntWritable(0)]Base interface for all data transformations.
public interface Transform {
List<Writable> map(List<Writable> writables);
String[] outputColumnNames();
ColumnType[] outputColumnTypes();
String transform(String input);
}Operations that work on individual columns of data.
public enum ColumnType {
String,
Integer,
Long,
Double,
Categorical,
Time,
Bytes,
Boolean,
NDArray
}
public abstract class ColumnOp {
public abstract ColumnType getColumnType();
public abstract String[] columnNames();
}
public class ConvertToString extends ColumnOp {
public ConvertToString(String columnName);
}
public class ConvertToDouble extends ColumnOp {
public ConvertToDouble(String columnName);
}
public class ConvertToInteger extends ColumnOp {
public ConvertToInteger(String columnName);
}Usage Example:
// Convert string column to double
Transform convertTransform = new ConvertToDouble("price_string");
List<Writable> input = Arrays.asList(
new Text("Product A"),
new Text("29.99") // String representation of price
);
List<Writable> output = convertTransform.map(input);
// Result: [Text("Product A"), DoubleWritable(29.99)]Mathematical transformations and calculations on numeric columns.
public enum MathOp {
Add,
Subtract,
Multiply,
Divide,
Modulus,
ReverseSubtract,
ReverseDivide,
ScalarMin,
ScalarMax,
Abs,
Ceil,
Floor,
Round,
Sqrt,
Square,
Log,
Log10,
Exp,
Pow,
Sin,
Cos,
Tan,
ASin,
ACos,
ATan,
Sinh,
Cosh,
Tanh
}
public class MathFunction implements Transform {
public MathFunction(String columnName, MathOp operation);
public MathFunction(String columnName, MathOp operation, double scalar);
}
public class AddConstantColumnTransform implements Transform {
public AddConstantColumnTransform(String columnName, double value);
}
public class MultiplyConstantColumnTransform implements Transform {
public MultiplyConstantColumnTransform(String columnName, double value);
}Usage Examples:
// Square all values in a column
Transform squareTransform = new MathFunction("values", MathOp.Square);
// Add constant to column
Transform addConstant = new AddConstantColumnTransform("salary", 5000.0);
// Multiply by constant
Transform multiplyConstant = new MultiplyConstantColumnTransform("price", 1.1); // 10% increase
List<Writable> input = Arrays.asList(new DoubleWritable(100.0));
List<Writable> squared = squareTransform.map(input);
// Result: [DoubleWritable(10000.0)]Operations that reduce multiple rows to summary statistics.
public enum ReduceOp {
Min,
Max,
Range,
Sum,
Mean,
Prod,
Stdev,
UncorrectedStdDev,
Variance,
PopulationVariance,
Count,
CountUnique
}
public class Reducer {
public static Builder builder(ReduceOp op, String column);
public IAggregableReduceOp<List<Writable>, List<Writable>> getReduction();
}
public enum StringReduceOp {
Merge,
Append,
Prepend,
Replace
}
public class StringReducer {
public static Builder builder(StringReduceOp op, String column);
public static StringReducer merge(String column, String delimiter);
public static StringReducer append(String column, String stringToAppend);
}Usage Examples:
// Calculate mean of a numeric column
IAggregableReduceOp<List<Writable>, List<Writable>> meanReduction =
Reducer.builder(ReduceOp.Mean, "values").build().getReduction();
// Merge string values with delimiter
StringReducer merger = StringReducer.merge("names", ",");
List<List<Writable>> data = Arrays.asList(
Arrays.asList(new DoubleWritable(10.0), new Text("Alice")),
Arrays.asList(new DoubleWritable(20.0), new Text("Bob")),
Arrays.asList(new DoubleWritable(30.0), new Text("Charlie"))
);
// Apply reduction operations
List<Writable> meanResult = meanReduction.aggregate(data);
// Result: [DoubleWritable(20.0)] - mean of 10, 20, 30Statistical normalization techniques for numeric data.
public enum Normalize {
MinMax,
Standardize,
Normalize,
Log2,
Log10
}
public class NormalizeTransform implements Transform {
public NormalizeTransform(String columnName, Normalize normalization);
public NormalizeTransform(String columnName, Normalize normalization,
double minValue, double maxValue);
}
public class StandardizeTransform implements Transform {
public StandardizeTransform(String columnName);
public StandardizeTransform(String columnName, double mean, double stdev);
}Usage Examples:
// Min-max normalization to [0, 1]
Transform minMaxNorm = new NormalizeTransform("values", Normalize.MinMax);
// Z-score standardization
Transform standardize = new StandardizeTransform("values");
// Custom min-max range [0, 100]
Transform customRange = new NormalizeTransform("values", Normalize.MinMax, 0.0, 100.0);
List<Writable> input = Arrays.asList(new DoubleWritable(75.0));
List<Writable> normalized = minMaxNorm.map(input);
// Result depends on previously calculated min/max values from dataTransformations for categorical and string data.
public class CategoricalToIntegerTransform implements Transform {
public CategoricalToIntegerTransform(String columnName, List<String> categoryList);
}
public class CategoricalToOneHotTransform implements Transform {
public CategoricalToOneHotTransform(String columnName, List<String> categoryList);
}
public class StringToCategoricalTransform implements Transform {
public StringToCategoricalTransform(String columnName, List<String> categoryList);
}
public class ReplaceStringTransform implements Transform {
public ReplaceStringTransform(String columnName, Map<String, String> mapping);
}Usage Examples:
// Convert categories to integers
List<String> categories = Arrays.asList("small", "medium", "large");
Transform catToInt = new CategoricalToIntegerTransform("size", categories);
// Convert categories to one-hot encoding
Transform catToOneHot = new CategoricalToOneHotTransform("size", categories);
// String replacement mapping
Map<String, String> replacements = new HashMap<>();
replacements.put("yes", "1");
replacements.put("no", "0");
Transform stringReplace = new ReplaceStringTransform("response", replacements);
List<Writable> input = Arrays.asList(new Text("medium"));
List<Writable> intResult = catToInt.map(input);
// Result: [IntWritable(1)] - "medium" is index 1
List<Writable> oneHotResult = catToOneHot.map(input);
// Result: [IntWritable(0), IntWritable(1), IntWritable(0)] - one-hot for "medium"Conditional logic for data transformations.
public interface Condition {
boolean condition(List<Writable> list);
boolean condition(Object input);
String[] getColumnNames();
}
public enum ConditionOp {
Equal,
NotEqual,
LessThan,
LessOrEqual,
GreaterThan,
GreaterOrEqual
}
public class DoubleColumnCondition implements Condition {
public DoubleColumnCondition(String columnName, ConditionOp op, double value);
}
public class StringColumnCondition implements Condition {
public StringColumnCondition(String columnName, ConditionOp op, String value);
}
public class ConditionalReplaceValueTransform implements Transform {
public ConditionalReplaceValueTransform(String columnName, Condition condition, Writable newValue);
}Usage Examples:
// Replace negative values with zero
Condition negativeCondition = new DoubleColumnCondition("salary", ConditionOp.LessThan, 0.0);
Transform replaceNegative = new ConditionalReplaceValueTransform("salary", negativeCondition, new DoubleWritable(0.0));
// Replace specific string values
Condition invalidString = new StringColumnCondition("status", ConditionOp.Equal, "INVALID");
Transform replaceInvalid = new ConditionalReplaceValueTransform("status", invalidString, new Text("UNKNOWN"));
List<Writable> input = Arrays.asList(new DoubleWritable(-1000.0));
List<Writable> result = replaceNegative.map(input);
// Result: [DoubleWritable(0.0)] - negative value replaced with zero// Apply transformations to record reader output
RecordReader reader = new CSVRecordReader();
reader.initialize(new FileSplit(new File("data.csv")));
TransformProcess tp = new TransformProcess.Builder(schema)
.normalize("feature1", Normalize.MinMax)
.categoricalToOneHot("category")
.build();
List<List<Writable>> transformedData = new ArrayList<>();
while (reader.hasNext()) {
List<Writable> record = reader.next();
List<Writable> transformed = tp.execute(record);
transformedData.add(transformed);
}// Transform data before creating DataSet
TransformProcessRecordReader transformReader = new TransformProcessRecordReader(baseReader, transformProcess);
DataSetIterator iterator = new RecordReaderDataSetIterator(
transformReader,
batchSize,
labelIndex,
numClasses
);// Process data in batches with transformations
List<List<Writable>> batch = new ArrayList<>();
// ... populate batch
// Apply transformation to entire batch
List<List<Writable>> transformedBatch = transformProcess.execute(batch);
// Process transformed batch
for (List<Writable> record : transformedBatch) {
// Handle transformed record
}public interface Transform {
List<Writable> map(List<Writable> writables);
String[] outputColumnNames();
ColumnType[] outputColumnTypes();
String transform(String input);
}
public interface Condition {
boolean condition(List<Writable> list);
boolean condition(Object input);
String[] getColumnNames();
}public class TransformProcess;
public class TransformProcess.Builder;
public class TransformProcessRecordReader implements RecordReader;public enum ColumnType;
public abstract class ColumnOp;
public class ConvertToString extends ColumnOp;
public class ConvertToDouble extends ColumnOp;
public class ConvertToInteger extends ColumnOp;public enum MathOp;
public class MathFunction implements Transform;
public class AddConstantColumnTransform implements Transform;
public class MultiplyConstantColumnTransform implements Transform;public enum ReduceOp;
public enum StringReduceOp;
public class Reducer;
public class StringReducer;public enum Normalize;
public class NormalizeTransform implements Transform;
public class StandardizeTransform implements Transform;public class CategoricalToIntegerTransform implements Transform;
public class CategoricalToOneHotTransform implements Transform;
public class StringToCategoricalTransform implements Transform;
public class ReplaceStringTransform implements Transform;public enum ConditionOp;
public class DoubleColumnCondition implements Condition;
public class StringColumnCondition implements Condition;
public class ConditionalReplaceValueTransform implements Transform;Install with Tessl CLI
npx tessl i tessl/maven-org-datavec--datavec-api