CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-datavec--datavec-api

ETL library for machine learning data preprocessing across diverse formats including HDFS, Spark, Images, Video, Audio, CSV, and Excel

Pending
Overview
Eval results
Files

transforms.mddocs/

Data Transforms and Processing

DataVec provides a comprehensive transformation system for preprocessing and cleaning data before feeding it to machine learning models. The transformation API enables complex data pipelines with column-level operations, mathematical transformations, and data quality improvements.

Capabilities

Transform Process

The core transformation workflow manager that orchestrates multiple transformation steps in a pipeline.

public class TransformProcess {
    public static Builder builder(Schema initialSchema);
    public Schema getInitialSchema();
    public Schema getFinalSchema();
    public List<DataAction> getActionList();
    public List<Writable> execute(List<Writable> input);
    public List<List<Writable>> execute(List<List<Writable>> input);
}

public static class TransformProcess.Builder {
    public Builder removeColumns(String... columnNames);
    public Builder removeColumns(int... columnIndices);
    public Builder renameColumn(String oldName, String newName);
    public Builder filter(Condition condition);
    public Builder transform(Transform transform);
    public Builder convertToString(String columnName);
    public Builder convertToDouble(String columnName);
    public Builder convertToInteger(String columnName);
    public Builder normalize(String columnName, Normalize normalization);
    public Builder standardize(String columnName);
    public Builder categoricalToOneHot(String columnName);
    public Builder categoricalToInteger(String columnName);
    public Builder stringToTimeTransform(String columnName, String dateTimeFormat, DateTimeZone dateTimeZone);
    public Builder conditionalReplaceValueTransform(String columnName, Condition condition, Writable newValue);
    public Builder appendStringColumnTransform(String columnName, String stringToAppend);
    public Builder replaceStringTransform(String columnName, Map<String, String> mapping);
    public TransformProcess build();
}

Usage Example:

import org.datavec.api.transform.TransformProcess;
import org.datavec.api.transform.schema.Schema;
import org.datavec.api.transform.condition.ConditionOp;
import org.datavec.api.transform.condition.column.DoubleColumnCondition;

// Define input schema
Schema inputSchema = new Schema.Builder()
    .addColumnString("name")
    .addColumnInteger("age")
    .addColumnDouble("income")
    .addColumnCategorical("category", Arrays.asList("A", "B", "C"))
    .build();

// Build transformation process
TransformProcess tp = new TransformProcess.Builder(inputSchema)
    .removeColumns("name")                                    // Remove name column
    .filter(new DoubleColumnCondition("income", ConditionOp.GreaterThan, 0.0)) // Filter positive income
    .normalize("income", Normalize.MinMax)                    // Min-max normalize income
    .categoricalToOneHot("category")                          // One-hot encode category
    .build();

// Apply transformations
List<Writable> input = Arrays.asList(
    new Text("John"),
    new IntWritable(25), 
    new DoubleWritable(50000.0),
    new Text("A")
);

List<Writable> transformed = tp.execute(input);
// Result: [IntWritable(25), DoubleWritable(normalized_income), IntWritable(1), IntWritable(0), IntWritable(0)]

Core Transform Interface

Base interface for all data transformations.

public interface Transform {
    List<Writable> map(List<Writable> writables);
    String[] outputColumnNames();
    ColumnType[] outputColumnTypes();
    String transform(String input);
}

Column Operations

Operations that work on individual columns of data.

public enum ColumnType {
    String,
    Integer, 
    Long,
    Double,
    Categorical,
    Time,
    Bytes,
    Boolean,
    NDArray
}

public abstract class ColumnOp {
    public abstract ColumnType getColumnType();
    public abstract String[] columnNames();
}

public class ConvertToString extends ColumnOp {
    public ConvertToString(String columnName);
}

public class ConvertToDouble extends ColumnOp {
    public ConvertToDouble(String columnName);
}

public class ConvertToInteger extends ColumnOp {
    public ConvertToInteger(String columnName);
}

Usage Example:

// Convert string column to double
Transform convertTransform = new ConvertToDouble("price_string");

List<Writable> input = Arrays.asList(
    new Text("Product A"),
    new Text("29.99")  // String representation of price
);

List<Writable> output = convertTransform.map(input);
// Result: [Text("Product A"), DoubleWritable(29.99)]

Mathematical Operations

Mathematical transformations and calculations on numeric columns.

public enum MathOp {
    Add,
    Subtract,
    Multiply,
    Divide,
    Modulus,
    ReverseSubtract,
    ReverseDivide,
    ScalarMin,
    ScalarMax,
    Abs,
    Ceil,
    Floor,
    Round,
    Sqrt,
    Square,
    Log,
    Log10,
    Exp,
    Pow,
    Sin,
    Cos,
    Tan,
    ASin,
    ACos,
    ATan,
    Sinh,
    Cosh,
    Tanh
}

public class MathFunction implements Transform {
    public MathFunction(String columnName, MathOp operation);
    public MathFunction(String columnName, MathOp operation, double scalar);
}

public class AddConstantColumnTransform implements Transform {
    public AddConstantColumnTransform(String columnName, double value);
}

public class MultiplyConstantColumnTransform implements Transform {
    public MultiplyConstantColumnTransform(String columnName, double value);
}

Usage Examples:

// Square all values in a column
Transform squareTransform = new MathFunction("values", MathOp.Square);

// Add constant to column
Transform addConstant = new AddConstantColumnTransform("salary", 5000.0);

// Multiply by constant
Transform multiplyConstant = new MultiplyConstantColumnTransform("price", 1.1); // 10% increase

List<Writable> input = Arrays.asList(new DoubleWritable(100.0));
List<Writable> squared = squareTransform.map(input);
// Result: [DoubleWritable(10000.0)]

Reduction Operations

Operations that reduce multiple rows to summary statistics.

public enum ReduceOp {
    Min,
    Max,
    Range,
    Sum,
    Mean,
    Prod,
    Stdev,
    UncorrectedStdDev,
    Variance,
    PopulationVariance,
    Count,
    CountUnique
}

public class Reducer {
    public static Builder builder(ReduceOp op, String column);
    public IAggregableReduceOp<List<Writable>, List<Writable>> getReduction();
}

public enum StringReduceOp {
    Merge,
    Append,
    Prepend,
    Replace
}

public class StringReducer {
    public static Builder builder(StringReduceOp op, String column);
    public static StringReducer merge(String column, String delimiter);
    public static StringReducer append(String column, String stringToAppend);
}

Usage Examples:

// Calculate mean of a numeric column
IAggregableReduceOp<List<Writable>, List<Writable>> meanReduction = 
    Reducer.builder(ReduceOp.Mean, "values").build().getReduction();

// Merge string values with delimiter
StringReducer merger = StringReducer.merge("names", ",");

List<List<Writable>> data = Arrays.asList(
    Arrays.asList(new DoubleWritable(10.0), new Text("Alice")),
    Arrays.asList(new DoubleWritable(20.0), new Text("Bob")),
    Arrays.asList(new DoubleWritable(30.0), new Text("Charlie"))
);

// Apply reduction operations
List<Writable> meanResult = meanReduction.aggregate(data);
// Result: [DoubleWritable(20.0)] - mean of 10, 20, 30

Normalization and Standardization

Statistical normalization techniques for numeric data.

public enum Normalize {
    MinMax,
    Standardize,
    Normalize,
    Log2,
    Log10
}

public class NormalizeTransform implements Transform {
    public NormalizeTransform(String columnName, Normalize normalization);
    public NormalizeTransform(String columnName, Normalize normalization, 
                             double minValue, double maxValue);
}

public class StandardizeTransform implements Transform {
    public StandardizeTransform(String columnName);
    public StandardizeTransform(String columnName, double mean, double stdev);
}

Usage Examples:

// Min-max normalization to [0, 1]
Transform minMaxNorm = new NormalizeTransform("values", Normalize.MinMax);

// Z-score standardization
Transform standardize = new StandardizeTransform("values");

// Custom min-max range [0, 100]
Transform customRange = new NormalizeTransform("values", Normalize.MinMax, 0.0, 100.0);

List<Writable> input = Arrays.asList(new DoubleWritable(75.0));
List<Writable> normalized = minMaxNorm.map(input);
// Result depends on previously calculated min/max values from data

Categorical Data Handling

Transformations for categorical and string data.

public class CategoricalToIntegerTransform implements Transform {
    public CategoricalToIntegerTransform(String columnName, List<String> categoryList);
}

public class CategoricalToOneHotTransform implements Transform {
    public CategoricalToOneHotTransform(String columnName, List<String> categoryList);
}

public class StringToCategoricalTransform implements Transform {
    public StringToCategoricalTransform(String columnName, List<String> categoryList);
}

public class ReplaceStringTransform implements Transform {
    public ReplaceStringTransform(String columnName, Map<String, String> mapping);
}

Usage Examples:

// Convert categories to integers
List<String> categories = Arrays.asList("small", "medium", "large");
Transform catToInt = new CategoricalToIntegerTransform("size", categories);

// Convert categories to one-hot encoding
Transform catToOneHot = new CategoricalToOneHotTransform("size", categories);

// String replacement mapping
Map<String, String> replacements = new HashMap<>();
replacements.put("yes", "1");
replacements.put("no", "0");
Transform stringReplace = new ReplaceStringTransform("response", replacements);

List<Writable> input = Arrays.asList(new Text("medium"));
List<Writable> intResult = catToInt.map(input);
// Result: [IntWritable(1)] - "medium" is index 1

List<Writable> oneHotResult = catToOneHot.map(input);
// Result: [IntWritable(0), IntWritable(1), IntWritable(0)] - one-hot for "medium"

Conditional Transformations

Conditional logic for data transformations.

public interface Condition {
    boolean condition(List<Writable> list);
    boolean condition(Object input);
    String[] getColumnNames();
}

public enum ConditionOp {
    Equal,
    NotEqual,
    LessThan,
    LessOrEqual,
    GreaterThan,
    GreaterOrEqual
}

public class DoubleColumnCondition implements Condition {
    public DoubleColumnCondition(String columnName, ConditionOp op, double value);
}

public class StringColumnCondition implements Condition {
    public StringColumnCondition(String columnName, ConditionOp op, String value);
}

public class ConditionalReplaceValueTransform implements Transform {
    public ConditionalReplaceValueTransform(String columnName, Condition condition, Writable newValue);
}

Usage Examples:

// Replace negative values with zero
Condition negativeCondition = new DoubleColumnCondition("salary", ConditionOp.LessThan, 0.0);
Transform replaceNegative = new ConditionalReplaceValueTransform("salary", negativeCondition, new DoubleWritable(0.0));

// Replace specific string values
Condition invalidString = new StringColumnCondition("status", ConditionOp.Equal, "INVALID");
Transform replaceInvalid = new ConditionalReplaceValueTransform("status", invalidString, new Text("UNKNOWN"));

List<Writable> input = Arrays.asList(new DoubleWritable(-1000.0));
List<Writable> result = replaceNegative.map(input);
// Result: [DoubleWritable(0.0)] - negative value replaced with zero

Integration Patterns

With RecordReader

// Apply transformations to record reader output
RecordReader reader = new CSVRecordReader();
reader.initialize(new FileSplit(new File("data.csv")));

TransformProcess tp = new TransformProcess.Builder(schema)
    .normalize("feature1", Normalize.MinMax)
    .categoricalToOneHot("category")
    .build();

List<List<Writable>> transformedData = new ArrayList<>();
while (reader.hasNext()) {
    List<Writable> record = reader.next();
    List<Writable> transformed = tp.execute(record);
    transformedData.add(transformed);
}

With DataSetIterator

// Transform data before creating DataSet
TransformProcessRecordReader transformReader = new TransformProcessRecordReader(baseReader, transformProcess);

DataSetIterator iterator = new RecordReaderDataSetIterator(
    transformReader,
    batchSize,
    labelIndex,
    numClasses
);

Batch Processing

// Process data in batches with transformations
List<List<Writable>> batch = new ArrayList<>();
// ... populate batch

// Apply transformation to entire batch
List<List<Writable>> transformedBatch = transformProcess.execute(batch);

// Process transformed batch
for (List<Writable> record : transformedBatch) {
    // Handle transformed record
}

Types

Core Interfaces

public interface Transform {
    List<Writable> map(List<Writable> writables);
    String[] outputColumnNames();
    ColumnType[] outputColumnTypes();
    String transform(String input);
}

public interface Condition {
    boolean condition(List<Writable> list);
    boolean condition(Object input);
    String[] getColumnNames();
}

Transform Process Classes

public class TransformProcess;
public class TransformProcess.Builder;
public class TransformProcessRecordReader implements RecordReader;

Column Operations

public enum ColumnType;
public abstract class ColumnOp;
public class ConvertToString extends ColumnOp;
public class ConvertToDouble extends ColumnOp;
public class ConvertToInteger extends ColumnOp;

Mathematical Operations

public enum MathOp;
public class MathFunction implements Transform;
public class AddConstantColumnTransform implements Transform;
public class MultiplyConstantColumnTransform implements Transform;

Reduction Operations

public enum ReduceOp;
public enum StringReduceOp;
public class Reducer;
public class StringReducer;

Normalization

public enum Normalize;
public class NormalizeTransform implements Transform;
public class StandardizeTransform implements Transform;

Categorical Transforms

public class CategoricalToIntegerTransform implements Transform;
public class CategoricalToOneHotTransform implements Transform;
public class StringToCategoricalTransform implements Transform;
public class ReplaceStringTransform implements Transform;

Conditional Operations

public enum ConditionOp;
public class DoubleColumnCondition implements Condition;
public class StringColumnCondition implements Condition;
public class ConditionalReplaceValueTransform implements Transform;

Install with Tessl CLI

npx tessl i tessl/maven-org-datavec--datavec-api

docs

data-types.md

image-processing.md

index.md

input-sources.md

record-readers.md

transforms.md

tile.json