CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-datavec--datavec-api

ETL library for machine learning data preprocessing across diverse formats including HDFS, Spark, Images, Video, Audio, CSV, and Excel

Pending
Overview
Eval results
Files

data-types.mddocs/

Data Types and Writables

DataVec uses a type-safe system of Writable objects to represent data values. These provide serialization capabilities, type safety, and seamless integration with Hadoop's I/O framework while supporting machine learning specific data types.

Capabilities

Core Writable Interface

The base interface that all DataVec data types implement. Provides serialization methods and string conversion for debugging and display.

public interface Writable {
    void write(DataOutput out) throws IOException;
    void readFields(DataInput in) throws IOException;
    String toString();
    double toDouble();
    float toFloat();
    int toInt();
    long toLong();
}

Usage Example:

Writable writable = new DoubleWritable(3.14);
double value = writable.toDouble();  // 3.14
String text = writable.toString();   // "3.14"

Primitive Type Wrappers

Type-safe wrappers for Java primitive types, commonly used for structured data representation in CSV files and database records.

public class IntWritable implements Writable {
    public IntWritable();
    public IntWritable(int value);
    public void set(int value);
    public int get();
}

public class LongWritable implements Writable {
    public LongWritable();
    public LongWritable(long value);
    public void set(long value);
    public long get();
}

public class FloatWritable implements Writable {
    public FloatWritable();
    public FloatWritable(float value);
    public void set(float value);
    public float get();
}

public class DoubleWritable implements Writable {
    public DoubleWritable();
    public DoubleWritable(double value);
    public void set(double value);
    public double get();
}

public class ByteWritable implements Writable {
    public ByteWritable();
    public ByteWritable(byte value);
    public void set(byte value);
    public byte get();
}

Usage Examples:

// Create and use integer values
IntWritable intVal = new IntWritable(42);
int primitive = intVal.get();           // 42
intVal.set(100);                        // Update value

// Create and use floating point values
DoubleWritable doubleVal = new DoubleWritable(3.14159);
double pi = doubleVal.get();            // 3.14159

// Type conversion
int piAsInt = doubleVal.toInt();        // 3 (truncated)
String piAsString = doubleVal.toString(); // "3.14159"

Text and String Data

Handles text data with efficient string operations and encoding support.

public class Text implements Writable {
    public Text();
    public Text(String string);
    public Text(byte[] utf8);
    public void set(String string);
    public void set(byte[] utf8);
    public String toString();
    public byte[] getBytes();
    public int getLength();
}

Usage Example:

Text textData = new Text("Hello, DataVec!");
String value = textData.toString();     // "Hello, DataVec!"
byte[] bytes = textData.getBytes();     // UTF-8 encoded bytes
int length = textData.getLength();      // Length in bytes

// Update text value
textData.set("New text content");

Binary Data

Handles raw binary data and byte arrays.

public class BytesWritable implements Writable {
    public BytesWritable();
    public BytesWritable(byte[] bytes);
    public void set(byte[] bytes);
    public byte[] getBytes();
    public int getLength();
    public void setCapacity(int capacity);
}

Usage Example:

// Create with byte array
byte[] data = {0x48, 0x65, 0x6C, 0x6C, 0x6F}; // "Hello" in bytes
BytesWritable bytesData = new BytesWritable(data);

byte[] retrieved = bytesData.getBytes();        // Original byte array
int length = bytesData.getLength();             // 5

// Update with new data
byte[] newData = "World".getBytes("UTF-8");
bytesData.set(newData);

Boolean Data

Represents boolean values in the Writable system.

public class BooleanWritable implements Writable {
    public BooleanWritable();
    public BooleanWritable(boolean value);
    public void set(boolean value);
    public boolean get();
}

Usage Example:

BooleanWritable boolVal = new BooleanWritable(true);
boolean flag = boolVal.get();           // true
boolVal.set(false);                     // Update to false

NDArray Integration

Wraps ND4J INDArray objects for machine learning tensor operations within the DataVec ecosystem.

public class NDArrayWritable implements Writable {
    public NDArrayWritable();
    public NDArrayWritable(INDArray array);
    public void set(INDArray array);
    public INDArray get();
}

Usage Example:

import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.api.ndarray.INDArray;

// Create a tensor
INDArray tensor = Nd4j.create(new double[]{1.0, 2.0, 3.0, 4.0});
NDArrayWritable ndArrayWritable = new NDArrayWritable(tensor);

// Retrieve tensor
INDArray retrieved = ndArrayWritable.get();
double[] values = retrieved.toDoubleVector(); // [1.0, 2.0, 3.0, 4.0]

Null Value Handling

Represents null or missing values in datasets.

public class NullWritable implements Writable {
    public static final NullWritable INSTANCE = new NullWritable();
    private NullWritable();
}

Usage Example:

Writable nullValue = NullWritable.INSTANCE;
boolean isNull = (nullValue instanceof NullWritable); // true

Collection Types

Handles collections of Writable objects for complex data structures.

public class ArrayWritable implements Writable {
    public ArrayWritable(Class<? extends Writable> valueClass);
    public ArrayWritable(Class<? extends Writable> valueClass, Writable[] values);
    public ArrayWritable(String[] strings);
    public void set(Writable[] values);
    public Writable[] get();
    public String[] toStrings();
}

Usage Example:

// Create array of doubles
Writable[] doubles = {
    new DoubleWritable(1.1),
    new DoubleWritable(2.2),
    new DoubleWritable(3.3)
};

ArrayWritable arrayWritable = new ArrayWritable(DoubleWritable.class, doubles);
Writable[] retrieved = arrayWritable.get();

// Convert to string array
String[] strings = arrayWritable.toStrings(); // ["1.1", "2.2", "3.3"]

Data Conversion Patterns

Type Conversion

All Writable objects support conversion to common Java types:

Writable writable = new DoubleWritable(42.7);

double asDouble = writable.toDouble();  // 42.7
float asFloat = writable.toFloat();     // 42.7f
int asInt = writable.toInt();           // 42 (truncated)
long asLong = writable.toLong();        // 42L (truncated)
String asString = writable.toString();  // "42.7"

WritableConverter Interface

Enables custom conversion logic for transforming data during record reading:

public interface WritableConverter {
    Writable convert(Writable writable) throws WritableConverterException;
}

public class SelfWritableConverter implements WritableConverter {
    public Writable convert(Writable writable) throws WritableConverterException;
}

Usage Example:

// Custom converter that squares numeric values
WritableConverter squareConverter = new WritableConverter() {
    @Override
    public Writable convert(Writable writable) throws WritableConverterException {
        if (writable instanceof DoubleWritable) {
            double value = ((DoubleWritable) writable).get();
            return new DoubleWritable(value * value);
        }
        return writable; // Pass through non-numeric values
    }
};

// Use with RecordReaderDataSetIterator
RecordReaderDataSetIterator iterator = new RecordReaderDataSetIterator(
    recordReader, squareConverter, batchSize, labelIndex, numClasses
);

Serialization and I/O

All Writable objects support Hadoop-compatible serialization:

// Serialize to output stream
Writable writable = new IntWritable(123);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(baos);
writable.write(dos);
byte[] serialized = baos.toByteArray();

// Deserialize from input stream
ByteArrayInputStream bais = new ByteArrayInputStream(serialized);
DataInputStream dis = new DataInputStream(bais);
IntWritable deserialized = new IntWritable();
deserialized.readFields(dis);
int value = deserialized.get(); // 123

Common Usage Patterns

Record Processing

List<Writable> record = recordReader.next();

// Access by index with type conversion
int id = record.get(0).toInt();
String name = record.get(1).toString();
double score = record.get(2).toDouble();
boolean active = record.get(3).toInt() == 1; // Convert int to boolean

Data Validation

for (Writable writable : record) {
    if (writable instanceof NullWritable) {
        // Handle missing value
        continue;
    }
    
    if (writable instanceof DoubleWritable) {
        double value = writable.toDouble();
        if (Double.isNaN(value) || Double.isInfinite(value)) {
            // Handle invalid numeric values
        }
    }
}

Types

Core Interfaces and Classes

public interface Writable {
    void write(DataOutput out) throws IOException;
    void readFields(DataInput in) throws IOException;
    String toString();
    double toDouble();
    float toFloat();
    int toInt();
    long toLong();
}

// Primitive Writable Types
public class IntWritable implements Writable;
public class LongWritable implements Writable;
public class FloatWritable implements Writable;
public class DoubleWritable implements Writable;
public class ByteWritable implements Writable;
public class BooleanWritable implements Writable;

// Complex Data Types
public class Text implements Writable;
public class BytesWritable implements Writable;
public class NDArrayWritable implements Writable;
public class ArrayWritable implements Writable;
public class NullWritable implements Writable;

public interface WritableConverter {
    Writable convert(Writable writable) throws WritableConverterException;
}

public class WritableConverterException extends Exception {
    public WritableConverterException(String message);
    public WritableConverterException(String message, Throwable cause);
}

Install with Tessl CLI

npx tessl i tessl/maven-org-datavec--datavec-api

docs

data-types.md

image-processing.md

index.md

input-sources.md

record-readers.md

transforms.md

tile.json