ETL library for machine learning data preprocessing across diverse formats including HDFS, Spark, Images, Video, Audio, CSV, and Excel
—
DataVec uses a type-safe system of Writable objects to represent data values. These provide serialization capabilities, type safety, and seamless integration with Hadoop's I/O framework while supporting machine learning specific data types.
The base interface that all DataVec data types implement. Provides serialization methods and string conversion for debugging and display.
public interface Writable {
void write(DataOutput out) throws IOException;
void readFields(DataInput in) throws IOException;
String toString();
double toDouble();
float toFloat();
int toInt();
long toLong();
}Usage Example:
Writable writable = new DoubleWritable(3.14);
double value = writable.toDouble(); // 3.14
String text = writable.toString(); // "3.14"Type-safe wrappers for Java primitive types, commonly used for structured data representation in CSV files and database records.
public class IntWritable implements Writable {
public IntWritable();
public IntWritable(int value);
public void set(int value);
public int get();
}
public class LongWritable implements Writable {
public LongWritable();
public LongWritable(long value);
public void set(long value);
public long get();
}
public class FloatWritable implements Writable {
public FloatWritable();
public FloatWritable(float value);
public void set(float value);
public float get();
}
public class DoubleWritable implements Writable {
public DoubleWritable();
public DoubleWritable(double value);
public void set(double value);
public double get();
}
public class ByteWritable implements Writable {
public ByteWritable();
public ByteWritable(byte value);
public void set(byte value);
public byte get();
}Usage Examples:
// Create and use integer values
IntWritable intVal = new IntWritable(42);
int primitive = intVal.get(); // 42
intVal.set(100); // Update value
// Create and use floating point values
DoubleWritable doubleVal = new DoubleWritable(3.14159);
double pi = doubleVal.get(); // 3.14159
// Type conversion
int piAsInt = doubleVal.toInt(); // 3 (truncated)
String piAsString = doubleVal.toString(); // "3.14159"Handles text data with efficient string operations and encoding support.
public class Text implements Writable {
public Text();
public Text(String string);
public Text(byte[] utf8);
public void set(String string);
public void set(byte[] utf8);
public String toString();
public byte[] getBytes();
public int getLength();
}Usage Example:
Text textData = new Text("Hello, DataVec!");
String value = textData.toString(); // "Hello, DataVec!"
byte[] bytes = textData.getBytes(); // UTF-8 encoded bytes
int length = textData.getLength(); // Length in bytes
// Update text value
textData.set("New text content");Handles raw binary data and byte arrays.
public class BytesWritable implements Writable {
public BytesWritable();
public BytesWritable(byte[] bytes);
public void set(byte[] bytes);
public byte[] getBytes();
public int getLength();
public void setCapacity(int capacity);
}Usage Example:
// Create with byte array
byte[] data = {0x48, 0x65, 0x6C, 0x6C, 0x6F}; // "Hello" in bytes
BytesWritable bytesData = new BytesWritable(data);
byte[] retrieved = bytesData.getBytes(); // Original byte array
int length = bytesData.getLength(); // 5
// Update with new data
byte[] newData = "World".getBytes("UTF-8");
bytesData.set(newData);Represents boolean values in the Writable system.
public class BooleanWritable implements Writable {
public BooleanWritable();
public BooleanWritable(boolean value);
public void set(boolean value);
public boolean get();
}Usage Example:
BooleanWritable boolVal = new BooleanWritable(true);
boolean flag = boolVal.get(); // true
boolVal.set(false); // Update to falseWraps ND4J INDArray objects for machine learning tensor operations within the DataVec ecosystem.
public class NDArrayWritable implements Writable {
public NDArrayWritable();
public NDArrayWritable(INDArray array);
public void set(INDArray array);
public INDArray get();
}Usage Example:
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.api.ndarray.INDArray;
// Create a tensor
INDArray tensor = Nd4j.create(new double[]{1.0, 2.0, 3.0, 4.0});
NDArrayWritable ndArrayWritable = new NDArrayWritable(tensor);
// Retrieve tensor
INDArray retrieved = ndArrayWritable.get();
double[] values = retrieved.toDoubleVector(); // [1.0, 2.0, 3.0, 4.0]Represents null or missing values in datasets.
public class NullWritable implements Writable {
public static final NullWritable INSTANCE = new NullWritable();
private NullWritable();
}Usage Example:
Writable nullValue = NullWritable.INSTANCE;
boolean isNull = (nullValue instanceof NullWritable); // trueHandles collections of Writable objects for complex data structures.
public class ArrayWritable implements Writable {
public ArrayWritable(Class<? extends Writable> valueClass);
public ArrayWritable(Class<? extends Writable> valueClass, Writable[] values);
public ArrayWritable(String[] strings);
public void set(Writable[] values);
public Writable[] get();
public String[] toStrings();
}Usage Example:
// Create array of doubles
Writable[] doubles = {
new DoubleWritable(1.1),
new DoubleWritable(2.2),
new DoubleWritable(3.3)
};
ArrayWritable arrayWritable = new ArrayWritable(DoubleWritable.class, doubles);
Writable[] retrieved = arrayWritable.get();
// Convert to string array
String[] strings = arrayWritable.toStrings(); // ["1.1", "2.2", "3.3"]All Writable objects support conversion to common Java types:
Writable writable = new DoubleWritable(42.7);
double asDouble = writable.toDouble(); // 42.7
float asFloat = writable.toFloat(); // 42.7f
int asInt = writable.toInt(); // 42 (truncated)
long asLong = writable.toLong(); // 42L (truncated)
String asString = writable.toString(); // "42.7"Enables custom conversion logic for transforming data during record reading:
public interface WritableConverter {
Writable convert(Writable writable) throws WritableConverterException;
}
public class SelfWritableConverter implements WritableConverter {
public Writable convert(Writable writable) throws WritableConverterException;
}Usage Example:
// Custom converter that squares numeric values
WritableConverter squareConverter = new WritableConverter() {
@Override
public Writable convert(Writable writable) throws WritableConverterException {
if (writable instanceof DoubleWritable) {
double value = ((DoubleWritable) writable).get();
return new DoubleWritable(value * value);
}
return writable; // Pass through non-numeric values
}
};
// Use with RecordReaderDataSetIterator
RecordReaderDataSetIterator iterator = new RecordReaderDataSetIterator(
recordReader, squareConverter, batchSize, labelIndex, numClasses
);All Writable objects support Hadoop-compatible serialization:
// Serialize to output stream
Writable writable = new IntWritable(123);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(baos);
writable.write(dos);
byte[] serialized = baos.toByteArray();
// Deserialize from input stream
ByteArrayInputStream bais = new ByteArrayInputStream(serialized);
DataInputStream dis = new DataInputStream(bais);
IntWritable deserialized = new IntWritable();
deserialized.readFields(dis);
int value = deserialized.get(); // 123List<Writable> record = recordReader.next();
// Access by index with type conversion
int id = record.get(0).toInt();
String name = record.get(1).toString();
double score = record.get(2).toDouble();
boolean active = record.get(3).toInt() == 1; // Convert int to booleanfor (Writable writable : record) {
if (writable instanceof NullWritable) {
// Handle missing value
continue;
}
if (writable instanceof DoubleWritable) {
double value = writable.toDouble();
if (Double.isNaN(value) || Double.isInfinite(value)) {
// Handle invalid numeric values
}
}
}public interface Writable {
void write(DataOutput out) throws IOException;
void readFields(DataInput in) throws IOException;
String toString();
double toDouble();
float toFloat();
int toInt();
long toLong();
}
// Primitive Writable Types
public class IntWritable implements Writable;
public class LongWritable implements Writable;
public class FloatWritable implements Writable;
public class DoubleWritable implements Writable;
public class ByteWritable implements Writable;
public class BooleanWritable implements Writable;
// Complex Data Types
public class Text implements Writable;
public class BytesWritable implements Writable;
public class NDArrayWritable implements Writable;
public class ArrayWritable implements Writable;
public class NullWritable implements Writable;
public interface WritableConverter {
Writable convert(Writable writable) throws WritableConverterException;
}
public class WritableConverterException extends Exception {
public WritableConverterException(String message);
public WritableConverterException(String message, Throwable cause);
}Install with Tessl CLI
npx tessl i tessl/maven-org-datavec--datavec-api