DataVec integration library providing data loading, transformation, and Spark processing capabilities for DeepLearning4j
npx @tessl/cli install tessl/maven-org-datavec--datavec-local@0.9.0DataVec Local Integration provides comprehensive data loading, transformation, and processing capabilities for DeepLearning4j. It bridges DataVec's data processing capabilities with DeepLearning4j's neural network training, enabling seamless conversion of various data sources into DataSet and MultiDataSet objects for machine learning workflows.
<dependency>
<groupId>org.datavec</groupId>
<artifactId>datavec-local</artifactId>
<version>0.9.1</version>
</dependency>import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator;
import org.deeplearning4j.datasets.datavec.RecordReaderMultiDataSetIterator;
import org.deeplearning4j.datasets.datavec.SequenceRecordReaderDataSetIterator;For Spark integration:
import org.deeplearning4j.spark.datavec.DataVecDataSetFunction;
import org.deeplearning4j.spark.datavec.DataVecSequenceDataSetFunction;import org.datavec.api.records.reader.RecordReader;
import org.datavec.api.records.reader.impl.csv.CSVRecordReader;
import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator;
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
// Create a CSV record reader
RecordReader recordReader = new CSVRecordReader();
recordReader.initialize(new FileSplit(new File("data.csv")));
// Create dataset iterator
int batchSize = 32;
int labelIndex = 4; // Index of label column
int numPossibleLabels = 3; // Number of classes
boolean regression = false;
DataSetIterator iterator = new RecordReaderDataSetIterator(
recordReader, batchSize, labelIndex, numPossibleLabels);
// Use with DeepLearning4j training
while (iterator.hasNext()) {
DataSet dataSet = iterator.next();
// Train your model with dataSet
}DataVec integration is built around several key components:
Core functionality for converting RecordReader data into DataSet objects suitable for DeepLearning4j training. Supports various data sources including CSV, images, and custom formats.
public class RecordReaderDataSetIterator implements DataSetIterator {
public RecordReaderDataSetIterator(RecordReader recordReader, int batchSize,
int labelIndex, int numPossibleLabels);
public DataSet next();
public boolean hasNext();
public void reset();
}Time series and sequential data processing with configurable alignment modes. Handles variable-length sequences and provides multiple alignment strategies for batch processing.
public class SequenceRecordReaderDataSetIterator implements DataSetIterator {
public SequenceRecordReaderDataSetIterator(SequenceRecordReader featuresReader,
SequenceRecordReader labelsReader,
int miniBatchSize, int numPossibleLabels);
public enum AlignmentMode { EQUAL_LENGTH, ALIGN_START, ALIGN_END }
}Advanced multi-modal data processing for complex neural network architectures with multiple inputs and outputs. Uses builder pattern for flexible configuration.
public class RecordReaderMultiDataSetIterator implements MultiDataSetIterator {
public static class Builder {
public Builder addReader(String readerName, RecordReader recordReader);
public Builder addInput(String readerName, int columnFirst, int columnLast);
public Builder addOutput(String readerName, int column, int numClasses);
public RecordReaderMultiDataSetIterator build();
}
}Distributed data processing functions for Apache Spark, enabling large-scale data processing and training across clusters.
public class DataVecDataSetFunction implements Function<List<Writable>, DataSet> {
public DataVecDataSetFunction(int labelIndex, int numPossibleLabels, boolean regression);
public DataSet call(List<Writable> currList);
}public interface DataSetIterator extends Iterator<DataSet> {
DataSet next(int num);
int totalExamples();
int inputColumns();
int totalOutcomes();
boolean resetSupported();
void reset();
boolean asyncSupported();
int batch();
int cursor();
void setPreProcessor(DataSetPreProcessor preProcessor);
DataSetPreProcessor getPreProcessor();
List<String> getLabels();
DataSet loadFromMetaData(RecordMetaData recordMetaData);
DataSet loadFromMetaData(List<RecordMetaData> list);
}
public interface MultiDataSetIterator extends Iterator<MultiDataSet> {
MultiDataSet next(int num);
boolean resetSupported();
boolean asyncSupported();
void reset();
void setPreProcessor(MultiDataSetPreProcessor preProcessor);
MultiDataSetPreProcessor getPreProcessor();
MultiDataSet loadFromMetaData(RecordMetaData recordMetaData);
MultiDataSet loadFromMetaData(List<RecordMetaData> list);
}public enum AlignmentMode {
EQUAL_LENGTH, // Sequences must be same length
ALIGN_START, // Align sequences at start, pad end
ALIGN_END // Align sequences at end, pad start
}public class ZeroLengthSequenceException extends RuntimeException {
public ZeroLengthSequenceException();
public ZeroLengthSequenceException(String type);
}