DataVec integration library providing data loading, transformation, and Spark processing capabilities for DeepLearning4j
—
Advanced multi-modal data processing for complex neural network architectures with multiple inputs and outputs. The RecordReaderMultiDataSetIterator uses a flexible builder pattern to configure multiple data sources and map them to different network inputs and outputs.
Main class for creating MultiDataSet objects from multiple RecordReaders and SequenceRecordReaders.
public class RecordReaderMultiDataSetIterator implements MultiDataSetIterator {
// Iterator methods
public boolean hasNext();
public MultiDataSet next();
public MultiDataSet next(int num);
public void remove();
// Configuration methods
public void setPreProcessor(MultiDataSetPreProcessor preProcessor);
public MultiDataSetPreProcessor getPreProcessor();
public void setCollectMetaData(boolean collectMetaData);
public boolean getCollectMetaData();
// Reset and async support
public boolean resetSupported();
public boolean asyncSupported();
public void reset();
// Metadata support
public MultiDataSet loadFromMetaData(RecordMetaData recordMetaData) throws IOException;
public MultiDataSet loadFromMetaData(List<RecordMetaData> recordMetaDatas) throws IOException;
}Fluent API for configuring multi-input/output data processing.
public static class Builder {
public Builder(int batchSize);
// Add data readers
public Builder addReader(String readerName, RecordReader recordReader);
public Builder addSequenceReader(String seqReaderName, SequenceRecordReader seqRecordReader);
// Configure sequence alignment
public Builder sequenceAlignmentMode(AlignmentMode alignmentMode);
// Add inputs (features)
public Builder addInput(String readerName);
public Builder addInput(String readerName, int columnFirst, int columnLast);
public Builder addInputOneHot(String readerName, int column, int numClasses);
// Add outputs (labels)
public Builder addOutput(String readerName);
public Builder addOutput(String readerName, int columnFirst, int columnLast);
public Builder addOutputOneHot(String readerName, int column, int numClasses);
// Time series configuration
public Builder timeSeriesRandomOffset(boolean timeSeriesRandomOffset, long rngSeed);
// Build the iterator
public RecordReaderMultiDataSetIterator build();
}public enum AlignmentMode {
EQUAL_LENGTH, // All sequences must be the same length
ALIGN_START, // Align sequences at start, pad end
ALIGN_END // Align sequences at end, pad start
}import org.datavec.api.records.reader.impl.csv.CSVRecordReader;
import org.datavec.api.split.FileSplit;
import org.deeplearning4j.datasets.datavec.RecordReaderMultiDataSetIterator;
// Setup multiple record readers
RecordReader featuresReader = new CSVRecordReader();
featuresReader.initialize(new FileSplit(new File("features.csv")));
RecordReader labelsReader = new CSVRecordReader();
labelsReader.initialize(new FileSplit(new File("labels.csv")));
// Build multi-input iterator
RecordReaderMultiDataSetIterator iterator =
new RecordReaderMultiDataSetIterator.Builder(32) // batch size
.addReader("features", featuresReader)
.addReader("labels", labelsReader)
.addInput("features") // All columns from features as input
.addOutput("labels") // All columns from labels as output
.build();
// Use iterator
while (iterator.hasNext()) {
MultiDataSet multiDataSet = iterator.next();
INDArray[] inputs = multiDataSet.getFeatures(); // Array of input arrays
INDArray[] outputs = multiDataSet.getLabels(); // Array of output arrays
}// Setup readers for different data modalities
RecordReader imageReader = new ImageRecordReader();
RecordReader textReader = new CSVRecordReader();
RecordReader metadataReader = new CSVRecordReader();
// Initialize readers
imageReader.initialize(new FileSplit(new File("images/")));
textReader.initialize(new FileSplit(new File("text_features.csv")));
metadataReader.initialize(new FileSplit(new File("metadata.csv")));
// Configure multi-modal architecture
RecordReaderMultiDataSetIterator iterator =
new RecordReaderMultiDataSetIterator.Builder(16)
// Add all readers
.addReader("images", imageReader)
.addReader("text", textReader)
.addReader("metadata", metadataReader)
// Configure multiple inputs
.addInput("images") // Input 0: Image data
.addInput("text", 0, 99) // Input 1: Text features (columns 0-99)
.addInput("metadata", 0, 9) // Input 2: Metadata (columns 0-9)
// Configure multiple outputs
.addOutputOneHot("metadata", 10, 5) // Output 0: Classification (column 10, 5 classes)
.addOutput("metadata", 11, 13) // Output 1: Regression (columns 11-13)
.build();
// Process multi-modal data
while (iterator.hasNext()) {
MultiDataSet batch = iterator.next();
// Access multiple inputs
INDArray imageInput = batch.getFeatures(0); // Images
INDArray textInput = batch.getFeatures(1); // Text features
INDArray metadataInput = batch.getFeatures(2); // Metadata
// Access multiple outputs
INDArray classificationOutput = batch.getLabels(0); // One-hot classification
INDArray regressionOutput = batch.getLabels(1); // Regression targets
}import org.datavec.api.records.reader.impl.csv.CSVSequenceRecordReader;
// Setup sequence readers
SequenceRecordReader audioReader = new CSVSequenceRecordReader();
SequenceRecordReader videoReader = new CSVSequenceRecordReader();
SequenceRecordReader labelReader = new CSVSequenceRecordReader();
// Initialize sequence readers
audioReader.initialize(new NumberedFileInputSplit("audio_%d.csv", 0, 999));
videoReader.initialize(new NumberedFileInputSplit("video_%d.csv", 0, 999));
labelReader.initialize(new NumberedFileInputSplit("labels_%d.csv", 0, 999));
// Configure multi-modal sequence processing
RecordReaderMultiDataSetIterator iterator =
new RecordReaderMultiDataSetIterator.Builder(8)
// Add sequence readers
.addSequenceReader("audio", audioReader)
.addSequenceReader("video", videoReader)
.addSequenceReader("labels", labelReader)
// Configure sequence alignment
.sequenceAlignmentMode(AlignmentMode.ALIGN_START)
// Add sequence inputs
.addInput("audio") // Audio features
.addInput("video") // Video features
// Add sequence output
.addOutputOneHot("labels", 0, 10) // 10-class classification per time step
.build();// Fine-grained control over which columns to use
RecordReader dataReader = new CSVRecordReader();
dataReader.initialize(new FileSplit(new File("complex_data.csv")));
RecordReaderMultiDataSetIterator iterator =
new RecordReaderMultiDataSetIterator.Builder(64)
.addReader("data", dataReader)
// Use specific column ranges for different inputs
.addInput("data", 0, 49) // Input 0: Columns 0-49 (features)
.addInput("data", 50, 99) // Input 1: Columns 50-99 (different feature type)
// Use specific columns for outputs
.addOutputOneHot("data", 100, 3) // Output 0: Column 100, 3 classes
.addOutput("data", 101, 105) // Output 1: Columns 101-105 (regression)
.build();// Add random time offset for data augmentation
RecordReaderMultiDataSetIterator iterator =
new RecordReaderMultiDataSetIterator.Builder(32)
.addSequenceReader("sequences", sequenceReader)
.addInput("sequences")
.addOutput("sequences", 10, 12)
// Enable random time series offset
.timeSeriesRandomOffset(true, 12345L) // enabled, with seed
.build();import org.nd4j.linalg.dataset.api.preprocessor.MultiDataSetPreProcessor;
import org.nd4j.linalg.dataset.api.preprocessor.MultiNormalizerMinMaxScaler;
// Create iterator
RecordReaderMultiDataSetIterator iterator =
new RecordReaderMultiDataSetIterator.Builder(32)
.addReader("input1", reader1)
.addReader("input2", reader2)
.addInput("input1")
.addInput("input2")
.addOutput("input1", 10, 12)
.build();
// Add multi-input preprocessing
MultiNormalizerMinMaxScaler scaler = new MultiNormalizerMinMaxScaler();
iterator.setPreProcessor(scaler);
// Fit preprocessor
scaler.fit(iterator);
iterator.reset();
// Use preprocessed data
while (iterator.hasNext()) {
MultiDataSet preprocessed = iterator.next();
// All inputs normalized independently
}addReader(name, reader): Add a RecordReader with a string identifieraddSequenceReader(name, reader): Add a SequenceRecordReader with identifieraddInput(readerName): Use all columns from reader as inputaddInput(readerName, start, end): Use columns start to end (inclusive) as inputaddInputOneHot(readerName, column, numClasses): Convert column to one-hot encodingaddOutput(readerName): Use all columns from reader as outputaddOutput(readerName, start, end): Use columns start to end (inclusive) as outputaddOutputOneHot(readerName, column, numClasses): Convert column to one-hot encodingInstall with Tessl CLI
npx tessl i tessl/maven-org-datavec--datavec-local