DataVec integration library providing data loading, transformation, and Spark processing capabilities for DeepLearning4j
—
Core functionality for converting RecordReader data into DataSet objects suitable for DeepLearning4j training. The RecordReaderDataSetIterator provides a bridge between DataVec's data reading capabilities and DeepLearning4j's training requirements.
Main class for converting RecordReader data into DataSet objects for neural network training.
public class RecordReaderDataSetIterator implements DataSetIterator, Serializable {
// Main constructors
public RecordReaderDataSetIterator(RecordReader recordReader, int batchSize);
public RecordReaderDataSetIterator(RecordReader recordReader, int batchSize,
int labelIndex, int numPossibleLabels);
public RecordReaderDataSetIterator(RecordReader recordReader, int batchSize,
int labelIndex, int numPossibleLabels,
boolean regression);
public RecordReaderDataSetIterator(RecordReader recordReader,
WritableConverter converter, int batchSize,
int labelIndex, int numPossibleLabels,
boolean regression);
public RecordReaderDataSetIterator(RecordReader recordReader, int batchSize,
int labelIndexFrom, int labelIndexTo,
boolean regression);
public RecordReaderDataSetIterator(RecordReader recordReader, int batchSize,
int labelIndex, int numPossibleLabels,
int maxNumBatches);
// Iterator methods
public boolean hasNext();
public DataSet next();
public DataSet next(int num);
public void remove();
// Configuration methods
public void setPreProcessor(DataSetPreProcessor preProcessor);
public DataSetPreProcessor getPreProcessor();
public void setCollectMetaData(boolean collectMetaData);
public boolean getCollectMetaData();
// Information methods
public int totalExamples();
public int inputColumns();
public int totalOutcomes();
public int batch();
public int cursor();
public int numExamples();
public List<String> getLabels();
// Reset and async support
public boolean resetSupported();
public boolean asyncSupported();
public void reset();
// Metadata support
public DataSet loadFromMetaData(RecordMetaData recordMetaData) throws IOException;
public DataSet loadFromMetaData(List<RecordMetaData> recordMetaDatas) throws IOException;
}import org.datavec.api.records.reader.impl.csv.CSVRecordReader;
import org.datavec.api.split.FileSplit;
import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator;
// Setup CSV reader
RecordReader csvReader = new CSVRecordReader();
csvReader.initialize(new FileSplit(new File("iris.csv")));
// Create iterator for classification
DataSetIterator iterator = new RecordReaderDataSetIterator(
csvReader, // recordReader
32, // batchSize
4, // labelIndex (column 4 contains labels)
3 // numPossibleLabels (3 classes)
);
// Use iterator
while (iterator.hasNext()) {
DataSet dataSet = iterator.next();
System.out.println("Features shape: " + Arrays.toString(dataSet.getFeatures().shape()));
System.out.println("Labels shape: " + Arrays.toString(dataSet.getLabels().shape()));
}// Setup for regression task
DataSetIterator regressionIterator = new RecordReaderDataSetIterator(
csvReader, // recordReader
64, // batchSize
5, // labelIndex (column 5 contains continuous target)
1, // numPossibleLabels (1 for regression)
true // regression = true
);// Labels in columns 3, 4, and 5
DataSetIterator multiLabelIterator = new RecordReaderDataSetIterator(
csvReader, // recordReader
32, // batchSize
3, // labelIndexFrom (start of label columns)
5, // labelIndexTo (end of label columns)
false // regression = false (classification)
);import org.nd4j.linalg.dataset.api.preprocessor.NormalizerMinMaxScaler;
// Create iterator
DataSetIterator iterator = new RecordReaderDataSetIterator(csvReader, 32, 4, 3);
// Add preprocessing
NormalizerMinMaxScaler scaler = new NormalizerMinMaxScaler();
iterator.setPreProcessor(scaler);
// First pass to calculate min/max
scaler.fit(iterator);
iterator.reset();
// Now use normalized data
while (iterator.hasNext()) {
DataSet normalizedData = iterator.next();
// Train with normalized data
}// Enable metadata collection
RecordReaderDataSetIterator iterator = new RecordReaderDataSetIterator(
csvReader, 32, 4, 3);
iterator.setCollectMetaData(true);
// Process data
DataSet batch = iterator.next();
List<RecordMetaData> metaData = batch.getExampleMetaData();
// Later, load specific examples by metadata
DataSet specificExample = iterator.loadFromMetaData(metaData.get(0));The iterator handles various error conditions:
Common validation performed:
Install with Tessl CLI
npx tessl i tessl/maven-org-datavec--datavec-local