ETL library for machine learning data preprocessing across diverse formats including HDFS, Spark, Images, Video, Audio, CSV, and Excel
—
Record readers provide the core functionality for reading structured data from various sources in DataVec. They implement a consistent iterator-based pattern and support metadata tracking for data lineage and debugging.
The base interface that all record readers implement. Provides standard iteration patterns, initialization, and optional batch reading capabilities.
public interface RecordReader {
void initialize(InputSplit split) throws IOException;
List<Writable> next();
boolean hasNext();
void reset();
List<String> getLabels();
Record nextRecord();
boolean batchesSupported();
List<Writable> next(int numRecords);
}Usage Example:
RecordReader reader = new CSVRecordReader();
reader.initialize(new FileSplit(new File("data.csv")));
while (reader.hasNext()) {
List<Writable> record = reader.next();
// Process record
}
reader.reset(); // Reset for reuseReads comma-separated values files with configurable delimiters and skip lines. Automatically handles type inference and provides labels for classification tasks.
public class CSVRecordReader implements RecordReader {
public CSVRecordReader();
public CSVRecordReader(int skipLines, String delimiter);
public CSVRecordReader(int skipLines, String delimiter, String quote);
}Constructor Parameters:
skipLines - Number of header lines to skip (default: 0)delimiter - Field separator character (default: ",")quote - Quote character for escaped fields (default: """)Usage Example:
// Read CSV with header line
RecordReader csvReader = new CSVRecordReader(1, ",");
csvReader.initialize(new FileSplit(new File("data.csv")));
while (csvReader.hasNext()) {
List<Writable> record = csvReader.next();
// First column as integer
int id = record.get(0).toInt();
// Second column as string
String name = record.get(1).toString();
// Third column as double
double value = record.get(2).toDouble();
}Handles sequential or time-series data where each record consists of multiple time steps. Extends RecordReader with sequence-specific methods.
public interface SequenceRecordReader extends RecordReader {
List<List<Writable>> sequenceRecord();
List<List<Writable>> sequenceRecord(URI uri, DataInputStream dataInputStream) throws IOException;
SequenceRecord nextSequence();
}
public class CSVSequenceRecordReader implements SequenceRecordReader {
public CSVSequenceRecordReader();
public CSVSequenceRecordReader(int skipLines, String delimiter);
}Usage Example:
SequenceRecordReader seqReader = new CSVSequenceRecordReader();
seqReader.initialize(new FileSplit(new File("sequence_data.csv")));
while (seqReader.hasNext()) {
List<List<Writable>> sequence = seqReader.sequenceRecord();
// Process sequence - each inner list is a time step
for (List<Writable> timeStep : sequence) {
// Process individual time step
}
}Reads data from in-memory Java collections, useful for testing and when data is already loaded in memory.
public class CollectionRecordReader implements RecordReader {
public CollectionRecordReader(Collection<Collection<Writable>> records);
public CollectionRecordReader(RecordReader recordReader);
}
public class CollectionSequenceRecordReader implements SequenceRecordReader {
public CollectionSequenceRecordReader(Collection<Collection<Collection<Writable>>> sequences);
}Usage Example:
// Create data in memory
List<List<Writable>> data = Arrays.asList(
Arrays.asList(new IntWritable(1), new DoubleWritable(2.5)),
Arrays.asList(new IntWritable(2), new DoubleWritable(3.7))
);
RecordReader collectionReader = new CollectionRecordReader(data);
collectionReader.initialize(new CollectionInputSplit(data));
while (collectionReader.hasNext()) {
List<Writable> record = collectionReader.next();
// Process in-memory record
}All record readers support metadata tracking for data provenance and debugging. Metadata includes source location, line numbers, and transformation history.
public interface Record {
List<Writable> getRecord();
RecordMetaData getMetaData();
}
public interface RecordMetaData {
String getLocation();
URI getURI();
Class<?> getReaderClass();
}Usage Example:
RecordReader reader = new CSVRecordReader();
reader.initialize(new FileSplit(new File("data.csv")));
while (reader.hasNext()) {
Record recordWithMeta = reader.nextRecord();
List<Writable> data = recordWithMeta.getRecord();
RecordMetaData meta = recordWithMeta.getMetaData();
System.out.println("Data from: " + meta.getLocation());
// Process data with metadata context
}Some record readers support batch reading for improved performance when processing large datasets.
// Check if batch reading is supported
if (reader.batchesSupported()) {
List<Writable> batch = reader.next(batchSize);
// Process batch of records
}DataVec provides comprehensive metadata tracking through a hierarchy of interfaces and classes that enable data lineage, debugging, and provenance tracking.
public interface RecordMetaData {
String getLocation();
URI getURI();
Class<?> getReaderClass();
}
public interface RecordMetaDataComposable extends RecordMetaData {
List<RecordMetaData> getMeta();
}
public class RecordMetaDataComposableMap implements RecordMetaDataComposable {
public RecordMetaDataComposableMap(Map<String, RecordMetaData> meta);
public RecordMetaData getMeta(String key);
public Set<String> getMetaKeys();
}Usage Example:
RecordReader reader = new CSVRecordReader();
reader.initialize(new FileSplit(new File("data.csv")));
while (reader.hasNext()) {
Record recordWithMeta = reader.nextRecord();
List<Writable> data = recordWithMeta.getRecord();
RecordMetaData meta = recordWithMeta.getMetaData();
// Access metadata information
String sourceLocation = meta.getLocation();
URI sourceURI = meta.getURI();
Class<?> readerClass = meta.getReaderClass();
System.out.println("Processing record from: " + sourceLocation);
System.out.println("Read by: " + readerClass.getSimpleName());
// For composite metadata
if (meta instanceof RecordMetaDataComposable) {
RecordMetaDataComposable composite = (RecordMetaDataComposable) meta;
List<RecordMetaData> allMeta = composite.getMeta();
System.out.println("Composite metadata contains " + allMeta.size() + " entries");
}
}DataVec defines specific exceptions for different error conditions during data processing:
public class WritableConverterException extends Exception {
public WritableConverterException(String message);
public WritableConverterException(String message, Throwable cause);
}
public class ZeroLengthSequenceException extends RuntimeException {
public ZeroLengthSequenceException(String message);
}Common Exception Scenarios:
try {
RecordReader reader = new CSVRecordReader();
reader.initialize(new FileSplit(new File("data.csv")));
while (reader.hasNext()) {
List<Writable> record = reader.next();
// Custom converter may throw WritableConverterException
WritableConverter converter = new CustomConverter();
for (int i = 0; i < record.size(); i++) {
Writable converted = converter.convert(record.get(i));
record.set(i, converted);
}
}
} catch (IOException e) {
// Handle file I/O errors
System.err.println("Error reading file: " + e.getMessage());
} catch (WritableConverterException e) {
// Handle data conversion errors
System.err.println("Error converting data: " + e.getMessage());
} catch (ZeroLengthSequenceException e) {
// Handle empty sequence errors
System.err.println("Empty sequence encountered: " + e.getMessage());
}Reads data from general file inputs with customizable parsing logic.
public class FileRecordReader implements RecordReader {
public FileRecordReader();
public FileRecordReader(RecordReader wrappedReader);
}Usage Example:
FileRecordReader fileReader = new FileRecordReader();
fileReader.initialize(new FileSplit(new File("data.txt")));
while (fileReader.hasNext()) {
List<Writable> record = fileReader.next();
// Process file-based record
}Reads text files line by line, treating each line as a single record.
public class LineRecordReader implements RecordReader {
public LineRecordReader();
public LineRecordReader(String delimiter);
}Usage Example:
LineRecordReader lineReader = new LineRecordReader();
lineReader.initialize(new FileSplit(new File("textfile.txt")));
while (lineReader.hasNext()) {
List<Writable> record = lineReader.next();
String line = record.get(0).toString(); // Each record contains one line as Text
}Combines multiple record readers for complex data processing workflows.
public class ComposableRecordReader implements RecordReader {
public ComposableRecordReader(RecordReader... readers);
public ComposableRecordReader(List<RecordReader> readers);
}Usage Example:
RecordReader csvReader = new CSVRecordReader();
RecordReader imageReader = new ImageRecordReader(64, 64, 3, labelGenerator);
ComposableRecordReader composableReader = new ComposableRecordReader(csvReader, imageReader);
composableReader.initialize(new FileSplit(new File("mixed_data")));
while (composableReader.hasNext()) {
List<Writable> record = composableReader.next();
// Process combined record from multiple readers
}Sequentially processes multiple record readers, concatenating their outputs.
public class ConcatenatingRecordReader implements RecordReader {
public ConcatenatingRecordReader(RecordReader... readers);
public ConcatenatingRecordReader(List<RecordReader> readers);
}Usage Example:
RecordReader reader1 = new CSVRecordReader();
reader1.initialize(new FileSplit(new File("part1.csv")));
RecordReader reader2 = new CSVRecordReader();
reader2.initialize(new FileSplit(new File("part2.csv")));
ConcatenatingRecordReader concatReader = new ConcatenatingRecordReader(reader1, reader2);
while (concatReader.hasNext()) {
List<Writable> record = concatReader.next();
// Process records from both files sequentially
}Record readers integrate seamlessly with DL4J's DataSetIterator for machine learning workflows:
RecordReader recordReader = new CSVRecordReader();
recordReader.initialize(new FileSplit(new File("training_data.csv")));
DataSetIterator iterator = new RecordReaderDataSetIterator(
recordReader,
batchSize, // Number of examples per batch
labelIndex, // Column index of the label
numClasses // Number of possible classes
);Record readers may throw various exceptions during operation:
IOExcception - File I/O errors during readingNumberFormatException - Invalid numeric data in CSV filesIllegalStateException - Reader not properly initializedtry {
reader.initialize(new FileSplit(new File("data.csv")));
while (reader.hasNext()) {
List<Writable> record = reader.next();
// Process record
}
} catch (IOException e) {
// Handle file I/O errors
} catch (NumberFormatException e) {
// Handle invalid numeric data
}public interface RecordReader {
void initialize(InputSplit split) throws IOException;
List<Writable> next();
boolean hasNext();
void reset();
List<String> getLabels();
Record nextRecord();
boolean batchesSupported();
List<Writable> next(int numRecords);
}
public interface SequenceRecordReader extends RecordReader {
List<List<Writable>> sequenceRecord();
List<List<Writable>> sequenceRecord(URI uri, DataInputStream dataInputStream) throws IOException;
SequenceRecord nextSequence();
}
public interface Record {
List<Writable> getRecord();
RecordMetaData getMetaData();
}
public interface SequenceRecord {
List<List<Writable>> getSequenceRecord();
RecordMetaData getMetaData();
}// CSV-based readers
public class CSVRecordReader implements RecordReader;
public class CSVSequenceRecordReader implements SequenceRecordReader;
// Collection-based readers
public class CollectionRecordReader implements RecordReader;
public class CollectionSequenceRecordReader implements SequenceRecordReader;
// File-based readers
public class FileRecordReader implements RecordReader;
public class LineRecordReader implements RecordReader;
// Composite readers
public class ComposableRecordReader implements RecordReader;
public class ConcatenatingRecordReader implements RecordReader;Install with Tessl CLI
npx tessl i tessl/maven-org-datavec--datavec-api