or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

tessl/maven-org-deeplearning4j--dl4j-spark-2-11

DeepLearning4j Spark integration component providing DataVec transformations and DataSet operations for distributed deep learning in Apache Spark environments

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
mavenpkg:maven/org.deeplearning4j/dl4j-spark_2.11@0.9.x

To install, run

npx @tessl/cli install tessl/maven-org-deeplearning4j--dl4j-spark-2-11@0.9.0

0

# DeepLearning4j Spark Integration

1

2

DeepLearning4j Spark Integration (dl4j-spark) provides data preprocessing and transformation operations specifically designed for Apache Spark environments within the DeepLearning4j ecosystem. This component transforms various data formats (collections, sequences, bytes, strings) into DataSet objects suitable for deep learning training and inference in distributed Spark applications.

3

4

## Package Information

5

6

- **Package Name**: org.deeplearning4j:dl4j-spark_2.11

7

- **Package Type**: Maven

8

- **Language**: Java/Scala

9

- **Version**: 0.9.1_spark_1

10

- **Installation**: Add Maven dependency to pom.xml

11

12

```xml

13

<dependency>

14

<groupId>org.deeplearning4j</groupId>

15

<artifactId>dl4j-spark_2.11</artifactId>

16

<version>0.9.1_spark_1</version>

17

</dependency>

18

```

19

20

## Core Imports

21

22

```java

23

import org.deeplearning4j.spark.datavec.*;

24

import org.deeplearning4j.spark.datavec.export.*;

25

import org.deeplearning4j.datasets.datavec.*;

26

import org.apache.spark.api.java.function.*;

27

import org.datavec.api.writable.Writable;

28

import org.nd4j.linalg.dataset.DataSet;

29

```

30

31

## Basic Usage

32

33

```java

34

import org.deeplearning4j.spark.datavec.DataVecDataSetFunction;

35

import org.apache.spark.api.java.JavaRDD;

36

import org.datavec.api.writable.Writable;

37

import org.nd4j.linalg.dataset.DataSet;

38

import java.util.List;

39

40

// Create a function to transform DataVec records to DataSet

41

DataVecDataSetFunction transformer = new DataVecDataSetFunction(

42

4, // labelIndex - column index for labels

43

10, // numPossibleLabels - number of classes for classification

44

false // regression - false for classification, true for regression

45

);

46

47

// Apply transformation to RDD of Writable collections

48

JavaRDD<List<Writable>> records = // ... your DataVec records RDD

49

JavaRDD<DataSet> datasets = records.map(transformer);

50

```

51

52

## Architecture

53

54

DataVec Spark Inference Model follows a functional transformation pattern where different `Function` implementations convert various input formats into standardized `DataSet` objects. The architecture consists of:

55

56

- **Transformation Functions**: Convert different data types to DataSet objects

57

- **Batch Processing Utilities**: Handle mini-batch partitioning for efficient processing

58

- **Export Functions**: Persist transformed datasets to storage systems

59

- **Core Integration**: Bridge between DataVec's data handling and Spark's distributed processing

60

61

## Capabilities

62

63

### Basic Data Transformation

64

65

Converts DataVec Writable collections into DataSet objects for both classification and regression tasks.

66

67

```java { .api }

68

public class DataVecDataSetFunction implements Function<List<Writable>, DataSet> {

69

public DataVecDataSetFunction(int labelIndex, int numPossibleLabels, boolean regression);

70

public DataVecDataSetFunction(int labelIndex, int numPossibleLabels, boolean regression,

71

DataSetPreProcessor preProcessor, WritableConverter converter);

72

public DataVecDataSetFunction(int labelIndexFrom, int labelIndexTo, int numPossibleLabels,

73

boolean regression, DataSetPreProcessor preProcessor,

74

WritableConverter converter);

75

public DataSet call(List<Writable> currList) throws Exception;

76

}

77

```

78

79

[Data Transformation Functions](./data-transformation.md)

80

81

### Sequence Data Processing

82

83

Handles time series and sequential data conversion with support for variable-length sequences and alignment modes.

84

85

```java { .api }

86

public class DataVecSequenceDataSetFunction implements Function<List<List<Writable>>, DataSet> {

87

public DataVecSequenceDataSetFunction(int labelIndex, int numPossibleLabels, boolean regression);

88

public DataVecSequenceDataSetFunction(int labelIndex, int numPossibleLabels, boolean regression,

89

DataSetPreProcessor preProcessor, WritableConverter converter);

90

public DataSet call(List<List<Writable>> input) throws Exception;

91

}

92

```

93

94

```java { .api }

95

public class DataVecSequencePairDataSetFunction

96

implements Function<Tuple2<List<List<Writable>>, List<List<Writable>>>, DataSet> {

97

98

public enum AlignmentMode { EQUAL_LENGTH, ALIGN_START, ALIGN_END }

99

100

public DataVecSequencePairDataSetFunction();

101

public DataVecSequencePairDataSetFunction(int numPossibleLabels, boolean regression);

102

public DataVecSequencePairDataSetFunction(int numPossibleLabels, boolean regression,

103

AlignmentMode alignmentMode);

104

public DataVecSequencePairDataSetFunction(int numPossibleLabels, boolean regression,

105

AlignmentMode alignmentMode,

106

DataSetPreProcessor preProcessor,

107

WritableConverter converter);

108

public DataSet call(Tuple2<List<List<Writable>>, List<List<Writable>>> input) throws Exception;

109

}

110

```

111

112

[Sequence Processing](./sequence-processing.md)

113

114

### Specialized Input Formats

115

116

Processes binary data and string records using RecordReader implementations.

117

118

```java { .api }

119

public class DataVecByteDataSetFunction implements PairFunction<Tuple2<Text, BytesWritable>, Double, DataSet> {

120

public DataVecByteDataSetFunction(int labelIndex, int numPossibleLabels, int batchSize, int byteFileLen);

121

public DataVecByteDataSetFunction(int labelIndex, int numPossibleLabels, int batchSize,

122

int byteFileLen, boolean regression);

123

public DataVecByteDataSetFunction(int labelIndex, int numPossibleLabels, int batchSize,

124

int byteFileLen, boolean regression,

125

DataSetPreProcessor preProcessor);

126

public Tuple2<Double, DataSet> call(Tuple2<Text, BytesWritable> inputTuple) throws Exception;

127

}

128

```

129

130

```java { .api }

131

public class RecordReaderFunction implements Function<String, DataSet> {

132

public RecordReaderFunction(RecordReader recordReader, int labelIndex, int numPossibleLabels,

133

WritableConverter converter);

134

public RecordReaderFunction(RecordReader recordReader, int labelIndex, int numPossibleLabels);

135

public DataSet call(String v1) throws Exception;

136

}

137

```

138

139

[Specialized Inputs](./specialized-inputs.md)

140

141

### Batch Processing and Export

142

143

Handles mini-batch creation and dataset export functionality for production workflows.

144

145

```java { .api }

146

public class RDDMiniBatches implements Serializable {

147

public RDDMiniBatches(int miniBatches, JavaRDD<DataSet> toSplit);

148

public JavaRDD<DataSet> miniBatchesJava();

149

}

150

```

151

152

```java { .api }

153

public class StringToDataSetExportFunction implements VoidFunction<Iterator<String>> {

154

public StringToDataSetExportFunction(URI outputDir, RecordReader recordReader, int batchSize,

155

boolean regression, int labelIndex, int numPossibleLabels);

156

public void call(Iterator<String> stringIterator) throws Exception;

157

}

158

```

159

160

[Batch Processing and Export](./batch-export.md)

161

162

## Supporting Types

163

164

```java { .api }

165

// From DataVec API

166

interface Writable {

167

double toDouble();

168

int toInt();

169

String toString();

170

}

171

172

class NDArrayWritable implements Writable {

173

public NDArrayWritable(INDArray array);

174

public INDArray get();

175

}

176

177

interface WritableConverter {

178

Writable convert(Writable writable) throws WritableConverterException;

179

}

180

181

// From ND4J

182

interface DataSetPreProcessor {

183

void preProcess(DataSet dataSet);

184

}

185

186

class DataSet {

187

public DataSet(INDArray features, INDArray labels);

188

public INDArray getFeatureMatrix();

189

public INDArray getLabels();

190

public static DataSet merge(List<DataSet> dataSets);

191

}

192

193

// From DataVec Records API

194

interface RecordReader {

195

void initialize(InputSplit split) throws IOException, InterruptedException;

196

List<Writable> next();

197

boolean hasNext();

198

}

199

200

// From Hadoop

201

class Text implements WritableComparable<Text> {

202

public Text(String string);

203

public String toString();

204

}

205

206

class BytesWritable implements WritableComparable<BytesWritable> {

207

public byte[] getBytes();

208

public int getLength();

209

}

210

211

// From Apache Spark

212

class Tuple2<T1, T2> {

213

public T1 _1();

214

public T2 _2();

215

}

216

```

217

218

## Exception Types

219

220

```java { .api }

221

class ZeroLengthSequenceException extends RuntimeException {

222

public ZeroLengthSequenceException(String message);

223

}

224

225

class WritableConverterException extends Exception {

226

public WritableConverterException(String message);

227

public WritableConverterException(String message, Throwable cause);

228

}

229

```