or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

bag-of-words.mddataset-loading.mddocument-embeddings.mdglove.mdindex.mdtext-processing.mdword-embeddings.md

index.mddocs/

0

# DeepLearning4J NLP

1

2

DeepLearning4J NLP is a comprehensive natural language processing library for Java that provides state-of-the-art word embeddings, document classification, and text processing capabilities. Built on the DeepLearning4J neural network framework, it offers scalable implementations of Word2Vec, GloVe, ParagraphVectors (Doc2Vec), and extensive text preprocessing utilities with support for parallel processing and production deployment.

3

4

## Package Information

5

6

- **Package Name**: org.deeplearning4j:deeplearning4j-nlp

7

- **Package Type**: maven

8

- **Language**: Java

9

- **Version**: 0.9.1

10

- **Installation**:

11

```xml

12

<dependency>

13

<groupId>org.deeplearning4j</groupId>

14

<artifactId>deeplearning4j-nlp</artifactId>

15

<version>0.9.1</version>

16

</dependency>

17

```

18

19

## Core Imports

20

21

```java

22

// Core word embedding models

23

import org.deeplearning4j.models.word2vec.Word2Vec;

24

import org.deeplearning4j.models.glove.Glove;

25

import org.deeplearning4j.models.paragraphvectors.ParagraphVectors;

26

27

// Text processing utilities

28

import org.deeplearning4j.text.sentenceiterator.SentenceIterator;

29

import org.deeplearning4j.text.documentiterator.DocumentIterator;

30

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;

31

import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;

32

33

// Vocabulary and word representations

34

import org.deeplearning4j.models.word2vec.VocabWord;

35

import org.deeplearning4j.models.word2vec.wordstore.VocabCache;

36

```

37

38

## Basic Usage

39

40

```java

41

import org.deeplearning4j.models.word2vec.Word2Vec;

42

import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;

43

import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;

44

import java.util.Arrays;

45

import java.util.Collection;

46

47

// Prepare training data

48

Collection<String> sentences = Arrays.asList(

49

"The quick brown fox jumps over the lazy dog",

50

"Natural language processing with deep learning",

51

"Word embeddings capture semantic relationships"

52

);

53

54

// Train Word2Vec model

55

Word2Vec vec = new Word2Vec.Builder()

56

.minWordFrequency(1)

57

.iterations(5)

58

.layerSize(100)

59

.seed(42)

60

.windowSize(5)

61

.iterate(new CollectionSentenceIterator(sentences))

62

.tokenizerFactory(new DefaultTokenizerFactory())

63

.build();

64

65

vec.fit();

66

67

// Use the trained model

68

double similarity = vec.similarity("quick", "fast");

69

Collection<String> nearestWords = vec.wordsNearest("fox", 5);

70

System.out.println("Similarity: " + similarity);

71

System.out.println("Nearest to 'fox': " + nearestWords);

72

```

73

74

## Architecture

75

76

DeepLearning4J NLP is built around several key architectural components:

77

78

- **Embedding Models**: Word2Vec, GloVe, and ParagraphVectors implementations based on a unified SequenceVectors framework

79

- **Builder Pattern**: Extensive use of builder classes for configurable model construction with sensible defaults

80

- **Text Processing Pipeline**: Modular tokenization, sentence iteration, and document processing with pluggable components

81

- **Parallel Processing**: Multi-threaded training and inference with configurable worker threads

82

- **Memory Management**: Efficient vocabulary caching and weight lookup tables optimized for large-scale text processing

83

- **Extensible Design**: Abstract base classes and interfaces allowing custom implementations of learning algorithms, iterators, and transformers

84

85

## Capabilities

86

87

### Word Embeddings (Word2Vec)

88

89

Dense vector representations of words trained using skip-gram or CBOW algorithms. Captures semantic and syntactic relationships between words in high-dimensional vector space.

90

91

```java { .api }

92

public class Word2Vec extends SequenceVectors<VocabWord> {

93

public void setTokenizerFactory(TokenizerFactory tokenizerFactory);

94

public void setSentenceIterator(SentenceIterator iterator);

95

public void setSequenceIterator(SequenceIterator<VocabWord> iterator);

96

97

public static class Builder extends SequenceVectors.Builder<VocabWord> {

98

public Builder iterate(SentenceIterator iterator);

99

public Builder iterate(DocumentIterator iterator);

100

public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);

101

public Builder batchSize(int batchSize);

102

public Builder iterations(int iterations);

103

public Builder epochs(int numEpochs);

104

public Builder layerSize(int layerSize);

105

public Builder learningRate(double learningRate);

106

public Builder minWordFrequency(int minWordFrequency);

107

public Builder windowSize(int windowSize);

108

public Builder seed(long randomSeed);

109

public Builder workers(int numWorkers);

110

public Word2Vec build();

111

}

112

}

113

```

114

115

[Word Embeddings](./word-embeddings.md)

116

117

### Global Vectors (GloVe)

118

119

Matrix factorization-based word embeddings that combine global statistical information with local context windows. Efficiently captures word co-occurrence statistics across large corpora.

120

121

```java { .api }

122

public class Glove extends SequenceVectors<VocabWord> {

123

public static class Builder extends SequenceVectors.Builder<VocabWord> {

124

public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);

125

public Builder xMax(double xMax);

126

public Builder symmetric(boolean reallySymmetric);

127

public Builder shuffle(boolean reallyShuffle);

128

public Builder alpha(double alpha);

129

public Builder iterate(SentenceIterator iterator);

130

public Builder iterate(DocumentIterator iterator);

131

public Builder maxMemory(int gbytes);

132

public Glove build();

133

}

134

}

135

```

136

137

[Global Vectors](./glove.md)

138

139

### Document Embeddings (ParagraphVectors)

140

141

Document-level embeddings (Doc2Vec) that create vector representations for entire documents, sentences, or paragraphs. Enables document similarity, classification, and clustering tasks.

142

143

```java { .api }

144

public class ParagraphVectors extends Word2Vec {

145

public String predict(String rawText);

146

public String predict(LabelledDocument document);

147

public Collection<String> predictSeveral(String rawText, int limit);

148

public INDArray inferVector(String text);

149

public INDArray inferVector(LabelledDocument document);

150

public Future<INDArray> inferVectorBatched(String document);

151

public Collection<String> nearestLabels(String rawText, int topN);

152

public double similarityToLabel(String rawText, String label);

153

154

public static class Builder extends Word2Vec.Builder {

155

public Builder useExistingWordVectors(WordVectors vec);

156

public Builder trainWordVectors(boolean trainElements);

157

public Builder labelsSource(LabelsSource source);

158

public Builder iterate(LabelAwareDocumentIterator iterator);

159

public Builder iterate(LabelAwareSentenceIterator iterator);

160

public ParagraphVectors build();

161

}

162

}

163

```

164

165

[Document Embeddings](./document-embeddings.md)

166

167

### Text Processing and Tokenization

168

169

Comprehensive text preprocessing pipeline with support for multiple tokenization strategies, sentence boundary detection, and document iteration patterns.

170

171

```java { .api }

172

public interface SentenceIterator {

173

String nextSentence();

174

boolean hasNext();

175

void reset();

176

void finish();

177

SentencePreProcessor getPreProcessor();

178

void setPreProcessor(SentencePreProcessor preProcessor);

179

}

180

181

public interface TokenizerFactory {

182

Tokenizer create(String toTokenize);

183

Tokenizer create(InputStream toTokenize);

184

void setTokenPreProcessor(TokenPreProcess preProcessor);

185

TokenPreProcess getTokenPreProcessor();

186

}

187

188

public interface DocumentIterator extends Serializable {

189

InputStream nextDocument();

190

boolean hasNext();

191

void reset();

192

}

193

```

194

195

[Text Processing](./text-processing.md)

196

197

### Bag of Words Vectorization

198

199

Traditional text vectorization methods including TF-IDF and bag-of-words representations for document classification and information retrieval tasks.

200

201

```java { .api }

202

public interface TextVectorizer {

203

// Vectorization interface for text processing

204

}

205

206

public class BagOfWordsVectorizer implements TextVectorizer {

207

// Bag of words implementation

208

}

209

210

public class TfidfVectorizer implements TextVectorizer {

211

// TF-IDF implementation

212

}

213

```

214

215

[Bag of Words](./bag-of-words.md)

216

217

### Dataset Loading and Iteration

218

219

Pre-built dataset loaders and iterators for common NLP datasets and data formats, designed for seamless integration with neural network training pipelines.

220

221

```java { .api }

222

public class CnnSentenceDataSetIterator {

223

// CNN sentence dataset iteration

224

}

225

226

public class ReutersNewsGroupsDataSetIterator {

227

// Reuters news groups dataset

228

}

229

230

public interface LabeledSentenceProvider {

231

// Labeled sentence provision interface

232

}

233

```

234

235

[Dataset Loading](./dataset-loading.md)

236

237

### Model Persistence and Serialization

238

239

Utilities for loading and saving Word2Vec models, including Google format compatibility and model serialization across different formats.

240

241

```java { .api }

242

public class WordVectorSerializer {

243

public static Word2Vec loadGoogleModel(File modelFile, boolean binary);

244

public static Word2Vec loadGoogleModel(File modelFile, boolean binary, boolean lineBreaks);

245

public static WordVectors loadGoogleModelNonNormalized(File modelFile, boolean binary, boolean lineBreaks);

246

public static void writeWord2VecModel(WordVectors vectors, File file);

247

public static void writeTsneWords(Word2Vec vec, List<String> labels, String path, INDArray tsne);

248

public static void writeWordVectors(WordVectors vectors, String path);

249

public static WordVectors loadTxtVectors(File vectorsFile);

250

}

251

```

252

253

### Sequence Vectors Framework

254

255

Core framework for implementing sequence-based embedding algorithms, providing the foundation for Word2Vec, GloVe, and ParagraphVectors implementations.

256

257

```java { .api }

258

public abstract class SequenceVectors<T extends SequenceElement> implements WordVectors {

259

public void fit();

260

public double similarity(String word1, String word2);

261

public Collection<String> wordsNearest(String word, int n);

262

public INDArray getWordVector(String word);

263

public boolean hasWord(String word);

264

265

public static abstract class Builder<T extends SequenceElement> {

266

public Builder<T> minWordFrequency(int minWordFrequency);

267

public Builder<T> iterations(int iterations);

268

public Builder<T> layerSize(int layerSize);

269

public Builder<T> learningRate(double learningRate);

270

public Builder<T> windowSize(int windowSize);

271

public Builder<T> seed(long seed);

272

public Builder<T> workers(int workers);

273

}

274

}

275

```

276

277

### Node2Vec Graph Embeddings

278

279

Graph-based node embeddings using random walks to learn vector representations of nodes in networks and graphs.

280

281

```java { .api }

282

public class Node2Vec<V extends SequenceElement, E extends Number> extends SequenceVectors<V> {

283

284

public static class Builder<V extends SequenceElement, E extends Number> extends SequenceVectors.Builder<V> {

285

public Builder<V, E> setGraphHuffman(GraphHuffman huffman);

286

public Builder<V, E> setWalkLength(int walkLength);

287

public Builder<V, E> setNumWalks(int numWalks);

288

public Builder<V, E> setP(double p);

289

public Builder<V, E> setQ(double q);

290

public Node2Vec<V, E> build();

291

}

292

}

293

```

294

295

## Types

296

297

```java { .api }

298

public class VocabWord extends SequenceElement {

299

public VocabWord(double wordFrequency, String word);

300

public String getWord();

301

public boolean isLabel();

302

public int getIndex();

303

}

304

305

public interface VocabCache<T extends SequenceElement> {

306

boolean containsWord(String word);

307

T wordFor(String word);

308

int numWords();

309

Collection<T> vocabWords();

310

}

311

312

public interface WeightLookupTable<T extends SequenceElement> {

313

INDArray getWeights();

314

INDArray vector(String word);

315

}

316

317

public class LabelledDocument {

318

public String getContent();

319

public String getId();

320

public List<String> getLabels();

321

public List<VocabWord> getReferencedContent();

322

}

323

324

public class LabelsSource {

325

public LabelsSource();

326

public LabelsSource(List<String> labels);

327

public List<String> getLabels();

328

}

329

330

public abstract class SequenceElement implements Serializable {

331

public abstract String getLabel();

332

public abstract void setIndex(int index);

333

public abstract int getIndex();

334

public abstract long getElementFrequency();

335

public abstract void incrementElementFrequency();

336

public abstract void incrementElementFrequency(int by);

337

}

338

339

public interface SequenceIterator<T extends SequenceElement> {

340

Sequence<T> nextSequence();

341

boolean hasNext();

342

void reset();

343

SequenceIterator<T> getNewInstance();

344

}

345

346

public interface WordVectors {

347

double[] getWordVector(String word);

348

INDArray getWordVectorMatrix(String word);

349

double similarity(String word1, String word2);

350

Collection<String> wordsNearest(String word, int n);

351

boolean hasWord(String word);

352

Collection<String> vocab();

353

long vocabPackage();

354

}

355

```