DeepLearning4J Natural Language Processing module providing word embeddings, document classification, and text processing capabilities for neural network applications.
npx @tessl/cli install tessl/maven-org-deeplearning4j--deeplearning4j-nlp@0.9.00
# DeepLearning4J NLP
1
2
DeepLearning4J NLP is a comprehensive natural language processing library for Java that provides state-of-the-art word embeddings, document classification, and text processing capabilities. Built on the DeepLearning4J neural network framework, it offers scalable implementations of Word2Vec, GloVe, ParagraphVectors (Doc2Vec), and extensive text preprocessing utilities with support for parallel processing and production deployment.
3
4
## Package Information
5
6
- **Package Name**: org.deeplearning4j:deeplearning4j-nlp
7
- **Package Type**: maven
8
- **Language**: Java
9
- **Version**: 0.9.1
10
- **Installation**:
11
```xml
12
<dependency>
13
<groupId>org.deeplearning4j</groupId>
14
<artifactId>deeplearning4j-nlp</artifactId>
15
<version>0.9.1</version>
16
</dependency>
17
```
18
19
## Core Imports
20
21
```java
22
// Core word embedding models
23
import org.deeplearning4j.models.word2vec.Word2Vec;
24
import org.deeplearning4j.models.glove.Glove;
25
import org.deeplearning4j.models.paragraphvectors.ParagraphVectors;
26
27
// Text processing utilities
28
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
29
import org.deeplearning4j.text.documentiterator.DocumentIterator;
30
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
31
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
32
33
// Vocabulary and word representations
34
import org.deeplearning4j.models.word2vec.VocabWord;
35
import org.deeplearning4j.models.word2vec.wordstore.VocabCache;
36
```
37
38
## Basic Usage
39
40
```java
41
import org.deeplearning4j.models.word2vec.Word2Vec;
42
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;
43
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
44
import java.util.Arrays;
45
import java.util.Collection;
46
47
// Prepare training data
48
Collection<String> sentences = Arrays.asList(
49
"The quick brown fox jumps over the lazy dog",
50
"Natural language processing with deep learning",
51
"Word embeddings capture semantic relationships"
52
);
53
54
// Train Word2Vec model
55
Word2Vec vec = new Word2Vec.Builder()
56
.minWordFrequency(1)
57
.iterations(5)
58
.layerSize(100)
59
.seed(42)
60
.windowSize(5)
61
.iterate(new CollectionSentenceIterator(sentences))
62
.tokenizerFactory(new DefaultTokenizerFactory())
63
.build();
64
65
vec.fit();
66
67
// Use the trained model
68
double similarity = vec.similarity("quick", "fast");
69
Collection<String> nearestWords = vec.wordsNearest("fox", 5);
70
System.out.println("Similarity: " + similarity);
71
System.out.println("Nearest to 'fox': " + nearestWords);
72
```
73
74
## Architecture
75
76
DeepLearning4J NLP is built around several key architectural components:
77
78
- **Embedding Models**: Word2Vec, GloVe, and ParagraphVectors implementations based on a unified SequenceVectors framework
79
- **Builder Pattern**: Extensive use of builder classes for configurable model construction with sensible defaults
80
- **Text Processing Pipeline**: Modular tokenization, sentence iteration, and document processing with pluggable components
81
- **Parallel Processing**: Multi-threaded training and inference with configurable worker threads
82
- **Memory Management**: Efficient vocabulary caching and weight lookup tables optimized for large-scale text processing
83
- **Extensible Design**: Abstract base classes and interfaces allowing custom implementations of learning algorithms, iterators, and transformers
84
85
## Capabilities
86
87
### Word Embeddings (Word2Vec)
88
89
Dense vector representations of words trained using skip-gram or CBOW algorithms. Captures semantic and syntactic relationships between words in high-dimensional vector space.
90
91
```java { .api }
92
public class Word2Vec extends SequenceVectors<VocabWord> {
93
public void setTokenizerFactory(TokenizerFactory tokenizerFactory);
94
public void setSentenceIterator(SentenceIterator iterator);
95
public void setSequenceIterator(SequenceIterator<VocabWord> iterator);
96
97
public static class Builder extends SequenceVectors.Builder<VocabWord> {
98
public Builder iterate(SentenceIterator iterator);
99
public Builder iterate(DocumentIterator iterator);
100
public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);
101
public Builder batchSize(int batchSize);
102
public Builder iterations(int iterations);
103
public Builder epochs(int numEpochs);
104
public Builder layerSize(int layerSize);
105
public Builder learningRate(double learningRate);
106
public Builder minWordFrequency(int minWordFrequency);
107
public Builder windowSize(int windowSize);
108
public Builder seed(long randomSeed);
109
public Builder workers(int numWorkers);
110
public Word2Vec build();
111
}
112
}
113
```
114
115
[Word Embeddings](./word-embeddings.md)
116
117
### Global Vectors (GloVe)
118
119
Matrix factorization-based word embeddings that combine global statistical information with local context windows. Efficiently captures word co-occurrence statistics across large corpora.
120
121
```java { .api }
122
public class Glove extends SequenceVectors<VocabWord> {
123
public static class Builder extends SequenceVectors.Builder<VocabWord> {
124
public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);
125
public Builder xMax(double xMax);
126
public Builder symmetric(boolean reallySymmetric);
127
public Builder shuffle(boolean reallyShuffle);
128
public Builder alpha(double alpha);
129
public Builder iterate(SentenceIterator iterator);
130
public Builder iterate(DocumentIterator iterator);
131
public Builder maxMemory(int gbytes);
132
public Glove build();
133
}
134
}
135
```
136
137
[Global Vectors](./glove.md)
138
139
### Document Embeddings (ParagraphVectors)
140
141
Document-level embeddings (Doc2Vec) that create vector representations for entire documents, sentences, or paragraphs. Enables document similarity, classification, and clustering tasks.
142
143
```java { .api }
144
public class ParagraphVectors extends Word2Vec {
145
public String predict(String rawText);
146
public String predict(LabelledDocument document);
147
public Collection<String> predictSeveral(String rawText, int limit);
148
public INDArray inferVector(String text);
149
public INDArray inferVector(LabelledDocument document);
150
public Future<INDArray> inferVectorBatched(String document);
151
public Collection<String> nearestLabels(String rawText, int topN);
152
public double similarityToLabel(String rawText, String label);
153
154
public static class Builder extends Word2Vec.Builder {
155
public Builder useExistingWordVectors(WordVectors vec);
156
public Builder trainWordVectors(boolean trainElements);
157
public Builder labelsSource(LabelsSource source);
158
public Builder iterate(LabelAwareDocumentIterator iterator);
159
public Builder iterate(LabelAwareSentenceIterator iterator);
160
public ParagraphVectors build();
161
}
162
}
163
```
164
165
[Document Embeddings](./document-embeddings.md)
166
167
### Text Processing and Tokenization
168
169
Comprehensive text preprocessing pipeline with support for multiple tokenization strategies, sentence boundary detection, and document iteration patterns.
170
171
```java { .api }
172
public interface SentenceIterator {
173
String nextSentence();
174
boolean hasNext();
175
void reset();
176
void finish();
177
SentencePreProcessor getPreProcessor();
178
void setPreProcessor(SentencePreProcessor preProcessor);
179
}
180
181
public interface TokenizerFactory {
182
Tokenizer create(String toTokenize);
183
Tokenizer create(InputStream toTokenize);
184
void setTokenPreProcessor(TokenPreProcess preProcessor);
185
TokenPreProcess getTokenPreProcessor();
186
}
187
188
public interface DocumentIterator extends Serializable {
189
InputStream nextDocument();
190
boolean hasNext();
191
void reset();
192
}
193
```
194
195
[Text Processing](./text-processing.md)
196
197
### Bag of Words Vectorization
198
199
Traditional text vectorization methods including TF-IDF and bag-of-words representations for document classification and information retrieval tasks.
200
201
```java { .api }
202
public interface TextVectorizer {
203
// Vectorization interface for text processing
204
}
205
206
public class BagOfWordsVectorizer implements TextVectorizer {
207
// Bag of words implementation
208
}
209
210
public class TfidfVectorizer implements TextVectorizer {
211
// TF-IDF implementation
212
}
213
```
214
215
[Bag of Words](./bag-of-words.md)
216
217
### Dataset Loading and Iteration
218
219
Pre-built dataset loaders and iterators for common NLP datasets and data formats, designed for seamless integration with neural network training pipelines.
220
221
```java { .api }
222
public class CnnSentenceDataSetIterator {
223
// CNN sentence dataset iteration
224
}
225
226
public class ReutersNewsGroupsDataSetIterator {
227
// Reuters news groups dataset
228
}
229
230
public interface LabeledSentenceProvider {
231
// Labeled sentence provision interface
232
}
233
```
234
235
[Dataset Loading](./dataset-loading.md)
236
237
### Model Persistence and Serialization
238
239
Utilities for loading and saving Word2Vec models, including Google format compatibility and model serialization across different formats.
240
241
```java { .api }
242
public class WordVectorSerializer {
243
public static Word2Vec loadGoogleModel(File modelFile, boolean binary);
244
public static Word2Vec loadGoogleModel(File modelFile, boolean binary, boolean lineBreaks);
245
public static WordVectors loadGoogleModelNonNormalized(File modelFile, boolean binary, boolean lineBreaks);
246
public static void writeWord2VecModel(WordVectors vectors, File file);
247
public static void writeTsneWords(Word2Vec vec, List<String> labels, String path, INDArray tsne);
248
public static void writeWordVectors(WordVectors vectors, String path);
249
public static WordVectors loadTxtVectors(File vectorsFile);
250
}
251
```
252
253
### Sequence Vectors Framework
254
255
Core framework for implementing sequence-based embedding algorithms, providing the foundation for Word2Vec, GloVe, and ParagraphVectors implementations.
256
257
```java { .api }
258
public abstract class SequenceVectors<T extends SequenceElement> implements WordVectors {
259
public void fit();
260
public double similarity(String word1, String word2);
261
public Collection<String> wordsNearest(String word, int n);
262
public INDArray getWordVector(String word);
263
public boolean hasWord(String word);
264
265
public static abstract class Builder<T extends SequenceElement> {
266
public Builder<T> minWordFrequency(int minWordFrequency);
267
public Builder<T> iterations(int iterations);
268
public Builder<T> layerSize(int layerSize);
269
public Builder<T> learningRate(double learningRate);
270
public Builder<T> windowSize(int windowSize);
271
public Builder<T> seed(long seed);
272
public Builder<T> workers(int workers);
273
}
274
}
275
```
276
277
### Node2Vec Graph Embeddings
278
279
Graph-based node embeddings using random walks to learn vector representations of nodes in networks and graphs.
280
281
```java { .api }
282
public class Node2Vec<V extends SequenceElement, E extends Number> extends SequenceVectors<V> {
283
284
public static class Builder<V extends SequenceElement, E extends Number> extends SequenceVectors.Builder<V> {
285
public Builder<V, E> setGraphHuffman(GraphHuffman huffman);
286
public Builder<V, E> setWalkLength(int walkLength);
287
public Builder<V, E> setNumWalks(int numWalks);
288
public Builder<V, E> setP(double p);
289
public Builder<V, E> setQ(double q);
290
public Node2Vec<V, E> build();
291
}
292
}
293
```
294
295
## Types
296
297
```java { .api }
298
public class VocabWord extends SequenceElement {
299
public VocabWord(double wordFrequency, String word);
300
public String getWord();
301
public boolean isLabel();
302
public int getIndex();
303
}
304
305
public interface VocabCache<T extends SequenceElement> {
306
boolean containsWord(String word);
307
T wordFor(String word);
308
int numWords();
309
Collection<T> vocabWords();
310
}
311
312
public interface WeightLookupTable<T extends SequenceElement> {
313
INDArray getWeights();
314
INDArray vector(String word);
315
}
316
317
public class LabelledDocument {
318
public String getContent();
319
public String getId();
320
public List<String> getLabels();
321
public List<VocabWord> getReferencedContent();
322
}
323
324
public class LabelsSource {
325
public LabelsSource();
326
public LabelsSource(List<String> labels);
327
public List<String> getLabels();
328
}
329
330
public abstract class SequenceElement implements Serializable {
331
public abstract String getLabel();
332
public abstract void setIndex(int index);
333
public abstract int getIndex();
334
public abstract long getElementFrequency();
335
public abstract void incrementElementFrequency();
336
public abstract void incrementElementFrequency(int by);
337
}
338
339
public interface SequenceIterator<T extends SequenceElement> {
340
Sequence<T> nextSequence();
341
boolean hasNext();
342
void reset();
343
SequenceIterator<T> getNewInstance();
344
}
345
346
public interface WordVectors {
347
double[] getWordVector(String word);
348
INDArray getWordVectorMatrix(String word);
349
double similarity(String word1, String word2);
350
Collection<String> wordsNearest(String word, int n);
351
boolean hasWord(String word);
352
Collection<String> vocab();
353
long vocabPackage();
354
}
355
```