0
# Word Embeddings (Word2Vec)
1
2
Dense vector representations of words that capture semantic and syntactic relationships in high-dimensional space. The Word2Vec implementation supports both skip-gram and CBOW algorithms with extensive configuration options for production-scale text processing.
3
4
## Capabilities
5
6
### Word2Vec Model
7
8
Main Word2Vec implementation based on SequenceVectors framework, providing neural word embeddings with configurable architecture and training parameters.
9
10
```java { .api }
11
/**
12
* Word2Vec implementation based on SequenceVectors
13
*/
14
public class Word2Vec extends SequenceVectors<VocabWord> {
15
16
/**
17
* Define TokenizerFactory instance for model building
18
* @param tokenizerFactory TokenizerFactory instance for text tokenization
19
*/
20
public void setTokenizerFactory(TokenizerFactory tokenizerFactory);
21
22
/**
23
* Define SentenceIterator as training corpus source
24
* @param iterator SentenceIterator instance for sentence-level text input
25
*/
26
public void setSentenceIterator(SentenceIterator iterator);
27
28
/**
29
* Define SequenceIterator for pre-tokenized sequences
30
* @param iterator SequenceIterator for already tokenized Sequence<VocabWord> input
31
*/
32
public void setSequenceIterator(SequenceIterator<VocabWord> iterator);
33
}
34
```
35
36
### Word2Vec Builder
37
38
Comprehensive builder pattern for Word2Vec configuration with extensive training and architecture parameters.
39
40
```java { .api }
41
/**
42
* Builder for Word2Vec configuration and construction
43
*/
44
public static class Word2Vec.Builder extends SequenceVectors.Builder<VocabWord> {
45
46
/**
47
* Build the configured Word2Vec instance
48
* @return Configured Word2Vec model ready for training
49
*/
50
public Word2Vec build();
51
52
/**
53
* Set document iterator for training data
54
* @param iterator DocumentIterator providing training documents
55
* @return Builder instance for method chaining
56
*/
57
public Builder iterate(DocumentIterator iterator);
58
59
/**
60
* Set sentence iterator for training data
61
* @param iterator SentenceIterator providing training sentences
62
* @return Builder instance for method chaining
63
*/
64
public Builder iterate(SentenceIterator iterator);
65
66
/**
67
* Set sequence iterator for pre-tokenized training data
68
* @param iterator SequenceIterator providing tokenized sequences
69
* @return Builder instance for method chaining
70
*/
71
public Builder iterate(SequenceIterator<VocabWord> iterator);
72
73
/**
74
* Set label-aware iterator for supervised training
75
* @param iterator LabelAwareIterator providing labeled training data
76
* @return Builder instance for method chaining
77
*/
78
public Builder iterate(LabelAwareIterator iterator);
79
80
/**
81
* Define TokenizerFactory for string tokenization during training
82
* @param tokenizerFactory TokenizerFactory for text tokenization
83
* @return Builder instance for method chaining
84
*/
85
public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);
86
87
/**
88
* Set mini-batch size for training
89
* @param batchSize Number of sequences per mini-batch
90
* @return Builder instance for method chaining
91
*/
92
public Builder batchSize(int batchSize);
93
94
/**
95
* Set number of iterations per mini-batch during training
96
* @param iterations Number of iterations per mini-batch
97
* @return Builder instance for method chaining
98
*/
99
public Builder iterations(int iterations);
100
101
/**
102
* Set number of epochs (full corpus iterations) for training
103
* @param numEpochs Number of training epochs
104
* @return Builder instance for method chaining
105
*/
106
public Builder epochs(int numEpochs);
107
108
/**
109
* Set number of dimensions for output vectors
110
* @param layerSize Vector dimensionality (typically 100-300)
111
* @return Builder instance for method chaining
112
*/
113
public Builder layerSize(int layerSize);
114
115
/**
116
* Set initial learning rate for model training
117
* @param learningRate Initial learning rate (typically 0.025)
118
* @return Builder instance for method chaining
119
*/
120
public Builder learningRate(double learningRate);
121
122
/**
123
* Set minimum word frequency threshold
124
* @param minWordFrequency Words below this frequency are removed
125
* @return Builder instance for method chaining
126
*/
127
public Builder minWordFrequency(int minWordFrequency);
128
129
/**
130
* Set minimum learning rate value for training
131
* @param minLearningRate Minimum learning rate threshold
132
* @return Builder instance for method chaining
133
*/
134
public Builder minLearningRate(double minLearningRate);
135
136
/**
137
* Set whether to reset model before building
138
* @param reallyReset Whether to wipe model prior to building
139
* @return Builder instance for method chaining
140
*/
141
public Builder resetModel(boolean reallyReset);
142
143
/**
144
* Set vocabulary size limit during construction
145
* @param limit Maximum vocabulary size (0 means no limit)
146
* @return Builder instance for method chaining
147
*/
148
public Builder limitVocabularySize(int limit);
149
150
/**
151
* Define external VocabCache to be used
152
* @param vocabCache External vocabulary cache instance
153
* @return Builder instance for method chaining
154
*/
155
public Builder vocabCache(VocabCache<VocabWord> vocabCache);
156
157
/**
158
* Define external WeightLookupTable to be used
159
* @param lookupTable External weight lookup table instance
160
* @return Builder instance for method chaining
161
*/
162
public Builder lookupTable(WeightLookupTable<VocabWord> lookupTable);
163
164
/**
165
* Set subsampling parameter for frequent words
166
* @param sampling Subsampling rate (>0 to enable, 0 to disable)
167
* @return Builder instance for method chaining
168
*/
169
public Builder sampling(double sampling);
170
171
/**
172
* Enable or disable adaptive gradients (AdaGrad)
173
* @param reallyUse Whether to use adaptive gradients
174
* @return Builder instance for method chaining
175
*/
176
public Builder useAdaGrad(boolean reallyUse);
177
178
/**
179
* Set negative sampling parameter
180
* @param negative Negative sampling rate (>0 to enable, 0 to disable)
181
* @return Builder instance for method chaining
182
*/
183
public Builder negativeSample(double negative);
184
185
/**
186
* Set stop words to ignore during training
187
* @param stopList List of stop words to exclude
188
* @return Builder instance for method chaining
189
*/
190
public Builder stopWords(List<String> stopList);
191
192
/**
193
* Set stop words collection to ignore during training
194
* @param stopList Collection of VocabWord stop words to exclude
195
* @return Builder instance for method chaining
196
*/
197
public Builder stopWords(Collection<VocabWord> stopList);
198
199
/**
200
* Set context window size for training
201
* @param windowSize Size of context window around target word
202
* @return Builder instance for method chaining
203
*/
204
public Builder windowSize(int windowSize);
205
206
/**
207
* Set random seed for reproducible results
208
* @param randomSeed Random seed for initialization
209
* @return Builder instance for method chaining
210
*/
211
public Builder seed(long randomSeed);
212
213
/**
214
* Set maximum number of concurrent worker threads
215
* @param numWorkers Number of worker threads for parallel training
216
* @return Builder instance for method chaining
217
*/
218
public Builder workers(int numWorkers);
219
220
/**
221
* Set model utilities for similarity and nearest neighbor operations
222
* @param modelUtils ModelUtils instance for vector operations
223
* @return Builder instance for method chaining
224
*/
225
public Builder modelUtils(ModelUtils<VocabWord> modelUtils);
226
227
/**
228
* Enable variable window sizes for training
229
* @param windows Array of window sizes to use randomly
230
* @return Builder instance for method chaining
231
*/
232
public Builder useVariableWindow(int... windows);
233
234
/**
235
* Set unknown element for handling out-of-vocabulary words
236
* @param element VocabWord element to use for unknown words
237
* @return Builder instance for method chaining
238
*/
239
public Builder unknownElement(VocabWord element);
240
241
/**
242
* Enable or disable unknown word handling
243
* @param reallyUse Whether to use UNK token for unknown words
244
* @return Builder instance for method chaining
245
*/
246
public Builder useUnknown(boolean reallyUse);
247
248
/**
249
* Set event listeners for training progress
250
* @param vectorsListeners Collection of VectorsListener instances
251
* @return Builder instance for method chaining
252
*/
253
public Builder setVectorsListeners(Collection<VectorsListener<VocabWord>> vectorsListeners);
254
255
/**
256
* Set elements learning algorithm by name
257
* @param algorithm Name of learning algorithm to use
258
* @return Builder instance for method chaining
259
*/
260
public Builder elementsLearningAlgorithm(String algorithm);
261
262
/**
263
* Set elements learning algorithm instance
264
* @param algorithm ElementsLearningAlgorithm instance
265
* @return Builder instance for method chaining
266
*/
267
public Builder elementsLearningAlgorithm(ElementsLearningAlgorithm<VocabWord> algorithm);
268
269
/**
270
* Enable or disable parallel tokenization
271
* @param allow Whether to allow parallel tokenization (default: true)
272
* @return Builder instance for method chaining
273
*/
274
public Builder allowParallelTokenization(boolean allow);
275
276
/**
277
* Enable or disable periodic vocabulary truncation
278
* @param reallyEnable Whether to enable vocabulary scavenging
279
* @return Builder instance for method chaining
280
*/
281
public Builder enableScavenger(boolean reallyEnable);
282
283
/**
284
* Enable or disable hierarchical softmax
285
* @param reallyUse Whether to use hierarchical softmax
286
* @return Builder instance for method chaining
287
*/
288
public Builder useHierarchicSoftmax(boolean reallyUse);
289
290
/**
291
* Enable or disable precise weight initialization
292
* @param reallyUse Whether to use precise weight initialization
293
* @return Builder instance for method chaining
294
*/
295
public Builder usePreciseWeightInit(boolean reallyUse);
296
}
297
```
298
299
**Usage Examples:**
300
301
```java
302
import org.deeplearning4j.models.word2vec.Word2Vec;
303
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;
304
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
305
306
// Basic Word2Vec training
307
Collection<String> sentences = Arrays.asList(
308
"The cat sat on the mat",
309
"The dog ran in the park",
310
"Natural language processing is fascinating"
311
);
312
313
Word2Vec word2Vec = new Word2Vec.Builder()
314
.minWordFrequency(1)
315
.iterations(5)
316
.layerSize(100)
317
.seed(42)
318
.windowSize(5)
319
.iterate(new CollectionSentenceIterator(sentences))
320
.tokenizerFactory(new DefaultTokenizerFactory())
321
.build();
322
323
word2Vec.fit();
324
325
// Use trained model
326
double similarity = word2Vec.similarity("cat", "dog");
327
Collection<String> nearest = word2Vec.wordsNearest("cat", 5);
328
329
// Advanced configuration with custom parameters
330
Word2Vec advancedModel = new Word2Vec.Builder()
331
.minWordFrequency(5)
332
.iterations(10)
333
.epochs(3)
334
.layerSize(300)
335
.learningRate(0.025)
336
.minLearningRate(0.0001)
337
.windowSize(8)
338
.negativeSample(5.0)
339
.useAdaGrad(false)
340
.workers(Runtime.getRuntime().availableProcessors())
341
.seed(123456L)
342
.iterate(new CollectionSentenceIterator(largeCorpus))
343
.tokenizerFactory(new DefaultTokenizerFactory())
344
.build();
345
346
advancedModel.fit();
347
```
348
349
### Vocabulary Word Representation
350
351
Word representation class that extends SequenceElement with word-specific functionality for Word2Vec training and inference.
352
353
```java { .api }
354
/**
355
* Vocabulary word representation for Word2Vec models
356
*/
357
public class VocabWord extends SequenceElement {
358
359
/**
360
* Create vocabulary word with frequency and word string
361
* @param wordFrequency Frequency of word in training corpus
362
* @param word String representation of the word
363
*/
364
public VocabWord(double wordFrequency, String word);
365
366
/**
367
* Get the word string
368
* @return String representation of the word
369
*/
370
public String getWord();
371
372
/**
373
* Check if this word is a label
374
* @return true if word represents a label, false otherwise
375
*/
376
public boolean isLabel();
377
378
/**
379
* Get the index of this word in vocabulary
380
* @return Integer index in vocabulary
381
*/
382
public int getIndex();
383
}
384
```