Tessl Tile for maven/org.deeplearning4j/deeplearning4j-nlp@0.9.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

bag-of-words.md dataset-loading.md document-embeddings.md glove.md index.md text-processing.md word-embeddings.md

word-embeddings.mddocs/

0
# Word Embeddings (Word2Vec)
1

2
Dense vector representations of words that capture semantic and syntactic relationships in high-dimensional space. The Word2Vec implementation supports both skip-gram and CBOW algorithms with extensive configuration options for production-scale text processing.
3

4
## Capabilities
5

6
### Word2Vec Model
7

8
Main Word2Vec implementation based on SequenceVectors framework, providing neural word embeddings with configurable architecture and training parameters.
9

10
```java { .api }
11
/**
12
 * Word2Vec implementation based on SequenceVectors
13
 */
14
public class Word2Vec extends SequenceVectors<VocabWord> {
15
    
16
    /**
17
     * Define TokenizerFactory instance for model building
18
     * @param tokenizerFactory TokenizerFactory instance for text tokenization
19
     */
20
    public void setTokenizerFactory(TokenizerFactory tokenizerFactory);
21
    
22
    /**
23
     * Define SentenceIterator as training corpus source
24
     * @param iterator SentenceIterator instance for sentence-level text input
25
     */
26
    public void setSentenceIterator(SentenceIterator iterator);
27
    
28
    /**
29
     * Define SequenceIterator for pre-tokenized sequences
30
     * @param iterator SequenceIterator for already tokenized Sequence<VocabWord> input
31
     */
32
    public void setSequenceIterator(SequenceIterator<VocabWord> iterator);
33
}
34
```
35

36
### Word2Vec Builder
37

38
Comprehensive builder pattern for Word2Vec configuration with extensive training and architecture parameters.
39

40
```java { .api }
41
/**
42
 * Builder for Word2Vec configuration and construction
43
 */
44
public static class Word2Vec.Builder extends SequenceVectors.Builder<VocabWord> {
45
    
46
    /**
47
     * Build the configured Word2Vec instance
48
     * @return Configured Word2Vec model ready for training
49
     */
50
    public Word2Vec build();
51
    
52
    /**
53
     * Set document iterator for training data
54
     * @param iterator DocumentIterator providing training documents
55
     * @return Builder instance for method chaining
56
     */
57
    public Builder iterate(DocumentIterator iterator);
58
    
59
    /**
60
     * Set sentence iterator for training data
61
     * @param iterator SentenceIterator providing training sentences
62
     * @return Builder instance for method chaining
63
     */
64
    public Builder iterate(SentenceIterator iterator);
65
    
66
    /**
67
     * Set sequence iterator for pre-tokenized training data
68
     * @param iterator SequenceIterator providing tokenized sequences
69
     * @return Builder instance for method chaining
70
     */
71
    public Builder iterate(SequenceIterator<VocabWord> iterator);
72
    
73
    /**
74
     * Set label-aware iterator for supervised training
75
     * @param iterator LabelAwareIterator providing labeled training data
76
     * @return Builder instance for method chaining
77
     */
78
    public Builder iterate(LabelAwareIterator iterator);
79
    
80
    /**
81
     * Define TokenizerFactory for string tokenization during training
82
     * @param tokenizerFactory TokenizerFactory for text tokenization
83
     * @return Builder instance for method chaining
84
     */
85
    public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);
86
    
87
    /**
88
     * Set mini-batch size for training
89
     * @param batchSize Number of sequences per mini-batch
90
     * @return Builder instance for method chaining
91
     */
92
    public Builder batchSize(int batchSize);
93
    
94
    /**
95
     * Set number of iterations per mini-batch during training
96
     * @param iterations Number of iterations per mini-batch
97
     * @return Builder instance for method chaining
98
     */
99
    public Builder iterations(int iterations);
100
    
101
    /**
102
     * Set number of epochs (full corpus iterations) for training
103
     * @param numEpochs Number of training epochs
104
     * @return Builder instance for method chaining
105
     */
106
    public Builder epochs(int numEpochs);
107
    
108
    /**
109
     * Set number of dimensions for output vectors
110
     * @param layerSize Vector dimensionality (typically 100-300)
111
     * @return Builder instance for method chaining
112
     */
113
    public Builder layerSize(int layerSize);
114
    
115
    /**
116
     * Set initial learning rate for model training
117
     * @param learningRate Initial learning rate (typically 0.025)
118
     * @return Builder instance for method chaining
119
     */
120
    public Builder learningRate(double learningRate);
121
    
122
    /**
123
     * Set minimum word frequency threshold
124
     * @param minWordFrequency Words below this frequency are removed
125
     * @return Builder instance for method chaining
126
     */
127
    public Builder minWordFrequency(int minWordFrequency);
128
    
129
    /**
130
     * Set minimum learning rate value for training
131
     * @param minLearningRate Minimum learning rate threshold
132
     * @return Builder instance for method chaining
133
     */
134
    public Builder minLearningRate(double minLearningRate);
135
    
136
    /**
137
     * Set whether to reset model before building
138
     * @param reallyReset Whether to wipe model prior to building
139
     * @return Builder instance for method chaining
140
     */
141
    public Builder resetModel(boolean reallyReset);
142
    
143
    /**
144
     * Set vocabulary size limit during construction
145
     * @param limit Maximum vocabulary size (0 means no limit)
146
     * @return Builder instance for method chaining
147
     */
148
    public Builder limitVocabularySize(int limit);
149
    
150
    /**
151
     * Define external VocabCache to be used
152
     * @param vocabCache External vocabulary cache instance
153
     * @return Builder instance for method chaining
154
     */
155
    public Builder vocabCache(VocabCache<VocabWord> vocabCache);
156
    
157
    /**
158
     * Define external WeightLookupTable to be used
159
     * @param lookupTable External weight lookup table instance
160
     * @return Builder instance for method chaining
161
     */
162
    public Builder lookupTable(WeightLookupTable<VocabWord> lookupTable);
163
    
164
    /**
165
     * Set subsampling parameter for frequent words
166
     * @param sampling Subsampling rate (>0 to enable, 0 to disable)
167
     * @return Builder instance for method chaining
168
     */
169
    public Builder sampling(double sampling);
170
    
171
    /**
172
     * Enable or disable adaptive gradients (AdaGrad)
173
     * @param reallyUse Whether to use adaptive gradients
174
     * @return Builder instance for method chaining
175
     */
176
    public Builder useAdaGrad(boolean reallyUse);
177
    
178
    /**
179
     * Set negative sampling parameter
180
     * @param negative Negative sampling rate (>0 to enable, 0 to disable)
181
     * @return Builder instance for method chaining
182
     */
183
    public Builder negativeSample(double negative);
184
    
185
    /**
186
     * Set stop words to ignore during training
187
     * @param stopList List of stop words to exclude
188
     * @return Builder instance for method chaining
189
     */
190
    public Builder stopWords(List<String> stopList);
191
    
192
    /**
193
     * Set stop words collection to ignore during training
194
     * @param stopList Collection of VocabWord stop words to exclude
195
     * @return Builder instance for method chaining
196
     */
197
    public Builder stopWords(Collection<VocabWord> stopList);
198
    
199
    /**
200
     * Set context window size for training
201
     * @param windowSize Size of context window around target word
202
     * @return Builder instance for method chaining
203
     */
204
    public Builder windowSize(int windowSize);
205
    
206
    /**
207
     * Set random seed for reproducible results
208
     * @param randomSeed Random seed for initialization
209
     * @return Builder instance for method chaining
210
     */
211
    public Builder seed(long randomSeed);
212
    
213
    /**
214
     * Set maximum number of concurrent worker threads
215
     * @param numWorkers Number of worker threads for parallel training
216
     * @return Builder instance for method chaining
217
     */
218
    public Builder workers(int numWorkers);
219
    
220
    /**
221
     * Set model utilities for similarity and nearest neighbor operations
222
     * @param modelUtils ModelUtils instance for vector operations
223
     * @return Builder instance for method chaining
224
     */
225
    public Builder modelUtils(ModelUtils<VocabWord> modelUtils);
226
    
227
    /**
228
     * Enable variable window sizes for training
229
     * @param windows Array of window sizes to use randomly
230
     * @return Builder instance for method chaining
231
     */
232
    public Builder useVariableWindow(int... windows);
233
    
234
    /**
235
     * Set unknown element for handling out-of-vocabulary words
236
     * @param element VocabWord element to use for unknown words
237
     * @return Builder instance for method chaining
238
     */
239
    public Builder unknownElement(VocabWord element);
240
    
241
    /**
242
     * Enable or disable unknown word handling
243
     * @param reallyUse Whether to use UNK token for unknown words
244
     * @return Builder instance for method chaining
245
     */
246
    public Builder useUnknown(boolean reallyUse);
247
    
248
    /**
249
     * Set event listeners for training progress
250
     * @param vectorsListeners Collection of VectorsListener instances
251
     * @return Builder instance for method chaining
252
     */
253
    public Builder setVectorsListeners(Collection<VectorsListener<VocabWord>> vectorsListeners);
254
    
255
    /**
256
     * Set elements learning algorithm by name
257
     * @param algorithm Name of learning algorithm to use
258
     * @return Builder instance for method chaining
259
     */
260
    public Builder elementsLearningAlgorithm(String algorithm);
261
    
262
    /**
263
     * Set elements learning algorithm instance
264
     * @param algorithm ElementsLearningAlgorithm instance
265
     * @return Builder instance for method chaining
266
     */
267
    public Builder elementsLearningAlgorithm(ElementsLearningAlgorithm<VocabWord> algorithm);
268
    
269
    /**
270
     * Enable or disable parallel tokenization
271
     * @param allow Whether to allow parallel tokenization (default: true)
272
     * @return Builder instance for method chaining
273
     */
274
    public Builder allowParallelTokenization(boolean allow);
275
    
276
    /**
277
     * Enable or disable periodic vocabulary truncation
278
     * @param reallyEnable Whether to enable vocabulary scavenging
279
     * @return Builder instance for method chaining
280
     */
281
    public Builder enableScavenger(boolean reallyEnable);
282
    
283
    /**
284
     * Enable or disable hierarchical softmax
285
     * @param reallyUse Whether to use hierarchical softmax
286
     * @return Builder instance for method chaining
287
     */
288
    public Builder useHierarchicSoftmax(boolean reallyUse);
289
    
290
    /**
291
     * Enable or disable precise weight initialization
292
     * @param reallyUse Whether to use precise weight initialization
293
     * @return Builder instance for method chaining
294
     */
295
    public Builder usePreciseWeightInit(boolean reallyUse);
296
}
297
```
298

299
**Usage Examples:**
300

301
```java
302
import org.deeplearning4j.models.word2vec.Word2Vec;
303
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;
304
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
305

306
// Basic Word2Vec training
307
Collection<String> sentences = Arrays.asList(
308
    "The cat sat on the mat",
309
    "The dog ran in the park",
310
    "Natural language processing is fascinating"
311
);
312

313
Word2Vec word2Vec = new Word2Vec.Builder()
314
    .minWordFrequency(1)
315
    .iterations(5)
316
    .layerSize(100)
317
    .seed(42)
318
    .windowSize(5)
319
    .iterate(new CollectionSentenceIterator(sentences))
320
    .tokenizerFactory(new DefaultTokenizerFactory())
321
    .build();
322

323
word2Vec.fit();
324

325
// Use trained model
326
double similarity = word2Vec.similarity("cat", "dog");
327
Collection<String> nearest = word2Vec.wordsNearest("cat", 5);
328

329
// Advanced configuration with custom parameters
330
Word2Vec advancedModel = new Word2Vec.Builder()
331
    .minWordFrequency(5)
332
    .iterations(10)
333
    .epochs(3)
334
    .layerSize(300)
335
    .learningRate(0.025)
336
    .minLearningRate(0.0001)
337
    .windowSize(8)
338
    .negativeSample(5.0)
339
    .useAdaGrad(false)
340
    .workers(Runtime.getRuntime().availableProcessors())
341
    .seed(123456L)
342
    .iterate(new CollectionSentenceIterator(largeCorpus))
343
    .tokenizerFactory(new DefaultTokenizerFactory())
344
    .build();
345

346
advancedModel.fit();
347
```
348

349
### Vocabulary Word Representation
350

351
Word representation class that extends SequenceElement with word-specific functionality for Word2Vec training and inference.
352

353
```java { .api }
354
/**
355
 * Vocabulary word representation for Word2Vec models
356
 */
357
public class VocabWord extends SequenceElement {
358
    
359
    /**
360
     * Create vocabulary word with frequency and word string
361
     * @param wordFrequency Frequency of word in training corpus
362
     * @param word String representation of the word
363
     */
364
    public VocabWord(double wordFrequency, String word);
365
    
366
    /**
367
     * Get the word string
368
     * @return String representation of the word
369
     */
370
    public String getWord();
371
    
372
    /**
373
     * Check if this word is a label
374
     * @return true if word represents a label, false otherwise
375
     */
376
    public boolean isLabel();
377
    
378
    /**
379
     * Get the index of this word in vocabulary
380
     * @return Integer index in vocabulary
381
     */
382
    public int getIndex();
383
}
384
```

Version

Tile

Files

word-embeddings.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

word-embeddings.mddocs/