Tessl Tile for maven/org.deeplearning4j/deeplearning4j-nlp@0.9.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

bag-of-words.md dataset-loading.md document-embeddings.md glove.md index.md text-processing.md word-embeddings.md

document-embeddings.mddocs/

0
# Document Embeddings (ParagraphVectors)
1

2
Document-level embeddings implementation (Doc2Vec) that creates vector representations for entire documents, sentences, or paragraphs. Enables document similarity comparison, classification, clustering, and information retrieval tasks with neural embeddings.
3

4
## Capabilities
5

6
### ParagraphVectors Model
7

8
Main ParagraphVectors implementation extending Word2Vec with document-level representation learning and inference capabilities.
9

10
```java { .api }
11
/**
12
 * ParagraphVectors (Doc2Vec) implementation extending Word2Vec
13
 * Provides document-level embeddings and classification capabilities
14
 */
15
public class ParagraphVectors extends Word2Vec {
16
    
17
    /**
18
     * Predict label for raw text (deprecated - use predict with document types)
19
     * @param rawText Raw text string to classify
20
     * @return Most probable label string
21
     */
22
    @Deprecated
23
    public String predict(String rawText);
24
    
25
    /**
26
     * Predict label for labeled document
27
     * @param document LabelledDocument instance to classify
28
     * @return Most probable label string
29
     */
30
    public String predict(LabelledDocument document);
31
    
32
    /**
33
     * Predict label for list of vocabulary words
34
     * @param document List of VocabWord instances
35
     * @return Most probable label string
36
     */
37
    public String predict(List<VocabWord> document);
38
    
39
    /**
40
     * Predict multiple labels for labeled document
41
     * @param document LabelledDocument to classify
42
     * @param limit Maximum number of labels to return
43
     * @return Collection of probable labels in descending order
44
     */
45
    public Collection<String> predictSeveral(LabelledDocument document, int limit);
46
    
47
    /**
48
     * Predict multiple labels for raw text
49
     * @param rawText Raw text string to classify
50
     * @param limit Maximum number of labels to return
51
     * @return Collection of probable labels in descending order
52
     */
53
    public Collection<String> predictSeveral(String rawText, int limit);
54
    
55
    /**
56
     * Predict multiple labels for word list
57
     * @param document List of VocabWord instances
58
     * @param limit Maximum number of labels to return
59
     * @return Collection of probable labels in descending order
60
     */
61
    public Collection<String> predictSeveral(List<VocabWord> document, int limit);
62
    
63
    /**
64
     * Calculate inferred vector for text with custom training parameters
65
     * @param text Raw text string to vectorize
66
     * @param learningRate Learning rate for inference training
67
     * @param minLearningRate Minimum learning rate threshold
68
     * @param iterations Number of inference iterations
69
     * @return INDArray vector representation of the text
70
     */
71
    public INDArray inferVector(String text, double learningRate, double minLearningRate, int iterations);
72
    
73
    /**
74
     * Calculate inferred vector for document with custom parameters
75
     * @param document LabelledDocument to vectorize
76
     * @param learningRate Learning rate for inference training
77
     * @param minLearningRate Minimum learning rate threshold
78
     * @param iterations Number of inference iterations
79
     * @return INDArray vector representation of the document
80
     */
81
    public INDArray inferVector(LabelledDocument document, double learningRate, double minLearningRate, int iterations);
82
    
83
    /**
84
     * Calculate inferred vector for word list with custom parameters
85
     * @param document List of VocabWord instances to vectorize
86
     * @param learningRate Learning rate for inference training
87
     * @param minLearningRate Minimum learning rate threshold
88
     * @param iterations Number of inference iterations
89
     * @return INDArray vector representation of the word list
90
     */
91
    public INDArray inferVector(List<VocabWord> document, double learningRate, double minLearningRate, int iterations);
92
    
93
    /**
94
     * Calculate inferred vector for text with default parameters
95
     * @param text Raw text string to vectorize
96
     * @return INDArray vector representation using default parameters
97
     */
98
    public INDArray inferVector(String text);
99
    
100
    /**
101
     * Calculate inferred vector for document with default parameters
102
     * @param document LabelledDocument to vectorize
103
     * @return INDArray vector representation using default parameters
104
     */
105
    public INDArray inferVector(LabelledDocument document);
106
    
107
    /**
108
     * Calculate inferred vector for word list with default parameters
109
     * @param document List of VocabWord instances to vectorize
110
     * @return INDArray vector representation using default parameters
111
     */
112
    public INDArray inferVector(List<VocabWord> document);
113
    
114
    /**
115
     * Batched inference for labeled document returning Future with ID and vector
116
     * @param document LabelledDocument with ID field defined
117
     * @return Future containing Pair of document ID and inferred vector
118
     */
119
    public Future<Pair<String, INDArray>> inferVectorBatched(LabelledDocument document);
120
    
121
    /**
122
     * Batched inference for text string returning Future with vector
123
     * @param document Raw text string to vectorize
124
     * @return Future containing inferred vector
125
     */
126
    public Future<INDArray> inferVectorBatched(String document);
127
    
128
    /**
129
     * Batched inference for multiple text strings
130
     * @param documents List of text strings to vectorize
131
     * @return List of INDArray vectors in same order as input
132
     */
133
    public List<INDArray> inferVectorBatched(List<String> documents);
134
    
135
    /**
136
     * Find top N labels nearest to labeled document
137
     * @param document LabelledDocument to compare
138
     * @param topN Number of nearest labels to return
139
     * @return Collection of nearest label strings
140
     */
141
    public Collection<String> nearestLabels(LabelledDocument document, int topN);
142
    
143
    /**
144
     * Find top N labels nearest to raw text
145
     * @param rawText Raw text string to compare
146
     * @param topN Number of nearest labels to return
147
     * @return Collection of nearest label strings
148
     */
149
    public Collection<String> nearestLabels(String rawText, int topN);
150
    
151
    /**
152
     * Find top N labels nearest to vocabulary word collection
153
     * @param document Collection of VocabWord instances
154
     * @param topN Number of nearest labels to return
155
     * @return Collection of nearest label strings
156
     */
157
    public Collection<String> nearestLabels(Collection<VocabWord> document, int topN);
158
    
159
    /**
160
     * Find top N labels nearest to feature vector
161
     * @param labelVector INDArray feature vector
162
     * @param topN Number of nearest labels to return
163
     * @return Collection of nearest label strings
164
     */
165
    public Collection<String> nearestLabels(INDArray labelVector, int topN);
166
    
167
    /**
168
     * Calculate similarity between document and specific label
169
     * @param document LabelledDocument to compare
170
     * @param label Target label string
171
     * @return Similarity score between document and label
172
     */
173
    public double similarityToLabel(LabelledDocument document, String label);
174
    
175
    /**
176
     * Calculate similarity between word list and specific label
177
     * @param document List of VocabWord instances
178
     * @param label Target label string
179
     * @return Similarity score between document and label
180
     */
181
    public double similarityToLabel(List<VocabWord> document, String label);
182
    
183
    /**
184
     * Calculate similarity between raw text and specific label (deprecated)
185
     * @param rawText Raw text string
186
     * @param label Target label string  
187
     * @return Similarity score between text and label
188
     */
189
    @Deprecated
190
    public double similarityToLabel(String rawText, String label);
191
    
192
    /**
193
     * Extract label vectors from vocabulary for nearest neighbor operations
194
     * Populates internal labels matrix for efficient similarity calculations
195
     */
196
    public void extractLabels();
197
    
198
    /**
199
     * Set sequence iterator for pre-tokenized training data
200
     * @param iterator SequenceIterator providing tokenized sequences
201
     */
202
    public void setSequenceIterator(SequenceIterator<VocabWord> iterator);
203
}
204
```
205

206
### ParagraphVectors Builder
207

208
Extended builder for ParagraphVectors with document-specific configuration options and label handling.
209

210
```java { .api }
211
/**
212
 * Builder for ParagraphVectors configuration extending Word2Vec.Builder
213
 */
214
public static class ParagraphVectors.Builder extends Word2Vec.Builder {
215
    
216
    /**
217
     * Build configured ParagraphVectors instance
218
     * @return Configured ParagraphVectors model ready for training
219
     */
220
    public ParagraphVectors build();
221
    
222
    /**
223
     * Use pre-built WordVectors model for ParagraphVectors initialization
224
     * @param vec Existing WordVectors model (Word2Vec or GloVe)
225
     * @return Builder instance for method chaining
226
     */
227
    public Builder useExistingWordVectors(WordVectors vec);
228
    
229
    /**
230
     * Define whether word representations should be trained with documents
231
     * @param trainElements Whether to train word vectors alongside document vectors
232
     * @return Builder instance for method chaining
233
     */
234
    public Builder trainWordVectors(boolean trainElements);
235
    
236
    /**
237
     * Attach pre-defined labels source to ParagraphVectors
238
     * @param source LabelsSource instance containing available labels
239
     * @return Builder instance for method chaining  
240
     */
241
    public Builder labelsSource(LabelsSource source);
242
    
243
    /**
244
     * Build LabelSource from labels list (deprecated due to order synchronization issues)
245
     * @param labels List of label strings
246
     * @return Builder instance for method chaining
247
     */
248
    @Deprecated
249
    public Builder labels(List<String> labels);
250
    
251
    /**
252
     * Set label-aware document iterator for training
253
     * @param iterator LabelAwareDocumentIterator with labeled documents
254
     * @return Builder instance for method chaining
255
     */
256
    public Builder iterate(LabelAwareDocumentIterator iterator);
257
    
258
    /**
259
     * Set label-aware sentence iterator for training  
260
     * @param iterator LabelAwareSentenceIterator with labeled sentences
261
     * @return Builder instance for method chaining
262
     */
263
    public Builder iterate(LabelAwareSentenceIterator iterator);
264
    
265
    /**
266
     * Set general label-aware iterator for training
267
     * @param iterator LabelAwareIterator providing labeled training data
268
     * @return Builder instance for method chaining
269
     */
270
    public Builder iterate(LabelAwareIterator iterator);
271
    
272
    /**
273
     * Set document iterator for training (unlabeled documents)
274
     * @param iterator DocumentIterator providing training documents
275
     * @return Builder instance for method chaining
276
     */
277
    public Builder iterate(DocumentIterator iterator);
278
    
279
    /**
280
     * Set sentence iterator for training (unlabeled sentences)
281
     * @param iterator SentenceIterator providing training sentences
282
     * @return Builder instance for method chaining
283
     */
284
    public Builder iterate(SentenceIterator iterator);
285
    
286
    // Inherits all Word2Vec.Builder methods with appropriate return types
287
}
288
```
289

290
**Usage Examples:**
291

292
```java
293
import org.deeplearning4j.models.paragraphvectors.ParagraphVectors;
294
import org.deeplearning4j.text.documentiterator.*;
295
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
296

297
// Basic document classification training
298
Collection<LabelledDocument> labeledDocs = Arrays.asList(
299
    new LabelledDocument("This is a positive review", "positive"),
300
    new LabelledDocument("This is a negative review", "negative"),
301
    new LabelledDocument("Great product, highly recommend", "positive")
302
);
303

304
LabelAwareIterator iterator = new BasicLabelAwareIterator(labeledDocs);
305

306
ParagraphVectors paragraphVectors = new ParagraphVectors.Builder()
307
    .minWordFrequency(1)
308
    .iterations(5)
309
    .epochs(10)
310
    .layerSize(100)
311
    .learningRate(0.025)
312
    .windowSize(5)
313
    .iterate(iterator)
314
    .tokenizerFactory(new DefaultTokenizerFactory())
315
    .trainWordVectors(true)
316
    .build();
317

318
paragraphVectors.fit();
319

320
// Document inference and classification
321
String newDocument = "This product is amazing";
322
INDArray docVector = paragraphVectors.inferVector(newDocument);
323
String predictedLabel = paragraphVectors.predict(newDocument);
324
Collection<String> topLabels = paragraphVectors.predictSeveral(newDocument, 3);
325

326
System.out.println("Predicted label: " + predictedLabel);
327
System.out.println("Top labels: " + topLabels);
328

329
// Document similarity using inferred vectors
330
String doc1 = "Great product quality";
331
String doc2 = "Excellent item, very satisfied";
332

333
INDArray vec1 = paragraphVectors.inferVector(doc1);
334
INDArray vec2 = paragraphVectors.inferVector(doc2);
335

336
// Calculate cosine similarity
337
double similarity = Transforms.cosineSim(vec1, vec2);
338
System.out.println("Document similarity: " + similarity);
339

340
// Batch inference for multiple documents
341
List<String> documents = Arrays.asList(
342
    "First document text",
343
    "Second document text", 
344
    "Third document text"
345
);
346

347
List<INDArray> vectors = paragraphVectors.inferVectorBatched(documents);
348
System.out.println("Processed " + vectors.size() + " documents");
349

350
// Find nearest labels to a document
351
Collection<String> nearestLabels = paragraphVectors.nearestLabels(newDocument, 5);
352
System.out.println("Nearest labels: " + nearestLabels);
353

354
// Advanced configuration with existing word vectors
355
Word2Vec existingWord2Vec = new Word2Vec.Builder()
356
    .layerSize(300)
357
    .windowSize(10)
358
    // ... other configuration
359
    .build();
360
existingWord2Vec.fit(); // Train on large corpus
361

362
ParagraphVectors advancedPV = new ParagraphVectors.Builder()
363
    .useExistingWordVectors(existingWord2Vec)
364
    .trainWordVectors(false) // Don't retrain word vectors
365
    .layerSize(300)
366
    .iterate(labeledDocumentIterator)
367
    .tokenizerFactory(new DefaultTokenizerFactory())
368
    .build();
369

370
advancedPV.fit();
371
```
372

373
### Document Types
374

375
Supporting classes for labeled document handling and training data preparation.
376

377
```java { .api }
378
/**
379
 * Document with label information for supervised training
380
 */
381
public class LabelledDocument {
382
    
383
    /**
384
     * Get document content as string
385
     * @return Document text content
386
     */
387
    public String getContent();
388
    
389
    /**
390
     * Get document identifier
391
     * @return String identifier for the document
392
     */
393
    public String getId();
394
    
395
    /**
396
     * Get document labels
397
     * @return List of label strings associated with document
398
     */
399
    public List<String> getLabels();
400
    
401
    /**
402
     * Get referenced content as vocabulary words
403
     * @return List of VocabWord instances from document
404
     */
405
    public List<VocabWord> getReferencedContent();
406
}
407

408
/**
409
 * Source of labels for document classification
410
 */
411
public class LabelsSource {
412
    
413
    /**
414
     * Create empty labels source
415
     */
416
    public LabelsSource();
417
    
418
    /**
419
     * Create labels source with predefined labels
420
     * @param labels List of available label strings
421
     */
422
    public LabelsSource(List<String> labels);
423
    
424
    /**
425
     * Get available labels
426
     * @return List of label strings
427
     */
428
    public List<String> getLabels();
429
}
430
```

Version

Tile

Files

document-embeddings.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

document-embeddings.mddocs/