0
# Document Embeddings (ParagraphVectors)
1
2
Document-level embeddings implementation (Doc2Vec) that creates vector representations for entire documents, sentences, or paragraphs. Enables document similarity comparison, classification, clustering, and information retrieval tasks with neural embeddings.
3
4
## Capabilities
5
6
### ParagraphVectors Model
7
8
Main ParagraphVectors implementation extending Word2Vec with document-level representation learning and inference capabilities.
9
10
```java { .api }
11
/**
12
* ParagraphVectors (Doc2Vec) implementation extending Word2Vec
13
* Provides document-level embeddings and classification capabilities
14
*/
15
public class ParagraphVectors extends Word2Vec {
16
17
/**
18
* Predict label for raw text (deprecated - use predict with document types)
19
* @param rawText Raw text string to classify
20
* @return Most probable label string
21
*/
22
@Deprecated
23
public String predict(String rawText);
24
25
/**
26
* Predict label for labeled document
27
* @param document LabelledDocument instance to classify
28
* @return Most probable label string
29
*/
30
public String predict(LabelledDocument document);
31
32
/**
33
* Predict label for list of vocabulary words
34
* @param document List of VocabWord instances
35
* @return Most probable label string
36
*/
37
public String predict(List<VocabWord> document);
38
39
/**
40
* Predict multiple labels for labeled document
41
* @param document LabelledDocument to classify
42
* @param limit Maximum number of labels to return
43
* @return Collection of probable labels in descending order
44
*/
45
public Collection<String> predictSeveral(LabelledDocument document, int limit);
46
47
/**
48
* Predict multiple labels for raw text
49
* @param rawText Raw text string to classify
50
* @param limit Maximum number of labels to return
51
* @return Collection of probable labels in descending order
52
*/
53
public Collection<String> predictSeveral(String rawText, int limit);
54
55
/**
56
* Predict multiple labels for word list
57
* @param document List of VocabWord instances
58
* @param limit Maximum number of labels to return
59
* @return Collection of probable labels in descending order
60
*/
61
public Collection<String> predictSeveral(List<VocabWord> document, int limit);
62
63
/**
64
* Calculate inferred vector for text with custom training parameters
65
* @param text Raw text string to vectorize
66
* @param learningRate Learning rate for inference training
67
* @param minLearningRate Minimum learning rate threshold
68
* @param iterations Number of inference iterations
69
* @return INDArray vector representation of the text
70
*/
71
public INDArray inferVector(String text, double learningRate, double minLearningRate, int iterations);
72
73
/**
74
* Calculate inferred vector for document with custom parameters
75
* @param document LabelledDocument to vectorize
76
* @param learningRate Learning rate for inference training
77
* @param minLearningRate Minimum learning rate threshold
78
* @param iterations Number of inference iterations
79
* @return INDArray vector representation of the document
80
*/
81
public INDArray inferVector(LabelledDocument document, double learningRate, double minLearningRate, int iterations);
82
83
/**
84
* Calculate inferred vector for word list with custom parameters
85
* @param document List of VocabWord instances to vectorize
86
* @param learningRate Learning rate for inference training
87
* @param minLearningRate Minimum learning rate threshold
88
* @param iterations Number of inference iterations
89
* @return INDArray vector representation of the word list
90
*/
91
public INDArray inferVector(List<VocabWord> document, double learningRate, double minLearningRate, int iterations);
92
93
/**
94
* Calculate inferred vector for text with default parameters
95
* @param text Raw text string to vectorize
96
* @return INDArray vector representation using default parameters
97
*/
98
public INDArray inferVector(String text);
99
100
/**
101
* Calculate inferred vector for document with default parameters
102
* @param document LabelledDocument to vectorize
103
* @return INDArray vector representation using default parameters
104
*/
105
public INDArray inferVector(LabelledDocument document);
106
107
/**
108
* Calculate inferred vector for word list with default parameters
109
* @param document List of VocabWord instances to vectorize
110
* @return INDArray vector representation using default parameters
111
*/
112
public INDArray inferVector(List<VocabWord> document);
113
114
/**
115
* Batched inference for labeled document returning Future with ID and vector
116
* @param document LabelledDocument with ID field defined
117
* @return Future containing Pair of document ID and inferred vector
118
*/
119
public Future<Pair<String, INDArray>> inferVectorBatched(LabelledDocument document);
120
121
/**
122
* Batched inference for text string returning Future with vector
123
* @param document Raw text string to vectorize
124
* @return Future containing inferred vector
125
*/
126
public Future<INDArray> inferVectorBatched(String document);
127
128
/**
129
* Batched inference for multiple text strings
130
* @param documents List of text strings to vectorize
131
* @return List of INDArray vectors in same order as input
132
*/
133
public List<INDArray> inferVectorBatched(List<String> documents);
134
135
/**
136
* Find top N labels nearest to labeled document
137
* @param document LabelledDocument to compare
138
* @param topN Number of nearest labels to return
139
* @return Collection of nearest label strings
140
*/
141
public Collection<String> nearestLabels(LabelledDocument document, int topN);
142
143
/**
144
* Find top N labels nearest to raw text
145
* @param rawText Raw text string to compare
146
* @param topN Number of nearest labels to return
147
* @return Collection of nearest label strings
148
*/
149
public Collection<String> nearestLabels(String rawText, int topN);
150
151
/**
152
* Find top N labels nearest to vocabulary word collection
153
* @param document Collection of VocabWord instances
154
* @param topN Number of nearest labels to return
155
* @return Collection of nearest label strings
156
*/
157
public Collection<String> nearestLabels(Collection<VocabWord> document, int topN);
158
159
/**
160
* Find top N labels nearest to feature vector
161
* @param labelVector INDArray feature vector
162
* @param topN Number of nearest labels to return
163
* @return Collection of nearest label strings
164
*/
165
public Collection<String> nearestLabels(INDArray labelVector, int topN);
166
167
/**
168
* Calculate similarity between document and specific label
169
* @param document LabelledDocument to compare
170
* @param label Target label string
171
* @return Similarity score between document and label
172
*/
173
public double similarityToLabel(LabelledDocument document, String label);
174
175
/**
176
* Calculate similarity between word list and specific label
177
* @param document List of VocabWord instances
178
* @param label Target label string
179
* @return Similarity score between document and label
180
*/
181
public double similarityToLabel(List<VocabWord> document, String label);
182
183
/**
184
* Calculate similarity between raw text and specific label (deprecated)
185
* @param rawText Raw text string
186
* @param label Target label string
187
* @return Similarity score between text and label
188
*/
189
@Deprecated
190
public double similarityToLabel(String rawText, String label);
191
192
/**
193
* Extract label vectors from vocabulary for nearest neighbor operations
194
* Populates internal labels matrix for efficient similarity calculations
195
*/
196
public void extractLabels();
197
198
/**
199
* Set sequence iterator for pre-tokenized training data
200
* @param iterator SequenceIterator providing tokenized sequences
201
*/
202
public void setSequenceIterator(SequenceIterator<VocabWord> iterator);
203
}
204
```
205
206
### ParagraphVectors Builder
207
208
Extended builder for ParagraphVectors with document-specific configuration options and label handling.
209
210
```java { .api }
211
/**
212
* Builder for ParagraphVectors configuration extending Word2Vec.Builder
213
*/
214
public static class ParagraphVectors.Builder extends Word2Vec.Builder {
215
216
/**
217
* Build configured ParagraphVectors instance
218
* @return Configured ParagraphVectors model ready for training
219
*/
220
public ParagraphVectors build();
221
222
/**
223
* Use pre-built WordVectors model for ParagraphVectors initialization
224
* @param vec Existing WordVectors model (Word2Vec or GloVe)
225
* @return Builder instance for method chaining
226
*/
227
public Builder useExistingWordVectors(WordVectors vec);
228
229
/**
230
* Define whether word representations should be trained with documents
231
* @param trainElements Whether to train word vectors alongside document vectors
232
* @return Builder instance for method chaining
233
*/
234
public Builder trainWordVectors(boolean trainElements);
235
236
/**
237
* Attach pre-defined labels source to ParagraphVectors
238
* @param source LabelsSource instance containing available labels
239
* @return Builder instance for method chaining
240
*/
241
public Builder labelsSource(LabelsSource source);
242
243
/**
244
* Build LabelSource from labels list (deprecated due to order synchronization issues)
245
* @param labels List of label strings
246
* @return Builder instance for method chaining
247
*/
248
@Deprecated
249
public Builder labels(List<String> labels);
250
251
/**
252
* Set label-aware document iterator for training
253
* @param iterator LabelAwareDocumentIterator with labeled documents
254
* @return Builder instance for method chaining
255
*/
256
public Builder iterate(LabelAwareDocumentIterator iterator);
257
258
/**
259
* Set label-aware sentence iterator for training
260
* @param iterator LabelAwareSentenceIterator with labeled sentences
261
* @return Builder instance for method chaining
262
*/
263
public Builder iterate(LabelAwareSentenceIterator iterator);
264
265
/**
266
* Set general label-aware iterator for training
267
* @param iterator LabelAwareIterator providing labeled training data
268
* @return Builder instance for method chaining
269
*/
270
public Builder iterate(LabelAwareIterator iterator);
271
272
/**
273
* Set document iterator for training (unlabeled documents)
274
* @param iterator DocumentIterator providing training documents
275
* @return Builder instance for method chaining
276
*/
277
public Builder iterate(DocumentIterator iterator);
278
279
/**
280
* Set sentence iterator for training (unlabeled sentences)
281
* @param iterator SentenceIterator providing training sentences
282
* @return Builder instance for method chaining
283
*/
284
public Builder iterate(SentenceIterator iterator);
285
286
// Inherits all Word2Vec.Builder methods with appropriate return types
287
}
288
```
289
290
**Usage Examples:**
291
292
```java
293
import org.deeplearning4j.models.paragraphvectors.ParagraphVectors;
294
import org.deeplearning4j.text.documentiterator.*;
295
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
296
297
// Basic document classification training
298
Collection<LabelledDocument> labeledDocs = Arrays.asList(
299
new LabelledDocument("This is a positive review", "positive"),
300
new LabelledDocument("This is a negative review", "negative"),
301
new LabelledDocument("Great product, highly recommend", "positive")
302
);
303
304
LabelAwareIterator iterator = new BasicLabelAwareIterator(labeledDocs);
305
306
ParagraphVectors paragraphVectors = new ParagraphVectors.Builder()
307
.minWordFrequency(1)
308
.iterations(5)
309
.epochs(10)
310
.layerSize(100)
311
.learningRate(0.025)
312
.windowSize(5)
313
.iterate(iterator)
314
.tokenizerFactory(new DefaultTokenizerFactory())
315
.trainWordVectors(true)
316
.build();
317
318
paragraphVectors.fit();
319
320
// Document inference and classification
321
String newDocument = "This product is amazing";
322
INDArray docVector = paragraphVectors.inferVector(newDocument);
323
String predictedLabel = paragraphVectors.predict(newDocument);
324
Collection<String> topLabels = paragraphVectors.predictSeveral(newDocument, 3);
325
326
System.out.println("Predicted label: " + predictedLabel);
327
System.out.println("Top labels: " + topLabels);
328
329
// Document similarity using inferred vectors
330
String doc1 = "Great product quality";
331
String doc2 = "Excellent item, very satisfied";
332
333
INDArray vec1 = paragraphVectors.inferVector(doc1);
334
INDArray vec2 = paragraphVectors.inferVector(doc2);
335
336
// Calculate cosine similarity
337
double similarity = Transforms.cosineSim(vec1, vec2);
338
System.out.println("Document similarity: " + similarity);
339
340
// Batch inference for multiple documents
341
List<String> documents = Arrays.asList(
342
"First document text",
343
"Second document text",
344
"Third document text"
345
);
346
347
List<INDArray> vectors = paragraphVectors.inferVectorBatched(documents);
348
System.out.println("Processed " + vectors.size() + " documents");
349
350
// Find nearest labels to a document
351
Collection<String> nearestLabels = paragraphVectors.nearestLabels(newDocument, 5);
352
System.out.println("Nearest labels: " + nearestLabels);
353
354
// Advanced configuration with existing word vectors
355
Word2Vec existingWord2Vec = new Word2Vec.Builder()
356
.layerSize(300)
357
.windowSize(10)
358
// ... other configuration
359
.build();
360
existingWord2Vec.fit(); // Train on large corpus
361
362
ParagraphVectors advancedPV = new ParagraphVectors.Builder()
363
.useExistingWordVectors(existingWord2Vec)
364
.trainWordVectors(false) // Don't retrain word vectors
365
.layerSize(300)
366
.iterate(labeledDocumentIterator)
367
.tokenizerFactory(new DefaultTokenizerFactory())
368
.build();
369
370
advancedPV.fit();
371
```
372
373
### Document Types
374
375
Supporting classes for labeled document handling and training data preparation.
376
377
```java { .api }
378
/**
379
* Document with label information for supervised training
380
*/
381
public class LabelledDocument {
382
383
/**
384
* Get document content as string
385
* @return Document text content
386
*/
387
public String getContent();
388
389
/**
390
* Get document identifier
391
* @return String identifier for the document
392
*/
393
public String getId();
394
395
/**
396
* Get document labels
397
* @return List of label strings associated with document
398
*/
399
public List<String> getLabels();
400
401
/**
402
* Get referenced content as vocabulary words
403
* @return List of VocabWord instances from document
404
*/
405
public List<VocabWord> getReferencedContent();
406
}
407
408
/**
409
* Source of labels for document classification
410
*/
411
public class LabelsSource {
412
413
/**
414
* Create empty labels source
415
*/
416
public LabelsSource();
417
418
/**
419
* Create labels source with predefined labels
420
* @param labels List of available label strings
421
*/
422
public LabelsSource(List<String> labels);
423
424
/**
425
* Get available labels
426
* @return List of label strings
427
*/
428
public List<String> getLabels();
429
}
430
```