or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

bag-of-words.mddataset-loading.mddocument-embeddings.mdglove.mdindex.mdtext-processing.mdword-embeddings.md

document-embeddings.mddocs/

0

# Document Embeddings (ParagraphVectors)

1

2

Document-level embeddings implementation (Doc2Vec) that creates vector representations for entire documents, sentences, or paragraphs. Enables document similarity comparison, classification, clustering, and information retrieval tasks with neural embeddings.

3

4

## Capabilities

5

6

### ParagraphVectors Model

7

8

Main ParagraphVectors implementation extending Word2Vec with document-level representation learning and inference capabilities.

9

10

```java { .api }

11

/**

12

* ParagraphVectors (Doc2Vec) implementation extending Word2Vec

13

* Provides document-level embeddings and classification capabilities

14

*/

15

public class ParagraphVectors extends Word2Vec {

16

17

/**

18

* Predict label for raw text (deprecated - use predict with document types)

19

* @param rawText Raw text string to classify

20

* @return Most probable label string

21

*/

22

@Deprecated

23

public String predict(String rawText);

24

25

/**

26

* Predict label for labeled document

27

* @param document LabelledDocument instance to classify

28

* @return Most probable label string

29

*/

30

public String predict(LabelledDocument document);

31

32

/**

33

* Predict label for list of vocabulary words

34

* @param document List of VocabWord instances

35

* @return Most probable label string

36

*/

37

public String predict(List<VocabWord> document);

38

39

/**

40

* Predict multiple labels for labeled document

41

* @param document LabelledDocument to classify

42

* @param limit Maximum number of labels to return

43

* @return Collection of probable labels in descending order

44

*/

45

public Collection<String> predictSeveral(LabelledDocument document, int limit);

46

47

/**

48

* Predict multiple labels for raw text

49

* @param rawText Raw text string to classify

50

* @param limit Maximum number of labels to return

51

* @return Collection of probable labels in descending order

52

*/

53

public Collection<String> predictSeveral(String rawText, int limit);

54

55

/**

56

* Predict multiple labels for word list

57

* @param document List of VocabWord instances

58

* @param limit Maximum number of labels to return

59

* @return Collection of probable labels in descending order

60

*/

61

public Collection<String> predictSeveral(List<VocabWord> document, int limit);

62

63

/**

64

* Calculate inferred vector for text with custom training parameters

65

* @param text Raw text string to vectorize

66

* @param learningRate Learning rate for inference training

67

* @param minLearningRate Minimum learning rate threshold

68

* @param iterations Number of inference iterations

69

* @return INDArray vector representation of the text

70

*/

71

public INDArray inferVector(String text, double learningRate, double minLearningRate, int iterations);

72

73

/**

74

* Calculate inferred vector for document with custom parameters

75

* @param document LabelledDocument to vectorize

76

* @param learningRate Learning rate for inference training

77

* @param minLearningRate Minimum learning rate threshold

78

* @param iterations Number of inference iterations

79

* @return INDArray vector representation of the document

80

*/

81

public INDArray inferVector(LabelledDocument document, double learningRate, double minLearningRate, int iterations);

82

83

/**

84

* Calculate inferred vector for word list with custom parameters

85

* @param document List of VocabWord instances to vectorize

86

* @param learningRate Learning rate for inference training

87

* @param minLearningRate Minimum learning rate threshold

88

* @param iterations Number of inference iterations

89

* @return INDArray vector representation of the word list

90

*/

91

public INDArray inferVector(List<VocabWord> document, double learningRate, double minLearningRate, int iterations);

92

93

/**

94

* Calculate inferred vector for text with default parameters

95

* @param text Raw text string to vectorize

96

* @return INDArray vector representation using default parameters

97

*/

98

public INDArray inferVector(String text);

99

100

/**

101

* Calculate inferred vector for document with default parameters

102

* @param document LabelledDocument to vectorize

103

* @return INDArray vector representation using default parameters

104

*/

105

public INDArray inferVector(LabelledDocument document);

106

107

/**

108

* Calculate inferred vector for word list with default parameters

109

* @param document List of VocabWord instances to vectorize

110

* @return INDArray vector representation using default parameters

111

*/

112

public INDArray inferVector(List<VocabWord> document);

113

114

/**

115

* Batched inference for labeled document returning Future with ID and vector

116

* @param document LabelledDocument with ID field defined

117

* @return Future containing Pair of document ID and inferred vector

118

*/

119

public Future<Pair<String, INDArray>> inferVectorBatched(LabelledDocument document);

120

121

/**

122

* Batched inference for text string returning Future with vector

123

* @param document Raw text string to vectorize

124

* @return Future containing inferred vector

125

*/

126

public Future<INDArray> inferVectorBatched(String document);

127

128

/**

129

* Batched inference for multiple text strings

130

* @param documents List of text strings to vectorize

131

* @return List of INDArray vectors in same order as input

132

*/

133

public List<INDArray> inferVectorBatched(List<String> documents);

134

135

/**

136

* Find top N labels nearest to labeled document

137

* @param document LabelledDocument to compare

138

* @param topN Number of nearest labels to return

139

* @return Collection of nearest label strings

140

*/

141

public Collection<String> nearestLabels(LabelledDocument document, int topN);

142

143

/**

144

* Find top N labels nearest to raw text

145

* @param rawText Raw text string to compare

146

* @param topN Number of nearest labels to return

147

* @return Collection of nearest label strings

148

*/

149

public Collection<String> nearestLabels(String rawText, int topN);

150

151

/**

152

* Find top N labels nearest to vocabulary word collection

153

* @param document Collection of VocabWord instances

154

* @param topN Number of nearest labels to return

155

* @return Collection of nearest label strings

156

*/

157

public Collection<String> nearestLabels(Collection<VocabWord> document, int topN);

158

159

/**

160

* Find top N labels nearest to feature vector

161

* @param labelVector INDArray feature vector

162

* @param topN Number of nearest labels to return

163

* @return Collection of nearest label strings

164

*/

165

public Collection<String> nearestLabels(INDArray labelVector, int topN);

166

167

/**

168

* Calculate similarity between document and specific label

169

* @param document LabelledDocument to compare

170

* @param label Target label string

171

* @return Similarity score between document and label

172

*/

173

public double similarityToLabel(LabelledDocument document, String label);

174

175

/**

176

* Calculate similarity between word list and specific label

177

* @param document List of VocabWord instances

178

* @param label Target label string

179

* @return Similarity score between document and label

180

*/

181

public double similarityToLabel(List<VocabWord> document, String label);

182

183

/**

184

* Calculate similarity between raw text and specific label (deprecated)

185

* @param rawText Raw text string

186

* @param label Target label string

187

* @return Similarity score between text and label

188

*/

189

@Deprecated

190

public double similarityToLabel(String rawText, String label);

191

192

/**

193

* Extract label vectors from vocabulary for nearest neighbor operations

194

* Populates internal labels matrix for efficient similarity calculations

195

*/

196

public void extractLabels();

197

198

/**

199

* Set sequence iterator for pre-tokenized training data

200

* @param iterator SequenceIterator providing tokenized sequences

201

*/

202

public void setSequenceIterator(SequenceIterator<VocabWord> iterator);

203

}

204

```

205

206

### ParagraphVectors Builder

207

208

Extended builder for ParagraphVectors with document-specific configuration options and label handling.

209

210

```java { .api }

211

/**

212

* Builder for ParagraphVectors configuration extending Word2Vec.Builder

213

*/

214

public static class ParagraphVectors.Builder extends Word2Vec.Builder {

215

216

/**

217

* Build configured ParagraphVectors instance

218

* @return Configured ParagraphVectors model ready for training

219

*/

220

public ParagraphVectors build();

221

222

/**

223

* Use pre-built WordVectors model for ParagraphVectors initialization

224

* @param vec Existing WordVectors model (Word2Vec or GloVe)

225

* @return Builder instance for method chaining

226

*/

227

public Builder useExistingWordVectors(WordVectors vec);

228

229

/**

230

* Define whether word representations should be trained with documents

231

* @param trainElements Whether to train word vectors alongside document vectors

232

* @return Builder instance for method chaining

233

*/

234

public Builder trainWordVectors(boolean trainElements);

235

236

/**

237

* Attach pre-defined labels source to ParagraphVectors

238

* @param source LabelsSource instance containing available labels

239

* @return Builder instance for method chaining

240

*/

241

public Builder labelsSource(LabelsSource source);

242

243

/**

244

* Build LabelSource from labels list (deprecated due to order synchronization issues)

245

* @param labels List of label strings

246

* @return Builder instance for method chaining

247

*/

248

@Deprecated

249

public Builder labels(List<String> labels);

250

251

/**

252

* Set label-aware document iterator for training

253

* @param iterator LabelAwareDocumentIterator with labeled documents

254

* @return Builder instance for method chaining

255

*/

256

public Builder iterate(LabelAwareDocumentIterator iterator);

257

258

/**

259

* Set label-aware sentence iterator for training

260

* @param iterator LabelAwareSentenceIterator with labeled sentences

261

* @return Builder instance for method chaining

262

*/

263

public Builder iterate(LabelAwareSentenceIterator iterator);

264

265

/**

266

* Set general label-aware iterator for training

267

* @param iterator LabelAwareIterator providing labeled training data

268

* @return Builder instance for method chaining

269

*/

270

public Builder iterate(LabelAwareIterator iterator);

271

272

/**

273

* Set document iterator for training (unlabeled documents)

274

* @param iterator DocumentIterator providing training documents

275

* @return Builder instance for method chaining

276

*/

277

public Builder iterate(DocumentIterator iterator);

278

279

/**

280

* Set sentence iterator for training (unlabeled sentences)

281

* @param iterator SentenceIterator providing training sentences

282

* @return Builder instance for method chaining

283

*/

284

public Builder iterate(SentenceIterator iterator);

285

286

// Inherits all Word2Vec.Builder methods with appropriate return types

287

}

288

```

289

290

**Usage Examples:**

291

292

```java

293

import org.deeplearning4j.models.paragraphvectors.ParagraphVectors;

294

import org.deeplearning4j.text.documentiterator.*;

295

import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;

296

297

// Basic document classification training

298

Collection<LabelledDocument> labeledDocs = Arrays.asList(

299

new LabelledDocument("This is a positive review", "positive"),

300

new LabelledDocument("This is a negative review", "negative"),

301

new LabelledDocument("Great product, highly recommend", "positive")

302

);

303

304

LabelAwareIterator iterator = new BasicLabelAwareIterator(labeledDocs);

305

306

ParagraphVectors paragraphVectors = new ParagraphVectors.Builder()

307

.minWordFrequency(1)

308

.iterations(5)

309

.epochs(10)

310

.layerSize(100)

311

.learningRate(0.025)

312

.windowSize(5)

313

.iterate(iterator)

314

.tokenizerFactory(new DefaultTokenizerFactory())

315

.trainWordVectors(true)

316

.build();

317

318

paragraphVectors.fit();

319

320

// Document inference and classification

321

String newDocument = "This product is amazing";

322

INDArray docVector = paragraphVectors.inferVector(newDocument);

323

String predictedLabel = paragraphVectors.predict(newDocument);

324

Collection<String> topLabels = paragraphVectors.predictSeveral(newDocument, 3);

325

326

System.out.println("Predicted label: " + predictedLabel);

327

System.out.println("Top labels: " + topLabels);

328

329

// Document similarity using inferred vectors

330

String doc1 = "Great product quality";

331

String doc2 = "Excellent item, very satisfied";

332

333

INDArray vec1 = paragraphVectors.inferVector(doc1);

334

INDArray vec2 = paragraphVectors.inferVector(doc2);

335

336

// Calculate cosine similarity

337

double similarity = Transforms.cosineSim(vec1, vec2);

338

System.out.println("Document similarity: " + similarity);

339

340

// Batch inference for multiple documents

341

List<String> documents = Arrays.asList(

342

"First document text",

343

"Second document text",

344

"Third document text"

345

);

346

347

List<INDArray> vectors = paragraphVectors.inferVectorBatched(documents);

348

System.out.println("Processed " + vectors.size() + " documents");

349

350

// Find nearest labels to a document

351

Collection<String> nearestLabels = paragraphVectors.nearestLabels(newDocument, 5);

352

System.out.println("Nearest labels: " + nearestLabels);

353

354

// Advanced configuration with existing word vectors

355

Word2Vec existingWord2Vec = new Word2Vec.Builder()

356

.layerSize(300)

357

.windowSize(10)

358

// ... other configuration

359

.build();

360

existingWord2Vec.fit(); // Train on large corpus

361

362

ParagraphVectors advancedPV = new ParagraphVectors.Builder()

363

.useExistingWordVectors(existingWord2Vec)

364

.trainWordVectors(false) // Don't retrain word vectors

365

.layerSize(300)

366

.iterate(labeledDocumentIterator)

367

.tokenizerFactory(new DefaultTokenizerFactory())

368

.build();

369

370

advancedPV.fit();

371

```

372

373

### Document Types

374

375

Supporting classes for labeled document handling and training data preparation.

376

377

```java { .api }

378

/**

379

* Document with label information for supervised training

380

*/

381

public class LabelledDocument {

382

383

/**

384

* Get document content as string

385

* @return Document text content

386

*/

387

public String getContent();

388

389

/**

390

* Get document identifier

391

* @return String identifier for the document

392

*/

393

public String getId();

394

395

/**

396

* Get document labels

397

* @return List of label strings associated with document

398

*/

399

public List<String> getLabels();

400

401

/**

402

* Get referenced content as vocabulary words

403

* @return List of VocabWord instances from document

404

*/

405

public List<VocabWord> getReferencedContent();

406

}

407

408

/**

409

* Source of labels for document classification

410

*/

411

public class LabelsSource {

412

413

/**

414

* Create empty labels source

415

*/

416

public LabelsSource();

417

418

/**

419

* Create labels source with predefined labels

420

* @param labels List of available label strings

421

*/

422

public LabelsSource(List<String> labels);

423

424

/**

425

* Get available labels

426

* @return List of label strings

427

*/

428

public List<String> getLabels();

429

}

430

```