or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

bag-of-words.mddataset-loading.mddocument-embeddings.mdglove.mdindex.mdtext-processing.mdword-embeddings.md

word-embeddings.mddocs/

0

# Word Embeddings (Word2Vec)

1

2

Dense vector representations of words that capture semantic and syntactic relationships in high-dimensional space. The Word2Vec implementation supports both skip-gram and CBOW algorithms with extensive configuration options for production-scale text processing.

3

4

## Capabilities

5

6

### Word2Vec Model

7

8

Main Word2Vec implementation based on SequenceVectors framework, providing neural word embeddings with configurable architecture and training parameters.

9

10

```java { .api }

11

/**

12

* Word2Vec implementation based on SequenceVectors

13

*/

14

public class Word2Vec extends SequenceVectors<VocabWord> {

15

16

/**

17

* Define TokenizerFactory instance for model building

18

* @param tokenizerFactory TokenizerFactory instance for text tokenization

19

*/

20

public void setTokenizerFactory(TokenizerFactory tokenizerFactory);

21

22

/**

23

* Define SentenceIterator as training corpus source

24

* @param iterator SentenceIterator instance for sentence-level text input

25

*/

26

public void setSentenceIterator(SentenceIterator iterator);

27

28

/**

29

* Define SequenceIterator for pre-tokenized sequences

30

* @param iterator SequenceIterator for already tokenized Sequence<VocabWord> input

31

*/

32

public void setSequenceIterator(SequenceIterator<VocabWord> iterator);

33

}

34

```

35

36

### Word2Vec Builder

37

38

Comprehensive builder pattern for Word2Vec configuration with extensive training and architecture parameters.

39

40

```java { .api }

41

/**

42

* Builder for Word2Vec configuration and construction

43

*/

44

public static class Word2Vec.Builder extends SequenceVectors.Builder<VocabWord> {

45

46

/**

47

* Build the configured Word2Vec instance

48

* @return Configured Word2Vec model ready for training

49

*/

50

public Word2Vec build();

51

52

/**

53

* Set document iterator for training data

54

* @param iterator DocumentIterator providing training documents

55

* @return Builder instance for method chaining

56

*/

57

public Builder iterate(DocumentIterator iterator);

58

59

/**

60

* Set sentence iterator for training data

61

* @param iterator SentenceIterator providing training sentences

62

* @return Builder instance for method chaining

63

*/

64

public Builder iterate(SentenceIterator iterator);

65

66

/**

67

* Set sequence iterator for pre-tokenized training data

68

* @param iterator SequenceIterator providing tokenized sequences

69

* @return Builder instance for method chaining

70

*/

71

public Builder iterate(SequenceIterator<VocabWord> iterator);

72

73

/**

74

* Set label-aware iterator for supervised training

75

* @param iterator LabelAwareIterator providing labeled training data

76

* @return Builder instance for method chaining

77

*/

78

public Builder iterate(LabelAwareIterator iterator);

79

80

/**

81

* Define TokenizerFactory for string tokenization during training

82

* @param tokenizerFactory TokenizerFactory for text tokenization

83

* @return Builder instance for method chaining

84

*/

85

public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);

86

87

/**

88

* Set mini-batch size for training

89

* @param batchSize Number of sequences per mini-batch

90

* @return Builder instance for method chaining

91

*/

92

public Builder batchSize(int batchSize);

93

94

/**

95

* Set number of iterations per mini-batch during training

96

* @param iterations Number of iterations per mini-batch

97

* @return Builder instance for method chaining

98

*/

99

public Builder iterations(int iterations);

100

101

/**

102

* Set number of epochs (full corpus iterations) for training

103

* @param numEpochs Number of training epochs

104

* @return Builder instance for method chaining

105

*/

106

public Builder epochs(int numEpochs);

107

108

/**

109

* Set number of dimensions for output vectors

110

* @param layerSize Vector dimensionality (typically 100-300)

111

* @return Builder instance for method chaining

112

*/

113

public Builder layerSize(int layerSize);

114

115

/**

116

* Set initial learning rate for model training

117

* @param learningRate Initial learning rate (typically 0.025)

118

* @return Builder instance for method chaining

119

*/

120

public Builder learningRate(double learningRate);

121

122

/**

123

* Set minimum word frequency threshold

124

* @param minWordFrequency Words below this frequency are removed

125

* @return Builder instance for method chaining

126

*/

127

public Builder minWordFrequency(int minWordFrequency);

128

129

/**

130

* Set minimum learning rate value for training

131

* @param minLearningRate Minimum learning rate threshold

132

* @return Builder instance for method chaining

133

*/

134

public Builder minLearningRate(double minLearningRate);

135

136

/**

137

* Set whether to reset model before building

138

* @param reallyReset Whether to wipe model prior to building

139

* @return Builder instance for method chaining

140

*/

141

public Builder resetModel(boolean reallyReset);

142

143

/**

144

* Set vocabulary size limit during construction

145

* @param limit Maximum vocabulary size (0 means no limit)

146

* @return Builder instance for method chaining

147

*/

148

public Builder limitVocabularySize(int limit);

149

150

/**

151

* Define external VocabCache to be used

152

* @param vocabCache External vocabulary cache instance

153

* @return Builder instance for method chaining

154

*/

155

public Builder vocabCache(VocabCache<VocabWord> vocabCache);

156

157

/**

158

* Define external WeightLookupTable to be used

159

* @param lookupTable External weight lookup table instance

160

* @return Builder instance for method chaining

161

*/

162

public Builder lookupTable(WeightLookupTable<VocabWord> lookupTable);

163

164

/**

165

* Set subsampling parameter for frequent words

166

* @param sampling Subsampling rate (>0 to enable, 0 to disable)

167

* @return Builder instance for method chaining

168

*/

169

public Builder sampling(double sampling);

170

171

/**

172

* Enable or disable adaptive gradients (AdaGrad)

173

* @param reallyUse Whether to use adaptive gradients

174

* @return Builder instance for method chaining

175

*/

176

public Builder useAdaGrad(boolean reallyUse);

177

178

/**

179

* Set negative sampling parameter

180

* @param negative Negative sampling rate (>0 to enable, 0 to disable)

181

* @return Builder instance for method chaining

182

*/

183

public Builder negativeSample(double negative);

184

185

/**

186

* Set stop words to ignore during training

187

* @param stopList List of stop words to exclude

188

* @return Builder instance for method chaining

189

*/

190

public Builder stopWords(List<String> stopList);

191

192

/**

193

* Set stop words collection to ignore during training

194

* @param stopList Collection of VocabWord stop words to exclude

195

* @return Builder instance for method chaining

196

*/

197

public Builder stopWords(Collection<VocabWord> stopList);

198

199

/**

200

* Set context window size for training

201

* @param windowSize Size of context window around target word

202

* @return Builder instance for method chaining

203

*/

204

public Builder windowSize(int windowSize);

205

206

/**

207

* Set random seed for reproducible results

208

* @param randomSeed Random seed for initialization

209

* @return Builder instance for method chaining

210

*/

211

public Builder seed(long randomSeed);

212

213

/**

214

* Set maximum number of concurrent worker threads

215

* @param numWorkers Number of worker threads for parallel training

216

* @return Builder instance for method chaining

217

*/

218

public Builder workers(int numWorkers);

219

220

/**

221

* Set model utilities for similarity and nearest neighbor operations

222

* @param modelUtils ModelUtils instance for vector operations

223

* @return Builder instance for method chaining

224

*/

225

public Builder modelUtils(ModelUtils<VocabWord> modelUtils);

226

227

/**

228

* Enable variable window sizes for training

229

* @param windows Array of window sizes to use randomly

230

* @return Builder instance for method chaining

231

*/

232

public Builder useVariableWindow(int... windows);

233

234

/**

235

* Set unknown element for handling out-of-vocabulary words

236

* @param element VocabWord element to use for unknown words

237

* @return Builder instance for method chaining

238

*/

239

public Builder unknownElement(VocabWord element);

240

241

/**

242

* Enable or disable unknown word handling

243

* @param reallyUse Whether to use UNK token for unknown words

244

* @return Builder instance for method chaining

245

*/

246

public Builder useUnknown(boolean reallyUse);

247

248

/**

249

* Set event listeners for training progress

250

* @param vectorsListeners Collection of VectorsListener instances

251

* @return Builder instance for method chaining

252

*/

253

public Builder setVectorsListeners(Collection<VectorsListener<VocabWord>> vectorsListeners);

254

255

/**

256

* Set elements learning algorithm by name

257

* @param algorithm Name of learning algorithm to use

258

* @return Builder instance for method chaining

259

*/

260

public Builder elementsLearningAlgorithm(String algorithm);

261

262

/**

263

* Set elements learning algorithm instance

264

* @param algorithm ElementsLearningAlgorithm instance

265

* @return Builder instance for method chaining

266

*/

267

public Builder elementsLearningAlgorithm(ElementsLearningAlgorithm<VocabWord> algorithm);

268

269

/**

270

* Enable or disable parallel tokenization

271

* @param allow Whether to allow parallel tokenization (default: true)

272

* @return Builder instance for method chaining

273

*/

274

public Builder allowParallelTokenization(boolean allow);

275

276

/**

277

* Enable or disable periodic vocabulary truncation

278

* @param reallyEnable Whether to enable vocabulary scavenging

279

* @return Builder instance for method chaining

280

*/

281

public Builder enableScavenger(boolean reallyEnable);

282

283

/**

284

* Enable or disable hierarchical softmax

285

* @param reallyUse Whether to use hierarchical softmax

286

* @return Builder instance for method chaining

287

*/

288

public Builder useHierarchicSoftmax(boolean reallyUse);

289

290

/**

291

* Enable or disable precise weight initialization

292

* @param reallyUse Whether to use precise weight initialization

293

* @return Builder instance for method chaining

294

*/

295

public Builder usePreciseWeightInit(boolean reallyUse);

296

}

297

```

298

299

**Usage Examples:**

300

301

```java

302

import org.deeplearning4j.models.word2vec.Word2Vec;

303

import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;

304

import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;

305

306

// Basic Word2Vec training

307

Collection<String> sentences = Arrays.asList(

308

"The cat sat on the mat",

309

"The dog ran in the park",

310

"Natural language processing is fascinating"

311

);

312

313

Word2Vec word2Vec = new Word2Vec.Builder()

314

.minWordFrequency(1)

315

.iterations(5)

316

.layerSize(100)

317

.seed(42)

318

.windowSize(5)

319

.iterate(new CollectionSentenceIterator(sentences))

320

.tokenizerFactory(new DefaultTokenizerFactory())

321

.build();

322

323

word2Vec.fit();

324

325

// Use trained model

326

double similarity = word2Vec.similarity("cat", "dog");

327

Collection<String> nearest = word2Vec.wordsNearest("cat", 5);

328

329

// Advanced configuration with custom parameters

330

Word2Vec advancedModel = new Word2Vec.Builder()

331

.minWordFrequency(5)

332

.iterations(10)

333

.epochs(3)

334

.layerSize(300)

335

.learningRate(0.025)

336

.minLearningRate(0.0001)

337

.windowSize(8)

338

.negativeSample(5.0)

339

.useAdaGrad(false)

340

.workers(Runtime.getRuntime().availableProcessors())

341

.seed(123456L)

342

.iterate(new CollectionSentenceIterator(largeCorpus))

343

.tokenizerFactory(new DefaultTokenizerFactory())

344

.build();

345

346

advancedModel.fit();

347

```

348

349

### Vocabulary Word Representation

350

351

Word representation class that extends SequenceElement with word-specific functionality for Word2Vec training and inference.

352

353

```java { .api }

354

/**

355

* Vocabulary word representation for Word2Vec models

356

*/

357

public class VocabWord extends SequenceElement {

358

359

/**

360

* Create vocabulary word with frequency and word string

361

* @param wordFrequency Frequency of word in training corpus

362

* @param word String representation of the word

363

*/

364

public VocabWord(double wordFrequency, String word);

365

366

/**

367

* Get the word string

368

* @return String representation of the word

369

*/

370

public String getWord();

371

372

/**

373

* Check if this word is a label

374

* @return true if word represents a label, false otherwise

375

*/

376

public boolean isLabel();

377

378

/**

379

* Get the index of this word in vocabulary

380

* @return Integer index in vocabulary

381

*/

382

public int getIndex();

383

}

384

```