DeepLearning4J Natural Language Processing module providing word embeddings, document classification, and text processing capabilities for neural network applications.
—
Dense vector representations of words that capture semantic and syntactic relationships in high-dimensional space. The Word2Vec implementation supports both skip-gram and CBOW algorithms with extensive configuration options for production-scale text processing.
Main Word2Vec implementation based on SequenceVectors framework, providing neural word embeddings with configurable architecture and training parameters.
/**
* Word2Vec implementation based on SequenceVectors
*/
public class Word2Vec extends SequenceVectors<VocabWord> {
/**
* Define TokenizerFactory instance for model building
* @param tokenizerFactory TokenizerFactory instance for text tokenization
*/
public void setTokenizerFactory(TokenizerFactory tokenizerFactory);
/**
* Define SentenceIterator as training corpus source
* @param iterator SentenceIterator instance for sentence-level text input
*/
public void setSentenceIterator(SentenceIterator iterator);
/**
* Define SequenceIterator for pre-tokenized sequences
* @param iterator SequenceIterator for already tokenized Sequence<VocabWord> input
*/
public void setSequenceIterator(SequenceIterator<VocabWord> iterator);
}Comprehensive builder pattern for Word2Vec configuration with extensive training and architecture parameters.
/**
* Builder for Word2Vec configuration and construction
*/
public static class Word2Vec.Builder extends SequenceVectors.Builder<VocabWord> {
/**
* Build the configured Word2Vec instance
* @return Configured Word2Vec model ready for training
*/
public Word2Vec build();
/**
* Set document iterator for training data
* @param iterator DocumentIterator providing training documents
* @return Builder instance for method chaining
*/
public Builder iterate(DocumentIterator iterator);
/**
* Set sentence iterator for training data
* @param iterator SentenceIterator providing training sentences
* @return Builder instance for method chaining
*/
public Builder iterate(SentenceIterator iterator);
/**
* Set sequence iterator for pre-tokenized training data
* @param iterator SequenceIterator providing tokenized sequences
* @return Builder instance for method chaining
*/
public Builder iterate(SequenceIterator<VocabWord> iterator);
/**
* Set label-aware iterator for supervised training
* @param iterator LabelAwareIterator providing labeled training data
* @return Builder instance for method chaining
*/
public Builder iterate(LabelAwareIterator iterator);
/**
* Define TokenizerFactory for string tokenization during training
* @param tokenizerFactory TokenizerFactory for text tokenization
* @return Builder instance for method chaining
*/
public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);
/**
* Set mini-batch size for training
* @param batchSize Number of sequences per mini-batch
* @return Builder instance for method chaining
*/
public Builder batchSize(int batchSize);
/**
* Set number of iterations per mini-batch during training
* @param iterations Number of iterations per mini-batch
* @return Builder instance for method chaining
*/
public Builder iterations(int iterations);
/**
* Set number of epochs (full corpus iterations) for training
* @param numEpochs Number of training epochs
* @return Builder instance for method chaining
*/
public Builder epochs(int numEpochs);
/**
* Set number of dimensions for output vectors
* @param layerSize Vector dimensionality (typically 100-300)
* @return Builder instance for method chaining
*/
public Builder layerSize(int layerSize);
/**
* Set initial learning rate for model training
* @param learningRate Initial learning rate (typically 0.025)
* @return Builder instance for method chaining
*/
public Builder learningRate(double learningRate);
/**
* Set minimum word frequency threshold
* @param minWordFrequency Words below this frequency are removed
* @return Builder instance for method chaining
*/
public Builder minWordFrequency(int minWordFrequency);
/**
* Set minimum learning rate value for training
* @param minLearningRate Minimum learning rate threshold
* @return Builder instance for method chaining
*/
public Builder minLearningRate(double minLearningRate);
/**
* Set whether to reset model before building
* @param reallyReset Whether to wipe model prior to building
* @return Builder instance for method chaining
*/
public Builder resetModel(boolean reallyReset);
/**
* Set vocabulary size limit during construction
* @param limit Maximum vocabulary size (0 means no limit)
* @return Builder instance for method chaining
*/
public Builder limitVocabularySize(int limit);
/**
* Define external VocabCache to be used
* @param vocabCache External vocabulary cache instance
* @return Builder instance for method chaining
*/
public Builder vocabCache(VocabCache<VocabWord> vocabCache);
/**
* Define external WeightLookupTable to be used
* @param lookupTable External weight lookup table instance
* @return Builder instance for method chaining
*/
public Builder lookupTable(WeightLookupTable<VocabWord> lookupTable);
/**
* Set subsampling parameter for frequent words
* @param sampling Subsampling rate (>0 to enable, 0 to disable)
* @return Builder instance for method chaining
*/
public Builder sampling(double sampling);
/**
* Enable or disable adaptive gradients (AdaGrad)
* @param reallyUse Whether to use adaptive gradients
* @return Builder instance for method chaining
*/
public Builder useAdaGrad(boolean reallyUse);
/**
* Set negative sampling parameter
* @param negative Negative sampling rate (>0 to enable, 0 to disable)
* @return Builder instance for method chaining
*/
public Builder negativeSample(double negative);
/**
* Set stop words to ignore during training
* @param stopList List of stop words to exclude
* @return Builder instance for method chaining
*/
public Builder stopWords(List<String> stopList);
/**
* Set stop words collection to ignore during training
* @param stopList Collection of VocabWord stop words to exclude
* @return Builder instance for method chaining
*/
public Builder stopWords(Collection<VocabWord> stopList);
/**
* Set context window size for training
* @param windowSize Size of context window around target word
* @return Builder instance for method chaining
*/
public Builder windowSize(int windowSize);
/**
* Set random seed for reproducible results
* @param randomSeed Random seed for initialization
* @return Builder instance for method chaining
*/
public Builder seed(long randomSeed);
/**
* Set maximum number of concurrent worker threads
* @param numWorkers Number of worker threads for parallel training
* @return Builder instance for method chaining
*/
public Builder workers(int numWorkers);
/**
* Set model utilities for similarity and nearest neighbor operations
* @param modelUtils ModelUtils instance for vector operations
* @return Builder instance for method chaining
*/
public Builder modelUtils(ModelUtils<VocabWord> modelUtils);
/**
* Enable variable window sizes for training
* @param windows Array of window sizes to use randomly
* @return Builder instance for method chaining
*/
public Builder useVariableWindow(int... windows);
/**
* Set unknown element for handling out-of-vocabulary words
* @param element VocabWord element to use for unknown words
* @return Builder instance for method chaining
*/
public Builder unknownElement(VocabWord element);
/**
* Enable or disable unknown word handling
* @param reallyUse Whether to use UNK token for unknown words
* @return Builder instance for method chaining
*/
public Builder useUnknown(boolean reallyUse);
/**
* Set event listeners for training progress
* @param vectorsListeners Collection of VectorsListener instances
* @return Builder instance for method chaining
*/
public Builder setVectorsListeners(Collection<VectorsListener<VocabWord>> vectorsListeners);
/**
* Set elements learning algorithm by name
* @param algorithm Name of learning algorithm to use
* @return Builder instance for method chaining
*/
public Builder elementsLearningAlgorithm(String algorithm);
/**
* Set elements learning algorithm instance
* @param algorithm ElementsLearningAlgorithm instance
* @return Builder instance for method chaining
*/
public Builder elementsLearningAlgorithm(ElementsLearningAlgorithm<VocabWord> algorithm);
/**
* Enable or disable parallel tokenization
* @param allow Whether to allow parallel tokenization (default: true)
* @return Builder instance for method chaining
*/
public Builder allowParallelTokenization(boolean allow);
/**
* Enable or disable periodic vocabulary truncation
* @param reallyEnable Whether to enable vocabulary scavenging
* @return Builder instance for method chaining
*/
public Builder enableScavenger(boolean reallyEnable);
/**
* Enable or disable hierarchical softmax
* @param reallyUse Whether to use hierarchical softmax
* @return Builder instance for method chaining
*/
public Builder useHierarchicSoftmax(boolean reallyUse);
/**
* Enable or disable precise weight initialization
* @param reallyUse Whether to use precise weight initialization
* @return Builder instance for method chaining
*/
public Builder usePreciseWeightInit(boolean reallyUse);
}Usage Examples:
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
// Basic Word2Vec training
Collection<String> sentences = Arrays.asList(
"The cat sat on the mat",
"The dog ran in the park",
"Natural language processing is fascinating"
);
Word2Vec word2Vec = new Word2Vec.Builder()
.minWordFrequency(1)
.iterations(5)
.layerSize(100)
.seed(42)
.windowSize(5)
.iterate(new CollectionSentenceIterator(sentences))
.tokenizerFactory(new DefaultTokenizerFactory())
.build();
word2Vec.fit();
// Use trained model
double similarity = word2Vec.similarity("cat", "dog");
Collection<String> nearest = word2Vec.wordsNearest("cat", 5);
// Advanced configuration with custom parameters
Word2Vec advancedModel = new Word2Vec.Builder()
.minWordFrequency(5)
.iterations(10)
.epochs(3)
.layerSize(300)
.learningRate(0.025)
.minLearningRate(0.0001)
.windowSize(8)
.negativeSample(5.0)
.useAdaGrad(false)
.workers(Runtime.getRuntime().availableProcessors())
.seed(123456L)
.iterate(new CollectionSentenceIterator(largeCorpus))
.tokenizerFactory(new DefaultTokenizerFactory())
.build();
advancedModel.fit();Word representation class that extends SequenceElement with word-specific functionality for Word2Vec training and inference.
/**
* Vocabulary word representation for Word2Vec models
*/
public class VocabWord extends SequenceElement {
/**
* Create vocabulary word with frequency and word string
* @param wordFrequency Frequency of word in training corpus
* @param word String representation of the word
*/
public VocabWord(double wordFrequency, String word);
/**
* Get the word string
* @return String representation of the word
*/
public String getWord();
/**
* Check if this word is a label
* @return true if word represents a label, false otherwise
*/
public boolean isLabel();
/**
* Get the index of this word in vocabulary
* @return Integer index in vocabulary
*/
public int getIndex();
}Install with Tessl CLI
npx tessl i tessl/maven-org-deeplearning4j--deeplearning4j-nlp