DeepLearning4J Natural Language Processing module providing word embeddings, document classification, and text processing capabilities for neural network applications.
—
Matrix factorization-based word embeddings that efficiently combine global statistical information with local context windows. GloVe captures word co-occurrence statistics across large corpora, providing high-quality word representations for downstream NLP tasks.
GlobalVectors implementation based on the Stanford GloVe algorithm, extending SequenceVectors with co-occurrence matrix optimization.
/**
* GlobalVectors standalone implementation for DL4j
* Based on original Stanford GloVe algorithm
*/
public class Glove extends SequenceVectors<VocabWord> {
// Protected constructor - use Builder to create instances
}Comprehensive builder for GloVe configuration with algorithm-specific parameters for co-occurrence processing and matrix factorization.
/**
* Builder for GloVe configuration and construction
*/
public static class Glove.Builder extends SequenceVectors.Builder<VocabWord> {
/**
* Build the configured GloVe instance
* @return Configured GloVe model ready for training
*/
public Glove build();
/**
* Set sequence iterator for training data
* @param iterator SequenceIterator providing tokenized sequences
* @return Builder instance for method chaining
*/
public Builder iterate(SequenceIterator<VocabWord> iterator);
/**
* Set mini-batch size for training
* @param batchSize Number of co-occurrence entries per batch
* @return Builder instance for method chaining
*/
public Builder batchSize(int batchSize);
/**
* Set number of training iterations (same as epochs in GloVe)
* @param iterations Number of training iterations
* @return Builder instance for method chaining
*/
public Builder iterations(int iterations);
/**
* Set number of epochs for training
* @param numEpochs Number of training epochs
* @return Builder instance for method chaining
*/
public Builder epochs(int numEpochs);
/**
* Enable AdaGrad optimizer (always enabled for GloVe)
* @param reallyUse AdaGrad usage flag (forced to true)
* @return Builder instance for method chaining
*/
public Builder useAdaGrad(boolean reallyUse);
/**
* Set vector dimensionality
* @param layerSize Number of dimensions for output vectors
* @return Builder instance for method chaining
*/
public Builder layerSize(int layerSize);
/**
* Set learning rate for optimization
* @param learningRate Learning rate for gradient descent
* @return Builder instance for method chaining
*/
public Builder learningRate(double learningRate);
/**
* Set minimum word frequency threshold
* @param minWordFrequency Words below this frequency are excluded
* @return Builder instance for method chaining
*/
public Builder minWordFrequency(int minWordFrequency);
/**
* Set minimum learning rate threshold
* @param minLearningRate Minimum learning rate value
* @return Builder instance for method chaining
*/
public Builder minLearningRate(double minLearningRate);
/**
* Set whether to reset model before building
* @param reallyReset Whether to clear model state
* @return Builder instance for method chaining
*/
public Builder resetModel(boolean reallyReset);
/**
* Set external vocabulary cache
* @param vocabCache VocabCache instance to use
* @return Builder instance for method chaining
*/
public Builder vocabCache(VocabCache<VocabWord> vocabCache);
/**
* Set external weight lookup table
* @param lookupTable WeightLookupTable instance to use
* @return Builder instance for method chaining
*/
public Builder lookupTable(WeightLookupTable<VocabWord> lookupTable);
/**
* Set subsampling parameter (deprecated for GloVe)
* @param sampling Subsampling rate (not used in GloVe)
* @return Builder instance for method chaining
*/
@Deprecated
public Builder sampling(double sampling);
/**
* Set negative sampling parameter (deprecated for GloVe)
* @param negative Negative sampling rate (not used in GloVe)
* @return Builder instance for method chaining
*/
@Deprecated
public Builder negativeSample(double negative);
/**
* Set stop words list
* @param stopList List of stop words to exclude
* @return Builder instance for method chaining
*/
public Builder stopWords(List<String> stopList);
/**
* Force elements representation training (always true for GloVe)
* @param trainElements Whether to train element representations
* @return Builder instance for method chaining
*/
public Builder trainElementsRepresentation(boolean trainElements);
/**
* Force sequence representation training (deprecated for GloVe)
* @param trainSequences Whether to train sequence representations
* @return Builder instance for method chaining
*/
@Deprecated
public Builder trainSequencesRepresentation(boolean trainSequences);
/**
* Set stop words collection
* @param stopList Collection of VocabWord stop words
* @return Builder instance for method chaining
*/
public Builder stopWords(Collection<VocabWord> stopList);
/**
* Set context window size
* @param windowSize Context window size for co-occurrence calculation
* @return Builder instance for method chaining
*/
public Builder windowSize(int windowSize);
/**
* Set random seed for reproducibility
* @param randomSeed Random seed value
* @return Builder instance for method chaining
*/
public Builder seed(long randomSeed);
/**
* Set number of worker threads
* @param numWorkers Number of parallel worker threads
* @return Builder instance for method chaining
*/
public Builder workers(int numWorkers);
/**
* Set TokenizerFactory for training
* @param tokenizerFactory TokenizerFactory for text tokenization
* @return Builder instance for method chaining
*/
public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);
/**
* Set cutoff parameter in weighting function
* @param xMax Cutoff value in weighting function (default: 100.0)
* @return Builder instance for method chaining
*/
public Builder xMax(double xMax);
/**
* Set whether co-occurrences should be built in both directions
* @param reallySymmetric Whether to build symmetric co-occurrence matrix
* @return Builder instance for method chaining
*/
public Builder symmetric(boolean reallySymmetric);
/**
* Set whether co-occurrences should be shuffled between epochs
* @param reallyShuffle Whether to shuffle co-occurrence list
* @return Builder instance for method chaining
*/
public Builder shuffle(boolean reallyShuffle);
/**
* Set alpha parameter in weighting function exponent
* @param alpha Exponent parameter in weighting function (default: 0.75)
* @return Builder instance for method chaining
*/
public Builder alpha(double alpha);
/**
* Set sentence iterator for training
* @param iterator SentenceIterator providing training sentences
* @return Builder instance for method chaining
*/
public Builder iterate(SentenceIterator iterator);
/**
* Set document iterator for training
* @param iterator DocumentIterator providing training documents
* @return Builder instance for method chaining
*/
public Builder iterate(DocumentIterator iterator);
/**
* Set model utilities for vector operations
* @param modelUtils ModelUtils instance for similarity calculations
* @return Builder instance for method chaining
*/
public Builder modelUtils(ModelUtils<VocabWord> modelUtils);
/**
* Set event listeners for training progress
* @param vectorsListeners Collection of training event listeners
* @return Builder instance for method chaining
*/
public Builder setVectorsListeners(Collection<VectorsListener<VocabWord>> vectorsListeners);
/**
* Set maximum memory available for co-occurrence map building
* @param gbytes Memory limit in gigabytes
* @return Builder instance for method chaining
*/
public Builder maxMemory(int gbytes);
/**
* Set unknown element for out-of-vocabulary words
* @param element VocabWord element for unknown words
* @return Builder instance for method chaining
*/
public Builder unknownElement(VocabWord element);
/**
* Enable or disable unknown word handling
* @param reallyUse Whether to use UNK token
* @return Builder instance for method chaining
*/
public Builder useUnknown(boolean reallyUse);
}Usage Examples:
import org.deeplearning4j.models.glove.Glove;
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
// Basic GloVe training
Collection<String> corpus = Arrays.asList(
"Global vectors for word representation are effective",
"Matrix factorization captures global statistics",
"Local context windows provide semantic information"
);
Glove glove = new Glove.Builder()
.learningRate(0.05)
.epochs(50)
.xMax(100.0)
.alpha(0.75)
.layerSize(100)
.iterate(new CollectionSentenceIterator(corpus))
.tokenizerFactory(new DefaultTokenizerFactory())
.build();
glove.fit();
// Use trained model
double similarity = glove.similarity("global", "matrix");
Collection<String> nearest = glove.wordsNearest("vectors", 10);
// Advanced GloVe configuration
Glove advancedGlove = new Glove.Builder()
.learningRate(0.075)
.epochs(100)
.layerSize(300)
.xMax(150.0)
.alpha(0.8)
.symmetric(true)
.shuffle(true)
.windowSize(10)
.minWordFrequency(5)
.workers(4)
.maxMemory(8) // 8GB memory limit
.iterate(new CollectionSentenceIterator(largeCorpus))
.tokenizerFactory(new DefaultTokenizerFactory())
.build();
advancedGlove.fit();
// Extract word vectors
INDArray wordVector = advancedGlove.getWordVectorMatrix("representation");
System.out.println("Vector for 'representation': " + wordVector);Key parameters specific to the GloVe algorithm that control co-occurrence matrix construction and optimization:
The GloVe weighting function uses these parameters as: f(X_ij) = min(1, (X_ij / xMax)^alpha) where X_ij is the co-occurrence count between words i and j.
Install with Tessl CLI
npx tessl i tessl/maven-org-deeplearning4j--deeplearning4j-nlp