DeepLearning4J Natural Language Processing module providing word embeddings, document classification, and text processing capabilities for neural network applications.
npx @tessl/cli install tessl/maven-org-deeplearning4j--deeplearning4j-nlp@0.9.0DeepLearning4J NLP is a comprehensive natural language processing library for Java that provides state-of-the-art word embeddings, document classification, and text processing capabilities. Built on the DeepLearning4J neural network framework, it offers scalable implementations of Word2Vec, GloVe, ParagraphVectors (Doc2Vec), and extensive text preprocessing utilities with support for parallel processing and production deployment.
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-nlp</artifactId>
<version>0.9.1</version>
</dependency>// Core word embedding models
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.models.glove.Glove;
import org.deeplearning4j.models.paragraphvectors.ParagraphVectors;
// Text processing utilities
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.documentiterator.DocumentIterator;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
// Vocabulary and word representations
import org.deeplearning4j.models.word2vec.VocabWord;
import org.deeplearning4j.models.word2vec.wordstore.VocabCache;import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import java.util.Arrays;
import java.util.Collection;
// Prepare training data
Collection<String> sentences = Arrays.asList(
"The quick brown fox jumps over the lazy dog",
"Natural language processing with deep learning",
"Word embeddings capture semantic relationships"
);
// Train Word2Vec model
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(1)
.iterations(5)
.layerSize(100)
.seed(42)
.windowSize(5)
.iterate(new CollectionSentenceIterator(sentences))
.tokenizerFactory(new DefaultTokenizerFactory())
.build();
vec.fit();
// Use the trained model
double similarity = vec.similarity("quick", "fast");
Collection<String> nearestWords = vec.wordsNearest("fox", 5);
System.out.println("Similarity: " + similarity);
System.out.println("Nearest to 'fox': " + nearestWords);DeepLearning4J NLP is built around several key architectural components:
Dense vector representations of words trained using skip-gram or CBOW algorithms. Captures semantic and syntactic relationships between words in high-dimensional vector space.
public class Word2Vec extends SequenceVectors<VocabWord> {
public void setTokenizerFactory(TokenizerFactory tokenizerFactory);
public void setSentenceIterator(SentenceIterator iterator);
public void setSequenceIterator(SequenceIterator<VocabWord> iterator);
public static class Builder extends SequenceVectors.Builder<VocabWord> {
public Builder iterate(SentenceIterator iterator);
public Builder iterate(DocumentIterator iterator);
public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);
public Builder batchSize(int batchSize);
public Builder iterations(int iterations);
public Builder epochs(int numEpochs);
public Builder layerSize(int layerSize);
public Builder learningRate(double learningRate);
public Builder minWordFrequency(int minWordFrequency);
public Builder windowSize(int windowSize);
public Builder seed(long randomSeed);
public Builder workers(int numWorkers);
public Word2Vec build();
}
}Matrix factorization-based word embeddings that combine global statistical information with local context windows. Efficiently captures word co-occurrence statistics across large corpora.
public class Glove extends SequenceVectors<VocabWord> {
public static class Builder extends SequenceVectors.Builder<VocabWord> {
public Builder tokenizerFactory(TokenizerFactory tokenizerFactory);
public Builder xMax(double xMax);
public Builder symmetric(boolean reallySymmetric);
public Builder shuffle(boolean reallyShuffle);
public Builder alpha(double alpha);
public Builder iterate(SentenceIterator iterator);
public Builder iterate(DocumentIterator iterator);
public Builder maxMemory(int gbytes);
public Glove build();
}
}Document-level embeddings (Doc2Vec) that create vector representations for entire documents, sentences, or paragraphs. Enables document similarity, classification, and clustering tasks.
public class ParagraphVectors extends Word2Vec {
public String predict(String rawText);
public String predict(LabelledDocument document);
public Collection<String> predictSeveral(String rawText, int limit);
public INDArray inferVector(String text);
public INDArray inferVector(LabelledDocument document);
public Future<INDArray> inferVectorBatched(String document);
public Collection<String> nearestLabels(String rawText, int topN);
public double similarityToLabel(String rawText, String label);
public static class Builder extends Word2Vec.Builder {
public Builder useExistingWordVectors(WordVectors vec);
public Builder trainWordVectors(boolean trainElements);
public Builder labelsSource(LabelsSource source);
public Builder iterate(LabelAwareDocumentIterator iterator);
public Builder iterate(LabelAwareSentenceIterator iterator);
public ParagraphVectors build();
}
}Comprehensive text preprocessing pipeline with support for multiple tokenization strategies, sentence boundary detection, and document iteration patterns.
public interface SentenceIterator {
String nextSentence();
boolean hasNext();
void reset();
void finish();
SentencePreProcessor getPreProcessor();
void setPreProcessor(SentencePreProcessor preProcessor);
}
public interface TokenizerFactory {
Tokenizer create(String toTokenize);
Tokenizer create(InputStream toTokenize);
void setTokenPreProcessor(TokenPreProcess preProcessor);
TokenPreProcess getTokenPreProcessor();
}
public interface DocumentIterator extends Serializable {
InputStream nextDocument();
boolean hasNext();
void reset();
}Traditional text vectorization methods including TF-IDF and bag-of-words representations for document classification and information retrieval tasks.
public interface TextVectorizer {
// Vectorization interface for text processing
}
public class BagOfWordsVectorizer implements TextVectorizer {
// Bag of words implementation
}
public class TfidfVectorizer implements TextVectorizer {
// TF-IDF implementation
}Pre-built dataset loaders and iterators for common NLP datasets and data formats, designed for seamless integration with neural network training pipelines.
public class CnnSentenceDataSetIterator {
// CNN sentence dataset iteration
}
public class ReutersNewsGroupsDataSetIterator {
// Reuters news groups dataset
}
public interface LabeledSentenceProvider {
// Labeled sentence provision interface
}Utilities for loading and saving Word2Vec models, including Google format compatibility and model serialization across different formats.
public class WordVectorSerializer {
public static Word2Vec loadGoogleModel(File modelFile, boolean binary);
public static Word2Vec loadGoogleModel(File modelFile, boolean binary, boolean lineBreaks);
public static WordVectors loadGoogleModelNonNormalized(File modelFile, boolean binary, boolean lineBreaks);
public static void writeWord2VecModel(WordVectors vectors, File file);
public static void writeTsneWords(Word2Vec vec, List<String> labels, String path, INDArray tsne);
public static void writeWordVectors(WordVectors vectors, String path);
public static WordVectors loadTxtVectors(File vectorsFile);
}Core framework for implementing sequence-based embedding algorithms, providing the foundation for Word2Vec, GloVe, and ParagraphVectors implementations.
public abstract class SequenceVectors<T extends SequenceElement> implements WordVectors {
public void fit();
public double similarity(String word1, String word2);
public Collection<String> wordsNearest(String word, int n);
public INDArray getWordVector(String word);
public boolean hasWord(String word);
public static abstract class Builder<T extends SequenceElement> {
public Builder<T> minWordFrequency(int minWordFrequency);
public Builder<T> iterations(int iterations);
public Builder<T> layerSize(int layerSize);
public Builder<T> learningRate(double learningRate);
public Builder<T> windowSize(int windowSize);
public Builder<T> seed(long seed);
public Builder<T> workers(int workers);
}
}Graph-based node embeddings using random walks to learn vector representations of nodes in networks and graphs.
public class Node2Vec<V extends SequenceElement, E extends Number> extends SequenceVectors<V> {
public static class Builder<V extends SequenceElement, E extends Number> extends SequenceVectors.Builder<V> {
public Builder<V, E> setGraphHuffman(GraphHuffman huffman);
public Builder<V, E> setWalkLength(int walkLength);
public Builder<V, E> setNumWalks(int numWalks);
public Builder<V, E> setP(double p);
public Builder<V, E> setQ(double q);
public Node2Vec<V, E> build();
}
}public class VocabWord extends SequenceElement {
public VocabWord(double wordFrequency, String word);
public String getWord();
public boolean isLabel();
public int getIndex();
}
public interface VocabCache<T extends SequenceElement> {
boolean containsWord(String word);
T wordFor(String word);
int numWords();
Collection<T> vocabWords();
}
public interface WeightLookupTable<T extends SequenceElement> {
INDArray getWeights();
INDArray vector(String word);
}
public class LabelledDocument {
public String getContent();
public String getId();
public List<String> getLabels();
public List<VocabWord> getReferencedContent();
}
public class LabelsSource {
public LabelsSource();
public LabelsSource(List<String> labels);
public List<String> getLabels();
}
public abstract class SequenceElement implements Serializable {
public abstract String getLabel();
public abstract void setIndex(int index);
public abstract int getIndex();
public abstract long getElementFrequency();
public abstract void incrementElementFrequency();
public abstract void incrementElementFrequency(int by);
}
public interface SequenceIterator<T extends SequenceElement> {
Sequence<T> nextSequence();
boolean hasNext();
void reset();
SequenceIterator<T> getNewInstance();
}
public interface WordVectors {
double[] getWordVector(String word);
INDArray getWordVectorMatrix(String word);
double similarity(String word1, String word2);
Collection<String> wordsNearest(String word, int n);
boolean hasWord(String word);
Collection<String> vocab();
long vocabPackage();
}