DeepLearning4J Natural Language Processing module providing word embeddings, document classification, and text processing capabilities for neural network applications.
—
Comprehensive text preprocessing pipeline with support for multiple tokenization strategies, sentence boundary detection, document iteration patterns, and text preprocessing operations. Provides the foundation for all text-based machine learning workflows in DeepLearning4J NLP.
Iterator interface for sentence-level text processing with preprocessing and resource management capabilities.
/**
* Iterator interface for sentence-level text processing
* Provides sentence boundary detection and preprocessing
*/
public interface SentenceIterator {
/**
* Get next sentence or null if no more sentences available
* @return Next sentence string, or null if iterator is exhausted
*/
String nextSentence();
/**
* Check if more sentences are available
* @return true if more sentences exist, false otherwise
*/
boolean hasNext();
/**
* Reset iterator to beginning of data source
* Allows multiple passes over the same data
*/
void reset();
/**
* Clean up resources and close input streams
* Should be called when iteration is complete
*/
void finish();
/**
* Get current sentence preprocessor
* @return SentencePreProcessor instance or null if none set
*/
SentencePreProcessor getPreProcessor();
/**
* Set sentence preprocessor for text cleaning
* @param preProcessor SentencePreProcessor for sentence-level cleaning
*/
void setPreProcessor(SentencePreProcessor preProcessor);
}
/**
* Sentence preprocessing interface for text normalization
*/
public interface SentencePreProcessor {
/**
* Preprocess sentence text
* @param sentence Input sentence string
* @return Preprocessed sentence string
*/
String preProcess(String sentence);
}Concrete implementations of SentenceIterator for various data sources and processing patterns.
/**
* File-based sentence iterator reading from text files
*/
public class FileSentenceIterator implements SentenceIterator {
// File-based sentence iteration with configurable encoding
}
/**
* Line-based sentence iterator treating each line as a sentence
*/
public class LineSentenceIterator implements SentenceIterator {
// Simple line-by-line sentence processing
}
/**
* Collection-based sentence iterator for in-memory text collections
*/
public class CollectionSentenceIterator implements SentenceIterator {
/**
* Create iterator from string collection
* @param sentences Collection of sentence strings
*/
public CollectionSentenceIterator(Collection<String> sentences);
}
/**
* Stream-based line iterator with configurable fetch size
*/
public class StreamLineIterator implements SentenceIterator {
/**
* Builder for StreamLineIterator configuration
*/
public static class Builder {
/**
* Create builder with document iterator source
* @param iterator DocumentIterator providing input documents
*/
public Builder(DocumentIterator iterator);
/**
* Set fetch size for batched processing
* @param fetchSize Number of lines to fetch at once
* @return Builder instance for method chaining
*/
public Builder setFetchSize(int fetchSize);
/**
* Build configured StreamLineIterator
* @return StreamLineIterator instance
*/
public StreamLineIterator build();
}
}
/**
* Thread-safe wrapper for sentence iterators
*/
public class SynchronizedSentenceIterator implements SentenceIterator {
// Thread-safe sentence iteration wrapper
}
/**
* Prefetching sentence iterator for improved performance
*/
public class PrefetchingSentenceIterator implements SentenceIterator {
// Performance-optimized iterator with prefetching
}
/**
* Multiple epochs sentence iterator for repeated data passes
*/
public class MutipleEpochsSentenceIterator implements SentenceIterator {
// Iterator supporting multiple epochs over same data
}
/**
* Aggregating iterator combining multiple sentence sources
*/
public class AggregatingSentenceIterator implements SentenceIterator {
// Combines multiple SentenceIterator instances
}Specialized sentence iterators that handle labeled data for supervised learning tasks.
/**
* Label-aware sentence iterator interface for supervised learning
*/
public interface LabelAwareSentenceIterator extends SentenceIterator {
/**
* Get current sentence label
* @return Label string for current sentence
*/
String currentLabel();
/**
* Get labels source
* @return LabelsSource containing available labels
*/
LabelsSource getLabelsSource();
}
/**
* File-based label-aware sentence iterator
*/
public class LabelAwareFileSentenceIterator implements LabelAwareSentenceIterator {
// File-based iteration with label extraction from filenames or content
}
/**
* List-based label-aware sentence iterator
*/
public class LabelAwareListSentenceIterator implements LabelAwareSentenceIterator {
// In-memory iteration over labeled sentence collections
}Document-level iteration interface for processing larger text units with stream-based access.
/**
* Document iterator interface for document-level text processing
* Provides InputStream access to document content
*/
public interface DocumentIterator extends Serializable {
/**
* Get next document as input stream
* @return InputStream for next document content
*/
InputStream nextDocument();
/**
* Check if more documents are available
* @return true if more documents exist, false otherwise
*/
boolean hasNext();
/**
* Reset iterator to beginning of document collection
*/
void reset();
}
/**
* File-based document iterator for file system traversal
*/
public class FileDocumentIterator implements DocumentIterator {
// Iterate over files in directory structure
}Document iterators with label information for supervised document processing tasks.
/**
* Label-aware document iterator interface
*/
public interface LabelAwareDocumentIterator extends DocumentIterator {
/**
* Get labels for current document
* @return List of label strings for current document
*/
List<String> getLabels();
/**
* Check if iterator has labels
* @return true if labels are available, false otherwise
*/
boolean hasLabels();
}
/**
* General label-aware iterator interface
*/
public interface LabelAwareIterator {
/**
* Get next labeled document
* @return LabelledDocument instance
*/
LabelledDocument nextDocument();
/**
* Check if more labeled documents available
* @return true if more documents exist
*/
boolean hasNext();
/**
* Reset to beginning of labeled data
*/
void reset();
/**
* Get labels source
* @return LabelsSource containing available labels
*/
LabelsSource getLabelsSource();
}
/**
* Basic implementation of label-aware iterator
*/
public class BasicLabelAwareIterator implements LabelAwareIterator {
/**
* Create iterator from labeled document collection
* @param documents Collection of LabelledDocument instances
*/
public BasicLabelAwareIterator(Collection<LabelledDocument> documents);
}
/**
* File-based label-aware iterator
*/
public class FileLabelAwareIterator implements LabelAwareIterator {
// File-based iteration with label extraction
}
/**
* Filename-based label-aware iterator
*/
public class FilenamesLabelAwareIterator implements LabelAwareIterator {
// Extract labels from filenames during iteration
}
/**
* Simple label-aware iterator implementation
*/
public class SimpleLabelAwareIterator implements LabelAwareIterator {
// Simple labeled document iteration
}
/**
* Asynchronous label-aware iterator for performance
*/
public class AsyncLabelAwareIterator implements LabelAwareIterator {
// Asynchronous processing of labeled documents
}Comprehensive tokenization system with pluggable tokenizers and preprocessing components.
/**
* Factory interface for creating tokenizers
*/
public interface TokenizerFactory {
/**
* Create tokenizer from string input
* @param toTokenize String to be tokenized
* @return Tokenizer instance for the input string
*/
Tokenizer create(String toTokenize);
/**
* Create tokenizer from input stream
* @param toTokenize InputStream to be tokenized
* @return Tokenizer instance for the input stream
*/
Tokenizer create(InputStream toTokenize);
/**
* Set token preprocessor for all created tokenizers
* @param preProcessor TokenPreProcess instance for token cleaning
*/
void setTokenPreProcessor(TokenPreProcess preProcessor);
/**
* Get current token preprocessor
* @return TokenPreProcess instance or null if none set
*/
TokenPreProcess getTokenPreProcessor();
}
/**
* Default tokenizer factory implementation
*/
public class DefaultTokenizerFactory implements TokenizerFactory {
// Standard tokenization with whitespace and punctuation handling
}
/**
* N-gram tokenizer factory for n-gram generation
*/
public class NGramTokenizerFactory implements TokenizerFactory {
// Creates n-gram tokens from input text
}
/**
* Tokenizer interface for text tokenization
*/
public interface Tokenizer {
/**
* Get all tokens from input
* @return List of token strings
*/
List<String> getTokens();
/**
* Count total number of tokens
* @return Number of tokens in input
*/
int countTokens();
/**
* Get next token
* @return Next token string or null if no more tokens
*/
String nextToken();
/**
* Check if more tokens available
* @return true if more tokens exist
*/
boolean hasMoreTokens();
}
/**
* Default tokenizer implementation
*/
public class DefaultTokenizer implements Tokenizer {
// Standard tokenization with delimiter-based splitting
}
/**
* Stream-based tokenizer for large inputs
*/
public class DefaultStreamTokenizer implements Tokenizer {
// Memory-efficient tokenization of streams
}
/**
* N-gram tokenizer for generating n-gram sequences
*/
public class NGramTokenizer implements Tokenizer {
// Generates n-gram token sequences from input
}Token-level preprocessing components for text normalization and cleaning.
/**
* Token preprocessing interface
*/
public interface TokenPreProcess {
/**
* Preprocess token string
* @param token Input token string
* @return Preprocessed token string
*/
String preProcess(String token);
}
/**
* Common token preprocessing operations
*/
public class CommonPreprocessor implements TokenPreProcess {
// Standard preprocessing: lowercasing, punctuation removal, etc.
}
/**
* Lowercase token preprocessor
*/
public class LowCasePreProcessor implements TokenPreProcess {
// Converts tokens to lowercase
}
/**
* String cleaning preprocessor
*/
public class StringCleaning implements TokenPreProcess {
// Comprehensive string cleaning and normalization
}
/**
* Word ending preprocessor
*/
public class EndingPreProcessor implements TokenPreProcess {
// Processes word endings and suffixes
}Usage Examples:
import org.deeplearning4j.text.sentenceiterator.*;
import org.deeplearning4j.text.tokenization.tokenizerfactory.*;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.*;
// Basic sentence iteration
Collection<String> sentences = Arrays.asList(
"First sentence for processing.",
"Second sentence with different content.",
"Third sentence to complete the example."
);
SentenceIterator iterator = new CollectionSentenceIterator(sentences);
while (iterator.hasNext()) {
String sentence = iterator.nextSentence();
System.out.println("Processing: " + sentence);
}
iterator.finish();
// File-based sentence iteration
File textFile = new File("corpus.txt");
SentenceIterator fileIterator = new FileSentenceIterator(textFile);
// Configure tokenization with preprocessing
TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());
// Use tokenizer
Tokenizer tokenizer = tokenizerFactory.create("Sample text for tokenization!");
List<String> tokens = tokenizer.getTokens();
System.out.println("Tokens: " + tokens);
// Advanced preprocessing chain
TokenizerFactory advancedFactory = new DefaultTokenizerFactory();
advancedFactory.setTokenPreProcessor(new LowCasePreProcessor());
// Label-aware document processing
Collection<LabelledDocument> labeledDocs = Arrays.asList(
new LabelledDocument("Positive review text", "positive"),
new LabelledDocument("Negative review text", "negative")
);
LabelAwareIterator labelIterator = new BasicLabelAwareIterator(labeledDocs);
while (labelIterator.hasNext()) {
LabelledDocument doc = labelIterator.nextDocument();
System.out.println("Document: " + doc.getContent());
System.out.println("Labels: " + doc.getLabels());
}
// Stream-based document processing
DocumentIterator docIterator = new FileDocumentIterator(new File("documents/"));
SentenceIterator streamIterator = new StreamLineIterator.Builder(docIterator)
.setFetchSize(100)
.build();
// Multi-threaded sentence processing
SentenceIterator syncIterator = new SynchronizedSentenceIterator(iterator);
// Use syncIterator in multi-threaded environmentUtility classes for converting between different iterator types and formats.
/**
* Converter between document iterator types
*/
public class DocumentIteratorConverter {
// Converts between LabelAwareDocumentIterator and standard DocumentIterator
}
/**
* Converter between sentence iterator types
*/
public class SentenceIteratorConverter {
// Converts between LabelAwareSentenceIterator and standard SentenceIterator
}Install with Tessl CLI
npx tessl i tessl/maven-org-deeplearning4j--deeplearning4j-nlp