Tessl Tile for maven/org.deeplearning4j/deeplearning4j-nlp@0.9.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

bag-of-words.md dataset-loading.md document-embeddings.md glove.md index.md text-processing.md word-embeddings.md

text-processing.mddocs/

0
# Text Processing and Tokenization
1

2
Comprehensive text preprocessing pipeline with support for multiple tokenization strategies, sentence boundary detection, document iteration patterns, and text preprocessing operations. Provides the foundation for all text-based machine learning workflows in DeepLearning4J NLP.
3

4
## Capabilities
5

6
### Sentence Iteration
7

8
Iterator interface for sentence-level text processing with preprocessing and resource management capabilities.
9

10
```java { .api }
11
/**
12
 * Iterator interface for sentence-level text processing
13
 * Provides sentence boundary detection and preprocessing
14
 */
15
public interface SentenceIterator {
16
    
17
    /**
18
     * Get next sentence or null if no more sentences available
19
     * @return Next sentence string, or null if iterator is exhausted
20
     */
21
    String nextSentence();
22
    
23
    /**
24
     * Check if more sentences are available
25
     * @return true if more sentences exist, false otherwise
26
     */
27
    boolean hasNext();
28
    
29
    /**
30
     * Reset iterator to beginning of data source
31
     * Allows multiple passes over the same data
32
     */
33
    void reset();
34
    
35
    /**
36
     * Clean up resources and close input streams
37
     * Should be called when iteration is complete
38
     */
39
    void finish();
40
    
41
    /**
42
     * Get current sentence preprocessor
43
     * @return SentencePreProcessor instance or null if none set
44
     */
45
    SentencePreProcessor getPreProcessor();
46
    
47
    /**
48
     * Set sentence preprocessor for text cleaning
49
     * @param preProcessor SentencePreProcessor for sentence-level cleaning
50
     */
51
    void setPreProcessor(SentencePreProcessor preProcessor);
52
}
53

54
/**
55
 * Sentence preprocessing interface for text normalization
56
 */
57
public interface SentencePreProcessor {
58
    /**
59
     * Preprocess sentence text
60
     * @param sentence Input sentence string
61
     * @return Preprocessed sentence string
62
     */
63
    String preProcess(String sentence);
64
}
65
```
66

67
### Sentence Iterator Implementations
68

69
Concrete implementations of SentenceIterator for various data sources and processing patterns.
70

71
```java { .api }
72
/**
73
 * File-based sentence iterator reading from text files
74
 */
75
public class FileSentenceIterator implements SentenceIterator {
76
    // File-based sentence iteration with configurable encoding
77
}
78

79
/**
80
 * Line-based sentence iterator treating each line as a sentence
81
 */
82
public class LineSentenceIterator implements SentenceIterator {
83
    // Simple line-by-line sentence processing
84
}
85

86
/**
87
 * Collection-based sentence iterator for in-memory text collections
88
 */
89
public class CollectionSentenceIterator implements SentenceIterator {
90
    
91
    /**
92
     * Create iterator from string collection
93
     * @param sentences Collection of sentence strings
94
     */
95
    public CollectionSentenceIterator(Collection<String> sentences);
96
}
97

98
/**
99
 * Stream-based line iterator with configurable fetch size
100
 */
101
public class StreamLineIterator implements SentenceIterator {
102
    
103
    /**
104
     * Builder for StreamLineIterator configuration
105
     */
106
    public static class Builder {
107
        /**
108
         * Create builder with document iterator source
109
         * @param iterator DocumentIterator providing input documents
110
         */
111
        public Builder(DocumentIterator iterator);
112
        
113
        /**
114
         * Set fetch size for batched processing
115
         * @param fetchSize Number of lines to fetch at once
116
         * @return Builder instance for method chaining
117
         */
118
        public Builder setFetchSize(int fetchSize);
119
        
120
        /**
121
         * Build configured StreamLineIterator
122
         * @return StreamLineIterator instance
123
         */
124
        public StreamLineIterator build();
125
    }
126
}
127

128
/**
129
 * Thread-safe wrapper for sentence iterators
130
 */
131
public class SynchronizedSentenceIterator implements SentenceIterator {
132
    // Thread-safe sentence iteration wrapper
133
}
134

135
/**
136
 * Prefetching sentence iterator for improved performance
137
 */
138
public class PrefetchingSentenceIterator implements SentenceIterator {
139
    // Performance-optimized iterator with prefetching
140
}
141

142
/**
143
 * Multiple epochs sentence iterator for repeated data passes
144
 */
145
public class MutipleEpochsSentenceIterator implements SentenceIterator {
146
    // Iterator supporting multiple epochs over same data
147
}
148

149
/**
150
 * Aggregating iterator combining multiple sentence sources
151
 */
152
public class AggregatingSentenceIterator implements SentenceIterator {
153
    // Combines multiple SentenceIterator instances
154
}
155
```
156

157
### Label-Aware Sentence Processing
158

159
Specialized sentence iterators that handle labeled data for supervised learning tasks.
160

161
```java { .api }
162
/**
163
 * Label-aware sentence iterator interface for supervised learning
164
 */
165
public interface LabelAwareSentenceIterator extends SentenceIterator {
166
    /**
167
     * Get current sentence label
168
     * @return Label string for current sentence
169
     */
170
    String currentLabel();
171
    
172
    /**
173
     * Get labels source
174
     * @return LabelsSource containing available labels
175
     */
176
    LabelsSource getLabelsSource();
177
}
178

179
/**
180
 * File-based label-aware sentence iterator
181
 */
182
public class LabelAwareFileSentenceIterator implements LabelAwareSentenceIterator {
183
    // File-based iteration with label extraction from filenames or content
184
}
185

186
/**
187
 * List-based label-aware sentence iterator
188
 */
189
public class LabelAwareListSentenceIterator implements LabelAwareSentenceIterator {
190
    // In-memory iteration over labeled sentence collections
191
}
192
```
193

194
### Document Iteration
195

196
Document-level iteration interface for processing larger text units with stream-based access.
197

198
```java { .api }
199
/**
200
 * Document iterator interface for document-level text processing
201
 * Provides InputStream access to document content
202
 */
203
public interface DocumentIterator extends Serializable {
204
    
205
    /**
206
     * Get next document as input stream
207
     * @return InputStream for next document content
208
     */
209
    InputStream nextDocument();
210
    
211
    /**
212
     * Check if more documents are available
213
     * @return true if more documents exist, false otherwise
214
     */
215
    boolean hasNext();
216
    
217
    /**
218
     * Reset iterator to beginning of document collection
219
     */
220
    void reset();
221
}
222

223
/**
224
 * File-based document iterator for file system traversal
225
 */
226
public class FileDocumentIterator implements DocumentIterator {
227
    // Iterate over files in directory structure
228
}
229
```
230

231
### Label-Aware Document Processing
232

233
Document iterators with label information for supervised document processing tasks.
234

235
```java { .api }
236
/**
237
 * Label-aware document iterator interface
238
 */
239
public interface LabelAwareDocumentIterator extends DocumentIterator {
240
    /**
241
     * Get labels for current document
242
     * @return List of label strings for current document
243
     */
244
    List<String> getLabels();
245
    
246
    /**
247
     * Check if iterator has labels
248
     * @return true if labels are available, false otherwise
249
     */
250
    boolean hasLabels();
251
}
252

253
/**
254
 * General label-aware iterator interface
255
 */
256
public interface LabelAwareIterator {
257
    /**
258
     * Get next labeled document
259
     * @return LabelledDocument instance
260
     */
261
    LabelledDocument nextDocument();
262
    
263
    /**
264
     * Check if more labeled documents available
265
     * @return true if more documents exist
266
     */
267
    boolean hasNext();
268
    
269
    /**
270
     * Reset to beginning of labeled data
271
     */
272
    void reset();
273
    
274
    /**
275
     * Get labels source
276
     * @return LabelsSource containing available labels
277
     */
278
    LabelsSource getLabelsSource();
279
}
280

281
/**
282
 * Basic implementation of label-aware iterator
283
 */
284
public class BasicLabelAwareIterator implements LabelAwareIterator {
285
    
286
    /**
287
     * Create iterator from labeled document collection
288
     * @param documents Collection of LabelledDocument instances
289
     */
290
    public BasicLabelAwareIterator(Collection<LabelledDocument> documents);
291
}
292

293
/**
294
 * File-based label-aware iterator
295
 */
296
public class FileLabelAwareIterator implements LabelAwareIterator {
297
    // File-based iteration with label extraction
298
}
299

300
/**
301
 * Filename-based label-aware iterator
302
 */
303
public class FilenamesLabelAwareIterator implements LabelAwareIterator {
304
    // Extract labels from filenames during iteration
305
}
306

307
/**
308
 * Simple label-aware iterator implementation
309
 */
310
public class SimpleLabelAwareIterator implements LabelAwareIterator {
311
    // Simple labeled document iteration
312
}
313

314
/**
315
 * Asynchronous label-aware iterator for performance
316
 */
317
public class AsyncLabelAwareIterator implements LabelAwareIterator {
318
    // Asynchronous processing of labeled documents
319
}
320
```
321

322
### Tokenization Framework
323

324
Comprehensive tokenization system with pluggable tokenizers and preprocessing components.
325

326
```java { .api }
327
/**
328
 * Factory interface for creating tokenizers
329
 */
330
public interface TokenizerFactory {
331
    
332
    /**
333
     * Create tokenizer from string input
334
     * @param toTokenize String to be tokenized
335
     * @return Tokenizer instance for the input string
336
     */
337
    Tokenizer create(String toTokenize);
338
    
339
    /**
340
     * Create tokenizer from input stream
341
     * @param toTokenize InputStream to be tokenized
342
     * @return Tokenizer instance for the input stream
343
     */
344
    Tokenizer create(InputStream toTokenize);
345
    
346
    /**
347
     * Set token preprocessor for all created tokenizers
348
     * @param preProcessor TokenPreProcess instance for token cleaning
349
     */
350
    void setTokenPreProcessor(TokenPreProcess preProcessor);
351
    
352
    /**
353
     * Get current token preprocessor
354
     * @return TokenPreProcess instance or null if none set
355
     */
356
    TokenPreProcess getTokenPreProcessor();
357
}
358

359
/**
360
 * Default tokenizer factory implementation
361
 */
362
public class DefaultTokenizerFactory implements TokenizerFactory {
363
    // Standard tokenization with whitespace and punctuation handling
364
}
365

366
/**
367
 * N-gram tokenizer factory for n-gram generation
368
 */
369
public class NGramTokenizerFactory implements TokenizerFactory {
370
    // Creates n-gram tokens from input text
371
}
372

373
/**
374
 * Tokenizer interface for text tokenization
375
 */
376
public interface Tokenizer {
377
    /**
378
     * Get all tokens from input
379
     * @return List of token strings
380
     */
381
    List<String> getTokens();
382
    
383
    /**
384
     * Count total number of tokens
385
     * @return Number of tokens in input
386
     */
387
    int countTokens();
388
    
389
    /**
390
     * Get next token
391
     * @return Next token string or null if no more tokens
392
     */
393
    String nextToken();
394
    
395
    /**
396
     * Check if more tokens available
397
     * @return true if more tokens exist
398
     */
399
    boolean hasMoreTokens();
400
}
401

402
/**
403
 * Default tokenizer implementation
404
 */
405
public class DefaultTokenizer implements Tokenizer {
406
    // Standard tokenization with delimiter-based splitting
407
}
408

409
/**
410
 * Stream-based tokenizer for large inputs
411
 */
412
public class DefaultStreamTokenizer implements Tokenizer {
413
    // Memory-efficient tokenization of streams
414
}
415

416
/**
417
 * N-gram tokenizer for generating n-gram sequences
418
 */
419
public class NGramTokenizer implements Tokenizer {
420
    // Generates n-gram token sequences from input
421
}
422
```
423

424
### Token Preprocessing
425

426
Token-level preprocessing components for text normalization and cleaning.
427

428
```java { .api }
429
/**
430
 * Token preprocessing interface
431
 */
432
public interface TokenPreProcess {
433
    /**
434
     * Preprocess token string
435
     * @param token Input token string
436
     * @return Preprocessed token string
437
     */
438
    String preProcess(String token);
439
}
440

441
/**
442
 * Common token preprocessing operations
443
 */
444
public class CommonPreprocessor implements TokenPreProcess {
445
    // Standard preprocessing: lowercasing, punctuation removal, etc.
446
}
447

448
/**
449
 * Lowercase token preprocessor
450
 */
451
public class LowCasePreProcessor implements TokenPreProcess {
452
    // Converts tokens to lowercase
453
}
454

455
/**
456
 * String cleaning preprocessor
457
 */
458
public class StringCleaning implements TokenPreProcess {
459
    // Comprehensive string cleaning and normalization
460
}
461

462
/**
463
 * Word ending preprocessor
464
 */
465
public class EndingPreProcessor implements TokenPreProcess {
466
    // Processes word endings and suffixes
467
}
468
```
469

470
**Usage Examples:**
471

472
```java
473
import org.deeplearning4j.text.sentenceiterator.*;
474
import org.deeplearning4j.text.tokenization.tokenizerfactory.*;
475
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.*;
476

477
// Basic sentence iteration
478
Collection<String> sentences = Arrays.asList(
479
    "First sentence for processing.",
480
    "Second sentence with different content.",
481
    "Third sentence to complete the example."
482
);
483

484
SentenceIterator iterator = new CollectionSentenceIterator(sentences);
485
while (iterator.hasNext()) {
486
    String sentence = iterator.nextSentence();
487
    System.out.println("Processing: " + sentence);
488
}
489
iterator.finish();
490

491
// File-based sentence iteration
492
File textFile = new File("corpus.txt");
493
SentenceIterator fileIterator = new FileSentenceIterator(textFile);
494

495
// Configure tokenization with preprocessing
496
TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
497
tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());
498

499
// Use tokenizer
500
Tokenizer tokenizer = tokenizerFactory.create("Sample text for tokenization!");
501
List<String> tokens = tokenizer.getTokens();
502
System.out.println("Tokens: " + tokens);
503

504
// Advanced preprocessing chain
505
TokenizerFactory advancedFactory = new DefaultTokenizerFactory();
506
advancedFactory.setTokenPreProcessor(new LowCasePreProcessor());
507

508
// Label-aware document processing
509
Collection<LabelledDocument> labeledDocs = Arrays.asList(
510
    new LabelledDocument("Positive review text", "positive"),
511
    new LabelledDocument("Negative review text", "negative")
512
);
513

514
LabelAwareIterator labelIterator = new BasicLabelAwareIterator(labeledDocs);
515
while (labelIterator.hasNext()) {
516
    LabelledDocument doc = labelIterator.nextDocument();
517
    System.out.println("Document: " + doc.getContent());
518
    System.out.println("Labels: " + doc.getLabels());
519
}
520

521
// Stream-based document processing
522
DocumentIterator docIterator = new FileDocumentIterator(new File("documents/"));
523
SentenceIterator streamIterator = new StreamLineIterator.Builder(docIterator)
524
    .setFetchSize(100)
525
    .build();
526

527
// Multi-threaded sentence processing
528
SentenceIterator syncIterator = new SynchronizedSentenceIterator(iterator);
529
// Use syncIterator in multi-threaded environment
530
```
531

532
### Iterator Conversion and Interoperability
533

534
Utility classes for converting between different iterator types and formats.
535

536
```java { .api }
537
/**
538
 * Converter between document iterator types
539
 */
540
public class DocumentIteratorConverter {
541
    // Converts between LabelAwareDocumentIterator and standard DocumentIterator
542
}
543

544
/**
545
 * Converter between sentence iterator types  
546
 */
547
public class SentenceIteratorConverter {
548
    // Converts between LabelAwareSentenceIterator and standard SentenceIterator
549
}
550
```

Version

Tile

Files

text-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text-processing.mddocs/