or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

bag-of-words.mddataset-loading.mddocument-embeddings.mdglove.mdindex.mdtext-processing.mdword-embeddings.md

text-processing.mddocs/

0

# Text Processing and Tokenization

1

2

Comprehensive text preprocessing pipeline with support for multiple tokenization strategies, sentence boundary detection, document iteration patterns, and text preprocessing operations. Provides the foundation for all text-based machine learning workflows in DeepLearning4J NLP.

3

4

## Capabilities

5

6

### Sentence Iteration

7

8

Iterator interface for sentence-level text processing with preprocessing and resource management capabilities.

9

10

```java { .api }

11

/**

12

* Iterator interface for sentence-level text processing

13

* Provides sentence boundary detection and preprocessing

14

*/

15

public interface SentenceIterator {

16

17

/**

18

* Get next sentence or null if no more sentences available

19

* @return Next sentence string, or null if iterator is exhausted

20

*/

21

String nextSentence();

22

23

/**

24

* Check if more sentences are available

25

* @return true if more sentences exist, false otherwise

26

*/

27

boolean hasNext();

28

29

/**

30

* Reset iterator to beginning of data source

31

* Allows multiple passes over the same data

32

*/

33

void reset();

34

35

/**

36

* Clean up resources and close input streams

37

* Should be called when iteration is complete

38

*/

39

void finish();

40

41

/**

42

* Get current sentence preprocessor

43

* @return SentencePreProcessor instance or null if none set

44

*/

45

SentencePreProcessor getPreProcessor();

46

47

/**

48

* Set sentence preprocessor for text cleaning

49

* @param preProcessor SentencePreProcessor for sentence-level cleaning

50

*/

51

void setPreProcessor(SentencePreProcessor preProcessor);

52

}

53

54

/**

55

* Sentence preprocessing interface for text normalization

56

*/

57

public interface SentencePreProcessor {

58

/**

59

* Preprocess sentence text

60

* @param sentence Input sentence string

61

* @return Preprocessed sentence string

62

*/

63

String preProcess(String sentence);

64

}

65

```

66

67

### Sentence Iterator Implementations

68

69

Concrete implementations of SentenceIterator for various data sources and processing patterns.

70

71

```java { .api }

72

/**

73

* File-based sentence iterator reading from text files

74

*/

75

public class FileSentenceIterator implements SentenceIterator {

76

// File-based sentence iteration with configurable encoding

77

}

78

79

/**

80

* Line-based sentence iterator treating each line as a sentence

81

*/

82

public class LineSentenceIterator implements SentenceIterator {

83

// Simple line-by-line sentence processing

84

}

85

86

/**

87

* Collection-based sentence iterator for in-memory text collections

88

*/

89

public class CollectionSentenceIterator implements SentenceIterator {

90

91

/**

92

* Create iterator from string collection

93

* @param sentences Collection of sentence strings

94

*/

95

public CollectionSentenceIterator(Collection<String> sentences);

96

}

97

98

/**

99

* Stream-based line iterator with configurable fetch size

100

*/

101

public class StreamLineIterator implements SentenceIterator {

102

103

/**

104

* Builder for StreamLineIterator configuration

105

*/

106

public static class Builder {

107

/**

108

* Create builder with document iterator source

109

* @param iterator DocumentIterator providing input documents

110

*/

111

public Builder(DocumentIterator iterator);

112

113

/**

114

* Set fetch size for batched processing

115

* @param fetchSize Number of lines to fetch at once

116

* @return Builder instance for method chaining

117

*/

118

public Builder setFetchSize(int fetchSize);

119

120

/**

121

* Build configured StreamLineIterator

122

* @return StreamLineIterator instance

123

*/

124

public StreamLineIterator build();

125

}

126

}

127

128

/**

129

* Thread-safe wrapper for sentence iterators

130

*/

131

public class SynchronizedSentenceIterator implements SentenceIterator {

132

// Thread-safe sentence iteration wrapper

133

}

134

135

/**

136

* Prefetching sentence iterator for improved performance

137

*/

138

public class PrefetchingSentenceIterator implements SentenceIterator {

139

// Performance-optimized iterator with prefetching

140

}

141

142

/**

143

* Multiple epochs sentence iterator for repeated data passes

144

*/

145

public class MutipleEpochsSentenceIterator implements SentenceIterator {

146

// Iterator supporting multiple epochs over same data

147

}

148

149

/**

150

* Aggregating iterator combining multiple sentence sources

151

*/

152

public class AggregatingSentenceIterator implements SentenceIterator {

153

// Combines multiple SentenceIterator instances

154

}

155

```

156

157

### Label-Aware Sentence Processing

158

159

Specialized sentence iterators that handle labeled data for supervised learning tasks.

160

161

```java { .api }

162

/**

163

* Label-aware sentence iterator interface for supervised learning

164

*/

165

public interface LabelAwareSentenceIterator extends SentenceIterator {

166

/**

167

* Get current sentence label

168

* @return Label string for current sentence

169

*/

170

String currentLabel();

171

172

/**

173

* Get labels source

174

* @return LabelsSource containing available labels

175

*/

176

LabelsSource getLabelsSource();

177

}

178

179

/**

180

* File-based label-aware sentence iterator

181

*/

182

public class LabelAwareFileSentenceIterator implements LabelAwareSentenceIterator {

183

// File-based iteration with label extraction from filenames or content

184

}

185

186

/**

187

* List-based label-aware sentence iterator

188

*/

189

public class LabelAwareListSentenceIterator implements LabelAwareSentenceIterator {

190

// In-memory iteration over labeled sentence collections

191

}

192

```

193

194

### Document Iteration

195

196

Document-level iteration interface for processing larger text units with stream-based access.

197

198

```java { .api }

199

/**

200

* Document iterator interface for document-level text processing

201

* Provides InputStream access to document content

202

*/

203

public interface DocumentIterator extends Serializable {

204

205

/**

206

* Get next document as input stream

207

* @return InputStream for next document content

208

*/

209

InputStream nextDocument();

210

211

/**

212

* Check if more documents are available

213

* @return true if more documents exist, false otherwise

214

*/

215

boolean hasNext();

216

217

/**

218

* Reset iterator to beginning of document collection

219

*/

220

void reset();

221

}

222

223

/**

224

* File-based document iterator for file system traversal

225

*/

226

public class FileDocumentIterator implements DocumentIterator {

227

// Iterate over files in directory structure

228

}

229

```

230

231

### Label-Aware Document Processing

232

233

Document iterators with label information for supervised document processing tasks.

234

235

```java { .api }

236

/**

237

* Label-aware document iterator interface

238

*/

239

public interface LabelAwareDocumentIterator extends DocumentIterator {

240

/**

241

* Get labels for current document

242

* @return List of label strings for current document

243

*/

244

List<String> getLabels();

245

246

/**

247

* Check if iterator has labels

248

* @return true if labels are available, false otherwise

249

*/

250

boolean hasLabels();

251

}

252

253

/**

254

* General label-aware iterator interface

255

*/

256

public interface LabelAwareIterator {

257

/**

258

* Get next labeled document

259

* @return LabelledDocument instance

260

*/

261

LabelledDocument nextDocument();

262

263

/**

264

* Check if more labeled documents available

265

* @return true if more documents exist

266

*/

267

boolean hasNext();

268

269

/**

270

* Reset to beginning of labeled data

271

*/

272

void reset();

273

274

/**

275

* Get labels source

276

* @return LabelsSource containing available labels

277

*/

278

LabelsSource getLabelsSource();

279

}

280

281

/**

282

* Basic implementation of label-aware iterator

283

*/

284

public class BasicLabelAwareIterator implements LabelAwareIterator {

285

286

/**

287

* Create iterator from labeled document collection

288

* @param documents Collection of LabelledDocument instances

289

*/

290

public BasicLabelAwareIterator(Collection<LabelledDocument> documents);

291

}

292

293

/**

294

* File-based label-aware iterator

295

*/

296

public class FileLabelAwareIterator implements LabelAwareIterator {

297

// File-based iteration with label extraction

298

}

299

300

/**

301

* Filename-based label-aware iterator

302

*/

303

public class FilenamesLabelAwareIterator implements LabelAwareIterator {

304

// Extract labels from filenames during iteration

305

}

306

307

/**

308

* Simple label-aware iterator implementation

309

*/

310

public class SimpleLabelAwareIterator implements LabelAwareIterator {

311

// Simple labeled document iteration

312

}

313

314

/**

315

* Asynchronous label-aware iterator for performance

316

*/

317

public class AsyncLabelAwareIterator implements LabelAwareIterator {

318

// Asynchronous processing of labeled documents

319

}

320

```

321

322

### Tokenization Framework

323

324

Comprehensive tokenization system with pluggable tokenizers and preprocessing components.

325

326

```java { .api }

327

/**

328

* Factory interface for creating tokenizers

329

*/

330

public interface TokenizerFactory {

331

332

/**

333

* Create tokenizer from string input

334

* @param toTokenize String to be tokenized

335

* @return Tokenizer instance for the input string

336

*/

337

Tokenizer create(String toTokenize);

338

339

/**

340

* Create tokenizer from input stream

341

* @param toTokenize InputStream to be tokenized

342

* @return Tokenizer instance for the input stream

343

*/

344

Tokenizer create(InputStream toTokenize);

345

346

/**

347

* Set token preprocessor for all created tokenizers

348

* @param preProcessor TokenPreProcess instance for token cleaning

349

*/

350

void setTokenPreProcessor(TokenPreProcess preProcessor);

351

352

/**

353

* Get current token preprocessor

354

* @return TokenPreProcess instance or null if none set

355

*/

356

TokenPreProcess getTokenPreProcessor();

357

}

358

359

/**

360

* Default tokenizer factory implementation

361

*/

362

public class DefaultTokenizerFactory implements TokenizerFactory {

363

// Standard tokenization with whitespace and punctuation handling

364

}

365

366

/**

367

* N-gram tokenizer factory for n-gram generation

368

*/

369

public class NGramTokenizerFactory implements TokenizerFactory {

370

// Creates n-gram tokens from input text

371

}

372

373

/**

374

* Tokenizer interface for text tokenization

375

*/

376

public interface Tokenizer {

377

/**

378

* Get all tokens from input

379

* @return List of token strings

380

*/

381

List<String> getTokens();

382

383

/**

384

* Count total number of tokens

385

* @return Number of tokens in input

386

*/

387

int countTokens();

388

389

/**

390

* Get next token

391

* @return Next token string or null if no more tokens

392

*/

393

String nextToken();

394

395

/**

396

* Check if more tokens available

397

* @return true if more tokens exist

398

*/

399

boolean hasMoreTokens();

400

}

401

402

/**

403

* Default tokenizer implementation

404

*/

405

public class DefaultTokenizer implements Tokenizer {

406

// Standard tokenization with delimiter-based splitting

407

}

408

409

/**

410

* Stream-based tokenizer for large inputs

411

*/

412

public class DefaultStreamTokenizer implements Tokenizer {

413

// Memory-efficient tokenization of streams

414

}

415

416

/**

417

* N-gram tokenizer for generating n-gram sequences

418

*/

419

public class NGramTokenizer implements Tokenizer {

420

// Generates n-gram token sequences from input

421

}

422

```

423

424

### Token Preprocessing

425

426

Token-level preprocessing components for text normalization and cleaning.

427

428

```java { .api }

429

/**

430

* Token preprocessing interface

431

*/

432

public interface TokenPreProcess {

433

/**

434

* Preprocess token string

435

* @param token Input token string

436

* @return Preprocessed token string

437

*/

438

String preProcess(String token);

439

}

440

441

/**

442

* Common token preprocessing operations

443

*/

444

public class CommonPreprocessor implements TokenPreProcess {

445

// Standard preprocessing: lowercasing, punctuation removal, etc.

446

}

447

448

/**

449

* Lowercase token preprocessor

450

*/

451

public class LowCasePreProcessor implements TokenPreProcess {

452

// Converts tokens to lowercase

453

}

454

455

/**

456

* String cleaning preprocessor

457

*/

458

public class StringCleaning implements TokenPreProcess {

459

// Comprehensive string cleaning and normalization

460

}

461

462

/**

463

* Word ending preprocessor

464

*/

465

public class EndingPreProcessor implements TokenPreProcess {

466

// Processes word endings and suffixes

467

}

468

```

469

470

**Usage Examples:**

471

472

```java

473

import org.deeplearning4j.text.sentenceiterator.*;

474

import org.deeplearning4j.text.tokenization.tokenizerfactory.*;

475

import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.*;

476

477

// Basic sentence iteration

478

Collection<String> sentences = Arrays.asList(

479

"First sentence for processing.",

480

"Second sentence with different content.",

481

"Third sentence to complete the example."

482

);

483

484

SentenceIterator iterator = new CollectionSentenceIterator(sentences);

485

while (iterator.hasNext()) {

486

String sentence = iterator.nextSentence();

487

System.out.println("Processing: " + sentence);

488

}

489

iterator.finish();

490

491

// File-based sentence iteration

492

File textFile = new File("corpus.txt");

493

SentenceIterator fileIterator = new FileSentenceIterator(textFile);

494

495

// Configure tokenization with preprocessing

496

TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();

497

tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());

498

499

// Use tokenizer

500

Tokenizer tokenizer = tokenizerFactory.create("Sample text for tokenization!");

501

List<String> tokens = tokenizer.getTokens();

502

System.out.println("Tokens: " + tokens);

503

504

// Advanced preprocessing chain

505

TokenizerFactory advancedFactory = new DefaultTokenizerFactory();

506

advancedFactory.setTokenPreProcessor(new LowCasePreProcessor());

507

508

// Label-aware document processing

509

Collection<LabelledDocument> labeledDocs = Arrays.asList(

510

new LabelledDocument("Positive review text", "positive"),

511

new LabelledDocument("Negative review text", "negative")

512

);

513

514

LabelAwareIterator labelIterator = new BasicLabelAwareIterator(labeledDocs);

515

while (labelIterator.hasNext()) {

516

LabelledDocument doc = labelIterator.nextDocument();

517

System.out.println("Document: " + doc.getContent());

518

System.out.println("Labels: " + doc.getLabels());

519

}

520

521

// Stream-based document processing

522

DocumentIterator docIterator = new FileDocumentIterator(new File("documents/"));

523

SentenceIterator streamIterator = new StreamLineIterator.Builder(docIterator)

524

.setFetchSize(100)

525

.build();

526

527

// Multi-threaded sentence processing

528

SentenceIterator syncIterator = new SynchronizedSentenceIterator(iterator);

529

// Use syncIterator in multi-threaded environment

530

```

531

532

### Iterator Conversion and Interoperability

533

534

Utility classes for converting between different iterator types and formats.

535

536

```java { .api }

537

/**

538

* Converter between document iterator types

539

*/

540

public class DocumentIteratorConverter {

541

// Converts between LabelAwareDocumentIterator and standard DocumentIterator

542

}

543

544

/**

545

* Converter between sentence iterator types

546

*/

547

public class SentenceIteratorConverter {

548

// Converts between LabelAwareSentenceIterator and standard SentenceIterator

549

}

550

```