or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mddistance.mdindex.mdngrams-tfidf.mdphonetics.mdpos-tagging.mdsentiment.mdtext-processing.mdtransliterators.mdutilities.mdwordnet.md

text-processing.mddocs/

0

# Text Processing

1

2

Comprehensive text preprocessing tools including tokenization, stemming, and normalization for multiple languages. These are essential building blocks for preparing raw text data for natural language processing tasks.

3

4

## Capabilities

5

6

### Text Analysis

7

8

#### Sentence Analyzer

9

10

Analyzes sentence structure and provides readability metrics.

11

12

```javascript { .api }

13

/**

14

* Sentence analyzer for readability and complexity metrics

15

*/

16

class SentenceAnalyzer {

17

/** Analyze sentence structure and readability */

18

static analyze(sentence: string): {

19

numWords: number;

20

numChars: number;

21

averageWordsPerSentence: number;

22

numSentences: number;

23

};

24

}

25

```

26

27

**Example usage:**

28

```javascript

29

const { SentenceAnalyzer } = require('natural');

30

31

const analysis = SentenceAnalyzer.analyze('This is a sample sentence.');

32

console.log(analysis);

33

// { numWords: 5, numChars: 25, averageWordsPerSentence: 5, numSentences: 1 }

34

```

35

36

### Tokenization

37

38

Breaking text into individual tokens (words, punctuation, etc.) using various strategies.

39

40

#### Word Tokenizer

41

42

Basic word tokenization that splits on whitespace and punctuation.

43

44

```javascript { .api }

45

/**

46

* Basic word tokenizer

47

*/

48

class WordTokenizer {

49

/** Tokenize text into words */

50

static tokenize(text: string): string[];

51

}

52

```

53

54

#### Regular Expression Tokenizer

55

56

Flexible tokenizer using regular expressions for custom tokenization patterns.

57

58

```javascript { .api }

59

/**

60

* Regular expression-based tokenizer

61

* @param options - Tokenization options including pattern

62

*/

63

class RegexpTokenizer {

64

constructor(options?: {pattern?: RegExp, discardEmpty?: boolean});

65

66

/** Tokenize text using regex pattern */

67

tokenize(text: string): string[];

68

}

69

70

/**

71

* Orthography-aware tokenizer

72

*/

73

class OrthographyTokenizer extends RegexpTokenizer {

74

constructor();

75

}

76

77

/**

78

* Word and punctuation tokenizer

79

*/

80

class WordPunctTokenizer extends RegexpTokenizer {

81

constructor();

82

}

83

```

84

85

#### Aggressive Tokenizers

86

87

Language-specific aggressive tokenizers that handle language-specific tokenization rules.

88

89

```javascript { .api }

90

/**

91

* Base aggressive tokenizer

92

*/

93

class AggressiveTokenizer {

94

constructor();

95

tokenize(text: string): string[];

96

}

97

98

// Language-specific aggressive tokenizers

99

class AggressiveTokenizerNl extends AggressiveTokenizer {} // Dutch

100

class AggressiveTokenizerFr extends AggressiveTokenizer {} // French

101

class AggressiveTokenizerDe extends AggressiveTokenizer {} // German

102

class AggressiveTokenizerEs extends AggressiveTokenizer {} // Spanish

103

class AggressiveTokenizerIt extends AggressiveTokenizer {} // Italian

104

class AggressiveTokenizerRu extends AggressiveTokenizer {} // Russian

105

class AggressiveTokenizerPt extends AggressiveTokenizer {} // Portuguese

106

class AggressiveTokenizerNo extends AggressiveTokenizer {} // Norwegian

107

class AggressiveTokenizerSv extends AggressiveTokenizer {} // Swedish

108

class AggressiveTokenizerPl extends AggressiveTokenizer {} // Polish

109

class AggressiveTokenizerVi extends AggressiveTokenizer {} // Vietnamese

110

class AggressiveTokenizerFa extends AggressiveTokenizer {} // Persian/Farsi

111

class AggressiveTokenizerId extends AggressiveTokenizer {} // Indonesian

112

class AggressiveTokenizerHi extends AggressiveTokenizer {} // Hindi

113

class AggressiveTokenizerUk extends AggressiveTokenizer {} // Ukrainian

114

```

115

116

#### Other Tokenizers

117

118

```javascript { .api }

119

/**

120

* Case-preserving tokenizer

121

*/

122

class CaseTokenizer {

123

constructor();

124

tokenize(text: string): string[];

125

}

126

127

/**

128

* Penn Treebank word tokenizer

129

*/

130

class TreebankWordTokenizer {

131

constructor();

132

tokenize(text: string): string[];

133

}

134

135

/**

136

* Japanese tokenizer

137

*/

138

class TokenizerJa {

139

constructor();

140

tokenize(text: string): string[];

141

}

142

143

/**

144

* Sentence tokenizer

145

*/

146

class SentenceTokenizer {

147

constructor();

148

tokenize(text: string): string[];

149

}

150

```

151

152

**Usage Examples:**

153

154

```javascript

155

const natural = require('natural');

156

157

// Basic word tokenization

158

const tokens = natural.WordTokenizer.tokenize('Hello world, how are you?');

159

console.log(tokens); // ['Hello', 'world', 'how', 'are', 'you']

160

161

// Regular expression tokenizer

162

const regexTokenizer = new natural.RegexpTokenizer({pattern: /\s+/, discardEmpty: true});

163

const regexTokens = regexTokenizer.tokenize('Hello world');

164

console.log(regexTokens); // ['Hello', 'world']

165

166

// Aggressive tokenizer

167

const aggressive = new natural.AggressiveTokenizer();

168

const aggressiveTokens = aggressive.tokenize("Don't you think?");

169

console.log(aggressiveTokens); // ['Don', 't', 'you', 'think']

170

171

// Language-specific tokenizer

172

const frenchTokenizer = new natural.AggressiveTokenizerFr();

173

const frenchTokens = frenchTokenizer.tokenize("Bonjour, comment allez-vous?");

174

175

// Sentence tokenizer

176

const sentenceTokenizer = new natural.SentenceTokenizer();

177

const sentences = sentenceTokenizer.tokenize('Hello world. How are you? Fine, thanks.');

178

console.log(sentences); // ['Hello world.', 'How are you?', 'Fine, thanks.']

179

```

180

181

### Stemming

182

183

Reducing words to their root form by removing suffixes and prefixes.

184

185

#### Porter Stemmer

186

187

The classic Porter stemming algorithm with support for multiple languages.

188

189

```javascript { .api }

190

/**

191

* Porter stemmer for English

192

*/

193

class PorterStemmer {

194

/** Stem a single word */

195

static stem(word: string): string;

196

197

/** Stem an array of tokens */

198

static stemTokens(tokens: string[]): string[];

199

}

200

201

// Language-specific Porter stemmers

202

class PorterStemmerFr { static stem(word: string): string; } // French

203

class PorterStemmerDe { static stem(word: string): string; } // German

204

class PorterStemmerEs { static stem(word: string): string; } // Spanish

205

class PorterStemmerIt { static stem(word: string): string; } // Italian

206

class PorterStemmerRu { static stem(word: string): string; } // Russian

207

class PorterStemmerPt { static stem(word: string): string; } // Portuguese

208

class PorterStemmerNo { static stem(word: string): string; } // Norwegian

209

class PorterStemmerSv { static stem(word: string): string; } // Swedish

210

class PorterStemmerNl { static stem(word: string): string; } // Dutch

211

class PorterStemmerFa { static stem(word: string): string; } // Persian/Farsi

212

class PorterStemmerUk { static stem(word: string): string; } // Ukrainian

213

```

214

215

#### Other Stemmers

216

217

```javascript { .api }

218

/**

219

* Lancaster stemmer (more aggressive than Porter)

220

*/

221

class LancasterStemmer {

222

static stem(word: string): string;

223

}

224

225

/**

226

* Japanese stemmer

227

*/

228

class StemmerJa {

229

static stem(word: string): string;

230

}

231

232

/**

233

* Indonesian stemmer

234

*/

235

class StemmerId {

236

static stem(word: string): string;

237

}

238

239

/**

240

* French Carry stemmer

241

*/

242

class CarryStemmerFr {

243

static stem(word: string): string;

244

}

245

246

/**

247

* Token class for advanced stemming operations and morphological analysis

248

* Provides detailed control over stemming processes with region-based operations

249

*/

250

class Token {

251

constructor(string: string);

252

253

/** Set vowels for this token language */

254

usingVowels(vowels: string | string[]): Token;

255

256

/** Mark a region in the token by index or callback */

257

markRegion(region: string, args: number | any[], callback?: Function, context?: object): Token;

258

259

/** Replace all instances of a string with another */

260

replaceAll(find: string, replace: string): Token;

261

262

/** Replace suffix if it exists within specified region */

263

replaceSuffixInRegion(suffix: string | string[], replace: string, region: string): Token;

264

265

/** Check if token has vowel at specific index */

266

hasVowelAtIndex(index: number): boolean;

267

268

/** Find next vowel index starting from position */

269

nextVowelIndex(start: number): number;

270

271

/** Find next consonant index starting from position */

272

nextConsonantIndex(start: number): number;

273

274

/** Check if token has specific suffix */

275

hasSuffix(suffix: string): boolean;

276

277

/** Check if token has suffix within specified region */

278

hasSuffixInRegion(suffix: string, region: string): boolean;

279

280

/** Get current token string */

281

toString(): string;

282

283

/** Token string (mutable) */

284

string: string;

285

286

/** Original token string (immutable) */

287

original: string;

288

289

/** Vowels definition for this token */

290

vowels: string;

291

292

/** Defined regions for morphological operations */

293

regions: {[key: string]: number};

294

}

295

```

296

297

**Usage Examples:**

298

299

```javascript

300

const natural = require('natural');

301

302

// English Porter stemming

303

console.log(natural.PorterStemmer.stem('running')); // 'run'

304

console.log(natural.PorterStemmer.stem('flies')); // 'fli'

305

306

// Stem multiple tokens

307

const tokens = ['running', 'flies', 'dying', 'lying'];

308

const stemmed = natural.PorterStemmer.stemTokens(tokens);

309

console.log(stemmed); // ['run', 'fli', 'die', 'lie']

310

311

// Lancaster stemmer (more aggressive)

312

console.log(natural.LancasterStemmer.stem('running')); // 'run'

313

console.log(natural.LancasterStemmer.stem('maximum')); // 'maxim'

314

315

// Language-specific stemming

316

console.log(natural.PorterStemmerFr.stem('courante')); // French stemming

317

console.log(natural.PorterStemmerDe.stem('laufende')); // German stemming

318

319

// Token-based stemming

320

const token = new natural.Token('running');

321

console.log(token.stem()); // 'run'

322

```

323

324

### Normalization

325

326

Text normalization for cleaning and standardizing text data.

327

328

```javascript { .api }

329

/**

330

* Normalize array of tokens

331

* @param tokens - Array of token strings

332

* @returns Normalized token array

333

*/

334

function normalize(tokens: string[]): string[];

335

336

/**

337

* Japanese text normalization

338

* @param text - Japanese text to normalize

339

* @returns Normalized Japanese text

340

*/

341

function normalizeJa(text: string): string;

342

343

/**

344

* Norwegian text normalization (diacritic removal)

345

* @param text - Norwegian text to normalize

346

* @returns Text with diacritics removed

347

*/

348

function normalizeNo(text: string): string;

349

350

/**

351

* Swedish text normalization

352

* @param text - Swedish text to normalize

353

* @returns Normalized Swedish text

354

*/

355

function normalizeSv(text: string): string;

356

357

/**

358

* Remove diacritical marks from text

359

* @param text - Text with diacritics

360

* @returns Text without diacritics

361

*/

362

function removeDiacritics(text: string): string;

363

364

/**

365

* Japanese character conversion utilities

366

*/

367

interface Converters {

368

hiraganaToKatakana(text: string): string;

369

katakanaToHiragana(text: string): string;

370

romajiToHiragana(text: string): string;

371

romajiToKatakana(text: string): string;

372

}

373

```

374

375

**Usage Examples:**

376

377

```javascript

378

const natural = require('natural');

379

380

// Basic normalization

381

const tokens = ['Hello', 'WORLD', 'Test'];

382

const normalized = natural.normalize(tokens);

383

console.log(normalized); // Normalized tokens

384

385

// Remove diacritics

386

const textWithDiacritics = 'café naïve résumé';

387

const clean = natural.removeDiacritics(textWithDiacritics);

388

console.log(clean); // 'cafe naive resume'

389

390

// Japanese normalization

391

const japaneseText = 'こんにちは世界';

392

const normalizedJa = natural.normalizeJa(japaneseText);

393

394

// Norwegian diacritic removal

395

const norwegianText = 'Hålløj verðen';

396

const normalizedNo = natural.normalizeNo(norwegianText);

397

398

// Japanese character conversion

399

const hiragana = 'こんにちは';

400

const katakana = natural.Converters.hiraganaToKatakana(hiragana);

401

console.log(katakana); // 'コンニチハ'

402

```

403

404

### Inflection

405

406

Word inflection for grammatical transformations.

407

408

```javascript { .api }

409

/**

410

* English noun inflector (singular/plural)

411

*/

412

class NounInflector {

413

/** Convert singular noun to plural */

414

pluralize(noun: string): string;

415

416

/** Convert plural noun to singular */

417

singularize(noun: string): string;

418

}

419

420

/**

421

* French noun inflector

422

*/

423

class NounInflectorFr {

424

pluralize(noun: string): string;

425

singularize(noun: string): string;

426

}

427

428

/**

429

* Japanese noun inflector

430

*/

431

class NounInflectorJa {

432

pluralize(noun: string): string;

433

}

434

435

/**

436

* Present tense verb inflector

437

*/

438

class PresentVerbInflector {

439

/** Convert verb to present tense form */

440

present(verb: string): string;

441

}

442

443

/**

444

* Count inflector for numbers

445

*/

446

class CountInflector {

447

/** Get ordinal form of number */

448

nth(number: number): string;

449

}

450

451

/**

452

* French count inflector

453

*/

454

class CountInflectorFr {

455

nth(number: number): string;

456

}

457

```

458

459

**Usage Examples:**

460

461

```javascript

462

const natural = require('natural');

463

464

// Noun inflection

465

const nounInflector = new natural.NounInflector();

466

console.log(nounInflector.pluralize('cat')); // 'cats'

467

console.log(nounInflector.singularize('cats')); // 'cat'

468

469

// Count inflection

470

const countInflector = new natural.CountInflector();

471

console.log(countInflector.nth(1)); // '1st'

472

console.log(countInflector.nth(2)); // '2nd'

473

console.log(countInflector.nth(3)); // '3rd'

474

console.log(countInflector.nth(21)); // '21st'

475

```