Tessl Tile for npm/natural@8.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classification.md distance.md index.md ngrams-tfidf.md phonetics.md pos-tagging.md sentiment.md text-processing.md transliterators.md utilities.md wordnet.md

text-processing.mddocs/

0
# Text Processing
1

2
Comprehensive text preprocessing tools including tokenization, stemming, and normalization for multiple languages. These are essential building blocks for preparing raw text data for natural language processing tasks.
3

4
## Capabilities
5

6
### Text Analysis
7

8
#### Sentence Analyzer
9

10
Analyzes sentence structure and provides readability metrics.
11

12
```javascript { .api }
13
/**
14
 * Sentence analyzer for readability and complexity metrics
15
 */
16
class SentenceAnalyzer {
17
  /** Analyze sentence structure and readability */
18
  static analyze(sentence: string): {
19
    numWords: number;
20
    numChars: number;
21
    averageWordsPerSentence: number;
22
    numSentences: number;
23
  };
24
}
25
```
26

27
**Example usage:**
28
```javascript
29
const { SentenceAnalyzer } = require('natural');
30

31
const analysis = SentenceAnalyzer.analyze('This is a sample sentence.');
32
console.log(analysis);
33
// { numWords: 5, numChars: 25, averageWordsPerSentence: 5, numSentences: 1 }
34
```
35

36
### Tokenization
37

38
Breaking text into individual tokens (words, punctuation, etc.) using various strategies.
39

40
#### Word Tokenizer
41

42
Basic word tokenization that splits on whitespace and punctuation.
43

44
```javascript { .api }
45
/**
46
 * Basic word tokenizer
47
 */
48
class WordTokenizer {
49
  /** Tokenize text into words */
50
  static tokenize(text: string): string[];
51
}
52
```
53

54
#### Regular Expression Tokenizer
55

56
Flexible tokenizer using regular expressions for custom tokenization patterns.
57

58
```javascript { .api }
59
/**
60
 * Regular expression-based tokenizer
61
 * @param options - Tokenization options including pattern
62
 */
63
class RegexpTokenizer {
64
  constructor(options?: {pattern?: RegExp, discardEmpty?: boolean});
65
  
66
  /** Tokenize text using regex pattern */
67
  tokenize(text: string): string[];
68
}
69

70
/**
71
 * Orthography-aware tokenizer
72
 */
73
class OrthographyTokenizer extends RegexpTokenizer {
74
  constructor();
75
}
76

77
/**
78
 * Word and punctuation tokenizer
79
 */
80
class WordPunctTokenizer extends RegexpTokenizer {
81
  constructor();
82
}
83
```
84

85
#### Aggressive Tokenizers
86

87
Language-specific aggressive tokenizers that handle language-specific tokenization rules.
88

89
```javascript { .api }
90
/**
91
 * Base aggressive tokenizer
92
 */
93
class AggressiveTokenizer {
94
  constructor();
95
  tokenize(text: string): string[];
96
}
97

98
// Language-specific aggressive tokenizers
99
class AggressiveTokenizerNl extends AggressiveTokenizer {} // Dutch
100
class AggressiveTokenizerFr extends AggressiveTokenizer {} // French
101
class AggressiveTokenizerDe extends AggressiveTokenizer {} // German
102
class AggressiveTokenizerEs extends AggressiveTokenizer {} // Spanish
103
class AggressiveTokenizerIt extends AggressiveTokenizer {} // Italian
104
class AggressiveTokenizerRu extends AggressiveTokenizer {} // Russian
105
class AggressiveTokenizerPt extends AggressiveTokenizer {} // Portuguese
106
class AggressiveTokenizerNo extends AggressiveTokenizer {} // Norwegian
107
class AggressiveTokenizerSv extends AggressiveTokenizer {} // Swedish
108
class AggressiveTokenizerPl extends AggressiveTokenizer {} // Polish
109
class AggressiveTokenizerVi extends AggressiveTokenizer {} // Vietnamese
110
class AggressiveTokenizerFa extends AggressiveTokenizer {} // Persian/Farsi
111
class AggressiveTokenizerId extends AggressiveTokenizer {} // Indonesian
112
class AggressiveTokenizerHi extends AggressiveTokenizer {} // Hindi
113
class AggressiveTokenizerUk extends AggressiveTokenizer {} // Ukrainian
114
```
115

116
#### Other Tokenizers
117

118
```javascript { .api }
119
/**
120
 * Case-preserving tokenizer
121
 */
122
class CaseTokenizer {
123
  constructor();
124
  tokenize(text: string): string[];
125
}
126

127
/**
128
 * Penn Treebank word tokenizer
129
 */
130
class TreebankWordTokenizer {
131
  constructor();
132
  tokenize(text: string): string[];
133
}
134

135
/**
136
 * Japanese tokenizer
137
 */
138
class TokenizerJa {
139
  constructor();
140
  tokenize(text: string): string[];
141
}
142

143
/**
144
 * Sentence tokenizer
145
 */
146
class SentenceTokenizer {
147
  constructor();
148
  tokenize(text: string): string[];
149
}
150
```
151

152
**Usage Examples:**
153

154
```javascript
155
const natural = require('natural');
156

157
// Basic word tokenization
158
const tokens = natural.WordTokenizer.tokenize('Hello world, how are you?');
159
console.log(tokens); // ['Hello', 'world', 'how', 'are', 'you']
160

161
// Regular expression tokenizer
162
const regexTokenizer = new natural.RegexpTokenizer({pattern: /\s+/, discardEmpty: true});
163
const regexTokens = regexTokenizer.tokenize('Hello   world');
164
console.log(regexTokens); // ['Hello', 'world']
165

166
// Aggressive tokenizer
167
const aggressive = new natural.AggressiveTokenizer();
168
const aggressiveTokens = aggressive.tokenize("Don't you think?");
169
console.log(aggressiveTokens); // ['Don', 't', 'you', 'think']
170

171
// Language-specific tokenizer
172
const frenchTokenizer = new natural.AggressiveTokenizerFr();
173
const frenchTokens = frenchTokenizer.tokenize("Bonjour, comment allez-vous?");
174

175
// Sentence tokenizer
176
const sentenceTokenizer = new natural.SentenceTokenizer();
177
const sentences = sentenceTokenizer.tokenize('Hello world. How are you? Fine, thanks.');
178
console.log(sentences); // ['Hello world.', 'How are you?', 'Fine, thanks.']
179
```
180

181
### Stemming
182

183
Reducing words to their root form by removing suffixes and prefixes.
184

185
#### Porter Stemmer
186

187
The classic Porter stemming algorithm with support for multiple languages.
188

189
```javascript { .api }
190
/**
191
 * Porter stemmer for English
192
 */
193
class PorterStemmer {
194
  /** Stem a single word */
195
  static stem(word: string): string;
196
  
197
  /** Stem an array of tokens */
198
  static stemTokens(tokens: string[]): string[];
199
}
200

201
// Language-specific Porter stemmers
202
class PorterStemmerFr { static stem(word: string): string; } // French
203
class PorterStemmerDe { static stem(word: string): string; } // German
204
class PorterStemmerEs { static stem(word: string): string; } // Spanish
205
class PorterStemmerIt { static stem(word: string): string; } // Italian
206
class PorterStemmerRu { static stem(word: string): string; } // Russian
207
class PorterStemmerPt { static stem(word: string): string; } // Portuguese
208
class PorterStemmerNo { static stem(word: string): string; } // Norwegian
209
class PorterStemmerSv { static stem(word: string): string; } // Swedish
210
class PorterStemmerNl { static stem(word: string): string; } // Dutch
211
class PorterStemmerFa { static stem(word: string): string; } // Persian/Farsi
212
class PorterStemmerUk { static stem(word: string): string; } // Ukrainian
213
```
214

215
#### Other Stemmers
216

217
```javascript { .api }
218
/**
219
 * Lancaster stemmer (more aggressive than Porter)
220
 */
221
class LancasterStemmer {
222
  static stem(word: string): string;
223
}
224

225
/**
226
 * Japanese stemmer
227
 */
228
class StemmerJa {
229
  static stem(word: string): string;
230
}
231

232
/**
233
 * Indonesian stemmer
234
 */
235
class StemmerId {
236
  static stem(word: string): string;
237
}
238

239
/**
240
 * French Carry stemmer
241
 */
242
class CarryStemmerFr {
243
  static stem(word: string): string;
244
}
245

246
/**
247
 * Token class for advanced stemming operations and morphological analysis
248
 * Provides detailed control over stemming processes with region-based operations
249
 */
250
class Token {
251
  constructor(string: string);
252
  
253
  /** Set vowels for this token language */
254
  usingVowels(vowels: string | string[]): Token;
255
  
256
  /** Mark a region in the token by index or callback */
257
  markRegion(region: string, args: number | any[], callback?: Function, context?: object): Token;
258
  
259
  /** Replace all instances of a string with another */
260
  replaceAll(find: string, replace: string): Token;
261
  
262
  /** Replace suffix if it exists within specified region */
263
  replaceSuffixInRegion(suffix: string | string[], replace: string, region: string): Token;
264
  
265
  /** Check if token has vowel at specific index */
266
  hasVowelAtIndex(index: number): boolean;
267
  
268
  /** Find next vowel index starting from position */
269
  nextVowelIndex(start: number): number;
270
  
271
  /** Find next consonant index starting from position */
272
  nextConsonantIndex(start: number): number;
273
  
274
  /** Check if token has specific suffix */
275
  hasSuffix(suffix: string): boolean;
276
  
277
  /** Check if token has suffix within specified region */
278
  hasSuffixInRegion(suffix: string, region: string): boolean;
279
  
280
  /** Get current token string */
281
  toString(): string;
282
  
283
  /** Token string (mutable) */
284
  string: string;
285
  
286
  /** Original token string (immutable) */
287
  original: string;
288
  
289
  /** Vowels definition for this token */
290
  vowels: string;
291
  
292
  /** Defined regions for morphological operations */
293
  regions: {[key: string]: number};
294
}
295
```
296

297
**Usage Examples:**
298

299
```javascript
300
const natural = require('natural');
301

302
// English Porter stemming
303
console.log(natural.PorterStemmer.stem('running')); // 'run'
304
console.log(natural.PorterStemmer.stem('flies')); // 'fli'
305

306
// Stem multiple tokens
307
const tokens = ['running', 'flies', 'dying', 'lying'];
308
const stemmed = natural.PorterStemmer.stemTokens(tokens);
309
console.log(stemmed); // ['run', 'fli', 'die', 'lie']
310

311
// Lancaster stemmer (more aggressive)
312
console.log(natural.LancasterStemmer.stem('running')); // 'run'
313
console.log(natural.LancasterStemmer.stem('maximum')); // 'maxim'
314

315
// Language-specific stemming
316
console.log(natural.PorterStemmerFr.stem('courante')); // French stemming
317
console.log(natural.PorterStemmerDe.stem('laufende')); // German stemming
318

319
// Token-based stemming
320
const token = new natural.Token('running');
321
console.log(token.stem()); // 'run'
322
```
323

324
### Normalization
325

326
Text normalization for cleaning and standardizing text data.
327

328
```javascript { .api }
329
/**
330
 * Normalize array of tokens
331
 * @param tokens - Array of token strings
332
 * @returns Normalized token array
333
 */
334
function normalize(tokens: string[]): string[];
335

336
/**
337
 * Japanese text normalization
338
 * @param text - Japanese text to normalize
339
 * @returns Normalized Japanese text
340
 */
341
function normalizeJa(text: string): string;
342

343
/**
344
 * Norwegian text normalization (diacritic removal)
345
 * @param text - Norwegian text to normalize
346
 * @returns Text with diacritics removed
347
 */
348
function normalizeNo(text: string): string;
349

350
/**
351
 * Swedish text normalization
352
 * @param text - Swedish text to normalize
353
 * @returns Normalized Swedish text
354
 */
355
function normalizeSv(text: string): string;
356

357
/**
358
 * Remove diacritical marks from text
359
 * @param text - Text with diacritics
360
 * @returns Text without diacritics
361
 */
362
function removeDiacritics(text: string): string;
363

364
/**
365
 * Japanese character conversion utilities
366
 */
367
interface Converters {
368
  hiraganaToKatakana(text: string): string;
369
  katakanaToHiragana(text: string): string;
370
  romajiToHiragana(text: string): string;
371
  romajiToKatakana(text: string): string;
372
}
373
```
374

375
**Usage Examples:**
376

377
```javascript
378
const natural = require('natural');
379

380
// Basic normalization
381
const tokens = ['Hello', 'WORLD', 'Test'];
382
const normalized = natural.normalize(tokens);
383
console.log(normalized); // Normalized tokens
384

385
// Remove diacritics
386
const textWithDiacritics = 'café naïve résumé';
387
const clean = natural.removeDiacritics(textWithDiacritics);
388
console.log(clean); // 'cafe naive resume'
389

390
// Japanese normalization
391
const japaneseText = 'こんにちは世界';
392
const normalizedJa = natural.normalizeJa(japaneseText);
393

394
// Norwegian diacritic removal
395
const norwegianText = 'Hålløj verðen';
396
const normalizedNo = natural.normalizeNo(norwegianText);
397

398
// Japanese character conversion
399
const hiragana = 'こんにちは';
400
const katakana = natural.Converters.hiraganaToKatakana(hiragana);
401
console.log(katakana); // 'コンニチハ'
402
```
403

404
### Inflection
405

406
Word inflection for grammatical transformations.
407

408
```javascript { .api }
409
/**
410
 * English noun inflector (singular/plural)
411
 */
412
class NounInflector {
413
  /** Convert singular noun to plural */
414
  pluralize(noun: string): string;
415
  
416
  /** Convert plural noun to singular */
417
  singularize(noun: string): string;
418
}
419

420
/**
421
 * French noun inflector
422
 */
423
class NounInflectorFr {
424
  pluralize(noun: string): string;
425
  singularize(noun: string): string;
426
}
427

428
/**
429
 * Japanese noun inflector
430
 */
431
class NounInflectorJa {
432
  pluralize(noun: string): string;
433
}
434

435
/**
436
 * Present tense verb inflector
437
 */
438
class PresentVerbInflector {
439
  /** Convert verb to present tense form */
440
  present(verb: string): string;
441
}
442

443
/**
444
 * Count inflector for numbers
445
 */
446
class CountInflector {
447
  /** Get ordinal form of number */
448
  nth(number: number): string;
449
}
450

451
/**
452
 * French count inflector
453
 */
454
class CountInflectorFr {
455
  nth(number: number): string;
456
}
457
```
458

459
**Usage Examples:**
460

461
```javascript
462
const natural = require('natural');
463

464
// Noun inflection
465
const nounInflector = new natural.NounInflector();
466
console.log(nounInflector.pluralize('cat')); // 'cats'
467
console.log(nounInflector.singularize('cats')); // 'cat'
468

469
// Count inflection
470
const countInflector = new natural.CountInflector();
471
console.log(countInflector.nth(1)); // '1st'
472
console.log(countInflector.nth(2)); // '2nd'
473
console.log(countInflector.nth(3)); // '3rd'
474
console.log(countInflector.nth(21)); // '21st'
475
```

Version

Tile

Files

text-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text-processing.mddocs/