Tessl Tile for npm/natural@8.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classification.md distance.md index.md ngrams-tfidf.md phonetics.md pos-tagging.md sentiment.md text-processing.md transliterators.md utilities.md wordnet.md

ngrams-tfidf.mddocs/

0
# N-grams and TF-IDF
1

2
Statistical text analysis tools for creating n-grams and calculating term frequency-inverse document frequency scores. These are fundamental techniques for text analytics, information retrieval, and feature extraction.
3

4
## Capabilities
5

6
### N-grams
7

8
Generate sequences of n consecutive words or characters from text for pattern analysis and language modeling.
9

10
```javascript { .api }
11
/**
12
 * Generate n-grams from a sequence
13
 * @param sequence - String or array of tokens
14
 * @param n - Size of n-grams (1=unigrams, 2=bigrams, 3=trigrams, etc.)
15
 * @param startSymbol - Optional padding symbol for start of sequence
16
 * @param endSymbol - Optional padding symbol for end of sequence
17
 * @param stats - If true, returns statistics object instead of array
18
 * @returns Array of n-grams or statistics object
19
 */
20
function ngrams(sequence: string | string[], n: number, startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;
21

22
/**
23
 * Generate bigrams (2-grams) from sequence
24
 * @param sequence - String or array of tokens
25
 * @param startSymbol - Optional padding symbol for start
26
 * @param endSymbol - Optional padding symbol for end
27
 * @param stats - If true, returns statistics object
28
 * @returns Array of bigrams or statistics
29
 */
30
function bigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;
31

32
/**
33
 * Generate trigrams (3-grams) from sequence
34
 * @param sequence - String or array of tokens
35
 * @param startSymbol - Optional padding symbol for start
36
 * @param endSymbol - Optional padding symbol for end
37
 * @param stats - If true, returns statistics object
38
 * @returns Array of trigrams or statistics
39
 */
40
function trigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;
41

42
/**
43
 * Set custom tokenizer for n-gram generation
44
 * @param tokenizer - Tokenizer object with tokenize method
45
 */
46
function setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;
47

48
/**
49
 * Statistics object returned when stats=true
50
 */
51
interface NgramStatistics {
52
  ngrams: string[][];
53
  frequencies: {[key: string]: number};
54
  Nr: {[key: string]: number};
55
  numberOfNgrams: number;
56
}
57
```
58

59
**Usage Examples:**
60

61
```javascript
62
const natural = require('natural');
63

64
// Basic n-gram generation
65
const text = 'hello world how are you';
66

67
// Unigrams (1-grams)
68
const unigrams = natural.ngrams(text, 1);
69
console.log(unigrams);
70
// [['hello'], ['world'], ['how'], ['are'], ['you']]
71

72
// Bigrams (2-grams)
73
const bigramArray = natural.bigrams(text);
74
console.log(bigramArray);
75
// [['hello', 'world'], ['world', 'how'], ['how', 'are'], ['are', 'you']]
76

77
// Trigrams (3-grams)
78
const trigramArray = natural.trigrams(text);
79
console.log(trigramArray);
80
// [['hello', 'world', 'how'], ['world', 'how', 'are'], ['how', 'are', 'you']]
81

82
// With padding symbols
83
const paddedBigrams = natural.bigrams(text, '<s>', '</s>');
84
console.log(paddedBigrams);
85
// [['<s>', 'hello'], ['hello', 'world'], ..., ['you', '</s>']]
86

87
// With statistics
88
const bigramStats = natural.bigrams(text, null, null, true);
89
console.log(bigramStats);
90
// {
91
//   ngrams: [...],
92
//   frequencies: {'hello,world': 1, 'world,how': 1, ...},
93
//   Nr: {1: 4},  // 4 bigrams appear once
94
//   numberOfNgrams: 4
95
// }
96
```
97

98
### N-gram Analysis
99

100
```javascript
101
const natural = require('natural');
102

103
/**
104
 * Analyze n-gram frequencies in text
105
 */
106
function analyzeNgrams(text, n = 2) {
107
  const stats = natural.ngrams(text, n, null, null, true);
108
  
109
  // Sort by frequency
110
  const sorted = Object.entries(stats.frequencies)
111
    .sort(([,a], [,b]) => b - a)
112
    .map(([ngram, freq]) => ({
113
      ngram: ngram.split(','),
114
      frequency: freq
115
    }));
116
  
117
  return {
118
    totalNgrams: stats.numberOfNgrams,
119
    uniqueNgrams: Object.keys(stats.frequencies).length,
120
    mostFrequent: sorted.slice(0, 10),
121
    frequencies: stats.frequencies
122
  };
123
}
124

125
// Example usage
126
const document = 'the quick brown fox jumps over the lazy dog the dog was lazy';
127
const analysis = analyzeNgrams(document, 2);
128
console.log('Total bigrams:', analysis.totalNgrams);
129
console.log('Unique bigrams:', analysis.uniqueNgrams);
130
console.log('Most frequent:', analysis.mostFrequent);
131
```
132

133
### Chinese N-grams
134

135
```javascript { .api }
136
/**
137
 * Chinese n-gram generation with specialized tokenization
138
 */
139
class NGramsZH {
140
  static ngrams(text: string, n: number): string[][];
141
  static bigrams(text: string): string[][];
142
  static trigrams(text: string): string[][];
143
}
144
```
145

146
**Usage Examples:**
147

148
```javascript
149
const natural = require('natural');
150

151
// Chinese text n-grams
152
const chineseText = '你好世界今天天气很好';
153
const chineseBigrams = natural.NGramsZH.bigrams(chineseText);
154
console.log(chineseBigrams);
155
```
156

157
## TF-IDF
158

159
Term Frequency-Inverse Document Frequency calculation for document analysis and information retrieval.
160

161
```javascript { .api }
162
/**
163
 * TF-IDF calculator for document corpus analysis
164
 * @param deserialized - Optional previously serialized TfIdf instance
165
 */
166
class TfIdf {
167
  constructor(deserialized?: object);
168
  
169
  /**
170
   * Add document to the corpus
171
   * @param document - Document text or array of tokens
172
   * @param key - Optional document identifier
173
   * @param restoreCache - Whether to restore IDF cache
174
   */
175
  addDocument(document: string | string[], key?: string, restoreCache?: boolean): void;
176
  
177
  /**
178
   * Add document from file synchronously
179
   * @param path - File path
180
   * @param encoding - File encoding (default: 'utf8')
181
   * @param key - Optional document identifier
182
   * @param restoreCache - Whether to restore IDF cache
183
   */
184
  addFileSync(path: string, encoding?: string, key?: string, restoreCache?: boolean): void;
185
  
186
  /**
187
   * Remove document from corpus
188
   * @param key - Document identifier
189
   * @returns true if document was removed
190
   */
191
  removeDocument(key: string): boolean;
192
  
193
  /**
194
   * Calculate inverse document frequency for a term
195
   * @param term - Term to calculate IDF for
196
   * @param force - Force recalculation even if cached
197
   * @returns IDF value
198
   */
199
  idf(term: string, force?: boolean): number;
200
  
201
  /**
202
   * Calculate TF-IDF score for terms in a specific document
203
   * @param terms - Term or array of terms
204
   * @param documentIndex - Index of document in corpus
205
   * @returns TF-IDF score
206
   */
207
  tfidf(terms: string | string[], documentIndex: number): number;
208
  
209
  /**
210
   * Calculate TF-IDF for terms across all documents
211
   * @param terms - Term or array of terms
212
   * @param callback - Optional callback function
213
   * @returns Array of TF-IDF scores for each document
214
   */
215
  tfidfs(terms: string | string[], callback?: (i: number, measure: number) => void): number[];
216
  
217
  /**
218
   * List all terms in a document with their TF-IDF scores
219
   * @param documentIndex - Index of document
220
   * @returns Array of terms with scores
221
   */
222
  listTerms(documentIndex: number): TfIdfTerm[];
223
  
224
  /**
225
   * Set custom tokenizer for document processing
226
   * @param tokenizer - Tokenizer with tokenize method
227
   */
228
  setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;
229
  
230
  /**
231
   * Set custom stopwords list
232
   * @param stopwords - Array of stopword strings
233
   */
234
  setStopwords(stopwords: string[]): void;
235
}
236

237
/**
238
 * Term with TF-IDF score
239
 */
240
interface TfIdfTerm {
241
  term: string;
242
  tfidf: number;
243
}
244

245
/**
246
 * Static method for calculating term frequency
247
 * @param term - Term to calculate TF for
248
 * @param document - Document text or tokens
249
 * @returns Term frequency
250
 */
251
static TfIdf.tf(term: string, document: string | string[]): number;
252
```
253

254
**Usage Examples:**
255

256
```javascript
257
const natural = require('natural');
258

259
// Create TF-IDF instance
260
const tfidf = new natural.TfIdf();
261

262
// Add documents to corpus
263
tfidf.addDocument('this document is about node. node is a runtime');
264
tfidf.addDocument('this document is about ruby. ruby is a language');
265
tfidf.addDocument('this document is about ruby. ruby is also a gem');
266

267
// Calculate TF-IDF for specific terms in document 0
268
console.log('TF-IDF for "node" in doc 0:', tfidf.tfidf('node', 0));
269
console.log('TF-IDF for "ruby" in doc 0:', tfidf.tfidf('ruby', 0));
270

271
// Calculate across all documents
272
const nodeScores = tfidf.tfidfs('node');
273
console.log('Node scores across all docs:', nodeScores);
274

275
// List all terms in document 0 with scores
276
const terms = tfidf.listTerms(0);
277
console.log('All terms in doc 0:');
278
terms.forEach(term => {
279
  console.log(`${term.term}: ${term.tfidf}`);
280
});
281

282
// Calculate IDF for a term
283
console.log('IDF for "document":', tfidf.idf('document'));
284

285
// Calculate TF for multiple terms
286
const multiTermScore = tfidf.tfidf(['this', 'document'], 0);
287
console.log('Multi-term TF-IDF:', multiTermScore);
288
```
289

290
### Advanced TF-IDF Usage
291

292
```javascript
293
const natural = require('natural');
294
const fs = require('fs');
295

296
/**
297
 * Document similarity using TF-IDF
298
 */
299
function calculateDocumentSimilarity(documents) {
300
  const tfidf = new natural.TfIdf();
301
  
302
  // Add all documents
303
  documents.forEach(doc => tfidf.addDocument(doc));
304
  
305
  // Get all unique terms
306
  const allTerms = new Set();
307
  documents.forEach((doc, i) => {
308
    const terms = tfidf.listTerms(i);
309
    terms.forEach(term => allTerms.add(term.term));
310
  });
311
  
312
  // Create TF-IDF vectors for each document
313
  const vectors = documents.map((doc, i) => {
314
    const vector = {};
315
    allTerms.forEach(term => {
316
      vector[term] = tfidf.tfidf(term, i);
317
    });
318
    return vector;
319
  });
320
  
321
  return vectors;
322
}
323

324
// Example usage
325
const docs = [
326
  'Machine learning is a subset of artificial intelligence',
327
  'Natural language processing uses machine learning algorithms',
328
  'Deep learning is a type of machine learning using neural networks'
329
];
330

331
const vectors = calculateDocumentSimilarity(docs);
332
console.log('Document TF-IDF vectors:', vectors);
333
```
334

335
### File-based TF-IDF
336

337
```javascript
338
const natural = require('natural');
339

340
// Create TF-IDF from files
341
const tfidf = new natural.TfIdf();
342

343
// Add documents from files
344
try {
345
  tfidf.addFileSync('./document1.txt', 'utf8', 'doc1');
346
  tfidf.addFileSync('./document2.txt', 'utf8', 'doc2');
347
  tfidf.addFileSync('./document3.txt', 'utf8', 'doc3');
348
  
349
  // Analyze specific terms
350
  const searchTerms = ['machine', 'learning', 'algorithm'];
351
  searchTerms.forEach(term => {
352
    console.log(`\nTF-IDF scores for "${term}":`);
353
    const scores = tfidf.tfidfs(term);
354
    scores.forEach((score, i) => {
355
      console.log(`Document ${i}: ${score}`);
356
    });
357
  });
358
  
359
  // Find most relevant document for a query
360
  const query = ['machine', 'learning'];
361
  const queryScores = tfidf.tfidfs(query);
362
  const mostRelevant = queryScores.indexOf(Math.max(...queryScores));
363
  console.log(`Most relevant document for query: ${mostRelevant}`);
364
  
365
} catch (error) {
366
  console.error('Error reading files:', error);
367
}
368
```
369

370
### Custom Tokenization and Stopwords
371

372
```javascript
373
const natural = require('natural');
374

375
// Create TF-IDF with custom settings
376
const tfidf = new natural.TfIdf();
377

378
// Set custom tokenizer
379
const customTokenizer = {
380
  tokenize: function(text) {
381
    // Custom tokenization logic
382
    return text.toLowerCase()
383
      .replace(/[^\w\s]/g, '') // Remove punctuation
384
      .split(/\s+/)
385
      .filter(token => token.length > 2); // Only tokens > 2 chars
386
  }
387
};
388
tfidf.setTokenizer(customTokenizer);
389

390
// Set custom stopwords
391
const customStopwords = ['the', 'is', 'at', 'which', 'on', 'and', 'or', 'but'];
392
tfidf.setStopwords(customStopwords);
393

394
// Add documents with custom processing
395
tfidf.addDocument('This is a sample document with custom processing');
396
tfidf.addDocument('Another document for testing custom tokenization');
397

398
// Analyze with custom settings
399
const terms = tfidf.listTerms(0);
400
console.log('Terms with custom processing:', terms);
401
```

Version

Tile

Files

ngrams-tfidf.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

ngrams-tfidf.mddocs/