0
# N-grams and TF-IDF
1
2
Statistical text analysis tools for creating n-grams and calculating term frequency-inverse document frequency scores. These are fundamental techniques for text analytics, information retrieval, and feature extraction.
3
4
## Capabilities
5
6
### N-grams
7
8
Generate sequences of n consecutive words or characters from text for pattern analysis and language modeling.
9
10
```javascript { .api }
11
/**
12
* Generate n-grams from a sequence
13
* @param sequence - String or array of tokens
14
* @param n - Size of n-grams (1=unigrams, 2=bigrams, 3=trigrams, etc.)
15
* @param startSymbol - Optional padding symbol for start of sequence
16
* @param endSymbol - Optional padding symbol for end of sequence
17
* @param stats - If true, returns statistics object instead of array
18
* @returns Array of n-grams or statistics object
19
*/
20
function ngrams(sequence: string | string[], n: number, startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;
21
22
/**
23
* Generate bigrams (2-grams) from sequence
24
* @param sequence - String or array of tokens
25
* @param startSymbol - Optional padding symbol for start
26
* @param endSymbol - Optional padding symbol for end
27
* @param stats - If true, returns statistics object
28
* @returns Array of bigrams or statistics
29
*/
30
function bigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;
31
32
/**
33
* Generate trigrams (3-grams) from sequence
34
* @param sequence - String or array of tokens
35
* @param startSymbol - Optional padding symbol for start
36
* @param endSymbol - Optional padding symbol for end
37
* @param stats - If true, returns statistics object
38
* @returns Array of trigrams or statistics
39
*/
40
function trigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;
41
42
/**
43
* Set custom tokenizer for n-gram generation
44
* @param tokenizer - Tokenizer object with tokenize method
45
*/
46
function setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;
47
48
/**
49
* Statistics object returned when stats=true
50
*/
51
interface NgramStatistics {
52
ngrams: string[][];
53
frequencies: {[key: string]: number};
54
Nr: {[key: string]: number};
55
numberOfNgrams: number;
56
}
57
```
58
59
**Usage Examples:**
60
61
```javascript
62
const natural = require('natural');
63
64
// Basic n-gram generation
65
const text = 'hello world how are you';
66
67
// Unigrams (1-grams)
68
const unigrams = natural.ngrams(text, 1);
69
console.log(unigrams);
70
// [['hello'], ['world'], ['how'], ['are'], ['you']]
71
72
// Bigrams (2-grams)
73
const bigramArray = natural.bigrams(text);
74
console.log(bigramArray);
75
// [['hello', 'world'], ['world', 'how'], ['how', 'are'], ['are', 'you']]
76
77
// Trigrams (3-grams)
78
const trigramArray = natural.trigrams(text);
79
console.log(trigramArray);
80
// [['hello', 'world', 'how'], ['world', 'how', 'are'], ['how', 'are', 'you']]
81
82
// With padding symbols
83
const paddedBigrams = natural.bigrams(text, '<s>', '</s>');
84
console.log(paddedBigrams);
85
// [['<s>', 'hello'], ['hello', 'world'], ..., ['you', '</s>']]
86
87
// With statistics
88
const bigramStats = natural.bigrams(text, null, null, true);
89
console.log(bigramStats);
90
// {
91
// ngrams: [...],
92
// frequencies: {'hello,world': 1, 'world,how': 1, ...},
93
// Nr: {1: 4}, // 4 bigrams appear once
94
// numberOfNgrams: 4
95
// }
96
```
97
98
### N-gram Analysis
99
100
```javascript
101
const natural = require('natural');
102
103
/**
104
* Analyze n-gram frequencies in text
105
*/
106
function analyzeNgrams(text, n = 2) {
107
const stats = natural.ngrams(text, n, null, null, true);
108
109
// Sort by frequency
110
const sorted = Object.entries(stats.frequencies)
111
.sort(([,a], [,b]) => b - a)
112
.map(([ngram, freq]) => ({
113
ngram: ngram.split(','),
114
frequency: freq
115
}));
116
117
return {
118
totalNgrams: stats.numberOfNgrams,
119
uniqueNgrams: Object.keys(stats.frequencies).length,
120
mostFrequent: sorted.slice(0, 10),
121
frequencies: stats.frequencies
122
};
123
}
124
125
// Example usage
126
const document = 'the quick brown fox jumps over the lazy dog the dog was lazy';
127
const analysis = analyzeNgrams(document, 2);
128
console.log('Total bigrams:', analysis.totalNgrams);
129
console.log('Unique bigrams:', analysis.uniqueNgrams);
130
console.log('Most frequent:', analysis.mostFrequent);
131
```
132
133
### Chinese N-grams
134
135
```javascript { .api }
136
/**
137
* Chinese n-gram generation with specialized tokenization
138
*/
139
class NGramsZH {
140
static ngrams(text: string, n: number): string[][];
141
static bigrams(text: string): string[][];
142
static trigrams(text: string): string[][];
143
}
144
```
145
146
**Usage Examples:**
147
148
```javascript
149
const natural = require('natural');
150
151
// Chinese text n-grams
152
const chineseText = '你好世界今天天气很好';
153
const chineseBigrams = natural.NGramsZH.bigrams(chineseText);
154
console.log(chineseBigrams);
155
```
156
157
## TF-IDF
158
159
Term Frequency-Inverse Document Frequency calculation for document analysis and information retrieval.
160
161
```javascript { .api }
162
/**
163
* TF-IDF calculator for document corpus analysis
164
* @param deserialized - Optional previously serialized TfIdf instance
165
*/
166
class TfIdf {
167
constructor(deserialized?: object);
168
169
/**
170
* Add document to the corpus
171
* @param document - Document text or array of tokens
172
* @param key - Optional document identifier
173
* @param restoreCache - Whether to restore IDF cache
174
*/
175
addDocument(document: string | string[], key?: string, restoreCache?: boolean): void;
176
177
/**
178
* Add document from file synchronously
179
* @param path - File path
180
* @param encoding - File encoding (default: 'utf8')
181
* @param key - Optional document identifier
182
* @param restoreCache - Whether to restore IDF cache
183
*/
184
addFileSync(path: string, encoding?: string, key?: string, restoreCache?: boolean): void;
185
186
/**
187
* Remove document from corpus
188
* @param key - Document identifier
189
* @returns true if document was removed
190
*/
191
removeDocument(key: string): boolean;
192
193
/**
194
* Calculate inverse document frequency for a term
195
* @param term - Term to calculate IDF for
196
* @param force - Force recalculation even if cached
197
* @returns IDF value
198
*/
199
idf(term: string, force?: boolean): number;
200
201
/**
202
* Calculate TF-IDF score for terms in a specific document
203
* @param terms - Term or array of terms
204
* @param documentIndex - Index of document in corpus
205
* @returns TF-IDF score
206
*/
207
tfidf(terms: string | string[], documentIndex: number): number;
208
209
/**
210
* Calculate TF-IDF for terms across all documents
211
* @param terms - Term or array of terms
212
* @param callback - Optional callback function
213
* @returns Array of TF-IDF scores for each document
214
*/
215
tfidfs(terms: string | string[], callback?: (i: number, measure: number) => void): number[];
216
217
/**
218
* List all terms in a document with their TF-IDF scores
219
* @param documentIndex - Index of document
220
* @returns Array of terms with scores
221
*/
222
listTerms(documentIndex: number): TfIdfTerm[];
223
224
/**
225
* Set custom tokenizer for document processing
226
* @param tokenizer - Tokenizer with tokenize method
227
*/
228
setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;
229
230
/**
231
* Set custom stopwords list
232
* @param stopwords - Array of stopword strings
233
*/
234
setStopwords(stopwords: string[]): void;
235
}
236
237
/**
238
* Term with TF-IDF score
239
*/
240
interface TfIdfTerm {
241
term: string;
242
tfidf: number;
243
}
244
245
/**
246
* Static method for calculating term frequency
247
* @param term - Term to calculate TF for
248
* @param document - Document text or tokens
249
* @returns Term frequency
250
*/
251
static TfIdf.tf(term: string, document: string | string[]): number;
252
```
253
254
**Usage Examples:**
255
256
```javascript
257
const natural = require('natural');
258
259
// Create TF-IDF instance
260
const tfidf = new natural.TfIdf();
261
262
// Add documents to corpus
263
tfidf.addDocument('this document is about node. node is a runtime');
264
tfidf.addDocument('this document is about ruby. ruby is a language');
265
tfidf.addDocument('this document is about ruby. ruby is also a gem');
266
267
// Calculate TF-IDF for specific terms in document 0
268
console.log('TF-IDF for "node" in doc 0:', tfidf.tfidf('node', 0));
269
console.log('TF-IDF for "ruby" in doc 0:', tfidf.tfidf('ruby', 0));
270
271
// Calculate across all documents
272
const nodeScores = tfidf.tfidfs('node');
273
console.log('Node scores across all docs:', nodeScores);
274
275
// List all terms in document 0 with scores
276
const terms = tfidf.listTerms(0);
277
console.log('All terms in doc 0:');
278
terms.forEach(term => {
279
console.log(`${term.term}: ${term.tfidf}`);
280
});
281
282
// Calculate IDF for a term
283
console.log('IDF for "document":', tfidf.idf('document'));
284
285
// Calculate TF for multiple terms
286
const multiTermScore = tfidf.tfidf(['this', 'document'], 0);
287
console.log('Multi-term TF-IDF:', multiTermScore);
288
```
289
290
### Advanced TF-IDF Usage
291
292
```javascript
293
const natural = require('natural');
294
const fs = require('fs');
295
296
/**
297
* Document similarity using TF-IDF
298
*/
299
function calculateDocumentSimilarity(documents) {
300
const tfidf = new natural.TfIdf();
301
302
// Add all documents
303
documents.forEach(doc => tfidf.addDocument(doc));
304
305
// Get all unique terms
306
const allTerms = new Set();
307
documents.forEach((doc, i) => {
308
const terms = tfidf.listTerms(i);
309
terms.forEach(term => allTerms.add(term.term));
310
});
311
312
// Create TF-IDF vectors for each document
313
const vectors = documents.map((doc, i) => {
314
const vector = {};
315
allTerms.forEach(term => {
316
vector[term] = tfidf.tfidf(term, i);
317
});
318
return vector;
319
});
320
321
return vectors;
322
}
323
324
// Example usage
325
const docs = [
326
'Machine learning is a subset of artificial intelligence',
327
'Natural language processing uses machine learning algorithms',
328
'Deep learning is a type of machine learning using neural networks'
329
];
330
331
const vectors = calculateDocumentSimilarity(docs);
332
console.log('Document TF-IDF vectors:', vectors);
333
```
334
335
### File-based TF-IDF
336
337
```javascript
338
const natural = require('natural');
339
340
// Create TF-IDF from files
341
const tfidf = new natural.TfIdf();
342
343
// Add documents from files
344
try {
345
tfidf.addFileSync('./document1.txt', 'utf8', 'doc1');
346
tfidf.addFileSync('./document2.txt', 'utf8', 'doc2');
347
tfidf.addFileSync('./document3.txt', 'utf8', 'doc3');
348
349
// Analyze specific terms
350
const searchTerms = ['machine', 'learning', 'algorithm'];
351
searchTerms.forEach(term => {
352
console.log(`\nTF-IDF scores for "${term}":`);
353
const scores = tfidf.tfidfs(term);
354
scores.forEach((score, i) => {
355
console.log(`Document ${i}: ${score}`);
356
});
357
});
358
359
// Find most relevant document for a query
360
const query = ['machine', 'learning'];
361
const queryScores = tfidf.tfidfs(query);
362
const mostRelevant = queryScores.indexOf(Math.max(...queryScores));
363
console.log(`Most relevant document for query: ${mostRelevant}`);
364
365
} catch (error) {
366
console.error('Error reading files:', error);
367
}
368
```
369
370
### Custom Tokenization and Stopwords
371
372
```javascript
373
const natural = require('natural');
374
375
// Create TF-IDF with custom settings
376
const tfidf = new natural.TfIdf();
377
378
// Set custom tokenizer
379
const customTokenizer = {
380
tokenize: function(text) {
381
// Custom tokenization logic
382
return text.toLowerCase()
383
.replace(/[^\w\s]/g, '') // Remove punctuation
384
.split(/\s+/)
385
.filter(token => token.length > 2); // Only tokens > 2 chars
386
}
387
};
388
tfidf.setTokenizer(customTokenizer);
389
390
// Set custom stopwords
391
const customStopwords = ['the', 'is', 'at', 'which', 'on', 'and', 'or', 'but'];
392
tfidf.setStopwords(customStopwords);
393
394
// Add documents with custom processing
395
tfidf.addDocument('This is a sample document with custom processing');
396
tfidf.addDocument('Another document for testing custom tokenization');
397
398
// Analyze with custom settings
399
const terms = tfidf.listTerms(0);
400
console.log('Terms with custom processing:', terms);
401
```