or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mddistance.mdindex.mdngrams-tfidf.mdphonetics.mdpos-tagging.mdsentiment.mdtext-processing.mdtransliterators.mdutilities.mdwordnet.md

ngrams-tfidf.mddocs/

0

# N-grams and TF-IDF

1

2

Statistical text analysis tools for creating n-grams and calculating term frequency-inverse document frequency scores. These are fundamental techniques for text analytics, information retrieval, and feature extraction.

3

4

## Capabilities

5

6

### N-grams

7

8

Generate sequences of n consecutive words or characters from text for pattern analysis and language modeling.

9

10

```javascript { .api }

11

/**

12

* Generate n-grams from a sequence

13

* @param sequence - String or array of tokens

14

* @param n - Size of n-grams (1=unigrams, 2=bigrams, 3=trigrams, etc.)

15

* @param startSymbol - Optional padding symbol for start of sequence

16

* @param endSymbol - Optional padding symbol for end of sequence

17

* @param stats - If true, returns statistics object instead of array

18

* @returns Array of n-grams or statistics object

19

*/

20

function ngrams(sequence: string | string[], n: number, startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;

21

22

/**

23

* Generate bigrams (2-grams) from sequence

24

* @param sequence - String or array of tokens

25

* @param startSymbol - Optional padding symbol for start

26

* @param endSymbol - Optional padding symbol for end

27

* @param stats - If true, returns statistics object

28

* @returns Array of bigrams or statistics

29

*/

30

function bigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;

31

32

/**

33

* Generate trigrams (3-grams) from sequence

34

* @param sequence - String or array of tokens

35

* @param startSymbol - Optional padding symbol for start

36

* @param endSymbol - Optional padding symbol for end

37

* @param stats - If true, returns statistics object

38

* @returns Array of trigrams or statistics

39

*/

40

function trigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;

41

42

/**

43

* Set custom tokenizer for n-gram generation

44

* @param tokenizer - Tokenizer object with tokenize method

45

*/

46

function setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;

47

48

/**

49

* Statistics object returned when stats=true

50

*/

51

interface NgramStatistics {

52

ngrams: string[][];

53

frequencies: {[key: string]: number};

54

Nr: {[key: string]: number};

55

numberOfNgrams: number;

56

}

57

```

58

59

**Usage Examples:**

60

61

```javascript

62

const natural = require('natural');

63

64

// Basic n-gram generation

65

const text = 'hello world how are you';

66

67

// Unigrams (1-grams)

68

const unigrams = natural.ngrams(text, 1);

69

console.log(unigrams);

70

// [['hello'], ['world'], ['how'], ['are'], ['you']]

71

72

// Bigrams (2-grams)

73

const bigramArray = natural.bigrams(text);

74

console.log(bigramArray);

75

// [['hello', 'world'], ['world', 'how'], ['how', 'are'], ['are', 'you']]

76

77

// Trigrams (3-grams)

78

const trigramArray = natural.trigrams(text);

79

console.log(trigramArray);

80

// [['hello', 'world', 'how'], ['world', 'how', 'are'], ['how', 'are', 'you']]

81

82

// With padding symbols

83

const paddedBigrams = natural.bigrams(text, '<s>', '</s>');

84

console.log(paddedBigrams);

85

// [['<s>', 'hello'], ['hello', 'world'], ..., ['you', '</s>']]

86

87

// With statistics

88

const bigramStats = natural.bigrams(text, null, null, true);

89

console.log(bigramStats);

90

// {

91

// ngrams: [...],

92

// frequencies: {'hello,world': 1, 'world,how': 1, ...},

93

// Nr: {1: 4}, // 4 bigrams appear once

94

// numberOfNgrams: 4

95

// }

96

```

97

98

### N-gram Analysis

99

100

```javascript

101

const natural = require('natural');

102

103

/**

104

* Analyze n-gram frequencies in text

105

*/

106

function analyzeNgrams(text, n = 2) {

107

const stats = natural.ngrams(text, n, null, null, true);

108

109

// Sort by frequency

110

const sorted = Object.entries(stats.frequencies)

111

.sort(([,a], [,b]) => b - a)

112

.map(([ngram, freq]) => ({

113

ngram: ngram.split(','),

114

frequency: freq

115

}));

116

117

return {

118

totalNgrams: stats.numberOfNgrams,

119

uniqueNgrams: Object.keys(stats.frequencies).length,

120

mostFrequent: sorted.slice(0, 10),

121

frequencies: stats.frequencies

122

};

123

}

124

125

// Example usage

126

const document = 'the quick brown fox jumps over the lazy dog the dog was lazy';

127

const analysis = analyzeNgrams(document, 2);

128

console.log('Total bigrams:', analysis.totalNgrams);

129

console.log('Unique bigrams:', analysis.uniqueNgrams);

130

console.log('Most frequent:', analysis.mostFrequent);

131

```

132

133

### Chinese N-grams

134

135

```javascript { .api }

136

/**

137

* Chinese n-gram generation with specialized tokenization

138

*/

139

class NGramsZH {

140

static ngrams(text: string, n: number): string[][];

141

static bigrams(text: string): string[][];

142

static trigrams(text: string): string[][];

143

}

144

```

145

146

**Usage Examples:**

147

148

```javascript

149

const natural = require('natural');

150

151

// Chinese text n-grams

152

const chineseText = '你好世界今天天气很好';

153

const chineseBigrams = natural.NGramsZH.bigrams(chineseText);

154

console.log(chineseBigrams);

155

```

156

157

## TF-IDF

158

159

Term Frequency-Inverse Document Frequency calculation for document analysis and information retrieval.

160

161

```javascript { .api }

162

/**

163

* TF-IDF calculator for document corpus analysis

164

* @param deserialized - Optional previously serialized TfIdf instance

165

*/

166

class TfIdf {

167

constructor(deserialized?: object);

168

169

/**

170

* Add document to the corpus

171

* @param document - Document text or array of tokens

172

* @param key - Optional document identifier

173

* @param restoreCache - Whether to restore IDF cache

174

*/

175

addDocument(document: string | string[], key?: string, restoreCache?: boolean): void;

176

177

/**

178

* Add document from file synchronously

179

* @param path - File path

180

* @param encoding - File encoding (default: 'utf8')

181

* @param key - Optional document identifier

182

* @param restoreCache - Whether to restore IDF cache

183

*/

184

addFileSync(path: string, encoding?: string, key?: string, restoreCache?: boolean): void;

185

186

/**

187

* Remove document from corpus

188

* @param key - Document identifier

189

* @returns true if document was removed

190

*/

191

removeDocument(key: string): boolean;

192

193

/**

194

* Calculate inverse document frequency for a term

195

* @param term - Term to calculate IDF for

196

* @param force - Force recalculation even if cached

197

* @returns IDF value

198

*/

199

idf(term: string, force?: boolean): number;

200

201

/**

202

* Calculate TF-IDF score for terms in a specific document

203

* @param terms - Term or array of terms

204

* @param documentIndex - Index of document in corpus

205

* @returns TF-IDF score

206

*/

207

tfidf(terms: string | string[], documentIndex: number): number;

208

209

/**

210

* Calculate TF-IDF for terms across all documents

211

* @param terms - Term or array of terms

212

* @param callback - Optional callback function

213

* @returns Array of TF-IDF scores for each document

214

*/

215

tfidfs(terms: string | string[], callback?: (i: number, measure: number) => void): number[];

216

217

/**

218

* List all terms in a document with their TF-IDF scores

219

* @param documentIndex - Index of document

220

* @returns Array of terms with scores

221

*/

222

listTerms(documentIndex: number): TfIdfTerm[];

223

224

/**

225

* Set custom tokenizer for document processing

226

* @param tokenizer - Tokenizer with tokenize method

227

*/

228

setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;

229

230

/**

231

* Set custom stopwords list

232

* @param stopwords - Array of stopword strings

233

*/

234

setStopwords(stopwords: string[]): void;

235

}

236

237

/**

238

* Term with TF-IDF score

239

*/

240

interface TfIdfTerm {

241

term: string;

242

tfidf: number;

243

}

244

245

/**

246

* Static method for calculating term frequency

247

* @param term - Term to calculate TF for

248

* @param document - Document text or tokens

249

* @returns Term frequency

250

*/

251

static TfIdf.tf(term: string, document: string | string[]): number;

252

```

253

254

**Usage Examples:**

255

256

```javascript

257

const natural = require('natural');

258

259

// Create TF-IDF instance

260

const tfidf = new natural.TfIdf();

261

262

// Add documents to corpus

263

tfidf.addDocument('this document is about node. node is a runtime');

264

tfidf.addDocument('this document is about ruby. ruby is a language');

265

tfidf.addDocument('this document is about ruby. ruby is also a gem');

266

267

// Calculate TF-IDF for specific terms in document 0

268

console.log('TF-IDF for "node" in doc 0:', tfidf.tfidf('node', 0));

269

console.log('TF-IDF for "ruby" in doc 0:', tfidf.tfidf('ruby', 0));

270

271

// Calculate across all documents

272

const nodeScores = tfidf.tfidfs('node');

273

console.log('Node scores across all docs:', nodeScores);

274

275

// List all terms in document 0 with scores

276

const terms = tfidf.listTerms(0);

277

console.log('All terms in doc 0:');

278

terms.forEach(term => {

279

console.log(`${term.term}: ${term.tfidf}`);

280

});

281

282

// Calculate IDF for a term

283

console.log('IDF for "document":', tfidf.idf('document'));

284

285

// Calculate TF for multiple terms

286

const multiTermScore = tfidf.tfidf(['this', 'document'], 0);

287

console.log('Multi-term TF-IDF:', multiTermScore);

288

```

289

290

### Advanced TF-IDF Usage

291

292

```javascript

293

const natural = require('natural');

294

const fs = require('fs');

295

296

/**

297

* Document similarity using TF-IDF

298

*/

299

function calculateDocumentSimilarity(documents) {

300

const tfidf = new natural.TfIdf();

301

302

// Add all documents

303

documents.forEach(doc => tfidf.addDocument(doc));

304

305

// Get all unique terms

306

const allTerms = new Set();

307

documents.forEach((doc, i) => {

308

const terms = tfidf.listTerms(i);

309

terms.forEach(term => allTerms.add(term.term));

310

});

311

312

// Create TF-IDF vectors for each document

313

const vectors = documents.map((doc, i) => {

314

const vector = {};

315

allTerms.forEach(term => {

316

vector[term] = tfidf.tfidf(term, i);

317

});

318

return vector;

319

});

320

321

return vectors;

322

}

323

324

// Example usage

325

const docs = [

326

'Machine learning is a subset of artificial intelligence',

327

'Natural language processing uses machine learning algorithms',

328

'Deep learning is a type of machine learning using neural networks'

329

];

330

331

const vectors = calculateDocumentSimilarity(docs);

332

console.log('Document TF-IDF vectors:', vectors);

333

```

334

335

### File-based TF-IDF

336

337

```javascript

338

const natural = require('natural');

339

340

// Create TF-IDF from files

341

const tfidf = new natural.TfIdf();

342

343

// Add documents from files

344

try {

345

tfidf.addFileSync('./document1.txt', 'utf8', 'doc1');

346

tfidf.addFileSync('./document2.txt', 'utf8', 'doc2');

347

tfidf.addFileSync('./document3.txt', 'utf8', 'doc3');

348

349

// Analyze specific terms

350

const searchTerms = ['machine', 'learning', 'algorithm'];

351

searchTerms.forEach(term => {

352

console.log(`\nTF-IDF scores for "${term}":`);

353

const scores = tfidf.tfidfs(term);

354

scores.forEach((score, i) => {

355

console.log(`Document ${i}: ${score}`);

356

});

357

});

358

359

// Find most relevant document for a query

360

const query = ['machine', 'learning'];

361

const queryScores = tfidf.tfidfs(query);

362

const mostRelevant = queryScores.indexOf(Math.max(...queryScores));

363

console.log(`Most relevant document for query: ${mostRelevant}`);

364

365

} catch (error) {

366

console.error('Error reading files:', error);

367

}

368

```

369

370

### Custom Tokenization and Stopwords

371

372

```javascript

373

const natural = require('natural');

374

375

// Create TF-IDF with custom settings

376

const tfidf = new natural.TfIdf();

377

378

// Set custom tokenizer

379

const customTokenizer = {

380

tokenize: function(text) {

381

// Custom tokenization logic

382

return text.toLowerCase()

383

.replace(/[^\w\s]/g, '') // Remove punctuation

384

.split(/\s+/)

385

.filter(token => token.length > 2); // Only tokens > 2 chars

386

}

387

};

388

tfidf.setTokenizer(customTokenizer);

389

390

// Set custom stopwords

391

const customStopwords = ['the', 'is', 'at', 'which', 'on', 'and', 'or', 'but'];

392

tfidf.setStopwords(customStopwords);

393

394

// Add documents with custom processing

395

tfidf.addDocument('This is a sample document with custom processing');

396

tfidf.addDocument('Another document for testing custom tokenization');

397

398

// Analyze with custom settings

399

const terms = tfidf.listTerms(0);

400

console.log('Terms with custom processing:', terms);

401

```