Comprehensive natural language processing library with tokenization, stemming, classification, sentiment analysis, phonetics, distance algorithms, and WordNet integration.
—
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Pending
The risk profile of this skill
Statistical text analysis tools for creating n-grams and calculating term frequency-inverse document frequency scores. These are fundamental techniques for text analytics, information retrieval, and feature extraction.
Generate sequences of n consecutive words or characters from text for pattern analysis and language modeling.
/**
* Generate n-grams from a sequence
* @param sequence - String or array of tokens
* @param n - Size of n-grams (1=unigrams, 2=bigrams, 3=trigrams, etc.)
* @param startSymbol - Optional padding symbol for start of sequence
* @param endSymbol - Optional padding symbol for end of sequence
* @param stats - If true, returns statistics object instead of array
* @returns Array of n-grams or statistics object
*/
function ngrams(sequence: string | string[], n: number, startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;
/**
* Generate bigrams (2-grams) from sequence
* @param sequence - String or array of tokens
* @param startSymbol - Optional padding symbol for start
* @param endSymbol - Optional padding symbol for end
* @param stats - If true, returns statistics object
* @returns Array of bigrams or statistics
*/
function bigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;
/**
* Generate trigrams (3-grams) from sequence
* @param sequence - String or array of tokens
* @param startSymbol - Optional padding symbol for start
* @param endSymbol - Optional padding symbol for end
* @param stats - If true, returns statistics object
* @returns Array of trigrams or statistics
*/
function trigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;
/**
* Set custom tokenizer for n-gram generation
* @param tokenizer - Tokenizer object with tokenize method
*/
function setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;
/**
* Statistics object returned when stats=true
*/
interface NgramStatistics {
ngrams: string[][];
frequencies: {[key: string]: number};
Nr: {[key: string]: number};
numberOfNgrams: number;
}Usage Examples:
const natural = require('natural');
// Basic n-gram generation
const text = 'hello world how are you';
// Unigrams (1-grams)
const unigrams = natural.ngrams(text, 1);
console.log(unigrams);
// [['hello'], ['world'], ['how'], ['are'], ['you']]
// Bigrams (2-grams)
const bigramArray = natural.bigrams(text);
console.log(bigramArray);
// [['hello', 'world'], ['world', 'how'], ['how', 'are'], ['are', 'you']]
// Trigrams (3-grams)
const trigramArray = natural.trigrams(text);
console.log(trigramArray);
// [['hello', 'world', 'how'], ['world', 'how', 'are'], ['how', 'are', 'you']]
// With padding symbols
const paddedBigrams = natural.bigrams(text, '<s>', '</s>');
console.log(paddedBigrams);
// [['<s>', 'hello'], ['hello', 'world'], ..., ['you', '</s>']]
// With statistics
const bigramStats = natural.bigrams(text, null, null, true);
console.log(bigramStats);
// {
// ngrams: [...],
// frequencies: {'hello,world': 1, 'world,how': 1, ...},
// Nr: {1: 4}, // 4 bigrams appear once
// numberOfNgrams: 4
// }const natural = require('natural');
/**
* Analyze n-gram frequencies in text
*/
function analyzeNgrams(text, n = 2) {
const stats = natural.ngrams(text, n, null, null, true);
// Sort by frequency
const sorted = Object.entries(stats.frequencies)
.sort(([,a], [,b]) => b - a)
.map(([ngram, freq]) => ({
ngram: ngram.split(','),
frequency: freq
}));
return {
totalNgrams: stats.numberOfNgrams,
uniqueNgrams: Object.keys(stats.frequencies).length,
mostFrequent: sorted.slice(0, 10),
frequencies: stats.frequencies
};
}
// Example usage
const document = 'the quick brown fox jumps over the lazy dog the dog was lazy';
const analysis = analyzeNgrams(document, 2);
console.log('Total bigrams:', analysis.totalNgrams);
console.log('Unique bigrams:', analysis.uniqueNgrams);
console.log('Most frequent:', analysis.mostFrequent);/**
* Chinese n-gram generation with specialized tokenization
*/
class NGramsZH {
static ngrams(text: string, n: number): string[][];
static bigrams(text: string): string[][];
static trigrams(text: string): string[][];
}Usage Examples:
const natural = require('natural');
// Chinese text n-grams
const chineseText = '你好世界今天天气很好';
const chineseBigrams = natural.NGramsZH.bigrams(chineseText);
console.log(chineseBigrams);Term Frequency-Inverse Document Frequency calculation for document analysis and information retrieval.
/**
* TF-IDF calculator for document corpus analysis
* @param deserialized - Optional previously serialized TfIdf instance
*/
class TfIdf {
constructor(deserialized?: object);
/**
* Add document to the corpus
* @param document - Document text or array of tokens
* @param key - Optional document identifier
* @param restoreCache - Whether to restore IDF cache
*/
addDocument(document: string | string[], key?: string, restoreCache?: boolean): void;
/**
* Add document from file synchronously
* @param path - File path
* @param encoding - File encoding (default: 'utf8')
* @param key - Optional document identifier
* @param restoreCache - Whether to restore IDF cache
*/
addFileSync(path: string, encoding?: string, key?: string, restoreCache?: boolean): void;
/**
* Remove document from corpus
* @param key - Document identifier
* @returns true if document was removed
*/
removeDocument(key: string): boolean;
/**
* Calculate inverse document frequency for a term
* @param term - Term to calculate IDF for
* @param force - Force recalculation even if cached
* @returns IDF value
*/
idf(term: string, force?: boolean): number;
/**
* Calculate TF-IDF score for terms in a specific document
* @param terms - Term or array of terms
* @param documentIndex - Index of document in corpus
* @returns TF-IDF score
*/
tfidf(terms: string | string[], documentIndex: number): number;
/**
* Calculate TF-IDF for terms across all documents
* @param terms - Term or array of terms
* @param callback - Optional callback function
* @returns Array of TF-IDF scores for each document
*/
tfidfs(terms: string | string[], callback?: (i: number, measure: number) => void): number[];
/**
* List all terms in a document with their TF-IDF scores
* @param documentIndex - Index of document
* @returns Array of terms with scores
*/
listTerms(documentIndex: number): TfIdfTerm[];
/**
* Set custom tokenizer for document processing
* @param tokenizer - Tokenizer with tokenize method
*/
setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;
/**
* Set custom stopwords list
* @param stopwords - Array of stopword strings
*/
setStopwords(stopwords: string[]): void;
}
/**
* Term with TF-IDF score
*/
interface TfIdfTerm {
term: string;
tfidf: number;
}
/**
* Static method for calculating term frequency
* @param term - Term to calculate TF for
* @param document - Document text or tokens
* @returns Term frequency
*/
static TfIdf.tf(term: string, document: string | string[]): number;Usage Examples:
const natural = require('natural');
// Create TF-IDF instance
const tfidf = new natural.TfIdf();
// Add documents to corpus
tfidf.addDocument('this document is about node. node is a runtime');
tfidf.addDocument('this document is about ruby. ruby is a language');
tfidf.addDocument('this document is about ruby. ruby is also a gem');
// Calculate TF-IDF for specific terms in document 0
console.log('TF-IDF for "node" in doc 0:', tfidf.tfidf('node', 0));
console.log('TF-IDF for "ruby" in doc 0:', tfidf.tfidf('ruby', 0));
// Calculate across all documents
const nodeScores = tfidf.tfidfs('node');
console.log('Node scores across all docs:', nodeScores);
// List all terms in document 0 with scores
const terms = tfidf.listTerms(0);
console.log('All terms in doc 0:');
terms.forEach(term => {
console.log(`${term.term}: ${term.tfidf}`);
});
// Calculate IDF for a term
console.log('IDF for "document":', tfidf.idf('document'));
// Calculate TF for multiple terms
const multiTermScore = tfidf.tfidf(['this', 'document'], 0);
console.log('Multi-term TF-IDF:', multiTermScore);const natural = require('natural');
const fs = require('fs');
/**
* Document similarity using TF-IDF
*/
function calculateDocumentSimilarity(documents) {
const tfidf = new natural.TfIdf();
// Add all documents
documents.forEach(doc => tfidf.addDocument(doc));
// Get all unique terms
const allTerms = new Set();
documents.forEach((doc, i) => {
const terms = tfidf.listTerms(i);
terms.forEach(term => allTerms.add(term.term));
});
// Create TF-IDF vectors for each document
const vectors = documents.map((doc, i) => {
const vector = {};
allTerms.forEach(term => {
vector[term] = tfidf.tfidf(term, i);
});
return vector;
});
return vectors;
}
// Example usage
const docs = [
'Machine learning is a subset of artificial intelligence',
'Natural language processing uses machine learning algorithms',
'Deep learning is a type of machine learning using neural networks'
];
const vectors = calculateDocumentSimilarity(docs);
console.log('Document TF-IDF vectors:', vectors);const natural = require('natural');
// Create TF-IDF from files
const tfidf = new natural.TfIdf();
// Add documents from files
try {
tfidf.addFileSync('./document1.txt', 'utf8', 'doc1');
tfidf.addFileSync('./document2.txt', 'utf8', 'doc2');
tfidf.addFileSync('./document3.txt', 'utf8', 'doc3');
// Analyze specific terms
const searchTerms = ['machine', 'learning', 'algorithm'];
searchTerms.forEach(term => {
console.log(`\nTF-IDF scores for "${term}":`);
const scores = tfidf.tfidfs(term);
scores.forEach((score, i) => {
console.log(`Document ${i}: ${score}`);
});
});
// Find most relevant document for a query
const query = ['machine', 'learning'];
const queryScores = tfidf.tfidfs(query);
const mostRelevant = queryScores.indexOf(Math.max(...queryScores));
console.log(`Most relevant document for query: ${mostRelevant}`);
} catch (error) {
console.error('Error reading files:', error);
}const natural = require('natural');
// Create TF-IDF with custom settings
const tfidf = new natural.TfIdf();
// Set custom tokenizer
const customTokenizer = {
tokenize: function(text) {
// Custom tokenization logic
return text.toLowerCase()
.replace(/[^\w\s]/g, '') // Remove punctuation
.split(/\s+/)
.filter(token => token.length > 2); // Only tokens > 2 chars
}
};
tfidf.setTokenizer(customTokenizer);
// Set custom stopwords
const customStopwords = ['the', 'is', 'at', 'which', 'on', 'and', 'or', 'but'];
tfidf.setStopwords(customStopwords);
// Add documents with custom processing
tfidf.addDocument('This is a sample document with custom processing');
tfidf.addDocument('Another document for testing custom tokenization');
// Analyze with custom settings
const terms = tfidf.listTerms(0);
console.log('Terms with custom processing:', terms);