CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/npm-natural

Comprehensive natural language processing library with tokenization, stemming, classification, sentiment analysis, phonetics, distance algorithms, and WordNet integration.

Pending
Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

SecuritybySnyk

Pending

The risk profile of this skill

Overview
Eval results
Files

ngrams-tfidf.mddocs/

N-grams and TF-IDF

Statistical text analysis tools for creating n-grams and calculating term frequency-inverse document frequency scores. These are fundamental techniques for text analytics, information retrieval, and feature extraction.

Capabilities

N-grams

Generate sequences of n consecutive words or characters from text for pattern analysis and language modeling.

/**
 * Generate n-grams from a sequence
 * @param sequence - String or array of tokens
 * @param n - Size of n-grams (1=unigrams, 2=bigrams, 3=trigrams, etc.)
 * @param startSymbol - Optional padding symbol for start of sequence
 * @param endSymbol - Optional padding symbol for end of sequence
 * @param stats - If true, returns statistics object instead of array
 * @returns Array of n-grams or statistics object
 */
function ngrams(sequence: string | string[], n: number, startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;

/**
 * Generate bigrams (2-grams) from sequence
 * @param sequence - String or array of tokens
 * @param startSymbol - Optional padding symbol for start
 * @param endSymbol - Optional padding symbol for end
 * @param stats - If true, returns statistics object
 * @returns Array of bigrams or statistics
 */
function bigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;

/**
 * Generate trigrams (3-grams) from sequence
 * @param sequence - String or array of tokens
 * @param startSymbol - Optional padding symbol for start
 * @param endSymbol - Optional padding symbol for end
 * @param stats - If true, returns statistics object
 * @returns Array of trigrams or statistics
 */
function trigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;

/**
 * Set custom tokenizer for n-gram generation
 * @param tokenizer - Tokenizer object with tokenize method
 */
function setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;

/**
 * Statistics object returned when stats=true
 */
interface NgramStatistics {
  ngrams: string[][];
  frequencies: {[key: string]: number};
  Nr: {[key: string]: number};
  numberOfNgrams: number;
}

Usage Examples:

const natural = require('natural');

// Basic n-gram generation
const text = 'hello world how are you';

// Unigrams (1-grams)
const unigrams = natural.ngrams(text, 1);
console.log(unigrams);
// [['hello'], ['world'], ['how'], ['are'], ['you']]

// Bigrams (2-grams)
const bigramArray = natural.bigrams(text);
console.log(bigramArray);
// [['hello', 'world'], ['world', 'how'], ['how', 'are'], ['are', 'you']]

// Trigrams (3-grams)
const trigramArray = natural.trigrams(text);
console.log(trigramArray);
// [['hello', 'world', 'how'], ['world', 'how', 'are'], ['how', 'are', 'you']]

// With padding symbols
const paddedBigrams = natural.bigrams(text, '<s>', '</s>');
console.log(paddedBigrams);
// [['<s>', 'hello'], ['hello', 'world'], ..., ['you', '</s>']]

// With statistics
const bigramStats = natural.bigrams(text, null, null, true);
console.log(bigramStats);
// {
//   ngrams: [...],
//   frequencies: {'hello,world': 1, 'world,how': 1, ...},
//   Nr: {1: 4},  // 4 bigrams appear once
//   numberOfNgrams: 4
// }

N-gram Analysis

const natural = require('natural');

/**
 * Analyze n-gram frequencies in text
 */
function analyzeNgrams(text, n = 2) {
  const stats = natural.ngrams(text, n, null, null, true);
  
  // Sort by frequency
  const sorted = Object.entries(stats.frequencies)
    .sort(([,a], [,b]) => b - a)
    .map(([ngram, freq]) => ({
      ngram: ngram.split(','),
      frequency: freq
    }));
  
  return {
    totalNgrams: stats.numberOfNgrams,
    uniqueNgrams: Object.keys(stats.frequencies).length,
    mostFrequent: sorted.slice(0, 10),
    frequencies: stats.frequencies
  };
}

// Example usage
const document = 'the quick brown fox jumps over the lazy dog the dog was lazy';
const analysis = analyzeNgrams(document, 2);
console.log('Total bigrams:', analysis.totalNgrams);
console.log('Unique bigrams:', analysis.uniqueNgrams);
console.log('Most frequent:', analysis.mostFrequent);

Chinese N-grams

/**
 * Chinese n-gram generation with specialized tokenization
 */
class NGramsZH {
  static ngrams(text: string, n: number): string[][];
  static bigrams(text: string): string[][];
  static trigrams(text: string): string[][];
}

Usage Examples:

const natural = require('natural');

// Chinese text n-grams
const chineseText = '你好世界今天天气很好';
const chineseBigrams = natural.NGramsZH.bigrams(chineseText);
console.log(chineseBigrams);

TF-IDF

Term Frequency-Inverse Document Frequency calculation for document analysis and information retrieval.

/**
 * TF-IDF calculator for document corpus analysis
 * @param deserialized - Optional previously serialized TfIdf instance
 */
class TfIdf {
  constructor(deserialized?: object);
  
  /**
   * Add document to the corpus
   * @param document - Document text or array of tokens
   * @param key - Optional document identifier
   * @param restoreCache - Whether to restore IDF cache
   */
  addDocument(document: string | string[], key?: string, restoreCache?: boolean): void;
  
  /**
   * Add document from file synchronously
   * @param path - File path
   * @param encoding - File encoding (default: 'utf8')
   * @param key - Optional document identifier
   * @param restoreCache - Whether to restore IDF cache
   */
  addFileSync(path: string, encoding?: string, key?: string, restoreCache?: boolean): void;
  
  /**
   * Remove document from corpus
   * @param key - Document identifier
   * @returns true if document was removed
   */
  removeDocument(key: string): boolean;
  
  /**
   * Calculate inverse document frequency for a term
   * @param term - Term to calculate IDF for
   * @param force - Force recalculation even if cached
   * @returns IDF value
   */
  idf(term: string, force?: boolean): number;
  
  /**
   * Calculate TF-IDF score for terms in a specific document
   * @param terms - Term or array of terms
   * @param documentIndex - Index of document in corpus
   * @returns TF-IDF score
   */
  tfidf(terms: string | string[], documentIndex: number): number;
  
  /**
   * Calculate TF-IDF for terms across all documents
   * @param terms - Term or array of terms
   * @param callback - Optional callback function
   * @returns Array of TF-IDF scores for each document
   */
  tfidfs(terms: string | string[], callback?: (i: number, measure: number) => void): number[];
  
  /**
   * List all terms in a document with their TF-IDF scores
   * @param documentIndex - Index of document
   * @returns Array of terms with scores
   */
  listTerms(documentIndex: number): TfIdfTerm[];
  
  /**
   * Set custom tokenizer for document processing
   * @param tokenizer - Tokenizer with tokenize method
   */
  setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;
  
  /**
   * Set custom stopwords list
   * @param stopwords - Array of stopword strings
   */
  setStopwords(stopwords: string[]): void;
}

/**
 * Term with TF-IDF score
 */
interface TfIdfTerm {
  term: string;
  tfidf: number;
}

/**
 * Static method for calculating term frequency
 * @param term - Term to calculate TF for
 * @param document - Document text or tokens
 * @returns Term frequency
 */
static TfIdf.tf(term: string, document: string | string[]): number;

Usage Examples:

const natural = require('natural');

// Create TF-IDF instance
const tfidf = new natural.TfIdf();

// Add documents to corpus
tfidf.addDocument('this document is about node. node is a runtime');
tfidf.addDocument('this document is about ruby. ruby is a language');
tfidf.addDocument('this document is about ruby. ruby is also a gem');

// Calculate TF-IDF for specific terms in document 0
console.log('TF-IDF for "node" in doc 0:', tfidf.tfidf('node', 0));
console.log('TF-IDF for "ruby" in doc 0:', tfidf.tfidf('ruby', 0));

// Calculate across all documents
const nodeScores = tfidf.tfidfs('node');
console.log('Node scores across all docs:', nodeScores);

// List all terms in document 0 with scores
const terms = tfidf.listTerms(0);
console.log('All terms in doc 0:');
terms.forEach(term => {
  console.log(`${term.term}: ${term.tfidf}`);
});

// Calculate IDF for a term
console.log('IDF for "document":', tfidf.idf('document'));

// Calculate TF for multiple terms
const multiTermScore = tfidf.tfidf(['this', 'document'], 0);
console.log('Multi-term TF-IDF:', multiTermScore);

Advanced TF-IDF Usage

const natural = require('natural');
const fs = require('fs');

/**
 * Document similarity using TF-IDF
 */
function calculateDocumentSimilarity(documents) {
  const tfidf = new natural.TfIdf();
  
  // Add all documents
  documents.forEach(doc => tfidf.addDocument(doc));
  
  // Get all unique terms
  const allTerms = new Set();
  documents.forEach((doc, i) => {
    const terms = tfidf.listTerms(i);
    terms.forEach(term => allTerms.add(term.term));
  });
  
  // Create TF-IDF vectors for each document
  const vectors = documents.map((doc, i) => {
    const vector = {};
    allTerms.forEach(term => {
      vector[term] = tfidf.tfidf(term, i);
    });
    return vector;
  });
  
  return vectors;
}

// Example usage
const docs = [
  'Machine learning is a subset of artificial intelligence',
  'Natural language processing uses machine learning algorithms',
  'Deep learning is a type of machine learning using neural networks'
];

const vectors = calculateDocumentSimilarity(docs);
console.log('Document TF-IDF vectors:', vectors);

File-based TF-IDF

const natural = require('natural');

// Create TF-IDF from files
const tfidf = new natural.TfIdf();

// Add documents from files
try {
  tfidf.addFileSync('./document1.txt', 'utf8', 'doc1');
  tfidf.addFileSync('./document2.txt', 'utf8', 'doc2');
  tfidf.addFileSync('./document3.txt', 'utf8', 'doc3');
  
  // Analyze specific terms
  const searchTerms = ['machine', 'learning', 'algorithm'];
  searchTerms.forEach(term => {
    console.log(`\nTF-IDF scores for "${term}":`);
    const scores = tfidf.tfidfs(term);
    scores.forEach((score, i) => {
      console.log(`Document ${i}: ${score}`);
    });
  });
  
  // Find most relevant document for a query
  const query = ['machine', 'learning'];
  const queryScores = tfidf.tfidfs(query);
  const mostRelevant = queryScores.indexOf(Math.max(...queryScores));
  console.log(`Most relevant document for query: ${mostRelevant}`);
  
} catch (error) {
  console.error('Error reading files:', error);
}

Custom Tokenization and Stopwords

const natural = require('natural');

// Create TF-IDF with custom settings
const tfidf = new natural.TfIdf();

// Set custom tokenizer
const customTokenizer = {
  tokenize: function(text) {
    // Custom tokenization logic
    return text.toLowerCase()
      .replace(/[^\w\s]/g, '') // Remove punctuation
      .split(/\s+/)
      .filter(token => token.length > 2); // Only tokens > 2 chars
  }
};
tfidf.setTokenizer(customTokenizer);

// Set custom stopwords
const customStopwords = ['the', 'is', 'at', 'which', 'on', 'and', 'or', 'but'];
tfidf.setStopwords(customStopwords);

// Add documents with custom processing
tfidf.addDocument('This is a sample document with custom processing');
tfidf.addDocument('Another document for testing custom tokenization');

// Analyze with custom settings
const terms = tfidf.listTerms(0);
console.log('Terms with custom processing:', terms);

docs

classification.md

distance.md

index.md

ngrams-tfidf.md

phonetics.md

pos-tagging.md

sentiment.md

text-processing.md

transliterators.md

utilities.md

wordnet.md

tile.json