tessl/npm-natural

Comprehensive natural language processing library with tokenization, stemming, classification, sentiment analysis, phonetics, distance algorithms, and WordNet integration.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Securityby

Pending

The risk profile of this skill

Overview

Eval results

Files

N-grams and TF-IDF

Name: tessl/npm-natural
Author: tessl

Statistical text analysis tools for creating n-grams and calculating term frequency-inverse document frequency scores. These are fundamental techniques for text analytics, information retrieval, and feature extraction.

Capabilities

N-grams

Generate sequences of n consecutive words or characters from text for pattern analysis and language modeling.

/**
 * Generate n-grams from a sequence
 * @param sequence - String or array of tokens
 * @param n - Size of n-grams (1=unigrams, 2=bigrams, 3=trigrams, etc.)
 * @param startSymbol - Optional padding symbol for start of sequence
 * @param endSymbol - Optional padding symbol for end of sequence
 * @param stats - If true, returns statistics object instead of array
 * @returns Array of n-grams or statistics object
 */
function ngrams(sequence: string | string[], n: number, startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;

/**
 * Generate bigrams (2-grams) from sequence
 * @param sequence - String or array of tokens
 * @param startSymbol - Optional padding symbol for start
 * @param endSymbol - Optional padding symbol for end
 * @param stats - If true, returns statistics object
 * @returns Array of bigrams or statistics
 */
function bigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;

/**
 * Generate trigrams (3-grams) from sequence
 * @param sequence - String or array of tokens
 * @param startSymbol - Optional padding symbol for start
 * @param endSymbol - Optional padding symbol for end
 * @param stats - If true, returns statistics object
 * @returns Array of trigrams or statistics
 */
function trigrams(sequence: string | string[], startSymbol?: string, endSymbol?: string, stats?: boolean): string[][] | NgramStatistics;

/**
 * Set custom tokenizer for n-gram generation
 * @param tokenizer - Tokenizer object with tokenize method
 */
function setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;

/**
 * Statistics object returned when stats=true
 */
interface NgramStatistics {
  ngrams: string[][];
  frequencies: {[key: string]: number};
  Nr: {[key: string]: number};
  numberOfNgrams: number;
}

Usage Examples:

const natural = require('natural');

// Basic n-gram generation
const text = 'hello world how are you';

// Unigrams (1-grams)
const unigrams = natural.ngrams(text, 1);
console.log(unigrams);
// [['hello'], ['world'], ['how'], ['are'], ['you']]

// Bigrams (2-grams)
const bigramArray = natural.bigrams(text);
console.log(bigramArray);
// [['hello', 'world'], ['world', 'how'], ['how', 'are'], ['are', 'you']]

// Trigrams (3-grams)
const trigramArray = natural.trigrams(text);
console.log(trigramArray);
// [['hello', 'world', 'how'], ['world', 'how', 'are'], ['how', 'are', 'you']]

// With padding symbols
const paddedBigrams = natural.bigrams(text, '<s>', '</s>');
console.log(paddedBigrams);
// [['<s>', 'hello'], ['hello', 'world'], ..., ['you', '</s>']]

// With statistics
const bigramStats = natural.bigrams(text, null, null, true);
console.log(bigramStats);
// {
//   ngrams: [...],
//   frequencies: {'hello,world': 1, 'world,how': 1, ...},
//   Nr: {1: 4},  // 4 bigrams appear once
//   numberOfNgrams: 4
// }

N-gram Analysis

const natural = require('natural');

/**
 * Analyze n-gram frequencies in text
 */
function analyzeNgrams(text, n = 2) {
  const stats = natural.ngrams(text, n, null, null, true);
  
  // Sort by frequency
  const sorted = Object.entries(stats.frequencies)
    .sort(([,a], [,b]) => b - a)
    .map(([ngram, freq]) => ({
      ngram: ngram.split(','),
      frequency: freq
    }));
  
  return {
    totalNgrams: stats.numberOfNgrams,
    uniqueNgrams: Object.keys(stats.frequencies).length,
    mostFrequent: sorted.slice(0, 10),
    frequencies: stats.frequencies
  };
}

// Example usage
const document = 'the quick brown fox jumps over the lazy dog the dog was lazy';
const analysis = analyzeNgrams(document, 2);
console.log('Total bigrams:', analysis.totalNgrams);
console.log('Unique bigrams:', analysis.uniqueNgrams);
console.log('Most frequent:', analysis.mostFrequent);

Chinese N-grams

/**
 * Chinese n-gram generation with specialized tokenization
 */
class NGramsZH {
  static ngrams(text: string, n: number): string[][];
  static bigrams(text: string): string[][];
  static trigrams(text: string): string[][];
}

Usage Examples:

const natural = require('natural');

// Chinese text n-grams
const chineseText = '你好世界今天天气很好';
const chineseBigrams = natural.NGramsZH.bigrams(chineseText);
console.log(chineseBigrams);

TF-IDF

Term Frequency-Inverse Document Frequency calculation for document analysis and information retrieval.

/**
 * TF-IDF calculator for document corpus analysis
 * @param deserialized - Optional previously serialized TfIdf instance
 */
class TfIdf {
  constructor(deserialized?: object);
  
  /**
   * Add document to the corpus
   * @param document - Document text or array of tokens
   * @param key - Optional document identifier
   * @param restoreCache - Whether to restore IDF cache
   */
  addDocument(document: string | string[], key?: string, restoreCache?: boolean): void;
  
  /**
   * Add document from file synchronously
   * @param path - File path
   * @param encoding - File encoding (default: 'utf8')
   * @param key - Optional document identifier
   * @param restoreCache - Whether to restore IDF cache
   */
  addFileSync(path: string, encoding?: string, key?: string, restoreCache?: boolean): void;
  
  /**
   * Remove document from corpus
   * @param key - Document identifier
   * @returns true if document was removed
   */
  removeDocument(key: string): boolean;
  
  /**
   * Calculate inverse document frequency for a term
   * @param term - Term to calculate IDF for
   * @param force - Force recalculation even if cached
   * @returns IDF value
   */
  idf(term: string, force?: boolean): number;
  
  /**
   * Calculate TF-IDF score for terms in a specific document
   * @param terms - Term or array of terms
   * @param documentIndex - Index of document in corpus
   * @returns TF-IDF score
   */
  tfidf(terms: string | string[], documentIndex: number): number;
  
  /**
   * Calculate TF-IDF for terms across all documents
   * @param terms - Term or array of terms
   * @param callback - Optional callback function
   * @returns Array of TF-IDF scores for each document
   */
  tfidfs(terms: string | string[], callback?: (i: number, measure: number) => void): number[];
  
  /**
   * List all terms in a document with their TF-IDF scores
   * @param documentIndex - Index of document
   * @returns Array of terms with scores
   */
  listTerms(documentIndex: number): TfIdfTerm[];
  
  /**
   * Set custom tokenizer for document processing
   * @param tokenizer - Tokenizer with tokenize method
   */
  setTokenizer(tokenizer: {tokenize: (text: string) => string[]}): void;
  
  /**
   * Set custom stopwords list
   * @param stopwords - Array of stopword strings
   */
  setStopwords(stopwords: string[]): void;
}

/**
 * Term with TF-IDF score
 */
interface TfIdfTerm {
  term: string;
  tfidf: number;
}

/**
 * Static method for calculating term frequency
 * @param term - Term to calculate TF for
 * @param document - Document text or tokens
 * @returns Term frequency
 */
static TfIdf.tf(term: string, document: string | string[]): number;

Usage Examples:

const natural = require('natural');

// Create TF-IDF instance
const tfidf = new natural.TfIdf();

// Add documents to corpus
tfidf.addDocument('this document is about node. node is a runtime');
tfidf.addDocument('this document is about ruby. ruby is a language');
tfidf.addDocument('this document is about ruby. ruby is also a gem');

// Calculate TF-IDF for specific terms in document 0
console.log('TF-IDF for "node" in doc 0:', tfidf.tfidf('node', 0));
console.log('TF-IDF for "ruby" in doc 0:', tfidf.tfidf('ruby', 0));

// Calculate across all documents
const nodeScores = tfidf.tfidfs('node');
console.log('Node scores across all docs:', nodeScores);

// List all terms in document 0 with scores
const terms = tfidf.listTerms(0);
console.log('All terms in doc 0:');
terms.forEach(term => {
  console.log(`${term.term}: ${term.tfidf}`);
});

// Calculate IDF for a term
console.log('IDF for "document":', tfidf.idf('document'));

// Calculate TF for multiple terms
const multiTermScore = tfidf.tfidf(['this', 'document'], 0);
console.log('Multi-term TF-IDF:', multiTermScore);

Advanced TF-IDF Usage

const natural = require('natural');
const fs = require('fs');

/**
 * Document similarity using TF-IDF
 */
function calculateDocumentSimilarity(documents) {
  const tfidf = new natural.TfIdf();
  
  // Add all documents
  documents.forEach(doc => tfidf.addDocument(doc));
  
  // Get all unique terms
  const allTerms = new Set();
  documents.forEach((doc, i) => {
    const terms = tfidf.listTerms(i);
    terms.forEach(term => allTerms.add(term.term));
  });
  
  // Create TF-IDF vectors for each document
  const vectors = documents.map((doc, i) => {
    const vector = {};
    allTerms.forEach(term => {
      vector[term] = tfidf.tfidf(term, i);
    });
    return vector;
  });
  
  return vectors;
}

// Example usage
const docs = [
  'Machine learning is a subset of artificial intelligence',
  'Natural language processing uses machine learning algorithms',
  'Deep learning is a type of machine learning using neural networks'
];

const vectors = calculateDocumentSimilarity(docs);
console.log('Document TF-IDF vectors:', vectors);

File-based TF-IDF

const natural = require('natural');

// Create TF-IDF from files
const tfidf = new natural.TfIdf();

// Add documents from files
try {
  tfidf.addFileSync('./document1.txt', 'utf8', 'doc1');
  tfidf.addFileSync('./document2.txt', 'utf8', 'doc2');
  tfidf.addFileSync('./document3.txt', 'utf8', 'doc3');
  
  // Analyze specific terms
  const searchTerms = ['machine', 'learning', 'algorithm'];
  searchTerms.forEach(term => {
    console.log(`\nTF-IDF scores for "${term}":`);
    const scores = tfidf.tfidfs(term);
    scores.forEach((score, i) => {
      console.log(`Document ${i}: ${score}`);
    });
  });
  
  // Find most relevant document for a query
  const query = ['machine', 'learning'];
  const queryScores = tfidf.tfidfs(query);
  const mostRelevant = queryScores.indexOf(Math.max(...queryScores));
  console.log(`Most relevant document for query: ${mostRelevant}`);
  
} catch (error) {
  console.error('Error reading files:', error);
}

Custom Tokenization and Stopwords

const natural = require('natural');

// Create TF-IDF with custom settings
const tfidf = new natural.TfIdf();

// Set custom tokenizer
const customTokenizer = {
  tokenize: function(text) {
    // Custom tokenization logic
    return text.toLowerCase()
      .replace(/[^\w\s]/g, '') // Remove punctuation
      .split(/\s+/)
      .filter(token => token.length > 2); // Only tokens > 2 chars
  }
};
tfidf.setTokenizer(customTokenizer);

// Set custom stopwords
const customStopwords = ['the', 'is', 'at', 'which', 'on', 'and', 'or', 'but'];
tfidf.setStopwords(customStopwords);

// Add documents with custom processing
tfidf.addDocument('This is a sample document with custom processing');
tfidf.addDocument('Another document for testing custom tokenization');

// Analyze with custom settings
const terms = tfidf.listTerms(0);
console.log('Terms with custom processing:', terms);