or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

array-diffing.mdcharacter-diffing.mdcss-diffing.mdcustom-diffing.mdformat-conversion.mdindex.mdjson-diffing.mdline-diffing.mdpatch-application.mdpatch-creation.mdpatch-utilities.mdsentence-diffing.mdword-diffing.md
tile.json

sentence-diffing.mddocs/

Sentence Diffing

Sentence-level text comparison for natural language processing applications. Automatically detects sentence boundaries based on punctuation marks followed by whitespace.

Capabilities

diffSentences Function

Performs sentence-level diff between two strings, treating each sentence as a token.

/**
 * Compare two strings at the sentence level
 * @param oldStr - Original text with sentences
 * @param newStr - New text to compare against
 * @param options - Configuration options
 * @returns Array of change objects representing the diff
 */
function diffSentences(oldStr, newStr, options);

Usage Examples:

import { diffSentences } from "diff";

// Basic sentence diff
const result = diffSentences(
  "Hello world. This is a test. Good bye.",
  "Hello world. This is modified. Good bye."
);
console.log(result);
// [
//   { value: "Hello world. ", count: 1 },
//   { value: "This is a test. ", removed: true, count: 1 },
//   { value: "This is modified. ", added: true, count: 1 },
//   { value: "Good bye.", count: 1 }
// ]

// Multiple sentence changes
const multiResult = diffSentences(
  "First sentence. Second sentence. Third sentence.",
  "First sentence. New second sentence. Third sentence. Added sentence!"
);

sentenceDiff Instance

Pre-configured Diff instance for sentence-level comparisons with sentence-aware tokenization.

/**
 * Pre-configured sentence diff instance
 * Uses regex-based sentence boundary detection
 */
const sentenceDiff: Diff;

Advanced Usage

Sentence Boundary Detection

The sentence diff uses the following rules for sentence detection:

  • Sentences end with ., !, or ?
  • Must be followed by whitespace or end of string
  • Simple regex pattern: /(\S.+?[.!?])(?=\s+|$)/
import { diffSentences } from "diff";

// Various punctuation marks
const punctuation = diffSentences(
  "Statement. Question? Exclamation!",
  "New statement. Question? Different exclamation!"
);

// Sentences with complex punctuation
const complex = diffSentences(
  "Dr. Smith said hello. Then he left.",
  "Dr. Smith said goodbye. Then he left."
);
// Note: "Dr." is not treated as sentence end due to no following whitespace

Natural Language Processing

import { diffSentences } from "diff";

function analyzeParagraphChanges(oldParagraph, newParagraph) {
  const changes = diffSentences(oldParagraph, newParagraph);
  
  const stats = {
    unchanged: 0,
    added: 0,
    removed: 0,
    modified: 0
  };
  
  changes.forEach(change => {
    if (change.added) stats.added++;
    else if (change.removed) stats.removed++;
    else stats.unchanged++;
  });
  
  // Estimate modifications (adjacent add/remove pairs)
  for (let i = 0; i < changes.length - 1; i++) {
    if (changes[i].removed && changes[i + 1].added) {
      stats.modified++;
      stats.added--;
      stats.removed--;
    }
  }
  
  return stats;
}

const oldText = "The cat sat on the mat. It was comfortable. The end.";
const newText = "The dog sat on the rug. It was very comfortable. The end.";
const analysis = analyzeParagraphChanges(oldText, newText);

Document Comparison

import { diffSentences } from "diff";

function compareDocuments(doc1, doc2) {
  const sentences = diffSentences(doc1, doc2);
  
  return sentences.map((change, index) => ({
    sentenceNumber: index + 1,
    content: change.value.trim(),
    status: change.added ? 'added' : 
            change.removed ? 'removed' : 'unchanged',
    wordCount: change.value.trim().split(/\s+/).length
  }));
}

// Usage for document analysis
const original = "First sentence. Second sentence. Third sentence.";
const revised = "First sentence. Modified second sentence. Third sentence. New sentence.";
const comparison = compareDocuments(original, revised);

Async Processing for Large Texts

import { diffSentences } from "diff";

function diffLongDocument(oldDoc, newDoc, callback) {
  diffSentences(oldDoc, newDoc, {
    callback: callback,
    maxEditLength: 1000,  // Limit for very long documents
    timeout: 15000        // 15 second timeout
  });
}

// Usage
diffLongDocument(longDocument1, longDocument2, (result) => {
  if (result) {
    const sentenceCount = result.length;
    const changes = result.filter(r => r.added || r.removed).length;
    console.log(`Compared ${sentenceCount} sentences, ${changes} changes found`);
  } else {
    console.log("Document too complex to diff efficiently");
  }
});

Direct Instance Usage

import { sentenceDiff } from "diff";

// Using the pre-configured instance directly
const directResult = sentenceDiff.diff(
  "Old sentence. Another old sentence.",
  "New sentence. Another old sentence."
);

// Access tokenization
const sentences = sentenceDiff.tokenize("First. Second! Third?");
console.log("Detected sentences:", sentences);
// ["First. ", "Second! ", "Third?"]

Limitations and Considerations

Sentence Detection Limitations

// The sentence detector has limitations with:

// Abbreviations
const abbrev = "Dr. Smith works at U.S.A. Corp.";  
// May not handle all abbreviations correctly

// Decimal numbers
const numbers = "The price is $12.99. That's expensive.";
// Should work correctly as no whitespace after decimal

// Ellipses  
const ellipses = "Well... I think so.";
// Ellipses are not treated as sentence boundaries

Alternative for Complex Text

import { diffArrays } from "diff";

// For more sophisticated sentence detection, use Intl.Segmenter
function advancedSentenceDiff(oldText, newText) {
  if (typeof Intl !== 'undefined' && Intl.Segmenter) {
    const segmenter = new Intl.Segmenter('en', { granularity: 'sentence' });
    
    const oldSentences = Array.from(segmenter.segment(oldText), s => s.segment);
    const newSentences = Array.from(segmenter.segment(newText), s => s.segment);
    
    return diffArrays(oldSentences, newSentences);
  } else {
    // Fallback to built-in sentence diff
    return diffSentences(oldText, newText);
  }
}