Data framework for your LLM application
—
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Pending
The risk profile of this skill
Text processing and document chunking functionality for preparing data for indexing and retrieval in LlamaIndex.TS.
import { Document, SentenceSplitter, TextNode } from "llamaindex";
// Or from specific submodules
import { SentenceSplitter, MarkdownNodeParser } from "llamaindex/node-parser";Document processing in LlamaIndex.TS involves transforming raw text into structured nodes that can be indexed and retrieved. The system provides various node parsers for different text types and chunking strategies.
The Document class represents a source document with text content and metadata.
class Document {
constructor(init: {
text: string;
id_?: string;
metadata?: Record<string, any>;
mimetype?: string;
relationships?: Record<string, any>;
});
text: string;
id_: string;
metadata: Record<string, any>;
mimetype?: string;
relationships?: Record<string, any>;
getText(): string;
setContent(value: string): void;
asRelatedNodeInfo(): RelatedNodeInfo;
}Base class for all node types in the system.
class BaseNode {
id_: string;
text: string;
metadata: Record<string, any>;
relationships: Record<string, any>;
getText(): string;
setContent(value: string): void;
asRelatedNodeInfo(): RelatedNodeInfo;
}Represents a chunk of text extracted from a document.
class TextNode extends BaseNode {
constructor(init: {
text: string;
id_?: string;
metadata?: Record<string, any>;
relationships?: Record<string, any>;
});
startCharIdx?: number;
endCharIdx?: number;
textTemplate: string;
metadataTemplate: string;
metadataSeparator: string;
getContent(metadataMode?: MetadataMode): string;
getMetadataStr(mode?: MetadataMode): string;
setContent(value: string): void;
}Base interface for all node parsers.
interface NodeParser {
getNodesFromDocuments(documents: Document[], showProgress?: boolean): TextNode[];
splitText(text: string): string[];
}The most commonly used node parser that splits text into sentences and chunks.
class SentenceSplitter implements NodeParser {
constructor(options?: {
chunkSize?: number;
chunkOverlap?: number;
tokenizer?: (text: string) => string[];
paragraphSeparator?: string;
chunkingTokenizerFn?: (text: string) => string[];
secondaryChunkingRegex?: string;
separator?: string;
});
chunkSize: number;
chunkOverlap: number;
separator: string;
paragraphSeparator: string;
getNodesFromDocuments(documents: Document[], showProgress?: boolean): TextNode[];
splitText(text: string): string[];
splitTextMetadataAware(text: string, metadata: Record<string, any>): string[];
}Splits text based on token count rather than characters.
class TokenTextSplitter implements NodeParser {
constructor(options?: {
chunkSize?: number;
chunkOverlap?: number;
separator?: string;
tokenizer?: (text: string) => string[];
chunkingTokenizerFn?: (text: string) => string[];
});
getNodesFromDocuments(documents: Document[], showProgress?: boolean): TextNode[];
splitText(text: string): string[];
}Specialized parser for Markdown documents that preserves structure.
class MarkdownNodeParser implements NodeParser {
constructor(options?: {
chunkSize?: number;
chunkOverlap?: number;
});
getNodesFromDocuments(documents: Document[], showProgress?: boolean): TextNode[];
splitText(text: string): string[];
}Parser for HTML documents that handles tags and structure.
class HTMLNodeParser implements NodeParser {
constructor(options?: {
chunkSize?: number;
chunkOverlap?: number;
tags?: string[];
});
getNodesFromDocuments(documents: Document[], showProgress?: boolean): TextNode[];
splitText(text: string): string[];
}Specialized parser for source code that respects language syntax.
class CodeSplitter implements NodeParser {
constructor(options?: {
language: string;
chunkLines?: number;
chunkLinesOverlap?: number;
maxChars?: number;
});
getNodesFromDocuments(documents: Document[], showProgress?: boolean): TextNode[];
splitText(text: string): string[];
}Creates overlapping windows of sentences for better context preservation.
class SentenceWindowNodeParser implements NodeParser {
constructor(options?: {
windowSize?: number;
windowMetadataKey?: string;
originalTextMetadataKey?: string;
});
getNodesFromDocuments(documents: Document[], showProgress?: boolean): TextNode[];
splitText(text: string): string[];
}import { Document } from "llamaindex";
// Simple document
const doc = new Document({
text: "This is the content of my document.",
id_: "doc-1"
});
// Document with metadata
const docWithMetadata = new Document({
text: "Financial report for Q3 2024...",
id_: "financial-report-q3-2024",
metadata: {
author: "John Doe",
department: "Finance",
date: "2024-03-31",
classification: "internal"
}
});import { SentenceSplitter, Document } from "llamaindex";
const documents = [
new Document({ text: "Long document content here..." }),
];
// Create splitter with default settings
const splitter = new SentenceSplitter({
chunkSize: 1024,
chunkOverlap: 20,
});
// Split documents into nodes
const nodes = splitter.getNodesFromDocuments(documents);
console.log(`Created ${nodes.length} text nodes`);
nodes.forEach((node, i) => {
console.log(`Node ${i}: ${node.text.substring(0, 100)}...`);
});import { SentenceSplitter } from "llamaindex/node-parser";
// Custom tokenizer and separators
const advancedSplitter = new SentenceSplitter({
chunkSize: 512,
chunkOverlap: 50,
separator: " ",
paragraphSeparator: "\n\n",
chunkingTokenizerFn: (text: string) => text.split(/\s+/), // Custom tokenizer
});
const nodes = advancedSplitter.getNodesFromDocuments(documents);import { MarkdownNodeParser } from "llamaindex/node-parser";
const markdownDoc = new Document({
text: `# Chapter 1\n\nThis is the introduction.\n\n## Section 1.1\n\nContent here...`,
});
const markdownParser = new MarkdownNodeParser({
chunkSize: 1024,
});
const markdownNodes = markdownParser.getNodesFromDocuments([markdownDoc]);import { HTMLNodeParser } from "llamaindex/node-parser";
const htmlDoc = new Document({
text: `<html><body><h1>Title</h1><p>Paragraph content...</p></body></html>`,
});
const htmlParser = new HTMLNodeParser({
chunkSize: 512,
tags: ["p", "h1", "h2", "div"], // Focus on specific tags
});
const htmlNodes = htmlParser.getNodesFromDocuments([htmlDoc]);import { CodeSplitter } from "llamaindex/node-parser";
const codeDoc = new Document({
text: `function example() {\n return "Hello World";\n}\n\nclass MyClass {\n constructor() {}\n}`,
});
const codeSplitter = new CodeSplitter({
language: "javascript",
chunkLines: 10,
chunkLinesOverlap: 2,
maxChars: 1000,
});
const codeNodes = codeSplitter.getNodesFromDocuments([codeDoc]);import { Settings, SentenceSplitter } from "llamaindex";
// Set global node parser
Settings.nodeParser = new SentenceSplitter({
chunkSize: 1024,
chunkOverlap: 20,
});
// All indexing operations will use this parser by defaultimport { Settings, TokenTextSplitter, VectorStoreIndex } from "llamaindex";
const documents = [/* your documents */];
// Use different parser for specific operation
const index = Settings.withNodeParser(
new TokenTextSplitter({ chunkSize: 512 }),
() => {
return VectorStoreIndex.fromDocuments(documents);
}
);const nodes = splitter.getNodesFromDocuments(documents);
nodes.forEach(node => {
console.log("Node ID:", node.id_);
console.log("Text:", node.text);
console.log("Metadata:", node.metadata);
// Check relationships to source document
if (node.relationships.SOURCE_NODE) {
console.log("Source document ID:", node.relationships.SOURCE_NODE.nodeId);
}
// Check text positions if available
if (node.startCharIdx !== undefined) {
console.log(`Text span: ${node.startCharIdx}-${node.endCharIdx}`);
}
});Document metadata is automatically propagated to generated nodes:
const docWithMeta = new Document({
text: "Content here...",
metadata: {
source: "research-paper.pdf",
page: 1,
section: "introduction"
}
});
const nodes = splitter.getNodesFromDocuments([docWithMeta]);
// Each node will contain the document metadata
nodes.forEach(node => {
console.log(node.metadata); // { source: "research-paper.pdf", page: 1, section: "introduction" }
});// For general text (articles, books)
const generalSplitter = new SentenceSplitter({
chunkSize: 1024, // Good balance of context and specificity
chunkOverlap: 20,
});
// For code
const codeSplitter = new CodeSplitter({
language: "typescript",
chunkLines: 15, // Functions or logical blocks
chunkLinesOverlap: 3,
});
// For short-form content (tweets, messages)
const shortFormSplitter = new SentenceSplitter({
chunkSize: 256, // Smaller chunks for focused retrieval
chunkOverlap: 10,
});const processDocumentByType = (doc: Document) => {
const { mimetype } = doc;
if (mimetype?.includes('html')) {
return new HTMLNodeParser().getNodesFromDocuments([doc]);
} else if (mimetype?.includes('markdown')) {
return new MarkdownNodeParser().getNodesFromDocuments([doc]);
} else if (doc.metadata.fileExtension === '.py') {
return new CodeSplitter({ language: 'python' }).getNodesFromDocuments([doc]);
} else {
return new SentenceSplitter().getNodesFromDocuments([doc]);
}
};