Various implementations of LangChain.js text splitters for retrieval-augmented generation (RAG) pipelines
—
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Pending
The risk profile of this skill
LangChain Text Splitters provides various implementations of text splitting functionality for LangChain.js, most commonly used as part of retrieval-augmented generation (RAG) pipelines. The library offers abstract base classes and concrete implementations for splitting text documents into smaller chunks with configurable size, overlap, and length functions.
npm install @langchain/textsplitters @langchain/core js-tiktokenimport {
// Classes
TextSplitter,
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
TokenTextSplitter,
MarkdownTextSplitter,
LatexTextSplitter,
// Interfaces and Types
TextSplitterParams,
TextSplitterChunkHeaderOptions,
CharacterTextSplitterParams,
RecursiveCharacterTextSplitterParams,
TokenTextSplitterParams,
MarkdownTextSplitterParams,
LatexTextSplitterParams,
SupportedTextSplitterLanguage,
SupportedTextSplitterLanguages
} from "@langchain/textsplitters";
// Required imports for tiktoken functionality
import type * as tiktoken from "js-tiktoken";
import { Document } from "@langchain/core/documents";For CommonJS:
const {
// Classes
TextSplitter,
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
TokenTextSplitter,
MarkdownTextSplitter,
LatexTextSplitter,
// Interfaces and Types
TextSplitterParams,
TextSplitterChunkHeaderOptions,
CharacterTextSplitterParams,
RecursiveCharacterTextSplitterParams,
TokenTextSplitterParams,
MarkdownTextSplitterParams,
LatexTextSplitterParams,
SupportedTextSplitterLanguage,
SupportedTextSplitterLanguages
} = require("@langchain/textsplitters");
// Required for document processing
const { Document } = require("@langchain/core/documents");import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
import { Document } from "@langchain/core/documents";
// Create a text splitter with custom configuration
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});
// Split text into chunks
const text = "Your long text content here...";
const chunks = await splitter.splitText(text);
// Create documents from text with metadata
const docs = await splitter.createDocuments(
[text],
[{ source: "example.txt" }]
);
// Split existing documents
const existingDocs = [
new Document({ pageContent: text, metadata: { source: "doc1" } })
];
const splitDocs = await splitter.splitDocuments(existingDocs);LangChain Text Splitters is built around several key components:
TextSplitter provides core splitting functionality and document transformation interfaceCore text splitting functionality using simple character-based separators. Ideal for basic document chunking with predictable separator patterns.
class CharacterTextSplitter extends TextSplitter {
constructor(fields?: Partial<CharacterTextSplitterParams>);
splitText(text: string): Promise<string[]>;
}
interface CharacterTextSplitterParams extends TextSplitterParams {
separator: string;
}Advanced recursive splitting using a hierarchy of separators. Perfect for intelligent document chunking that preserves semantic structure and supports code-aware splitting.
class RecursiveCharacterTextSplitter extends TextSplitter {
constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>);
splitText(text: string): Promise<string[]>;
static fromLanguage(
language: SupportedTextSplitterLanguage,
options?: Partial<RecursiveCharacterTextSplitterParams>
): RecursiveCharacterTextSplitter;
static getSeparatorsForLanguage(language: SupportedTextSplitterLanguage): string[];
}
interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {
separators: string[];
}
const SupportedTextSplitterLanguages = [
"cpp", "go", "java", "js", "php", "proto", "python", "rst",
"ruby", "rust", "scala", "swift", "markdown", "latex", "html", "sol"
] as const;
type SupportedTextSplitterLanguage = (typeof SupportedTextSplitterLanguages)[number];Token-aware splitting using tiktoken encoding for accurate token count management. Essential for applications that need precise token-based chunking for language models.
class TokenTextSplitter extends TextSplitter {
constructor(fields?: Partial<TokenTextSplitterParams>);
splitText(text: string): Promise<string[]>;
}
interface TokenTextSplitterParams extends TextSplitterParams {
encodingName: tiktoken.TiktokenEncoding;
allowedSpecial: "all" | Array<string>;
disallowedSpecial: "all" | Array<string>;
}
// Tiktoken encoding types from js-tiktoken
namespace tiktoken {
type TiktokenEncoding = "gpt2" | "r50k_base" | "p50k_base" | "cl100k_base";
interface Tiktoken {
encode(text: string, allowedSpecial?: "all" | Array<string>, disallowedSpecial?: "all" | Array<string>): number[];
decode(tokens: number[]): string;
}
}Specialized splitters optimized for specific document formats like Markdown and LaTeX. Designed to preserve document structure and formatting semantics.
class MarkdownTextSplitter extends RecursiveCharacterTextSplitter {
constructor(fields?: Partial<MarkdownTextSplitterParams>);
}
class LatexTextSplitter extends RecursiveCharacterTextSplitter {
constructor(fields?: Partial<LatexTextSplitterParams>);
}
type MarkdownTextSplitterParams = TextSplitterParams;
type LatexTextSplitterParams = TextSplitterParams;interface TextSplitterParams {
chunkSize: number;
chunkOverlap: number;
keepSeparator: boolean;
lengthFunction?: ((text: string) => number) | ((text: string) => Promise<number>);
}
type TextSplitterChunkHeaderOptions = {
chunkHeader?: string;
chunkOverlapHeader?: string;
appendChunkOverlapHeader?: boolean;
};
// Base class from @langchain/core/documents
abstract class BaseDocumentTransformer {
abstract transformDocuments(documents: Document[], ...args: any[]): Promise<Document[]>;
}
abstract class TextSplitter extends BaseDocumentTransformer implements TextSplitterParams {
lc_namespace: string[];
chunkSize: number;
chunkOverlap: number;
keepSeparator: boolean;
lengthFunction: ((text: string) => number) | ((text: string) => Promise<number>);
constructor(fields?: Partial<TextSplitterParams>);
abstract splitText(text: string): Promise<string[]>;
transformDocuments(
documents: Document[],
chunkHeaderOptions?: TextSplitterChunkHeaderOptions
): Promise<Document[]>;
createDocuments(
texts: string[],
metadatas?: Record<string, any>[],
chunkHeaderOptions?: TextSplitterChunkHeaderOptions
): Promise<Document[]>;
splitDocuments(
documents: Document[],
chunkHeaderOptions?: TextSplitterChunkHeaderOptions
): Promise<Document[]>;
protected splitOnSeparator(text: string, separator: string): string[];
mergeSplits(splits: string[], separator: string): Promise<string[]>;
private numberOfNewLines(text: string, start?: number, end?: number): number;
private joinDocs(docs: string[], separator: string): string | null;
}