Various implementations of LangChain.js text splitters for retrieval-augmented generation (RAG) pipelines
npx @tessl/cli install tessl/npm-langchain--textsplitters@0.1.0LangChain Text Splitters provides various implementations of text splitting functionality for LangChain.js, most commonly used as part of retrieval-augmented generation (RAG) pipelines. The library offers abstract base classes and concrete implementations for splitting text documents into smaller chunks with configurable size, overlap, and length functions.
npm install @langchain/textsplitters @langchain/core js-tiktokenimport {
// Classes
TextSplitter,
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
TokenTextSplitter,
MarkdownTextSplitter,
LatexTextSplitter,
// Interfaces and Types
TextSplitterParams,
TextSplitterChunkHeaderOptions,
CharacterTextSplitterParams,
RecursiveCharacterTextSplitterParams,
TokenTextSplitterParams,
MarkdownTextSplitterParams,
LatexTextSplitterParams,
SupportedTextSplitterLanguage,
SupportedTextSplitterLanguages
} from "@langchain/textsplitters";
// Required imports for tiktoken functionality
import type * as tiktoken from "js-tiktoken";
import { Document } from "@langchain/core/documents";For CommonJS:
const {
// Classes
TextSplitter,
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
TokenTextSplitter,
MarkdownTextSplitter,
LatexTextSplitter,
// Interfaces and Types
TextSplitterParams,
TextSplitterChunkHeaderOptions,
CharacterTextSplitterParams,
RecursiveCharacterTextSplitterParams,
TokenTextSplitterParams,
MarkdownTextSplitterParams,
LatexTextSplitterParams,
SupportedTextSplitterLanguage,
SupportedTextSplitterLanguages
} = require("@langchain/textsplitters");
// Required for document processing
const { Document } = require("@langchain/core/documents");import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
import { Document } from "@langchain/core/documents";
// Create a text splitter with custom configuration
const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});
// Split text into chunks
const text = "Your long text content here...";
const chunks = await splitter.splitText(text);
// Create documents from text with metadata
const docs = await splitter.createDocuments(
[text],
[{ source: "example.txt" }]
);
// Split existing documents
const existingDocs = [
new Document({ pageContent: text, metadata: { source: "doc1" } })
];
const splitDocs = await splitter.splitDocuments(existingDocs);LangChain Text Splitters is built around several key components:
TextSplitter provides core splitting functionality and document transformation interfaceCore text splitting functionality using simple character-based separators. Ideal for basic document chunking with predictable separator patterns.
class CharacterTextSplitter extends TextSplitter {
constructor(fields?: Partial<CharacterTextSplitterParams>);
splitText(text: string): Promise<string[]>;
}
interface CharacterTextSplitterParams extends TextSplitterParams {
separator: string;
}Advanced recursive splitting using a hierarchy of separators. Perfect for intelligent document chunking that preserves semantic structure and supports code-aware splitting.
class RecursiveCharacterTextSplitter extends TextSplitter {
constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>);
splitText(text: string): Promise<string[]>;
static fromLanguage(
language: SupportedTextSplitterLanguage,
options?: Partial<RecursiveCharacterTextSplitterParams>
): RecursiveCharacterTextSplitter;
static getSeparatorsForLanguage(language: SupportedTextSplitterLanguage): string[];
}
interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {
separators: string[];
}
const SupportedTextSplitterLanguages = [
"cpp", "go", "java", "js", "php", "proto", "python", "rst",
"ruby", "rust", "scala", "swift", "markdown", "latex", "html", "sol"
] as const;
type SupportedTextSplitterLanguage = (typeof SupportedTextSplitterLanguages)[number];Token-aware splitting using tiktoken encoding for accurate token count management. Essential for applications that need precise token-based chunking for language models.
class TokenTextSplitter extends TextSplitter {
constructor(fields?: Partial<TokenTextSplitterParams>);
splitText(text: string): Promise<string[]>;
}
interface TokenTextSplitterParams extends TextSplitterParams {
encodingName: tiktoken.TiktokenEncoding;
allowedSpecial: "all" | Array<string>;
disallowedSpecial: "all" | Array<string>;
}
// Tiktoken encoding types from js-tiktoken
namespace tiktoken {
type TiktokenEncoding = "gpt2" | "r50k_base" | "p50k_base" | "cl100k_base";
interface Tiktoken {
encode(text: string, allowedSpecial?: "all" | Array<string>, disallowedSpecial?: "all" | Array<string>): number[];
decode(tokens: number[]): string;
}
}Specialized splitters optimized for specific document formats like Markdown and LaTeX. Designed to preserve document structure and formatting semantics.
class MarkdownTextSplitter extends RecursiveCharacterTextSplitter {
constructor(fields?: Partial<MarkdownTextSplitterParams>);
}
class LatexTextSplitter extends RecursiveCharacterTextSplitter {
constructor(fields?: Partial<LatexTextSplitterParams>);
}
type MarkdownTextSplitterParams = TextSplitterParams;
type LatexTextSplitterParams = TextSplitterParams;interface TextSplitterParams {
chunkSize: number;
chunkOverlap: number;
keepSeparator: boolean;
lengthFunction?: ((text: string) => number) | ((text: string) => Promise<number>);
}
type TextSplitterChunkHeaderOptions = {
chunkHeader?: string;
chunkOverlapHeader?: string;
appendChunkOverlapHeader?: boolean;
};
// Base class from @langchain/core/documents
abstract class BaseDocumentTransformer {
abstract transformDocuments(documents: Document[], ...args: any[]): Promise<Document[]>;
}
abstract class TextSplitter extends BaseDocumentTransformer implements TextSplitterParams {
lc_namespace: string[];
chunkSize: number;
chunkOverlap: number;
keepSeparator: boolean;
lengthFunction: ((text: string) => number) | ((text: string) => Promise<number>);
constructor(fields?: Partial<TextSplitterParams>);
abstract splitText(text: string): Promise<string[]>;
transformDocuments(
documents: Document[],
chunkHeaderOptions?: TextSplitterChunkHeaderOptions
): Promise<Document[]>;
createDocuments(
texts: string[],
metadatas?: Record<string, any>[],
chunkHeaderOptions?: TextSplitterChunkHeaderOptions
): Promise<Document[]>;
splitDocuments(
documents: Document[],
chunkHeaderOptions?: TextSplitterChunkHeaderOptions
): Promise<Document[]>;
protected splitOnSeparator(text: string, separator: string): string[];
mergeSplits(splits: string[], separator: string): Promise<string[]>;
private numberOfNewLines(text: string, start?: number, end?: number): number;
private joinDocs(docs: string[], separator: string): string | null;
}