Tessl Tile for npm/@langchain/textsplitters@0.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

character-splitting.md format-splitting.md index.md recursive-splitting.md token-splitting.md

tile.json

recursive-splitting.mddocs/

0
# Recursive Text Splitting
1

2
Advanced recursive splitting using a hierarchy of separators. Perfect for intelligent document chunking that preserves semantic structure and supports code-aware splitting for 18 programming languages.
3

4
## Capabilities
5

6
### RecursiveCharacterTextSplitter Class
7

8
Recursively splits text using a hierarchy of separators, trying larger structural separators first before falling back to smaller ones.
9

10
```typescript { .api }
11
/**
12
 * Text splitter that recursively tries different separators to preserve semantic structure
13
 */
14
class RecursiveCharacterTextSplitter extends TextSplitter implements RecursiveCharacterTextSplitterParams {
15
  separators: string[];
16
  
17
  constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>);
18
  splitText(text: string): Promise<string[]>;
19
  static lc_name(): string;
20
  static fromLanguage(
21
    language: SupportedTextSplitterLanguage,
22
    options?: Partial<RecursiveCharacterTextSplitterParams>
23
  ): RecursiveCharacterTextSplitter;
24
  static getSeparatorsForLanguage(language: SupportedTextSplitterLanguage): string[];
25
}
26

27
interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {
28
  /** Array of separators to try in order of preference (default: ["\n\n", "\n", " ", ""]) */
29
  separators: string[];
30
}
31
```
32

33
**Usage Examples:**
34

35
```typescript
36
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
37

38
// Basic recursive splitting with default separators
39
const splitter = new RecursiveCharacterTextSplitter({
40
  chunkSize: 100,
41
  chunkOverlap: 20,
42
});
43

44
const text = `# Title
45

46
This is a paragraph.
47

48
This is another paragraph.
49

50
Final paragraph here.`;
51

52
const chunks = await splitter.splitText(text);
53
// Tries paragraph breaks (\n\n) first, then line breaks (\n), then spaces, then characters
54

55
// Custom separator hierarchy
56
const customSplitter = new RecursiveCharacterTextSplitter({
57
  separators: ["\n### ", "\n## ", "\n# ", "\n\n", "\n", " ", ""],
58
  chunkSize: 200,
59
  chunkOverlap: 50,
60
});
61

62
const markdown = `# Main Title
63
Content under main title.
64

65
## Section One
66
Content in section one.
67

68
### Subsection
69
Content in subsection.`;
70

71
const markdownChunks = await customSplitter.splitText(markdown);
72
```
73

74
### Language-Specific Splitting
75

76
Create language-aware splitters that understand code structure and preserve semantic boundaries.
77

78
```typescript { .api }
79
/**
80
 * Create a recursive splitter optimized for a specific programming language
81
 * @param language - The programming language to optimize for
82
 * @param options - Additional configuration options
83
 * @returns Configured RecursiveCharacterTextSplitter
84
 */
85
static fromLanguage(
86
  language: SupportedTextSplitterLanguage,
87
  options?: Partial<RecursiveCharacterTextSplitterParams>
88
): RecursiveCharacterTextSplitter;
89

90
/**
91
 * Get the separator hierarchy for a specific programming language
92
 * @param language - The programming language
93
 * @returns Array of separators in order of preference
94
 */
95
static getSeparatorsForLanguage(language: SupportedTextSplitterLanguage): string[];
96

97
const SupportedTextSplitterLanguages = [
98
  "cpp", "go", "java", "js", "php", "proto", "python", "rst",
99
  "ruby", "rust", "scala", "swift", "markdown", "latex", "html", "sol"
100
] as const;
101

102
type SupportedTextSplitterLanguage = (typeof SupportedTextSplitterLanguages)[number];
103
```
104

105
**Language-Specific Examples:**
106

107
```typescript
108
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
109

110
// Python code splitting
111
const pythonSplitter = RecursiveCharacterTextSplitter.fromLanguage("python", {
112
  chunkSize: 500,
113
  chunkOverlap: 50,
114
});
115

116
const pythonCode = `class DataProcessor:
117
    def __init__(self, config):
118
        self.config = config
119
    
120
    def process(self, data):
121
        return self.transform(data)
122
    
123
    def transform(self, data):
124
        # Transform logic here
125
        return processed_data`;
126

127
const pythonChunks = await pythonSplitter.splitText(pythonCode);
128
// Preserves class and function boundaries
129

130
// JavaScript code splitting
131
const jsSplitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
132
  chunkSize: 300,
133
  chunkOverlap: 30,
134
});
135

136
const jsCode = `function processData(input) {
137
  const transformed = input.map(item => {
138
    return {
139
      ...item,
140
      processed: true
141
    };
142
  });
143
  
144
  return transformed.filter(item => item.valid);
145
}
146

147
const config = {
148
  timeout: 5000,
149
  retries: 3
150
};`;
151

152
const jsChunks = await jsSplitter.splitText(jsCode);
153

154
// Get separators for any supported language
155
const rustSeparators = RecursiveCharacterTextSplitter.getSeparatorsForLanguage("rust");
156
console.log(rustSeparators);
157
// ["\nfn ", "\nconst ", "\nlet ", "\nif ", "\nwhile ", "\nfor ", "\nloop ", "\nmatch ", "\nconst ", "\n\n", "\n", " ", ""]
158
```
159

160
### Supported Languages
161

162
The recursive text splitter supports intelligent splitting for these languages:
163

164
```typescript { .api }
165
const SupportedTextSplitterLanguages = [
166
  "cpp",        // C++
167
  "go",         // Go
168
  "java",       // Java
169
  "js",         // JavaScript/TypeScript
170
  "php",        // PHP
171
  "proto",      // Protocol Buffers
172
  "python",     // Python
173
  "rst",        // reStructuredText
174
  "ruby",       // Ruby
175
  "rust",       // Rust
176
  "scala",      // Scala
177
  "swift",      // Swift
178
  "markdown",   // Markdown
179
  "latex",      // LaTeX
180
  "html",       // HTML
181
  "sol"         // Solidity
182
] as const;
183
```
184

185
Each language has optimized separators that prioritize structural elements:
186

187
**Python separators**: `["\nclass ", "\ndef ", "\n\tdef ", "\n\n", "\n", " ", ""]`
188
**JavaScript separators**: `["\nfunction ", "\nconst ", "\nlet ", "\nvar ", "\nclass ", "\nif ", "\nfor ", "\nwhile ", "\nswitch ", "\ncase ", "\ndefault ", "\n\n", "\n", " ", ""]`
189
**Java separators**: `["\nclass ", "\npublic ", "\nprotected ", "\nprivate ", "\nstatic ", "\nif ", "\nfor ", "\nwhile ", "\nswitch ", "\ncase ", "\n\n", "\n", " ", ""]`
190

191
### Advanced Configuration
192

193
```typescript { .api }
194
interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {
195
  /** Array of separators to try in order of preference */
196
  separators: string[];
197
}
198
```
199

200
**Advanced Usage Examples:**
201

202
```typescript
203
// HTML-aware splitting with custom configuration
204
const htmlSplitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
205
  chunkSize: 1000,
206
  chunkOverlap: 100,
207
  keepSeparator: true, // Keep HTML tags
208
});
209

210
const htmlContent = `<html>
211
<head>
212
  <title>Example Page</title>
213
</head>
214
<body>
215
  <div class="content">
216
    <h1>Main Heading</h1>
217
    <p>First paragraph content.</p>
218
    <p>Second paragraph content.</p>
219
  </div>
220
</body>
221
</html>`;
222

223
const htmlChunks = await htmlSplitter.splitText(htmlContent);
224

225
// Markdown with custom separators prioritizing headings
226
const markdownSplitter = new RecursiveCharacterTextSplitter({
227
  separators: [
228
    "\n# ",      // H1 headings
229
    "\n## ",     // H2 headings  
230
    "\n### ",    // H3 headings
231
    "\n\n",      // Paragraph breaks
232
    "\n",        // Line breaks
233
    " ",         // Spaces
234
    ""           // Characters
235
  ],
236
  chunkSize: 500,
237
  chunkOverlap: 50,
238
  keepSeparator: true
239
});
240

241
// Document splitting with overlap headers
242
const documents = await markdownSplitter.createDocuments(
243
  [markdownContent],
244
  [{ source: "guide.md" }],
245
  {
246
    chunkHeader: "--- Document Chunk ---\n",
247
    chunkOverlapHeader: "(Continued) ",
248
    appendChunkOverlapHeader: true
249
  }
250
);
251
```
252

253
### Integration with Document Processing
254

255
Recursive text splitters work seamlessly with LangChain's document processing pipeline:
256

257
```typescript
258
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
259
import { Document } from "@langchain/core/documents";
260

261
// Transform existing documents
262
const splitter = RecursiveCharacterTextSplitter.fromLanguage("python");
263

264
const docs = [
265
  new Document({
266
    pageContent: pythonCode,
267
    metadata: { filename: "processor.py", author: "dev" }
268
  })
269
];
270

271
// Use as document transformer
272
const splitDocs = await splitter.transformDocuments(docs);
273

274
// All documents maintain original metadata plus line location information
275
splitDocs.forEach(doc => {
276
  console.log(doc.metadata); 
277
  // { filename: "processor.py", author: "dev", loc: { lines: { from: 1, to: 5 } } }
278
});
279
```

Version

Tile

Files

recursive-splitting.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

recursive-splitting.mddocs/