LangChain text splitting utilities for breaking documents into manageable chunks for AI processing
—
Code-aware text splitting provides specialized text segmentation that understands programming language syntax and structure. These splitters are designed to maintain code integrity by respecting logical boundaries such as function definitions, class declarations, and block structures.
Specialized splitting for Python source code that respects Python syntax and structure.
class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
def __init__(self, **kwargs: Any) -> None: ...Usage:
from langchain_text_splitters import PythonCodeTextSplitter
python_splitter = PythonCodeTextSplitter(
chunk_size=2000,
chunk_overlap=200
)
python_code = """
import os
import sys
def calculate_sum(a, b):
'''Calculate the sum of two numbers.'''
return a + b
class Calculator:
def __init__(self):
self.history = []
def add(self, x, y):
result = x + y
self.history.append(f"{x} + {y} = {result}")
return result
def get_history(self):
return self.history
if __name__ == "__main__":
calc = Calculator()
print(calc.add(5, 3))
"""
chunks = python_splitter.split_text(python_code)The Python splitter uses separators optimized for Python syntax:
class )def , async def )if , for , while , try , with )\n\n, \n, , ``)Specialized splitting for React/JSX, Vue, and Svelte code that understands component boundaries and framework-specific syntax.
class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
def __init__(
self,
separators: Optional[list[str]] = None,
chunk_size: int = 2000,
chunk_overlap: int = 0,
**kwargs: Any
) -> None: ...
def split_text(self, text: str) -> list[str]: ...Parameters:
separators: Custom separator list (default: framework-optimized separators)chunk_size: Maximum chunk size (default: 2000)chunk_overlap: Overlap between chunks (default: 0)Usage:
from langchain_text_splitters import JSFrameworkTextSplitter
jsx_splitter = JSFrameworkTextSplitter(
chunk_size=1500,
chunk_overlap=100
)
react_code = """
import React, { useState, useEffect } from 'react';
const UserProfile = ({ userId }) => {
const [user, setUser] = useState(null);
const [loading, setLoading] = useState(true);
useEffect(() => {
fetchUser(userId)
.then(userData => {
setUser(userData);
setLoading(false);
})
.catch(error => {
console.error('Error fetching user:', error);
setLoading(false);
});
}, [userId]);
if (loading) {
return <LoadingSpinner />;
}
return (
<div className="user-profile">
<h1>{user.name}</h1>
<p>{user.email}</p>
</div>
);
};
export default UserProfile;
"""
chunks = jsx_splitter.split_text(react_code)The JSX splitter recognizes:
useState, useEffect, etc.)Specialized splitting for LaTeX documents that respects LaTeX structure and formatting commands.
class LatexTextSplitter(RecursiveCharacterTextSplitter):
def __init__(self, **kwargs: Any) -> None: ...Usage:
from langchain_text_splitters import LatexTextSplitter
latex_splitter = LatexTextSplitter(
chunk_size=1000,
chunk_overlap=100
)
latex_document = r"""
\documentclass{article}
\usepackage{amsmath}
\title{Mathematical Analysis}
\author{Author Name}
\date{\today}
\begin{document}
\maketitle
\section{Introduction}
This document presents a mathematical analysis of...
\subsection{Preliminaries}
Let us define the following concepts:
\begin{definition}
A function $f: \mathbb{R} \to \mathbb{R}$ is continuous at point $a$ if...
\end{definition}
\begin{theorem}
If $f$ is continuous on $[a, b]$ and differentiable on $(a, b)$, then...
\end{theorem}
\section{Main Results}
The main theorem can be stated as follows:
\begin{align}
\int_a^b f(x) dx &= F(b) - F(a) \\
&= \lim_{n \to \infty} \sum_{i=1}^n f(x_i) \Delta x
\end{align}
\end{document}
"""
chunks = latex_splitter.split_text(latex_document)The LaTeX splitter uses separators that respect:
\section, \subsection, \chapter)\begin{}, \end{})For other programming languages, use the RecursiveCharacterTextSplitter.from_language() method with the appropriate Language enum value.
# Available through RecursiveCharacterTextSplitter
@classmethod
def from_language(
cls,
language: Language,
**kwargs: Any
) -> "RecursiveCharacterTextSplitter": ...
@staticmethod
def get_separators_for_language(language: Language) -> list[str]: ...Usage:
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
# Java code splitting
java_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.JAVA,
chunk_size=2000,
chunk_overlap=200
)
java_code = """
public class Calculator {
private double result;
public Calculator() {
this.result = 0.0;
}
public double add(double a, double b) {
result = a + b;
return result;
}
public static void main(String[] args) {
Calculator calc = new Calculator();
System.out.println(calc.add(5.0, 3.0));
}
}
"""
java_chunks = java_splitter.split_text(java_code)
# C++ code splitting
cpp_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.CPP,
chunk_size=1500
)
# Get separators for inspection
cpp_separators = RecursiveCharacterTextSplitter.get_separators_for_language(Language.CPP)The Language enum provides optimized separators for:
Install with Tessl CLI
npx tessl i tessl/pypi-langchain-text-splitters