LangChain text splitting utilities for breaking documents into manageable chunks for AI processing
npx @tessl/cli install tessl/pypi-langchain-text-splitters@0.3.0LangChain Text Splitters provides comprehensive text splitting utilities for breaking down various types of documents into manageable chunks for processing by language models and other AI systems. The library offers specialized splitters for different content types and maintains document structure and context through intelligent chunking strategies.
pip install langchain-text-splittersfrom langchain_text_splitters import (
TextSplitter,
CharacterTextSplitter,
RecursiveCharacterTextSplitter
)For specific splitter types:
from langchain_text_splitters import (
# HTML splitters
HTMLHeaderTextSplitter,
HTMLSectionSplitter,
HTMLSemanticPreservingSplitter,
# Markdown splitters
MarkdownHeaderTextSplitter,
MarkdownTextSplitter,
ExperimentalMarkdownSyntaxTextSplitter,
# Other specialized splitters
RecursiveJsonSplitter,
PythonCodeTextSplitter,
NLTKTextSplitter,
SpacyTextSplitter
)For type definitions:
from langchain_text_splitters import (
ElementType,
HeaderType,
LineType,
Language
)from langchain_text_splitters import RecursiveCharacterTextSplitter
# Create a text splitter with custom configuration
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
is_separator_regex=False,
)
# Split text into chunks
text = "Your long document text here..."
chunks = text_splitter.split_text(text)
# Create Document objects with metadata
from langchain_core.documents import Document
documents = text_splitter.create_documents([text], [{"source": "example.txt"}])
# Split existing Document objects
existing_docs = [Document(page_content="Text content", metadata={"page": 1})]
split_docs = text_splitter.split_documents(existing_docs)The package follows a well-defined inheritance hierarchy:
Key design patterns:
TextSplitter classfrom_* methods for convenient initializationLanguage enumDocument class for metadata preservationBasic and advanced character-based text splitting strategies including simple separator-based splitting and recursive multi-separator splitting with language-specific support.
class CharacterTextSplitter(TextSplitter):
def __init__(self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs): ...
def split_text(self, text: str) -> list[str]: ...
class RecursiveCharacterTextSplitter(TextSplitter):
def __init__(self, separators: Optional[list[str]] = None, keep_separator: bool = True, is_separator_regex: bool = False, **kwargs): ...
def split_text(self, text: str) -> list[str]: ...
@classmethod
def from_language(cls, language: Language, **kwargs) -> "RecursiveCharacterTextSplitter": ...
@staticmethod
def get_separators_for_language(language: Language) -> list[str]: ...Advanced token-aware splitting using popular tokenizers including OpenAI's tiktoken, HuggingFace transformers, and sentence transformer models.
class TokenTextSplitter(TextSplitter):
def __init__(self, encoding_name: str = "gpt2", model_name: Optional[str] = None, allowed_special: Union[Literal["all"], set[str]] = set(), disallowed_special: Union[Literal["all"], Collection[str]] = "all", **kwargs): ...
def split_text(self, text: str) -> list[str]: ...
class SentenceTransformersTokenTextSplitter(TextSplitter):
def __init__(self, chunk_overlap: int = 50, model_name: str = "sentence-transformers/all-mpnet-base-v2", tokens_per_chunk: Optional[int] = None, **kwargs): ...
def split_text(self, text: str) -> list[str]: ...
def count_tokens(self, text: str) -> int: ...Specialized splitters that understand and preserve document structure for HTML, Markdown, JSON, and LaTeX documents while maintaining semantic context.
class HTMLHeaderTextSplitter:
def __init__(self, headers_to_split_on: list[tuple[str, str]], return_each_element: bool = False): ...
def split_text(self, text: str) -> list[Document]: ...
def split_text_from_url(self, url: str, timeout: int = 10, **kwargs) -> list[Document]: ...
class HTMLSectionSplitter:
def __init__(self, headers_to_split_on: list[tuple[str, str]], **kwargs: Any): ...
def split_documents(self, documents: Iterable[Document]) -> list[Document]: ...
def split_text(self, text: str) -> list[Document]: ...
class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
def __init__(self, headers_to_split_on: list[tuple[str, str]], *, max_chunk_size: int = 1000, chunk_overlap: int = 0, **kwargs): ...
def split_text(self, text: str) -> list[Document]: ...
def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> list[Document]: ...
class MarkdownHeaderTextSplitter:
def __init__(self, headers_to_split_on: list[tuple[str, str]], return_each_line: bool = False, strip_headers: bool = True, custom_header_patterns: Optional[dict[int, str]] = None): ...
def split_text(self, text: str) -> list[Document]: ...
class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
def __init__(self, **kwargs: Any) -> None: ...
class ExperimentalMarkdownSyntaxTextSplitter:
def __init__(self, headers_to_split_on: Optional[list[tuple[str, str]]] = None, return_each_line: bool = False, strip_headers: bool = True): ...
def split_text(self, text: str) -> list[Document]: ...
class RecursiveJsonSplitter:
def __init__(self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None): ...
def split_json(self, json_data: dict, convert_lists: bool = False) -> list[dict]: ...
def split_text(self, json_data: dict, convert_lists: bool = False, ensure_ascii: bool = True) -> list[str]: ...Programming language-aware splitters that understand code syntax and structure for Python, JavaScript/TypeScript frameworks, and other programming languages.
class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
def __init__(self, **kwargs): ...
class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
def __init__(self, separators: Optional[list[str]] = None, chunk_size: int = 2000, chunk_overlap: int = 0, **kwargs): ...
def split_text(self, text: str) -> list[str]: ...
class LatexTextSplitter(RecursiveCharacterTextSplitter):
def __init__(self, **kwargs): ...NLP-powered text splitters using NLTK, spaCy, and Konlpy for sentence-aware splitting with support for multiple languages including Korean.
class NLTKTextSplitter(TextSplitter):
def __init__(self, separator: str = "\n\n", language: str = "english", use_span_tokenize: bool = False, **kwargs): ...
def split_text(self, text: str) -> list[str]: ...
class SpacyTextSplitter(TextSplitter):
def __init__(self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", max_length: int = 1000000, strip_whitespace: bool = True, **kwargs): ...
def split_text(self, text: str) -> list[str]: ...
class KonlpyTextSplitter(TextSplitter):
def __init__(self, separator: str = "\n\n", **kwargs): ...
def split_text(self, text: str) -> list[str]: ...Core interfaces, enums, and utility functions that provide the foundation for all text splitting functionality.
class TextSplitter(BaseDocumentTransformer, ABC):
def __init__(self, chunk_size: int = 4000, chunk_overlap: int = 200, length_function: Callable[[str], int] = len, keep_separator: Union[bool, Literal["start", "end"]] = False, add_start_index: bool = False, strip_whitespace: bool = True): ...
@abstractmethod
def split_text(self, text: str) -> list[str]: ...
def create_documents(self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None) -> list[Document]: ...
def split_documents(self, documents: Iterable[Document]) -> list[Document]: ...
class Language(Enum):
CPP = "cpp"
GO = "go"
JAVA = "java"
KOTLIN = "kotlin"
JS = "js"
TS = "ts"
PHP = "php"
PROTO = "proto"
PYTHON = "python"
RST = "rst"
RUBY = "ruby"
RUST = "rust"
SCALA = "scala"
SWIFT = "swift"
MARKDOWN = "markdown"
LATEX = "latex"
HTML = "html"
SOL = "sol"
CSHARP = "csharp"
COBOL = "cobol"
C = "c"
LUA = "lua"
PERL = "perl"
HASKELL = "haskell"
ELIXIR = "elixir"
POWERSHELL = "powershell"
VISUALBASIC6 = "visualbasic6"
def split_text_on_tokens(*, text: str, tokenizer: "Tokenizer") -> list[str]: ...