LangChain text splitting utilities for breaking documents into manageable chunks for AI processing
—
Token-based splitting provides advanced text segmentation based on tokenization models. This approach ensures precise control over chunk sizes in terms of tokens rather than characters, which is crucial for language model applications that have token-based context limits.
Text splitting based on OpenAI's tiktoken tokenizer, supporting various encoding schemes and models.
class TokenTextSplitter(TextSplitter):
def __init__(
self,
encoding_name: str = "gpt2",
model_name: Optional[str] = None,
allowed_special: Union[Literal["all"], set[str]] = set(),
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
**kwargs: Any
) -> None: ...
def split_text(self, text: str) -> list[str]: ...Parameters:
encoding_name: Tiktoken encoding name (default: "gpt2")model_name: Optional OpenAI model name to determine encodingallowed_special: Special tokens allowed during encodingdisallowed_special: Special tokens that raise errors during encoding**kwargs: Additional parameters passed to TextSplitter.__init__()Usage:
from langchain_text_splitters import TokenTextSplitter
# Basic token splitting with GPT-2 encoding
splitter = TokenTextSplitter(
encoding_name="gpt2",
chunk_size=512, # 512 tokens per chunk
chunk_overlap=50
)
chunks = splitter.split_text("Long text to be tokenized and split...")
# Model-specific token splitting
gpt4_splitter = TokenTextSplitter(
model_name="gpt-4",
chunk_size=1000,
chunk_overlap=100
)
# Custom special token handling
custom_splitter = TokenTextSplitter(
encoding_name="cl100k_base", # GPT-3.5/GPT-4 encoding
allowed_special={"<|endoftext|>"},
disallowed_special="all",
chunk_size=800
)Token splitting using sentence transformer models, optimized for embedding-based applications.
class SentenceTransformersTokenTextSplitter(TextSplitter):
def __init__(
self,
chunk_overlap: int = 50,
model_name: str = "sentence-transformers/all-mpnet-base-v2",
tokens_per_chunk: Optional[int] = None,
**kwargs: Any
) -> None: ...
def split_text(self, text: str) -> list[str]: ...
def count_tokens(self, text: str) -> int: ...Parameters:
chunk_overlap: Token overlap between chunks (default: 50)model_name: Sentence transformer model name (default: "sentence-transformers/all-mpnet-base-v2")tokens_per_chunk: Maximum tokens per chunk (overrides chunk_size)**kwargs: Additional parameters passed to TextSplitter.__init__()Methods:
count_tokens(): Count tokens in text using the model's tokenizerUsage:
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
# Basic sentence transformer splitting
splitter = SentenceTransformersTokenTextSplitter(
model_name="sentence-transformers/all-mpnet-base-v2",
chunk_overlap=50,
tokens_per_chunk=384 # Common embedding model context size
)
text = "Document to be split for embedding..."
chunks = splitter.split_text(text)
# Count tokens in text
token_count = splitter.count_tokens("Sample text to count")
# Different embedding models
distilbert_splitter = SentenceTransformersTokenTextSplitter(
model_name="sentence-transformers/distilbert-base-nli-mean-tokens",
tokens_per_chunk=512
)
roberta_splitter = SentenceTransformersTokenTextSplitter(
model_name="sentence-transformers/all-roberta-large-v1",
tokens_per_chunk=256
)Convenient factory methods on the base TextSplitter class for creating token-based splitters.
class TextSplitter:
@classmethod
def from_huggingface_tokenizer(
cls,
tokenizer: Any,
**kwargs: Any
) -> "TextSplitter": ...
@classmethod
def from_tiktoken_encoder(
cls,
encoding_name: str = "gpt2",
model_name: Optional[str] = None,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
**kwargs: Any
) -> Self: ...Factory Methods:
from_huggingface_tokenizer(): Create splitter from HuggingFace tokenizerfrom_tiktoken_encoder(): Create splitter from tiktoken encoderUsage:
from langchain_text_splitters import TextSplitter
from transformers import AutoTokenizer
# Create splitter from HuggingFace tokenizer
hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
hf_splitter = TextSplitter.from_huggingface_tokenizer(
tokenizer=hf_tokenizer,
chunk_size=512,
chunk_overlap=50
)
# Create splitter from tiktoken encoder
tiktoken_splitter = TextSplitter.from_tiktoken_encoder(
encoding_name="cl100k_base",
chunk_size=1000,
chunk_overlap=100
)Low-level tokenizer configuration for advanced use cases.
@dataclass(frozen=True)
class Tokenizer:
chunk_overlap: int
tokens_per_chunk: int
decode: Callable[[list[int]], str]
encode: Callable[[str], list[int]]
def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> list[str]: ...Usage:
from langchain_text_splitters import Tokenizer, split_text_on_tokens
import tiktoken
# Create custom tokenizer configuration
encoding = tiktoken.get_encoding("gpt2")
custom_tokenizer = Tokenizer(
chunk_overlap=50,
tokens_per_chunk=500,
decode=encoding.decode,
encode=encoding.encode
)
# Use tokenizer to split text
text = "Text to be split using custom tokenizer..."
chunks = split_text_on_tokens(text=text, tokenizer=custom_tokenizer)gpt2: GPT-2 and smaller GPT-3 modelsr50k_base: text-davinci-002, text-davinci-003p50k_base: Code models, text-davinci-edit-001, text-similarity-*cl100k_base: GPT-3.5, GPT-4, text-embedding-ada-002all-mpnet-base-v2: High-quality general-purpose embeddingsall-MiniLM-L6-v2: Fast and efficient embeddingsdistilbert-base-nli-mean-tokens: Lightweight BERT-based embeddingsall-roberta-large-v1: High-quality RoBERTa-based embeddingscount_tokens() method to verify chunk sizesInstall with Tessl CLI
npx tessl i tessl/pypi-langchain-text-splitters