tessl install tessl/pypi-chonkie@1.5.0The lightweight ingestion library for fast, efficient and robust RAG pipelines
Comprehensive collection of 11 text chunking strategies for various use cases, from simple token-based splitting to advanced semantic and LLM-guided chunking.
Abstract base class providing common functionality for all chunker implementations.
from abc import ABC, abstractmethod
from typing import Union, Sequence, Optional
class BaseChunker(ABC):
"""
Base class for all chunker implementations.
Args:
tokenizer: Tokenizer instance or string identifier (e.g., 'gpt2', 'character')
"""
def __init__(self, tokenizer: Union[str, TokenizerProtocol] = "gpt2"): ...
@abstractmethod
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks a single text string.
Args:
text: Input text to chunk
Returns:
List of Chunk objects
"""
...
def chunk_batch(
self,
texts: Sequence[str],
show_progress: bool = True
) -> list[list[Chunk]]:
"""
Chunks multiple texts with optional progress bar.
Args:
texts: Sequence of input texts
show_progress: Whether to display progress bar
Returns:
List of chunk lists, one per input text
"""
...
def chunk_document(self, document: Document) -> Document:
"""
Chunks a document object and updates its chunks.
Args:
document: Document to chunk
Returns:
Document with populated chunks list
"""
...
def __call__(
self,
text: Union[str, Sequence[str]],
show_progress: bool = True
) -> Union[list[Chunk], list[list[Chunk]]]:
"""
Allows calling chunker as a function.
Args:
text: Single text string or sequence of texts
show_progress: Whether to display progress bar for batch processing
Returns:
Chunks for single text or list of chunk lists for batch
"""
...
@property
def tokenizer(self) -> AutoTokenizer:
"""Returns the tokenizer instance."""
...Splits text into fixed-size token chunks with optional overlap, the simplest and fastest chunking strategy.
class TokenChunker(BaseChunker):
"""
Splits text into chunks of a specified token size with optional overlap.
Args:
tokenizer: Tokenizer instance or identifier (default: 'character')
chunk_size: Maximum number of tokens per chunk (default: 2048)
chunk_overlap: Number of tokens to overlap between chunks, or fraction if < 1.0 (default: 0)
"""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
chunk_overlap: Union[int, float] = 0
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks text by token count.
Args:
text: Input text to chunk
Returns:
List of Chunk objects with fixed token sizes
"""
...Usage example:
from chonkie import TokenChunker
# Create chunker with GPT-2 tokenizer
chunker = TokenChunker(tokenizer="gpt2", chunk_size=512, chunk_overlap=50)
# Chunk text
chunks = chunker("Your long text here...")
# Process multiple texts
texts = ["Text 1", "Text 2", "Text 3"]
batch_chunks = chunker(texts)Splits text based on sentence boundaries while respecting token limits, preserving semantic units.
class SentenceChunker(BaseChunker):
"""
Splits text into chunks based on sentence boundaries while respecting token limits.
Args:
tokenizer: Tokenizer instance or identifier (default: 'character')
chunk_size: Maximum number of tokens per chunk (default: 2048)
chunk_overlap: Number of sentences to overlap between chunks (default: 0)
min_sentences_per_chunk: Minimum sentences required per chunk (default: 1)
min_characters_per_sentence: Minimum characters for valid sentence (default: 12)
approximate: If True, use faster approximate sentence detection (default: False)
delim: Sentence delimiter(s) (default: ['. ', '! ', '? ', '\\n'])
include_delim: Include delimiter with 'prev' or 'next' chunk (default: 'prev')
"""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
chunk_overlap: int = 0,
min_sentences_per_chunk: int = 1,
min_characters_per_sentence: int = 12,
approximate: bool = False,
delim: Union[str, list[str]] = [". ", "! ", "? ", "\n"],
include_delim: Optional[Literal["prev", "next"]] = "prev"
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks text by sentence boundaries.
Args:
text: Input text to chunk
Returns:
List of Chunk objects aligned to sentence boundaries
"""
...
@classmethod
def from_recipe(
cls,
name: str,
lang: Optional[str] = "en",
**kwargs
) -> SentenceChunker:
"""
Creates chunker from a predefined recipe.
Args:
name: Recipe name
lang: Language code
**kwargs: Additional parameters for the chunker
Returns:
Configured SentenceChunker instance
"""
...Usage example:
from chonkie import SentenceChunker
# Create sentence-based chunker
chunker = SentenceChunker(
tokenizer="gpt2",
chunk_size=512,
chunk_overlap=1, # Overlap by 1 sentence
min_sentences_per_chunk=2
)
chunks = chunker("First sentence. Second sentence. Third sentence.")Recursively splits text using hierarchical rules (paragraphs, sentences, words, etc.) for semantically meaningful chunks.
class RecursiveChunker(BaseChunker):
"""
Recursively splits text using hierarchical rules.
Args:
tokenizer: Tokenizer instance or identifier (default: 'character')
chunk_size: Maximum number of tokens per chunk (default: 2048)
rules: RecursiveRules defining the hierarchy (default: RecursiveRules())
min_characters_per_chunk: Minimum characters required per chunk (default: 24)
"""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
rules: RecursiveRules = ...,
min_characters_per_chunk: int = 24
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks text recursively according to hierarchical rules.
Args:
text: Input text to chunk
Returns:
List of Chunk objects created using recursive splitting
"""
...
@classmethod
def from_recipe(
cls,
name: Optional[str] = "default",
lang: Optional[str] = "en",
path: Optional[str] = None,
**kwargs
) -> RecursiveChunker:
"""
Creates chunker from a predefined recipe.
Args:
name: Recipe name (e.g., 'default', 'markdown', 'code')
lang: Language code
path: Optional path to custom recipe file
**kwargs: Additional parameters for the chunker
Returns:
Configured RecursiveChunker instance
"""
...Usage example:
from chonkie import RecursiveChunker, RecursiveRules
# Use default rules
chunker = RecursiveChunker(chunk_size=512)
# Use markdown-specific rules
chunker = RecursiveChunker.from_recipe(name="markdown", chunk_size=512)
# Custom rules
from chonkie import RecursiveLevel
rules = RecursiveRules(levels=[
RecursiveLevel(delimiters=["\n\n"]), # Split on paragraphs first
RecursiveLevel(delimiters=[". "]), # Then sentences
RecursiveLevel(whitespace=True) # Finally words
])
chunker = RecursiveChunker(rules=rules, chunk_size=512)
chunks = chunker("Your text here...")Uses embedding-based semantic similarity and peak detection to find optimal chunk boundaries.
class SemanticChunker(BaseChunker):
"""
Uses embedding similarity to find optimal chunk boundaries.
Args:
embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-base-32M')
threshold: Similarity threshold for splitting (0.0-1.0, default: 0.8)
chunk_size: Maximum number of tokens per chunk (default: 2048)
similarity_window: Window size for similarity computation (default: 3)
min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
min_characters_per_sentence: Minimum characters per sentence (default: 24)
delim: Sentence delimiters (default: ['. ', '! ', '? ', '\\n'])
include_delim: Include delimiter with 'prev' or 'next' (default: 'prev')
skip_window: Skip window size for peak detection (default: 0)
filter_window: Savitzky-Golay filter window size (default: 5)
filter_polyorder: Savitzky-Golay polynomial order (default: 3)
filter_tolerance: Filter tolerance for smoothing (default: 0.2)
**kwargs: Additional arguments for embedding model
"""
def __init__(
self,
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-base-32M",
threshold: float = 0.8,
chunk_size: int = 2048,
similarity_window: int = 3,
min_sentences_per_chunk: int = 1,
min_characters_per_sentence: int = 24,
delim: Union[str, list[str]] = [". ", "! ", "? ", "\n"],
include_delim: Optional[Literal["prev", "next"]] = "prev",
skip_window: int = 0,
filter_window: int = 5,
filter_polyorder: int = 3,
filter_tolerance: float = 0.2,
**kwargs: dict[str, Any]
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks text using semantic similarity analysis.
Args:
text: Input text to chunk
Returns:
List of Chunk objects with embeddings, split at semantic boundaries
"""
...Usage example:
from chonkie import SemanticChunker
# Create semantic chunker
chunker = SemanticChunker(
embedding_model="all-MiniLM-L6-v2",
threshold=0.75,
chunk_size=512
)
chunks = chunker("Your text here...")
# Chunks include embeddings
for chunk in chunks:
print(f"Text: {chunk.text}")
print(f"Embedding shape: {chunk.embedding.shape}")Specialized chunker for source code using tree-sitter AST parsing to create structurally meaningful chunks.
Note: CodeChunker requires optional dependencies. Install with: pip install chonkie[code] or manually install tree-sitter-language-pack and magika.
class CodeChunker(BaseChunker):
"""
Specialized chunker for source code using AST parsing.
Requires optional dependencies: tree-sitter-language-pack, magika
Args:
tokenizer: Tokenizer instance or identifier (default: 'character')
chunk_size: Maximum number of tokens per chunk (default: 2048)
language: Programming language for parsing or 'auto' for detection (default: 'auto')
include_nodes: If True, include AST node information in chunk metadata (default: False)
"""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
language: Union[Literal["auto"], Any] = "auto",
include_nodes: bool = False
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks code using AST structure.
Args:
text: Source code to chunk
Returns:
List of Chunk objects aligned to code structure (functions, classes, etc.)
"""
...Usage example:
from chonkie import CodeChunker
# Create code chunker with auto-detection
chunker = CodeChunker(language="auto", chunk_size=512)
# Or specify language explicitly
python_chunker = CodeChunker(language="python", chunk_size=512)
code = """
def function1():
pass
class MyClass:
def method1(self):
pass
"""
chunks = chunker(code)Combines recursive chunking with late-stage embedding computation for semantic optimization.
class LateChunker(BaseChunker):
"""
Recursively chunks with embedding-based merging at late stage.
Args:
embedding_model: Embedding model identifier or instance (default: 'nomic-ai/modernbert-embed-base')
chunk_size: Maximum number of tokens per chunk (default: 2048)
rules: RecursiveRules defining the hierarchy (default: RecursiveRules())
min_characters_per_chunk: Minimum characters per chunk (default: 24)
**kwargs: Additional arguments for embedding model
"""
def __init__(
self,
embedding_model: Union[str, SentenceTransformerEmbeddings, Any] = "nomic-ai/modernbert-embed-base",
chunk_size: int = 2048,
rules: RecursiveRules = ...,
min_characters_per_chunk: int = 24,
**kwargs: Any
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Recursively chunks with late-stage semantic merging.
Args:
text: Input text to chunk
Returns:
List of Chunk objects with embeddings
"""
...
@classmethod
def from_recipe(cls, **kwargs) -> LateChunker:
"""Creates chunker from recipe."""
...Usage example:
from chonkie import LateChunker
# Create late chunker
chunker = LateChunker(
embedding_model="all-MiniLM-L6-v2",
chunk_size=512
)
chunks = chunker("Your text here...")Uses LLM (via Genie) to intelligently determine chunk boundaries, also known as AgenticChunker.
class SlumberChunker(BaseChunker):
"""
Uses LLM to intelligently determine chunk boundaries.
Args:
genie: LLM integration instance (e.g., OpenAIGenie) (default: None)
tokenizer: Tokenizer instance or identifier (default: 'character')
chunk_size: Maximum number of tokens per chunk (default: 2048)
rules: RecursiveRules for initial splitting (default: RecursiveRules())
candidate_size: Token size for candidate chunks presented to LLM (default: 128)
min_characters_per_chunk: Minimum characters per chunk (default: 24)
verbose: If True, print LLM decision details (default: True)
"""
def __init__(
self,
genie: Optional[BaseGenie] = None,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
rules: RecursiveRules = ...,
candidate_size: int = 128,
min_characters_per_chunk: int = 24,
verbose: bool = True
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks using LLM-guided decisions.
Args:
text: Input text to chunk
Returns:
List of Chunk objects determined by LLM analysis
"""
...Usage example:
from chonkie import SlumberChunker, OpenAIGenie
# Create with OpenAI
genie = OpenAIGenie(model="gpt-4", api_key="your-key")
chunker = SlumberChunker(genie=genie, chunk_size=512)
chunks = chunker("Complex text requiring intelligent splitting...")Uses neural token classification models to predict chunk boundaries.
class NeuralChunker(BaseChunker):
"""
Uses neural models to predict chunk boundaries.
Args:
model: Model identifier or instance (default: 'mirth/chonky_distilbert_base_uncased_1')
tokenizer: Tokenizer for the model (default: None, uses model's tokenizer)
device_map: Device mapping for model (default: 'auto')
min_characters_per_chunk: Minimum characters per chunk (default: 10)
stride: Stride for sliding window processing (default: None)
"""
def __init__(
self,
model: Union[str, Any] = "mirth/chonky_distilbert_base_uncased_1",
tokenizer: Optional[Union[str, Any]] = None,
device_map: str = "auto",
min_characters_per_chunk: int = 10,
stride: Optional[int] = None
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks using neural boundary detection.
Args:
text: Input text to chunk
Returns:
List of Chunk objects identified by neural model
"""
...Usage example:
from chonkie import NeuralChunker
# Create neural chunker (requires neural installation)
chunker = NeuralChunker(min_characters_per_chunk=50)
chunks = chunker("Your text here...")Specialized chunker for markdown tables that preserves table structure while splitting by rows.
class TableChunker(BaseChunker):
"""
Specialized chunker for markdown tables preserving structure.
Args:
tokenizer: Tokenizer instance or identifier (default: 'row')
chunk_size: Maximum number of rows per chunk (default: 3)
"""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "row",
chunk_size: int = 3
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks markdown tables by rows.
Args:
text: Markdown table text
Returns:
List of Chunk objects, each containing header + rows
"""
...Usage example:
from chonkie import TableChunker
chunker = TableChunker(chunk_size=5) # 5 rows per chunk
table_text = """
| Col1 | Col2 | Col3 |
|------|------|------|
| A | B | C |
| D | E | F |
| G | H | I |
"""
chunks = chunker(table_text)High-performance chunker using the memchunk library for SIMD-accelerated delimiter-based splitting.
class FastChunker(BaseChunker):
"""
High-performance chunker using memchunk for fast delimiter-based splitting.
Note: FastChunker uses byte-based splitting and does not use a tokenizer.
It sets _tokenizer = None internally and operates directly on character positions.
Args:
chunk_size: Maximum chunk size in characters (default: 4096)
delimiters: String of delimiter characters (default: '\\n.?')
pattern: Optional regex pattern for splitting (default: None)
prefix: If True, include delimiters as prefix (default: False)
consecutive: If True, treat consecutive delimiters as one (default: False)
forward_fallback: If True, use forward search when no delimiter found (default: False)
"""
def __init__(
self,
chunk_size: int = 4096,
delimiters: str = "\n.?",
pattern: Optional[str] = None,
prefix: bool = False,
consecutive: bool = False,
forward_fallback: bool = False
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Fast delimiter-based chunking.
Args:
text: Input text to chunk
Returns:
List of Chunk objects created using SIMD-accelerated splitting
"""
...Usage example:
from chonkie import FastChunker
# Create fast chunker (requires fast installation: pip install chonkie[fast])
chunker = FastChunker(
chunk_size=1024,
delimiters="\n.!?",
consecutive=True
)
chunks = chunker("Your large text here...")All chunkers are available from the main package:
from chonkie import (
BaseChunker,
TokenChunker,
SentenceChunker,
RecursiveChunker,
SemanticChunker,
CodeChunker,
LateChunker,
SlumberChunker,
NeuralChunker,
TableChunker,
FastChunker,
)All chunkers can be used in the Pipeline API via their aliases:
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("token", chunk_size=512, chunk_overlap=50)
.chunk_with("semantic", threshold=0.8)
)Chunker aliases:
token - TokenChunkersentence - SentenceChunkerrecursive - RecursiveChunkersemantic - SemanticChunkercode - CodeChunkerlate - LateChunkerslumber - SlumberChunkerneural - NeuralChunkerfast - FastChunker