tessl install tessl/pypi-chonkie@1.5.0The lightweight ingestion library for fast, efficient and robust RAG pipelines
A comprehensive Python library for text chunking in RAG pipelines. Chonkie provides 11 different chunking strategies, seamless integration with 9+ embedding providers, direct connections to 10+ vector databases, and a fluent pipeline API for building end-to-end text processing workflows.
pip install chonkieimport chonkieCommon imports for chunking:
from chonkie import TokenChunker, SentenceChunker, RecursiveChunker, SemanticChunker
from chonkie import Chunk, Document
from chonkie import Pipelinefrom chonkie import RecursiveChunker
# Initialize chunker with default settings
chunker = RecursiveChunker()
# Chunk text
chunks = chunker("Chonkie is a comprehensive text chunking library for RAG pipelines.")
# Access chunk information
for chunk in chunks:
print(f"Text: {chunk.text}")
print(f"Tokens: {chunk.token_count}")
print(f"Position: {chunk.start_index}-{chunk.end_index}")Using the Pipeline API for end-to-end workflows:
from chonkie import Pipeline
# Build a pipeline with multiple processing steps
pipe = (
Pipeline()
.chunk_with("recursive", tokenizer="gpt2", chunk_size=2048, recipe="markdown")
.refine_with("overlap", context_size=128)
.refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
.store_in("chroma", collection_name="my_chunks")
)
# Execute pipeline
doc = pipe.run(texts="Your text content here")
# Access processed chunks
for chunk in doc.chunks:
print(chunk.text, chunk.embedding)Chonkie follows the CHOMP pipeline architecture for text processing:
This modular design allows you to compose custom text processing workflows by chaining components together via the Pipeline API.
Fundamental data structures for representing chunks, documents, and configuration.
class Chunk:
"""Represents a text chunk with metadata."""
def __init__(
self,
id: str = ...,
text: str = "",
start_index: int = 0,
end_index: int = 0,
token_count: int = 0,
context: Optional[str] = None,
embedding: Union[list[float], np.ndarray, None] = None
): ...
class Document:
"""Container for text content with chunks and metadata."""
def __init__(
self,
id: str = ...,
content: str = "",
chunks: list[Chunk] = ...,
metadata: dict[str, Any] = ...
): ...
class Sentence:
"""Represents a sentence with position and token information."""
def __init__(
self,
text: str,
start_index: int,
end_index: int,
token_count: int,
embedding: Union[list[float], np.ndarray, None] = None
): ...11 different text chunking strategies for various use cases, from simple token-based splitting to advanced semantic and LLM-guided chunking.
class TokenChunker(BaseChunker):
"""Splits text into fixed-size token chunks with optional overlap."""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
chunk_overlap: Union[int, float] = 0
): ...
class SentenceChunker(BaseChunker):
"""Splits text based on sentence boundaries while respecting token limits."""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
chunk_overlap: int = 0,
min_sentences_per_chunk: int = 1,
min_characters_per_sentence: int = 12,
approximate: bool = False,
delim: Union[str, list[str]] = [". ", "! ", "? ", "\n"],
include_delim: Optional[Literal["prev", "next"]] = "prev"
): ...
class RecursiveChunker(BaseChunker):
"""Recursively splits text using hierarchical rules."""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
rules: RecursiveRules = ...,
min_characters_per_chunk: int = 24
): ...
class SemanticChunker(BaseChunker):
"""Uses embedding similarity to find optimal chunk boundaries."""
def __init__(
self,
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-base-32M",
threshold: float = 0.8,
chunk_size: int = 2048,
similarity_window: int = 3,
min_sentences_per_chunk: int = 1,
min_characters_per_sentence: int = 24,
delim: Union[str, list[str]] = [". ", "! ", "? ", "\n"],
include_delim: Optional[Literal["prev", "next"]] = "prev",
skip_window: int = 0,
filter_window: int = 5,
filter_polyorder: int = 3,
filter_tolerance: float = 0.2,
**kwargs: dict[str, Any]
): ...
class CodeChunker(BaseChunker):
"""Specialized chunker for source code using AST parsing."""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
language: Union[Literal["auto"], Any] = "auto",
include_nodes: bool = False
): ...Flexible tokenization supporting character-level, word-level, byte-level, and integration with HuggingFace tokenizers and tiktoken.
class AutoTokenizer:
"""Auto-loading tokenizer supporting multiple backends."""
def __init__(self, tokenizer: Union[str, Callable, Any] = "character"): ...
def encode(self, text: str) -> Sequence[int]: ...
def decode(self, tokens: Sequence[int]) -> str: ...
def count_tokens(self, text: str) -> int: ...
class CharacterTokenizer(Tokenizer):
"""Character-level tokenizer treating each character as a token."""
def tokenize(self, text: str) -> Sequence[str]: ...
class WordTokenizer(Tokenizer):
"""Word-level tokenizer splitting on whitespace."""
def tokenize(self, text: str) -> Sequence[str]: ...
class ByteTokenizer(Tokenizer):
"""Byte-level tokenizer operating on UTF-8 bytes."""
def tokenize(self, text: str) -> Sequence[int]: ...Integration with 9+ embedding providers including OpenAI, Cohere, Gemini, sentence-transformers, and model2vec.
class AutoEmbeddings:
"""Factory for automatically loading embedding providers."""
@staticmethod
def get_embeddings(
model: Union[str, BaseEmbeddings, Any],
**kwargs: Any
) -> BaseEmbeddings: ...
class SentenceTransformerEmbeddings(BaseEmbeddings):
"""Embeddings using sentence-transformers library."""
def __init__(
self,
model: Union[str, SentenceTransformer] = "all-MiniLM-L6-v2",
**kwargs: Any
): ...
class OpenAIEmbeddings(BaseEmbeddings):
"""Embeddings using OpenAI's API."""
def __init__(
self,
model: str = ...,
tokenizer: Optional[Any] = None,
dimension: Optional[int] = None,
max_tokens: Optional[int] = None,
base_url: Optional[str] = None,
api_key: Optional[str] = None,
max_retries: int = 3,
timeout: float = 60.0,
batch_size: int = 128,
**kwargs: dict[str, Any]
): ...Fluent API for building composable text processing workflows with component registration and recipe support.
class Pipeline:
"""Fluent API for building CHOMP pipelines."""
def __init__(self): ...
def fetch_from(self, source_type: str, **kwargs: Any) -> Pipeline: ...
def process_with(self, chef_type: str, **kwargs: Any) -> Pipeline: ...
def chunk_with(self, chunker_type: str, **kwargs: Any) -> Pipeline: ...
def refine_with(self, refinery_type: str, **kwargs: Any) -> Pipeline: ...
def export_with(self, porter_type: str, **kwargs: Any) -> Pipeline: ...
def store_in(self, handshake_type: str, **kwargs: Any) -> Pipeline: ...
def run(
self,
texts: Optional[Union[str, list[str]]] = None
) -> Union[Document, list[Document]]: ...
@classmethod
def from_recipe(cls, name: str, path: Optional[str] = None) -> Pipeline: ...
@classmethod
def from_config(
cls,
config: Union[str, list[Union[tuple[Any, ...], dict[str, Any]]]]
) -> Pipeline: ...Post-processing components for adding contextual overlap and embeddings to chunks.
class EmbeddingsRefinery(BaseRefinery):
"""Adds embeddings to chunks using an embedding model."""
def __init__(
self,
embedding_model: Union[str, BaseEmbeddings, AutoEmbeddings] = "minishlab/potion-retrieval-32M",
**kwargs: dict[str, Any]
): ...
class OverlapRefinery(BaseRefinery):
"""Adds contextual overlap between adjacent chunks."""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "character",
context_size: Union[int, float] = 0.25,
mode: Literal["token", "recursive"] = "token",
method: Literal["suffix", "prefix"] = "suffix",
rules: RecursiveRules = ...,
merge: bool = True,
inplace: bool = True
): ...File loading and text preprocessing with support for plain text, markdown, and tabular data.
class FileFetcher(BaseFetcher):
"""Fetches files from local filesystem."""
def fetch(
self,
path: Optional[str] = None,
dir: Optional[str] = None,
ext: Optional[list[str]] = None
) -> Union[Path, list[Path]]: ...
class TextChef(BaseChef):
"""Processes plain text files into documents."""
def __init__(self, tokenizer: Union[TokenizerProtocol, str] = "character"): ...
def process(self, path: Union[str, Path]) -> Document: ...
class MarkdownChef(BaseChef):
"""Processes markdown files extracting tables, code blocks, and images."""
def __init__(self, tokenizer: Union[TokenizerProtocol, str] = "character"): ...
def process(self, path: Union[str, Path]) -> MarkdownDocument: ...Direct integration with 10+ vector databases including ChromaDB, Qdrant, Pinecone, Weaviate, and more.
class ChromaHandshake(BaseHandshake):
"""Integration with ChromaDB vector database."""
def __init__(
self,
client: Optional[Any] = None,
collection_name: Union[str, Literal["random"]] = "random",
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
path: Optional[str] = None
): ...
class QdrantHandshake(BaseHandshake):
"""Integration with Qdrant vector database."""
def __init__(
self,
client: Optional[QdrantClient] = None,
collection_name: Union[str, Literal["random"]] = "random",
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
url: Optional[str] = None,
path: Optional[str] = None,
api_key: Optional[str] = None,
**kwargs: dict[str, Any]
): ...Export chunks to JSON, JSONL, or HuggingFace datasets, with visualization and hub access utilities.
class JSONPorter(BasePorter):
"""Exports chunks to JSON or JSONL format."""
def __init__(self, lines: bool = True): ...
def export(self, chunks: list[Chunk], file: str = "chunks.jsonl") -> None: ...
class DatasetsPorter(BasePorter):
"""Exports chunks to HuggingFace Datasets format."""
def export(self, chunks: list[Chunk], **kwargs: dict[str, Any]) -> Dataset: ...
def to_dataset(self, chunks: list[Chunk]) -> Dataset: ...
class Visualizer:
"""Visualizes chunks with color-coded highlighting."""
def __init__(self, theme: Union[str, list[str]] = "pastel"): ...
def print(self, chunks: list[Chunk], full_text: Optional[str] = None) -> None: ...
def save(
self,
filename: str,
chunks: list[Chunk],
full_text: Optional[str] = None,
title: str = "Chunk Visualization"
) -> None: ...LLM integrations for AI-powered chunking and recipe management via the Chonkie Hub.
class SlumberChunker(BaseChunker):
"""Uses LLM to intelligently determine chunk boundaries."""
def __init__(
self,
genie: Optional[BaseGenie] = None,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
rules: RecursiveRules = ...,
candidate_size: int = 128,
min_characters_per_chunk: int = 24,
verbose: bool = True
): ...
class OpenAIGenie(BaseGenie):
"""LLM integration for OpenAI models."""
def __init__(
self,
model: str = "gpt-4.1",
base_url: Optional[str] = None,
api_key: Optional[str] = None
): ...
class Hubbie:
"""Manager for accessing recipes from the Chonkie Hub."""
def get_recipe(
self,
name: Optional[str] = "default",
lang: Optional[str] = "en",
path: Optional[str] = None
) -> dict: ...Centralized logging configuration with programmatic control and environment variable support for debugging and monitoring.
def get_logger(module_name: str): ...
def configure(level: str = None, format: str = None) -> None: ...
def disable() -> None: ...
def enable(level: str = "INFO") -> None: ...
def is_enabled() -> bool: ...