tessl install tessl/pypi-chonkie@1.5.0The lightweight ingestion library for fast, efficient and robust RAG pipelines
Post-processing components for enhancing chunks with contextual overlap and embeddings after initial chunking.
Abstract base class for all refinery implementations.
from abc import ABC, abstractmethod
class BaseRefinery(ABC):
"""
Base class for all refinery implementations that post-process chunks.
"""
def __init__(self): ...
@abstractmethod
def refine(self, chunks: list[Chunk]) -> list[Chunk]:
"""
Refines a list of chunks.
Args:
chunks: List of chunks to refine
Returns:
List of refined chunks
"""
...
def refine_document(self, document: Document) -> Document:
"""
Refines chunks within a document.
Args:
document: Document containing chunks to refine
Returns:
Document with refined chunks
"""
...
def __call__(self, chunks: list[Chunk]) -> list[Chunk]:
"""
Allows calling refinery as a function.
Args:
chunks: List of chunks to refine
Returns:
List of refined chunks
"""
...Adds embeddings to chunks using a specified embedding model.
from typing import Union, Any
class EmbeddingsRefinery(BaseRefinery):
"""
Adds embeddings to chunks using an embedding model.
Args:
embedding_model: Embedding model identifier, BaseEmbeddings instance, or AutoEmbeddings
(default: 'minishlab/potion-retrieval-32M')
**kwargs: Additional arguments for the embedding model
"""
def __init__(
self,
embedding_model: Union[str, BaseEmbeddings, AutoEmbeddings] = "minishlab/potion-retrieval-32M",
**kwargs: dict[str, Any]
): ...
def refine(self, chunks: list[Chunk]) -> list[Chunk]:
"""
Adds embeddings to each chunk.
Args:
chunks: List of chunks to add embeddings to
Returns:
List of chunks with embedding vectors populated
"""
...
@property
def dimension(self) -> int:
"""
Returns the embedding dimension.
Returns:
Embedding vector dimension
"""
...Usage example:
from chonkie import TokenChunker, EmbeddingsRefinery
# Create chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")
# Add embeddings
refinery = EmbeddingsRefinery(embedding_model="all-MiniLM-L6-v2")
chunks_with_embeddings = refinery(chunks)
# Access embeddings
for chunk in chunks_with_embeddings:
print(f"Text: {chunk.text}")
print(f"Embedding shape: {chunk.embedding.shape}")
# Use with different embedding providers
openai_refinery = EmbeddingsRefinery(
embedding_model="openai/text-embedding-3-small",
api_key="your-key"
)
# Use in pipeline
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("recursive", chunk_size=512)
.refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
)
doc = pipe.run("Your text...")Adds contextual overlap between adjacent chunks by including content from neighboring chunks.
from typing import Union, Literal
class OverlapRefinery(BaseRefinery):
"""
Adds contextual overlap between adjacent chunks.
Args:
tokenizer: Tokenizer instance or identifier (default: 'character')
context_size: Size of overlap context in tokens or as fraction if < 1.0 (default: 0.25)
mode: Overlap mode - 'token' for fixed token count or 'recursive' for hierarchical
(default: 'token')
method: Overlap method - 'suffix' adds end of previous chunk, 'prefix' adds start of next
(default: 'suffix')
rules: RecursiveRules for recursive mode (default: RecursiveRules())
merge: If True, merge context into chunk text; if False, store separately (default: True)
inplace: If True, modify chunks in place; if False, create copies (default: True)
"""
def __init__(
self,
tokenizer: Union[str, TokenizerProtocol] = "character",
context_size: Union[int, float] = 0.25,
mode: Literal["token", "recursive"] = "token",
method: Literal["suffix", "prefix"] = "suffix",
rules: RecursiveRules = ...,
merge: bool = True,
inplace: bool = True
): ...
def refine(self, chunks: list[Chunk]) -> list[Chunk]:
"""
Adds overlap context to chunks.
Args:
chunks: List of chunks to add overlap to
Returns:
List of chunks with contextual overlap added
"""
...
def clear_cache(self) -> None:
"""
Clears the internal LRU cache for tokenization.
"""
...
def cache_info(self) -> dict:
"""
Returns cache statistics.
Returns:
Dictionary with cache statistics. Contains 'tokens_cache' and 'count_cache' keys,
each with a dict containing 'hits', 'misses', 'maxsize', and 'currsize' fields
"""
...Usage example:
from chonkie import SentenceChunker, OverlapRefinery
# Create chunks
chunker = SentenceChunker(chunk_size=512)
chunks = chunker("Sentence one. Sentence two. Sentence three...")
# Add overlap with suffix method (default)
refinery = OverlapRefinery(
context_size=128, # 128 tokens of overlap
method="suffix" # Include end of previous chunk
)
overlapped_chunks = refinery(chunks)
# Each chunk now includes context from the previous chunk
for chunk in overlapped_chunks:
print(f"Main text: {chunk.text}")
if chunk.context:
print(f"Context: {chunk.context}")
# Use fractional overlap
fractional_refinery = OverlapRefinery(
context_size=0.25, # 25% of chunk size
merge=False # Store context separately
)
# Recursive mode for semantic overlap
from chonkie import RecursiveRules
rules = RecursiveRules.from_recipe("default")
recursive_refinery = OverlapRefinery(
mode="recursive",
context_size=100,
rules=rules
)
# Use in pipeline
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("token", chunk_size=512)
.refine_with("overlap", context_size=0.2, method="suffix")
)
doc = pipe.run("Your text...")All refineries are available from the main package:
from chonkie import (
BaseRefinery,
EmbeddingsRefinery,
OverlapRefinery,
)Refineries are used in pipelines via the refine_with() method:
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("recursive", chunk_size=512)
.refine_with("overlap", context_size=128)
.refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
)Refinery aliases:
overlap - OverlapRefineryembeddings - EmbeddingsRefineryCreate custom refineries by extending BaseRefinery:
from chonkie import BaseRefinery, Chunk
class CustomRefinery(BaseRefinery):
def __init__(self, my_param: str = "default"):
super().__init__()
self.my_param = my_param
def refine(self, chunks: list[Chunk]) -> list[Chunk]:
# Custom refinement logic
for chunk in chunks:
# Modify chunks as needed
chunk.metadata = {"processed_by": self.my_param}
return chunks
# Register with pipeline
from chonkie.pipeline import refinery
@refinery("custom")
class RegisteredRefinery(BaseRefinery):
def refine(self, chunks: list[Chunk]) -> list[Chunk]:
return chunks
# Use in pipeline
from chonkie import Pipeline
pipe = Pipeline().refine_with("custom", my_param="value")Multiple refineries can be chained together:
from chonkie import TokenChunker, OverlapRefinery, EmbeddingsRefinery
# Create chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")
# Apply multiple refineries in sequence
overlap_refinery = OverlapRefinery(context_size=128)
embedding_refinery = EmbeddingsRefinery(embedding_model="all-MiniLM-L6-v2")
# Chain refineries
chunks = overlap_refinery(chunks)
chunks = embedding_refinery(chunks)
# Or use pipeline
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("token", chunk_size=512)
.refine_with("overlap", context_size=128)
.refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
)inplace=True modifies chunks directly (faster, lower memory)inplace=False creates copies (safer for parallel processing)merge=True combines context with text (simpler)merge=False stores context separately (preserves original)from chonkie import OverlapRefinery
# High-performance settings
refinery = OverlapRefinery(
inplace=True, # Modify in place
merge=True # Combine context with text
)
# Safe for parallel processing
refinery = OverlapRefinery(
inplace=False, # Create copies
merge=False # Separate context
)
# Clear cache if memory is constrained
refinery.clear_cache()
# Check cache performance
info = refinery.cache_info()
print(f"Tokens cache - hits: {info['tokens_cache']['hits']}, misses: {info['tokens_cache']['misses']}")
print(f"Count cache - hits: {info['count_cache']['hits']}, misses: {info['count_cache']['misses']}")