or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/chonkie@1.5.x

docs

advanced-features.mdchunkers.mdcore-types.mddata-processing.mdembeddings.mdexport.mdindex.mdlogging.mdpipeline.mdrefineries.mdtokenizers.mdvector-databases.md
tile.json

tessl/pypi-chonkie

tessl install tessl/pypi-chonkie@1.5.0

The lightweight ingestion library for fast, efficient and robust RAG pipelines

refineries.mddocs/

Refineries

Post-processing components for enhancing chunks with contextual overlap and embeddings after initial chunking.

Capabilities

BaseRefinery

Abstract base class for all refinery implementations.

from abc import ABC, abstractmethod

class BaseRefinery(ABC):
    """
    Base class for all refinery implementations that post-process chunks.
    """
    def __init__(self): ...

    @abstractmethod
    def refine(self, chunks: list[Chunk]) -> list[Chunk]:
        """
        Refines a list of chunks.

        Args:
            chunks: List of chunks to refine

        Returns:
            List of refined chunks
        """
        ...

    def refine_document(self, document: Document) -> Document:
        """
        Refines chunks within a document.

        Args:
            document: Document containing chunks to refine

        Returns:
            Document with refined chunks
        """
        ...

    def __call__(self, chunks: list[Chunk]) -> list[Chunk]:
        """
        Allows calling refinery as a function.

        Args:
            chunks: List of chunks to refine

        Returns:
            List of refined chunks
        """
        ...

EmbeddingsRefinery

Adds embeddings to chunks using a specified embedding model.

from typing import Union, Any

class EmbeddingsRefinery(BaseRefinery):
    """
    Adds embeddings to chunks using an embedding model.

    Args:
        embedding_model: Embedding model identifier, BaseEmbeddings instance, or AutoEmbeddings
            (default: 'minishlab/potion-retrieval-32M')
        **kwargs: Additional arguments for the embedding model
    """
    def __init__(
        self,
        embedding_model: Union[str, BaseEmbeddings, AutoEmbeddings] = "minishlab/potion-retrieval-32M",
        **kwargs: dict[str, Any]
    ): ...

    def refine(self, chunks: list[Chunk]) -> list[Chunk]:
        """
        Adds embeddings to each chunk.

        Args:
            chunks: List of chunks to add embeddings to

        Returns:
            List of chunks with embedding vectors populated
        """
        ...

    @property
    def dimension(self) -> int:
        """
        Returns the embedding dimension.

        Returns:
            Embedding vector dimension
        """
        ...

Usage example:

from chonkie import TokenChunker, EmbeddingsRefinery

# Create chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")

# Add embeddings
refinery = EmbeddingsRefinery(embedding_model="all-MiniLM-L6-v2")
chunks_with_embeddings = refinery(chunks)

# Access embeddings
for chunk in chunks_with_embeddings:
    print(f"Text: {chunk.text}")
    print(f"Embedding shape: {chunk.embedding.shape}")

# Use with different embedding providers
openai_refinery = EmbeddingsRefinery(
    embedding_model="openai/text-embedding-3-small",
    api_key="your-key"
)

# Use in pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("recursive", chunk_size=512)
    .refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
)

doc = pipe.run("Your text...")

OverlapRefinery

Adds contextual overlap between adjacent chunks by including content from neighboring chunks.

from typing import Union, Literal

class OverlapRefinery(BaseRefinery):
    """
    Adds contextual overlap between adjacent chunks.

    Args:
        tokenizer: Tokenizer instance or identifier (default: 'character')
        context_size: Size of overlap context in tokens or as fraction if < 1.0 (default: 0.25)
        mode: Overlap mode - 'token' for fixed token count or 'recursive' for hierarchical
            (default: 'token')
        method: Overlap method - 'suffix' adds end of previous chunk, 'prefix' adds start of next
            (default: 'suffix')
        rules: RecursiveRules for recursive mode (default: RecursiveRules())
        merge: If True, merge context into chunk text; if False, store separately (default: True)
        inplace: If True, modify chunks in place; if False, create copies (default: True)
    """
    def __init__(
        self,
        tokenizer: Union[str, TokenizerProtocol] = "character",
        context_size: Union[int, float] = 0.25,
        mode: Literal["token", "recursive"] = "token",
        method: Literal["suffix", "prefix"] = "suffix",
        rules: RecursiveRules = ...,
        merge: bool = True,
        inplace: bool = True
    ): ...

    def refine(self, chunks: list[Chunk]) -> list[Chunk]:
        """
        Adds overlap context to chunks.

        Args:
            chunks: List of chunks to add overlap to

        Returns:
            List of chunks with contextual overlap added
        """
        ...

    def clear_cache(self) -> None:
        """
        Clears the internal LRU cache for tokenization.
        """
        ...

    def cache_info(self) -> dict:
        """
        Returns cache statistics.

        Returns:
            Dictionary with cache statistics. Contains 'tokens_cache' and 'count_cache' keys,
            each with a dict containing 'hits', 'misses', 'maxsize', and 'currsize' fields
        """
        ...

Usage example:

from chonkie import SentenceChunker, OverlapRefinery

# Create chunks
chunker = SentenceChunker(chunk_size=512)
chunks = chunker("Sentence one. Sentence two. Sentence three...")

# Add overlap with suffix method (default)
refinery = OverlapRefinery(
    context_size=128,  # 128 tokens of overlap
    method="suffix"    # Include end of previous chunk
)
overlapped_chunks = refinery(chunks)

# Each chunk now includes context from the previous chunk
for chunk in overlapped_chunks:
    print(f"Main text: {chunk.text}")
    if chunk.context:
        print(f"Context: {chunk.context}")

# Use fractional overlap
fractional_refinery = OverlapRefinery(
    context_size=0.25,  # 25% of chunk size
    merge=False         # Store context separately
)

# Recursive mode for semantic overlap
from chonkie import RecursiveRules

rules = RecursiveRules.from_recipe("default")
recursive_refinery = OverlapRefinery(
    mode="recursive",
    context_size=100,
    rules=rules
)

# Use in pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("token", chunk_size=512)
    .refine_with("overlap", context_size=0.2, method="suffix")
)

doc = pipe.run("Your text...")

Refinery Imports

All refineries are available from the main package:

from chonkie import (
    BaseRefinery,
    EmbeddingsRefinery,
    OverlapRefinery,
)

Pipeline Usage

Refineries are used in pipelines via the refine_with() method:

from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("recursive", chunk_size=512)
    .refine_with("overlap", context_size=128)
    .refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
)

Refinery aliases:

  • overlap - OverlapRefinery
  • embeddings - EmbeddingsRefinery

Custom Refineries

Create custom refineries by extending BaseRefinery:

from chonkie import BaseRefinery, Chunk

class CustomRefinery(BaseRefinery):
    def __init__(self, my_param: str = "default"):
        super().__init__()
        self.my_param = my_param

    def refine(self, chunks: list[Chunk]) -> list[Chunk]:
        # Custom refinement logic
        for chunk in chunks:
            # Modify chunks as needed
            chunk.metadata = {"processed_by": self.my_param}
        return chunks

# Register with pipeline
from chonkie.pipeline import refinery

@refinery("custom")
class RegisteredRefinery(BaseRefinery):
    def refine(self, chunks: list[Chunk]) -> list[Chunk]:
        return chunks

# Use in pipeline
from chonkie import Pipeline

pipe = Pipeline().refine_with("custom", my_param="value")

Combining Refineries

Multiple refineries can be chained together:

from chonkie import TokenChunker, OverlapRefinery, EmbeddingsRefinery

# Create chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")

# Apply multiple refineries in sequence
overlap_refinery = OverlapRefinery(context_size=128)
embedding_refinery = EmbeddingsRefinery(embedding_model="all-MiniLM-L6-v2")

# Chain refineries
chunks = overlap_refinery(chunks)
chunks = embedding_refinery(chunks)

# Or use pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("token", chunk_size=512)
    .refine_with("overlap", context_size=128)
    .refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
)

Performance Considerations

EmbeddingsRefinery

  • Uses batch embedding for efficiency
  • Embedding model is loaded once and reused
  • Supports all embedding providers via AutoEmbeddings

OverlapRefinery

  • Uses LRU caching for tokenization to improve performance
  • inplace=True modifies chunks directly (faster, lower memory)
  • inplace=False creates copies (safer for parallel processing)
  • merge=True combines context with text (simpler)
  • merge=False stores context separately (preserves original)
from chonkie import OverlapRefinery

# High-performance settings
refinery = OverlapRefinery(
    inplace=True,   # Modify in place
    merge=True      # Combine context with text
)

# Safe for parallel processing
refinery = OverlapRefinery(
    inplace=False,  # Create copies
    merge=False     # Separate context
)

# Clear cache if memory is constrained
refinery.clear_cache()

# Check cache performance
info = refinery.cache_info()
print(f"Tokens cache - hits: {info['tokens_cache']['hits']}, misses: {info['tokens_cache']['misses']}")
print(f"Count cache - hits: {info['count_cache']['hits']}, misses: {info['count_cache']['misses']}")