or run

tessl search

Version

Workspace: tessl
Visibility: Public
Created: 11 days ago
Last updated: 1 day ago
Describes: pkg:pypi/chonkie@1.5.x

docs

advanced-features.md chunkers.md core-types.md data-processing.md embeddings.md export.md index.md logging.md pipeline.md refineries.md tokenizers.md vector-databases.md

tile.json

tessl/pypi-chonkie

tessl install tessl/pypi-chonkie@1.5.0

The lightweight ingestion library for fast, efficient and robust RAG pipelines

Refineries

Post-processing components for enhancing chunks with contextual overlap and embeddings after initial chunking.

Capabilities

BaseRefinery

Abstract base class for all refinery implementations.

from abc import ABC, abstractmethod

class BaseRefinery(ABC):
    """
    Base class for all refinery implementations that post-process chunks.
    """
    def __init__(self): ...

    @abstractmethod
    def refine(self, chunks: list[Chunk]) -> list[Chunk]:
        """
        Refines a list of chunks.

        Args:
            chunks: List of chunks to refine

        Returns:
            List of refined chunks
        """
        ...

    def refine_document(self, document: Document) -> Document:
        """
        Refines chunks within a document.

        Args:
            document: Document containing chunks to refine

        Returns:
            Document with refined chunks
        """
        ...

    def __call__(self, chunks: list[Chunk]) -> list[Chunk]:
        """
        Allows calling refinery as a function.

        Args:
            chunks: List of chunks to refine

        Returns:
            List of refined chunks
        """
        ...

EmbeddingsRefinery

Adds embeddings to chunks using a specified embedding model.

from typing import Union, Any

class EmbeddingsRefinery(BaseRefinery):
    """
    Adds embeddings to chunks using an embedding model.

    Args:
        embedding_model: Embedding model identifier, BaseEmbeddings instance, or AutoEmbeddings
            (default: 'minishlab/potion-retrieval-32M')
        **kwargs: Additional arguments for the embedding model
    """
    def __init__(
        self,
        embedding_model: Union[str, BaseEmbeddings, AutoEmbeddings] = "minishlab/potion-retrieval-32M",
        **kwargs: dict[str, Any]
    ): ...

    def refine(self, chunks: list[Chunk]) -> list[Chunk]:
        """
        Adds embeddings to each chunk.

        Args:
            chunks: List of chunks to add embeddings to

        Returns:
            List of chunks with embedding vectors populated
        """
        ...

    @property
    def dimension(self) -> int:
        """
        Returns the embedding dimension.

        Returns:
            Embedding vector dimension
        """
        ...

Usage example:

from chonkie import TokenChunker, EmbeddingsRefinery

# Create chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")

# Add embeddings
refinery = EmbeddingsRefinery(embedding_model="all-MiniLM-L6-v2")
chunks_with_embeddings = refinery(chunks)

# Access embeddings
for chunk in chunks_with_embeddings:
    print(f"Text: {chunk.text}")
    print(f"Embedding shape: {chunk.embedding.shape}")

# Use with different embedding providers
openai_refinery = EmbeddingsRefinery(
    embedding_model="openai/text-embedding-3-small",
    api_key="your-key"
)

# Use in pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("recursive", chunk_size=512)
    .refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
)

doc = pipe.run("Your text...")

OverlapRefinery

Adds contextual overlap between adjacent chunks by including content from neighboring chunks.

from typing import Union, Literal

class OverlapRefinery(BaseRefinery):
    """
    Adds contextual overlap between adjacent chunks.

    Args:
        tokenizer: Tokenizer instance or identifier (default: 'character')
        context_size: Size of overlap context in tokens or as fraction if < 1.0 (default: 0.25)
        mode: Overlap mode - 'token' for fixed token count or 'recursive' for hierarchical
            (default: 'token')
        method: Overlap method - 'suffix' adds end of previous chunk, 'prefix' adds start of next
            (default: 'suffix')
        rules: RecursiveRules for recursive mode (default: RecursiveRules())
        merge: If True, merge context into chunk text; if False, store separately (default: True)
        inplace: If True, modify chunks in place; if False, create copies (default: True)
    """
    def __init__(
        self,
        tokenizer: Union[str, TokenizerProtocol] = "character",
        context_size: Union[int, float] = 0.25,
        mode: Literal["token", "recursive"] = "token",
        method: Literal["suffix", "prefix"] = "suffix",
        rules: RecursiveRules = ...,
        merge: bool = True,
        inplace: bool = True
    ): ...

    def refine(self, chunks: list[Chunk]) -> list[Chunk]:
        """
        Adds overlap context to chunks.

        Args:
            chunks: List of chunks to add overlap to

        Returns:
            List of chunks with contextual overlap added
        """
        ...

    def clear_cache(self) -> None:
        """
        Clears the internal LRU cache for tokenization.
        """
        ...

    def cache_info(self) -> dict:
        """
        Returns cache statistics.

        Returns:
            Dictionary with cache statistics. Contains 'tokens_cache' and 'count_cache' keys,
            each with a dict containing 'hits', 'misses', 'maxsize', and 'currsize' fields
        """
        ...

Usage example:

from chonkie import SentenceChunker, OverlapRefinery

# Create chunks
chunker = SentenceChunker(chunk_size=512)
chunks = chunker("Sentence one. Sentence two. Sentence three...")

# Add overlap with suffix method (default)
refinery = OverlapRefinery(
    context_size=128,  # 128 tokens of overlap
    method="suffix"    # Include end of previous chunk
)
overlapped_chunks = refinery(chunks)

# Each chunk now includes context from the previous chunk
for chunk in overlapped_chunks:
    print(f"Main text: {chunk.text}")
    if chunk.context:
        print(f"Context: {chunk.context}")

# Use fractional overlap
fractional_refinery = OverlapRefinery(
    context_size=0.25,  # 25% of chunk size
    merge=False         # Store context separately
)

# Recursive mode for semantic overlap
from chonkie import RecursiveRules

rules = RecursiveRules.from_recipe("default")
recursive_refinery = OverlapRefinery(
    mode="recursive",
    context_size=100,
    rules=rules
)

# Use in pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("token", chunk_size=512)
    .refine_with("overlap", context_size=0.2, method="suffix")
)

doc = pipe.run("Your text...")

Refinery Imports

All refineries are available from the main package:

from chonkie import (
    BaseRefinery,
    EmbeddingsRefinery,
    OverlapRefinery,
)

Pipeline Usage

Refineries are used in pipelines via the refine_with() method:

from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("recursive", chunk_size=512)
    .refine_with("overlap", context_size=128)
    .refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
)

Refinery aliases:

overlap - OverlapRefinery
embeddings - EmbeddingsRefinery

Custom Refineries

Create custom refineries by extending BaseRefinery:

from chonkie import BaseRefinery, Chunk

class CustomRefinery(BaseRefinery):
    def __init__(self, my_param: str = "default"):
        super().__init__()
        self.my_param = my_param

    def refine(self, chunks: list[Chunk]) -> list[Chunk]:
        # Custom refinement logic
        for chunk in chunks:
            # Modify chunks as needed
            chunk.metadata = {"processed_by": self.my_param}
        return chunks

# Register with pipeline
from chonkie.pipeline import refinery

@refinery("custom")
class RegisteredRefinery(BaseRefinery):
    def refine(self, chunks: list[Chunk]) -> list[Chunk]:
        return chunks

# Use in pipeline
from chonkie import Pipeline

pipe = Pipeline().refine_with("custom", my_param="value")

Combining Refineries

Multiple refineries can be chained together:

from chonkie import TokenChunker, OverlapRefinery, EmbeddingsRefinery

# Create chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")

# Apply multiple refineries in sequence
overlap_refinery = OverlapRefinery(context_size=128)
embedding_refinery = EmbeddingsRefinery(embedding_model="all-MiniLM-L6-v2")

# Chain refineries
chunks = overlap_refinery(chunks)
chunks = embedding_refinery(chunks)

# Or use pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("token", chunk_size=512)
    .refine_with("overlap", context_size=128)
    .refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
)

Performance Considerations

EmbeddingsRefinery

Uses batch embedding for efficiency
Embedding model is loaded once and reused
Supports all embedding providers via AutoEmbeddings

OverlapRefinery

Uses LRU caching for tokenization to improve performance
inplace=True modifies chunks directly (faster, lower memory)
inplace=False creates copies (safer for parallel processing)
merge=True combines context with text (simpler)
merge=False stores context separately (preserves original)

from chonkie import OverlapRefinery

# High-performance settings
refinery = OverlapRefinery(
    inplace=True,   # Modify in place
    merge=True      # Combine context with text
)

# Safe for parallel processing
refinery = OverlapRefinery(
    inplace=False,  # Create copies
    merge=False     # Separate context
)

# Clear cache if memory is constrained
refinery.clear_cache()

# Check cache performance
info = refinery.cache_info()
print(f"Tokens cache - hits: {info['tokens_cache']['hits']}, misses: {info['tokens_cache']['misses']}")
print(f"Count cache - hits: {info['count_cache']['hits']}, misses: {info['count_cache']['misses']}")

Version

tessl/pypi-chonkie

refineries.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

Refineries

Capabilities

BaseRefinery

EmbeddingsRefinery

OverlapRefinery

Refinery Imports

Pipeline Usage

Custom Refineries

Combining Refineries

Performance Considerations

EmbeddingsRefinery

OverlapRefinery

refineries.mddocs/