or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/chonkie@1.5.x

docs

advanced-features.mdchunkers.mdcore-types.mddata-processing.mdembeddings.mdexport.mdindex.mdlogging.mdpipeline.mdrefineries.mdtokenizers.mdvector-databases.md
tile.json

tessl/pypi-chonkie

tessl install tessl/pypi-chonkie@1.5.0

The lightweight ingestion library for fast, efficient and robust RAG pipelines

chunkers.mddocs/

Chunkers

Comprehensive collection of 11 text chunking strategies for various use cases, from simple token-based splitting to advanced semantic and LLM-guided chunking.

Capabilities

BaseChunker

Abstract base class providing common functionality for all chunker implementations.

from abc import ABC, abstractmethod
from typing import Union, Sequence, Optional

class BaseChunker(ABC):
    """
    Base class for all chunker implementations.

    Args:
        tokenizer: Tokenizer instance or string identifier (e.g., 'gpt2', 'character')
    """
    def __init__(self, tokenizer: Union[str, TokenizerProtocol] = "gpt2"): ...

    @abstractmethod
    def chunk(self, text: str) -> list[Chunk]:
        """
        Chunks a single text string.

        Args:
            text: Input text to chunk

        Returns:
            List of Chunk objects
        """
        ...

    def chunk_batch(
        self,
        texts: Sequence[str],
        show_progress: bool = True
    ) -> list[list[Chunk]]:
        """
        Chunks multiple texts with optional progress bar.

        Args:
            texts: Sequence of input texts
            show_progress: Whether to display progress bar

        Returns:
            List of chunk lists, one per input text
        """
        ...

    def chunk_document(self, document: Document) -> Document:
        """
        Chunks a document object and updates its chunks.

        Args:
            document: Document to chunk

        Returns:
            Document with populated chunks list
        """
        ...

    def __call__(
        self,
        text: Union[str, Sequence[str]],
        show_progress: bool = True
    ) -> Union[list[Chunk], list[list[Chunk]]]:
        """
        Allows calling chunker as a function.

        Args:
            text: Single text string or sequence of texts
            show_progress: Whether to display progress bar for batch processing

        Returns:
            Chunks for single text or list of chunk lists for batch
        """
        ...

    @property
    def tokenizer(self) -> AutoTokenizer:
        """Returns the tokenizer instance."""
        ...

TokenChunker

Splits text into fixed-size token chunks with optional overlap, the simplest and fastest chunking strategy.

class TokenChunker(BaseChunker):
    """
    Splits text into chunks of a specified token size with optional overlap.

    Args:
        tokenizer: Tokenizer instance or identifier (default: 'character')
        chunk_size: Maximum number of tokens per chunk (default: 2048)
        chunk_overlap: Number of tokens to overlap between chunks, or fraction if < 1.0 (default: 0)
    """
    def __init__(
        self,
        tokenizer: Union[str, TokenizerProtocol] = "character",
        chunk_size: int = 2048,
        chunk_overlap: Union[int, float] = 0
    ): ...

    def chunk(self, text: str) -> list[Chunk]:
        """
        Chunks text by token count.

        Args:
            text: Input text to chunk

        Returns:
            List of Chunk objects with fixed token sizes
        """
        ...

Usage example:

from chonkie import TokenChunker

# Create chunker with GPT-2 tokenizer
chunker = TokenChunker(tokenizer="gpt2", chunk_size=512, chunk_overlap=50)

# Chunk text
chunks = chunker("Your long text here...")

# Process multiple texts
texts = ["Text 1", "Text 2", "Text 3"]
batch_chunks = chunker(texts)

SentenceChunker

Splits text based on sentence boundaries while respecting token limits, preserving semantic units.

class SentenceChunker(BaseChunker):
    """
    Splits text into chunks based on sentence boundaries while respecting token limits.

    Args:
        tokenizer: Tokenizer instance or identifier (default: 'character')
        chunk_size: Maximum number of tokens per chunk (default: 2048)
        chunk_overlap: Number of sentences to overlap between chunks (default: 0)
        min_sentences_per_chunk: Minimum sentences required per chunk (default: 1)
        min_characters_per_sentence: Minimum characters for valid sentence (default: 12)
        approximate: If True, use faster approximate sentence detection (default: False)
        delim: Sentence delimiter(s) (default: ['. ', '! ', '? ', '\\n'])
        include_delim: Include delimiter with 'prev' or 'next' chunk (default: 'prev')
    """
    def __init__(
        self,
        tokenizer: Union[str, TokenizerProtocol] = "character",
        chunk_size: int = 2048,
        chunk_overlap: int = 0,
        min_sentences_per_chunk: int = 1,
        min_characters_per_sentence: int = 12,
        approximate: bool = False,
        delim: Union[str, list[str]] = [". ", "! ", "? ", "\n"],
        include_delim: Optional[Literal["prev", "next"]] = "prev"
    ): ...

    def chunk(self, text: str) -> list[Chunk]:
        """
        Chunks text by sentence boundaries.

        Args:
            text: Input text to chunk

        Returns:
            List of Chunk objects aligned to sentence boundaries
        """
        ...

    @classmethod
    def from_recipe(
        cls,
        name: str,
        lang: Optional[str] = "en",
        **kwargs
    ) -> SentenceChunker:
        """
        Creates chunker from a predefined recipe.

        Args:
            name: Recipe name
            lang: Language code
            **kwargs: Additional parameters for the chunker

        Returns:
            Configured SentenceChunker instance
        """
        ...

Usage example:

from chonkie import SentenceChunker

# Create sentence-based chunker
chunker = SentenceChunker(
    tokenizer="gpt2",
    chunk_size=512,
    chunk_overlap=1,  # Overlap by 1 sentence
    min_sentences_per_chunk=2
)

chunks = chunker("First sentence. Second sentence. Third sentence.")

RecursiveChunker

Recursively splits text using hierarchical rules (paragraphs, sentences, words, etc.) for semantically meaningful chunks.

class RecursiveChunker(BaseChunker):
    """
    Recursively splits text using hierarchical rules.

    Args:
        tokenizer: Tokenizer instance or identifier (default: 'character')
        chunk_size: Maximum number of tokens per chunk (default: 2048)
        rules: RecursiveRules defining the hierarchy (default: RecursiveRules())
        min_characters_per_chunk: Minimum characters required per chunk (default: 24)
    """
    def __init__(
        self,
        tokenizer: Union[str, TokenizerProtocol] = "character",
        chunk_size: int = 2048,
        rules: RecursiveRules = ...,
        min_characters_per_chunk: int = 24
    ): ...

    def chunk(self, text: str) -> list[Chunk]:
        """
        Chunks text recursively according to hierarchical rules.

        Args:
            text: Input text to chunk

        Returns:
            List of Chunk objects created using recursive splitting
        """
        ...

    @classmethod
    def from_recipe(
        cls,
        name: Optional[str] = "default",
        lang: Optional[str] = "en",
        path: Optional[str] = None,
        **kwargs
    ) -> RecursiveChunker:
        """
        Creates chunker from a predefined recipe.

        Args:
            name: Recipe name (e.g., 'default', 'markdown', 'code')
            lang: Language code
            path: Optional path to custom recipe file
            **kwargs: Additional parameters for the chunker

        Returns:
            Configured RecursiveChunker instance
        """
        ...

Usage example:

from chonkie import RecursiveChunker, RecursiveRules

# Use default rules
chunker = RecursiveChunker(chunk_size=512)

# Use markdown-specific rules
chunker = RecursiveChunker.from_recipe(name="markdown", chunk_size=512)

# Custom rules
from chonkie import RecursiveLevel
rules = RecursiveRules(levels=[
    RecursiveLevel(delimiters=["\n\n"]),  # Split on paragraphs first
    RecursiveLevel(delimiters=[". "]),     # Then sentences
    RecursiveLevel(whitespace=True)        # Finally words
])
chunker = RecursiveChunker(rules=rules, chunk_size=512)

chunks = chunker("Your text here...")

SemanticChunker

Uses embedding-based semantic similarity and peak detection to find optimal chunk boundaries.

class SemanticChunker(BaseChunker):
    """
    Uses embedding similarity to find optimal chunk boundaries.

    Args:
        embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-base-32M')
        threshold: Similarity threshold for splitting (0.0-1.0, default: 0.8)
        chunk_size: Maximum number of tokens per chunk (default: 2048)
        similarity_window: Window size for similarity computation (default: 3)
        min_sentences_per_chunk: Minimum sentences per chunk (default: 1)
        min_characters_per_sentence: Minimum characters per sentence (default: 24)
        delim: Sentence delimiters (default: ['. ', '! ', '? ', '\\n'])
        include_delim: Include delimiter with 'prev' or 'next' (default: 'prev')
        skip_window: Skip window size for peak detection (default: 0)
        filter_window: Savitzky-Golay filter window size (default: 5)
        filter_polyorder: Savitzky-Golay polynomial order (default: 3)
        filter_tolerance: Filter tolerance for smoothing (default: 0.2)
        **kwargs: Additional arguments for embedding model
    """
    def __init__(
        self,
        embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-base-32M",
        threshold: float = 0.8,
        chunk_size: int = 2048,
        similarity_window: int = 3,
        min_sentences_per_chunk: int = 1,
        min_characters_per_sentence: int = 24,
        delim: Union[str, list[str]] = [". ", "! ", "? ", "\n"],
        include_delim: Optional[Literal["prev", "next"]] = "prev",
        skip_window: int = 0,
        filter_window: int = 5,
        filter_polyorder: int = 3,
        filter_tolerance: float = 0.2,
        **kwargs: dict[str, Any]
    ): ...

    def chunk(self, text: str) -> list[Chunk]:
        """
        Chunks text using semantic similarity analysis.

        Args:
            text: Input text to chunk

        Returns:
            List of Chunk objects with embeddings, split at semantic boundaries
        """
        ...

Usage example:

from chonkie import SemanticChunker

# Create semantic chunker
chunker = SemanticChunker(
    embedding_model="all-MiniLM-L6-v2",
    threshold=0.75,
    chunk_size=512
)

chunks = chunker("Your text here...")

# Chunks include embeddings
for chunk in chunks:
    print(f"Text: {chunk.text}")
    print(f"Embedding shape: {chunk.embedding.shape}")

CodeChunker

Specialized chunker for source code using tree-sitter AST parsing to create structurally meaningful chunks.

Note: CodeChunker requires optional dependencies. Install with: pip install chonkie[code] or manually install tree-sitter-language-pack and magika.

class CodeChunker(BaseChunker):
    """
    Specialized chunker for source code using AST parsing.
    Requires optional dependencies: tree-sitter-language-pack, magika

    Args:
        tokenizer: Tokenizer instance or identifier (default: 'character')
        chunk_size: Maximum number of tokens per chunk (default: 2048)
        language: Programming language for parsing or 'auto' for detection (default: 'auto')
        include_nodes: If True, include AST node information in chunk metadata (default: False)
    """
    def __init__(
        self,
        tokenizer: Union[str, TokenizerProtocol] = "character",
        chunk_size: int = 2048,
        language: Union[Literal["auto"], Any] = "auto",
        include_nodes: bool = False
    ): ...

    def chunk(self, text: str) -> list[Chunk]:
        """
        Chunks code using AST structure.

        Args:
            text: Source code to chunk

        Returns:
            List of Chunk objects aligned to code structure (functions, classes, etc.)
        """
        ...

Usage example:

from chonkie import CodeChunker

# Create code chunker with auto-detection
chunker = CodeChunker(language="auto", chunk_size=512)

# Or specify language explicitly
python_chunker = CodeChunker(language="python", chunk_size=512)

code = """
def function1():
    pass

class MyClass:
    def method1(self):
        pass
"""

chunks = chunker(code)

LateChunker

Combines recursive chunking with late-stage embedding computation for semantic optimization.

class LateChunker(BaseChunker):
    """
    Recursively chunks with embedding-based merging at late stage.

    Args:
        embedding_model: Embedding model identifier or instance (default: 'nomic-ai/modernbert-embed-base')
        chunk_size: Maximum number of tokens per chunk (default: 2048)
        rules: RecursiveRules defining the hierarchy (default: RecursiveRules())
        min_characters_per_chunk: Minimum characters per chunk (default: 24)
        **kwargs: Additional arguments for embedding model
    """
    def __init__(
        self,
        embedding_model: Union[str, SentenceTransformerEmbeddings, Any] = "nomic-ai/modernbert-embed-base",
        chunk_size: int = 2048,
        rules: RecursiveRules = ...,
        min_characters_per_chunk: int = 24,
        **kwargs: Any
    ): ...

    def chunk(self, text: str) -> list[Chunk]:
        """
        Recursively chunks with late-stage semantic merging.

        Args:
            text: Input text to chunk

        Returns:
            List of Chunk objects with embeddings
        """
        ...

    @classmethod
    def from_recipe(cls, **kwargs) -> LateChunker:
        """Creates chunker from recipe."""
        ...

Usage example:

from chonkie import LateChunker

# Create late chunker
chunker = LateChunker(
    embedding_model="all-MiniLM-L6-v2",
    chunk_size=512
)

chunks = chunker("Your text here...")

SlumberChunker

Uses LLM (via Genie) to intelligently determine chunk boundaries, also known as AgenticChunker.

class SlumberChunker(BaseChunker):
    """
    Uses LLM to intelligently determine chunk boundaries.

    Args:
        genie: LLM integration instance (e.g., OpenAIGenie) (default: None)
        tokenizer: Tokenizer instance or identifier (default: 'character')
        chunk_size: Maximum number of tokens per chunk (default: 2048)
        rules: RecursiveRules for initial splitting (default: RecursiveRules())
        candidate_size: Token size for candidate chunks presented to LLM (default: 128)
        min_characters_per_chunk: Minimum characters per chunk (default: 24)
        verbose: If True, print LLM decision details (default: True)
    """
    def __init__(
        self,
        genie: Optional[BaseGenie] = None,
        tokenizer: Union[str, TokenizerProtocol] = "character",
        chunk_size: int = 2048,
        rules: RecursiveRules = ...,
        candidate_size: int = 128,
        min_characters_per_chunk: int = 24,
        verbose: bool = True
    ): ...

    def chunk(self, text: str) -> list[Chunk]:
        """
        Chunks using LLM-guided decisions.

        Args:
            text: Input text to chunk

        Returns:
            List of Chunk objects determined by LLM analysis
        """
        ...

Usage example:

from chonkie import SlumberChunker, OpenAIGenie

# Create with OpenAI
genie = OpenAIGenie(model="gpt-4", api_key="your-key")
chunker = SlumberChunker(genie=genie, chunk_size=512)

chunks = chunker("Complex text requiring intelligent splitting...")

NeuralChunker

Uses neural token classification models to predict chunk boundaries.

class NeuralChunker(BaseChunker):
    """
    Uses neural models to predict chunk boundaries.

    Args:
        model: Model identifier or instance (default: 'mirth/chonky_distilbert_base_uncased_1')
        tokenizer: Tokenizer for the model (default: None, uses model's tokenizer)
        device_map: Device mapping for model (default: 'auto')
        min_characters_per_chunk: Minimum characters per chunk (default: 10)
        stride: Stride for sliding window processing (default: None)
    """
    def __init__(
        self,
        model: Union[str, Any] = "mirth/chonky_distilbert_base_uncased_1",
        tokenizer: Optional[Union[str, Any]] = None,
        device_map: str = "auto",
        min_characters_per_chunk: int = 10,
        stride: Optional[int] = None
    ): ...

    def chunk(self, text: str) -> list[Chunk]:
        """
        Chunks using neural boundary detection.

        Args:
            text: Input text to chunk

        Returns:
            List of Chunk objects identified by neural model
        """
        ...

Usage example:

from chonkie import NeuralChunker

# Create neural chunker (requires neural installation)
chunker = NeuralChunker(min_characters_per_chunk=50)

chunks = chunker("Your text here...")

TableChunker

Specialized chunker for markdown tables that preserves table structure while splitting by rows.

class TableChunker(BaseChunker):
    """
    Specialized chunker for markdown tables preserving structure.

    Args:
        tokenizer: Tokenizer instance or identifier (default: 'row')
        chunk_size: Maximum number of rows per chunk (default: 3)
    """
    def __init__(
        self,
        tokenizer: Union[str, TokenizerProtocol] = "row",
        chunk_size: int = 3
    ): ...

    def chunk(self, text: str) -> list[Chunk]:
        """
        Chunks markdown tables by rows.

        Args:
            text: Markdown table text

        Returns:
            List of Chunk objects, each containing header + rows
        """
        ...

Usage example:

from chonkie import TableChunker

chunker = TableChunker(chunk_size=5)  # 5 rows per chunk

table_text = """
| Col1 | Col2 | Col3 |
|------|------|------|
| A    | B    | C    |
| D    | E    | F    |
| G    | H    | I    |
"""

chunks = chunker(table_text)

FastChunker

High-performance chunker using the memchunk library for SIMD-accelerated delimiter-based splitting.

class FastChunker(BaseChunker):
    """
    High-performance chunker using memchunk for fast delimiter-based splitting.

    Note: FastChunker uses byte-based splitting and does not use a tokenizer.
    It sets _tokenizer = None internally and operates directly on character positions.

    Args:
        chunk_size: Maximum chunk size in characters (default: 4096)
        delimiters: String of delimiter characters (default: '\\n.?')
        pattern: Optional regex pattern for splitting (default: None)
        prefix: If True, include delimiters as prefix (default: False)
        consecutive: If True, treat consecutive delimiters as one (default: False)
        forward_fallback: If True, use forward search when no delimiter found (default: False)
    """
    def __init__(
        self,
        chunk_size: int = 4096,
        delimiters: str = "\n.?",
        pattern: Optional[str] = None,
        prefix: bool = False,
        consecutive: bool = False,
        forward_fallback: bool = False
    ): ...

    def chunk(self, text: str) -> list[Chunk]:
        """
        Fast delimiter-based chunking.

        Args:
            text: Input text to chunk

        Returns:
            List of Chunk objects created using SIMD-accelerated splitting
        """
        ...

Usage example:

from chonkie import FastChunker

# Create fast chunker (requires fast installation: pip install chonkie[fast])
chunker = FastChunker(
    chunk_size=1024,
    delimiters="\n.!?",
    consecutive=True
)

chunks = chunker("Your large text here...")

Chunker Imports

All chunkers are available from the main package:

from chonkie import (
    BaseChunker,
    TokenChunker,
    SentenceChunker,
    RecursiveChunker,
    SemanticChunker,
    CodeChunker,
    LateChunker,
    SlumberChunker,
    NeuralChunker,
    TableChunker,
    FastChunker,
)

Pipeline Usage

All chunkers can be used in the Pipeline API via their aliases:

from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("token", chunk_size=512, chunk_overlap=50)
    .chunk_with("semantic", threshold=0.8)
)

Chunker aliases:

  • token - TokenChunker
  • sentence - SentenceChunker
  • recursive - RecursiveChunker
  • semantic - SemanticChunker
  • code - CodeChunker
  • late - LateChunker
  • slumber - SlumberChunker
  • neural - NeuralChunker
  • fast - FastChunker