or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/chonkie@1.5.x

docs

advanced-features.mdchunkers.mdcore-types.mddata-processing.mdembeddings.mdexport.mdindex.mdlogging.mdpipeline.mdrefineries.mdtokenizers.mdvector-databases.md
tile.json

tessl/pypi-chonkie

tessl install tessl/pypi-chonkie@1.5.0

The lightweight ingestion library for fast, efficient and robust RAG pipelines

tokenizers.mddocs/

Tokenizers

Flexible tokenization system supporting character-level, word-level, byte-level, and integration with HuggingFace tokenizers and tiktoken.

Capabilities

TokenizerProtocol

Protocol defining the interface that all tokenizers must implement.

from typing import Protocol, Sequence, Union

class TokenizerProtocol(Protocol):
    """
    Protocol defining the interface for tokenizers.
    """
    def encode(self, text: str) -> Sequence[int]:
        """
        Encodes text to token IDs.

        Args:
            text: Input text

        Returns:
            Sequence of integer token IDs
        """
        ...

    def decode(self, tokens: Sequence[int]) -> str:
        """
        Decodes token IDs back to text.

        Args:
            tokens: Sequence of token IDs

        Returns:
            Decoded text string
        """
        ...

    def tokenize(self, text: str) -> Sequence[Union[str, int]]:
        """
        Tokenizes text into tokens.

        Args:
            text: Input text

        Returns:
            Sequence of tokens (strings or integers)
        """
        ...

Tokenizer

Abstract base class for custom tokenizers implementing the TokenizerProtocol.

from abc import ABC, abstractmethod
from typing import Sequence, Union

class Tokenizer(ABC):
    """
    Base class for custom tokenizers.
    """
    def __init__(self): ...

    @abstractmethod
    def encode(self, text: str) -> Sequence[int]:
        """
        Encodes text to token IDs.

        Args:
            text: Input text

        Returns:
            Sequence of token IDs
        """
        ...

    @abstractmethod
    def decode(self, tokens: Sequence[int]) -> str:
        """
        Decodes token IDs to text.

        Args:
            tokens: Sequence of token IDs

        Returns:
            Decoded text
        """
        ...

    @abstractmethod
    def tokenize(self, text: str) -> Sequence[Union[str, int]]:
        """
        Tokenizes text.

        Args:
            text: Input text

        Returns:
            Sequence of tokens
        """
        ...

    def count_tokens(self, text: str) -> int:
        """
        Counts tokens in text.

        Args:
            text: Input text

        Returns:
            Number of tokens
        """
        ...

    def encode_batch(self, texts: Sequence[str]) -> Sequence[Sequence[int]]:
        """
        Batch encodes multiple texts.

        Args:
            texts: Sequence of input texts

        Returns:
            Sequence of token ID sequences
        """
        ...

    def decode_batch(
        self,
        token_sequences: Sequence[Sequence[int]]
    ) -> Sequence[str]:
        """
        Batch decodes multiple token sequences.

        Args:
            token_sequences: Sequence of token ID sequences

        Returns:
            Sequence of decoded texts
        """
        ...

    def count_tokens_batch(self, texts: Sequence[str]) -> Sequence[int]:
        """
        Batch counts tokens in multiple texts.

        Args:
            texts: Sequence of input texts

        Returns:
            Sequence of token counts
        """
        ...

    def get_vocab(self) -> Sequence[str]:
        """
        Returns the vocabulary.

        Returns:
            Sequence of vocabulary tokens
        """
        ...

    def get_token2id(self) -> dict:
        """
        Returns token-to-id mapping.

        Returns:
            Dictionary mapping tokens to IDs
        """
        ...

CharacterTokenizer

Character-level tokenizer that treats each character as a token.

class CharacterTokenizer(Tokenizer):
    """
    Character-level tokenizer treating each character as a token.
    """
    def __init__(self): ...

    def tokenize(self, text: str) -> Sequence[str]:
        """
        Splits text into characters.

        Args:
            text: Input text

        Returns:
            Sequence of individual characters
        """
        ...

    def encode(self, text: str) -> Sequence[int]:
        """
        Encodes text to character code points.

        Args:
            text: Input text

        Returns:
            Sequence of character code point IDs
        """
        ...

    def decode(self, tokens: Sequence[int]) -> str:
        """
        Decodes character code points to text.

        Args:
            tokens: Sequence of character IDs

        Returns:
            Decoded text
        """
        ...

    def count_tokens(self, text: str) -> int:
        """
        Returns character count.

        Args:
            text: Input text

        Returns:
            Number of characters
        """
        ...

Usage example:

from chonkie import CharacterTokenizer

tokenizer = CharacterTokenizer()

# Tokenize
tokens = tokenizer.tokenize("Hello")  # ['H', 'e', 'l', 'l', 'o']

# Count
count = tokenizer.count_tokens("Hello")  # 5

# Encode/decode
encoded = tokenizer.encode("Hi")
decoded = tokenizer.decode(encoded)

WordTokenizer

Word-level tokenizer that splits on whitespace.

class WordTokenizer(Tokenizer):
    """
    Word-level tokenizer splitting on whitespace.
    """
    def __init__(self): ...

    def tokenize(self, text: str) -> Sequence[str]:
        """
        Splits text into words.

        Args:
            text: Input text

        Returns:
            Sequence of words
        """
        ...

    def encode(self, text: str) -> Sequence[int]:
        """
        Encodes text to word IDs.

        Args:
            text: Input text

        Returns:
            Sequence of word IDs
        """
        ...

    def decode(self, tokens: Sequence[int]) -> str:
        """
        Decodes word IDs to text.

        Args:
            tokens: Sequence of word IDs

        Returns:
            Decoded text
        """
        ...

    def count_tokens(self, text: str) -> int:
        """
        Returns word count.

        Args:
            text: Input text

        Returns:
            Number of words
        """
        ...

Usage example:

from chonkie import WordTokenizer

tokenizer = WordTokenizer()

# Tokenize
tokens = tokenizer.tokenize("Hello world")  # ['Hello', 'world']

# Count
count = tokenizer.count_tokens("Hello world")  # 2

ByteTokenizer

Byte-level tokenizer that operates on UTF-8 bytes.

class ByteTokenizer(Tokenizer):
    """
    Byte-level tokenizer operating on UTF-8 bytes.
    """
    def __init__(self): ...

    def tokenize(self, text: str) -> Sequence[int]:
        """
        Converts text to bytes.

        Args:
            text: Input text

        Returns:
            Sequence of byte values
        """
        ...

    def encode(self, text: str) -> Sequence[int]:
        """
        Encodes text to byte values.

        Args:
            text: Input text

        Returns:
            Sequence of byte values
        """
        ...

    def decode(self, tokens: Sequence[int]) -> str:
        """
        Decodes bytes to text.

        Args:
            tokens: Sequence of byte values

        Returns:
            Decoded text
        """
        ...

    def count_tokens(self, text: str) -> int:
        """
        Returns byte count.

        Args:
            text: Input text

        Returns:
            Number of bytes
        """
        ...

Usage example:

from chonkie import ByteTokenizer

tokenizer = ByteTokenizer()

# Encode
bytes_encoded = tokenizer.encode("Hello")

# Count
count = tokenizer.count_tokens("Hello")  # Byte count

RowTokenizer

Row/line-based tokenizer that treats each line as a token.

class RowTokenizer(Tokenizer):
    """
    Row-based tokenizer treating each line as a token.
    """
    def __init__(self): ...

    def tokenize(self, text: str) -> Sequence[str]:
        """
        Splits text into lines.

        Args:
            text: Input text

        Returns:
            Sequence of lines
        """
        ...

    def encode(self, text: str) -> Sequence[int]:
        """
        Encodes text to line IDs.

        Args:
            text: Input text

        Returns:
            Sequence of line IDs
        """
        ...

    def decode(self, tokens: Sequence[int]) -> str:
        """
        Decodes line IDs to text.

        Args:
            tokens: Sequence of line IDs

        Returns:
            Decoded text
        """
        ...

    def count_tokens(self, text: str) -> int:
        """
        Returns line count.

        Args:
            text: Input text

        Returns:
            Number of lines
        """
        ...

Usage example:

from chonkie import RowTokenizer

tokenizer = RowTokenizer()

text = "Line 1\nLine 2\nLine 3"
tokens = tokenizer.tokenize(text)  # ['Line 1', 'Line 2', 'Line 3']
count = tokenizer.count_tokens(text)  # 3

AutoTokenizer

Auto-loading tokenizer that wraps various tokenizer backends including HuggingFace, tiktoken, and custom tokenizers.

from typing import Callable, Any

class AutoTokenizer:
    """
    Auto-loading tokenizer supporting multiple backends.

    Args:
        tokenizer: Tokenizer identifier, callable, or instance
            - 'character': CharacterTokenizer
            - 'word': WordTokenizer
            - 'byte': ByteTokenizer
            - 'row': RowTokenizer
            - 'gpt2', 'gpt-4', etc.: HuggingFace or tiktoken tokenizers
            - Callable: Custom token counting function
            - Any: Existing tokenizer instance
    """
    def __init__(self, tokenizer: Union[str, Callable, Any] = "character"): ...

    def encode(self, text: str) -> Sequence[int]:
        """
        Encodes text using the backend tokenizer.

        Args:
            text: Input text

        Returns:
            Sequence of token IDs
        """
        ...

    def decode(self, tokens: Sequence[int]) -> str:
        """
        Decodes tokens using the backend tokenizer.

        Args:
            tokens: Sequence of token IDs

        Returns:
            Decoded text
        """
        ...

    def count_tokens(self, text: str) -> int:
        """
        Counts tokens using the backend tokenizer.

        Args:
            text: Input text

        Returns:
            Number of tokens
        """
        ...

    def encode_batch(self, texts: Sequence[str]) -> Sequence[Sequence[int]]:
        """
        Batch encodes texts.

        Args:
            texts: Sequence of input texts

        Returns:
            Sequence of token ID sequences
        """
        ...

    def decode_batch(
        self,
        token_sequences: Sequence[Sequence[int]]
    ) -> Sequence[str]:
        """
        Batch decodes token sequences.

        Args:
            token_sequences: Sequence of token ID sequences

        Returns:
            Sequence of decoded texts
        """
        ...

    def count_tokens_batch(self, texts: Sequence[str]) -> Sequence[int]:
        """
        Batch counts tokens.

        Args:
            texts: Sequence of input texts

        Returns:
            Sequence of token counts
        """
        ...

Usage examples:

from chonkie import AutoTokenizer

# Built-in tokenizers
char_tokenizer = AutoTokenizer("character")
word_tokenizer = AutoTokenizer("word")

# HuggingFace tokenizers (requires transformers)
gpt2_tokenizer = AutoTokenizer("gpt2")
bert_tokenizer = AutoTokenizer("bert-base-uncased")

# tiktoken (requires tiktoken)
tiktoken_tokenizer = AutoTokenizer("gpt-4")

# Custom token counter function
def custom_counter(text: str) -> int:
    return len(text.split())

custom_tokenizer = AutoTokenizer(custom_counter)

# Use with chunkers
from chonkie import TokenChunker

chunker = TokenChunker(tokenizer="gpt2", chunk_size=512)

Tokenizer Imports

All tokenizers are available from the main package:

from chonkie import (
    TokenizerProtocol,
    Tokenizer,
    AutoTokenizer,
    CharacterTokenizer,
    WordTokenizer,
    ByteTokenizer,
    RowTokenizer,
)

Integration with Chunkers

All chunkers accept tokenizers via their constructor:

from chonkie import RecursiveChunker, AutoTokenizer

# Using string identifier
chunker1 = RecursiveChunker(tokenizer="gpt2")

# Using AutoTokenizer
tokenizer = AutoTokenizer("character")
chunker2 = RecursiveChunker(tokenizer=tokenizer)

# Using custom function
def my_counter(text: str) -> int:
    return len(text)

chunker3 = RecursiveChunker(tokenizer=my_counter)

Custom Tokenizers

Create custom tokenizers by extending the Tokenizer base class:

from chonkie import Tokenizer
from typing import Sequence, Union

class MyCustomTokenizer(Tokenizer):
    def encode(self, text: str) -> Sequence[int]:
        # Custom encoding logic
        return [ord(c) for c in text]

    def decode(self, tokens: Sequence[int]) -> str:
        # Custom decoding logic
        return ''.join(chr(t) for t in tokens)

    def tokenize(self, text: str) -> Sequence[Union[str, int]]:
        # Custom tokenization logic
        return list(text)

# Use with chunkers
from chonkie import TokenChunker

custom_tokenizer = MyCustomTokenizer()
chunker = TokenChunker(tokenizer=custom_tokenizer)