tessl install tessl/pypi-chonkie@1.5.0The lightweight ingestion library for fast, efficient and robust RAG pipelines
Flexible tokenization system supporting character-level, word-level, byte-level, and integration with HuggingFace tokenizers and tiktoken.
Protocol defining the interface that all tokenizers must implement.
from typing import Protocol, Sequence, Union
class TokenizerProtocol(Protocol):
"""
Protocol defining the interface for tokenizers.
"""
def encode(self, text: str) -> Sequence[int]:
"""
Encodes text to token IDs.
Args:
text: Input text
Returns:
Sequence of integer token IDs
"""
...
def decode(self, tokens: Sequence[int]) -> str:
"""
Decodes token IDs back to text.
Args:
tokens: Sequence of token IDs
Returns:
Decoded text string
"""
...
def tokenize(self, text: str) -> Sequence[Union[str, int]]:
"""
Tokenizes text into tokens.
Args:
text: Input text
Returns:
Sequence of tokens (strings or integers)
"""
...Abstract base class for custom tokenizers implementing the TokenizerProtocol.
from abc import ABC, abstractmethod
from typing import Sequence, Union
class Tokenizer(ABC):
"""
Base class for custom tokenizers.
"""
def __init__(self): ...
@abstractmethod
def encode(self, text: str) -> Sequence[int]:
"""
Encodes text to token IDs.
Args:
text: Input text
Returns:
Sequence of token IDs
"""
...
@abstractmethod
def decode(self, tokens: Sequence[int]) -> str:
"""
Decodes token IDs to text.
Args:
tokens: Sequence of token IDs
Returns:
Decoded text
"""
...
@abstractmethod
def tokenize(self, text: str) -> Sequence[Union[str, int]]:
"""
Tokenizes text.
Args:
text: Input text
Returns:
Sequence of tokens
"""
...
def count_tokens(self, text: str) -> int:
"""
Counts tokens in text.
Args:
text: Input text
Returns:
Number of tokens
"""
...
def encode_batch(self, texts: Sequence[str]) -> Sequence[Sequence[int]]:
"""
Batch encodes multiple texts.
Args:
texts: Sequence of input texts
Returns:
Sequence of token ID sequences
"""
...
def decode_batch(
self,
token_sequences: Sequence[Sequence[int]]
) -> Sequence[str]:
"""
Batch decodes multiple token sequences.
Args:
token_sequences: Sequence of token ID sequences
Returns:
Sequence of decoded texts
"""
...
def count_tokens_batch(self, texts: Sequence[str]) -> Sequence[int]:
"""
Batch counts tokens in multiple texts.
Args:
texts: Sequence of input texts
Returns:
Sequence of token counts
"""
...
def get_vocab(self) -> Sequence[str]:
"""
Returns the vocabulary.
Returns:
Sequence of vocabulary tokens
"""
...
def get_token2id(self) -> dict:
"""
Returns token-to-id mapping.
Returns:
Dictionary mapping tokens to IDs
"""
...Character-level tokenizer that treats each character as a token.
class CharacterTokenizer(Tokenizer):
"""
Character-level tokenizer treating each character as a token.
"""
def __init__(self): ...
def tokenize(self, text: str) -> Sequence[str]:
"""
Splits text into characters.
Args:
text: Input text
Returns:
Sequence of individual characters
"""
...
def encode(self, text: str) -> Sequence[int]:
"""
Encodes text to character code points.
Args:
text: Input text
Returns:
Sequence of character code point IDs
"""
...
def decode(self, tokens: Sequence[int]) -> str:
"""
Decodes character code points to text.
Args:
tokens: Sequence of character IDs
Returns:
Decoded text
"""
...
def count_tokens(self, text: str) -> int:
"""
Returns character count.
Args:
text: Input text
Returns:
Number of characters
"""
...Usage example:
from chonkie import CharacterTokenizer
tokenizer = CharacterTokenizer()
# Tokenize
tokens = tokenizer.tokenize("Hello") # ['H', 'e', 'l', 'l', 'o']
# Count
count = tokenizer.count_tokens("Hello") # 5
# Encode/decode
encoded = tokenizer.encode("Hi")
decoded = tokenizer.decode(encoded)Word-level tokenizer that splits on whitespace.
class WordTokenizer(Tokenizer):
"""
Word-level tokenizer splitting on whitespace.
"""
def __init__(self): ...
def tokenize(self, text: str) -> Sequence[str]:
"""
Splits text into words.
Args:
text: Input text
Returns:
Sequence of words
"""
...
def encode(self, text: str) -> Sequence[int]:
"""
Encodes text to word IDs.
Args:
text: Input text
Returns:
Sequence of word IDs
"""
...
def decode(self, tokens: Sequence[int]) -> str:
"""
Decodes word IDs to text.
Args:
tokens: Sequence of word IDs
Returns:
Decoded text
"""
...
def count_tokens(self, text: str) -> int:
"""
Returns word count.
Args:
text: Input text
Returns:
Number of words
"""
...Usage example:
from chonkie import WordTokenizer
tokenizer = WordTokenizer()
# Tokenize
tokens = tokenizer.tokenize("Hello world") # ['Hello', 'world']
# Count
count = tokenizer.count_tokens("Hello world") # 2Byte-level tokenizer that operates on UTF-8 bytes.
class ByteTokenizer(Tokenizer):
"""
Byte-level tokenizer operating on UTF-8 bytes.
"""
def __init__(self): ...
def tokenize(self, text: str) -> Sequence[int]:
"""
Converts text to bytes.
Args:
text: Input text
Returns:
Sequence of byte values
"""
...
def encode(self, text: str) -> Sequence[int]:
"""
Encodes text to byte values.
Args:
text: Input text
Returns:
Sequence of byte values
"""
...
def decode(self, tokens: Sequence[int]) -> str:
"""
Decodes bytes to text.
Args:
tokens: Sequence of byte values
Returns:
Decoded text
"""
...
def count_tokens(self, text: str) -> int:
"""
Returns byte count.
Args:
text: Input text
Returns:
Number of bytes
"""
...Usage example:
from chonkie import ByteTokenizer
tokenizer = ByteTokenizer()
# Encode
bytes_encoded = tokenizer.encode("Hello")
# Count
count = tokenizer.count_tokens("Hello") # Byte countRow/line-based tokenizer that treats each line as a token.
class RowTokenizer(Tokenizer):
"""
Row-based tokenizer treating each line as a token.
"""
def __init__(self): ...
def tokenize(self, text: str) -> Sequence[str]:
"""
Splits text into lines.
Args:
text: Input text
Returns:
Sequence of lines
"""
...
def encode(self, text: str) -> Sequence[int]:
"""
Encodes text to line IDs.
Args:
text: Input text
Returns:
Sequence of line IDs
"""
...
def decode(self, tokens: Sequence[int]) -> str:
"""
Decodes line IDs to text.
Args:
tokens: Sequence of line IDs
Returns:
Decoded text
"""
...
def count_tokens(self, text: str) -> int:
"""
Returns line count.
Args:
text: Input text
Returns:
Number of lines
"""
...Usage example:
from chonkie import RowTokenizer
tokenizer = RowTokenizer()
text = "Line 1\nLine 2\nLine 3"
tokens = tokenizer.tokenize(text) # ['Line 1', 'Line 2', 'Line 3']
count = tokenizer.count_tokens(text) # 3Auto-loading tokenizer that wraps various tokenizer backends including HuggingFace, tiktoken, and custom tokenizers.
from typing import Callable, Any
class AutoTokenizer:
"""
Auto-loading tokenizer supporting multiple backends.
Args:
tokenizer: Tokenizer identifier, callable, or instance
- 'character': CharacterTokenizer
- 'word': WordTokenizer
- 'byte': ByteTokenizer
- 'row': RowTokenizer
- 'gpt2', 'gpt-4', etc.: HuggingFace or tiktoken tokenizers
- Callable: Custom token counting function
- Any: Existing tokenizer instance
"""
def __init__(self, tokenizer: Union[str, Callable, Any] = "character"): ...
def encode(self, text: str) -> Sequence[int]:
"""
Encodes text using the backend tokenizer.
Args:
text: Input text
Returns:
Sequence of token IDs
"""
...
def decode(self, tokens: Sequence[int]) -> str:
"""
Decodes tokens using the backend tokenizer.
Args:
tokens: Sequence of token IDs
Returns:
Decoded text
"""
...
def count_tokens(self, text: str) -> int:
"""
Counts tokens using the backend tokenizer.
Args:
text: Input text
Returns:
Number of tokens
"""
...
def encode_batch(self, texts: Sequence[str]) -> Sequence[Sequence[int]]:
"""
Batch encodes texts.
Args:
texts: Sequence of input texts
Returns:
Sequence of token ID sequences
"""
...
def decode_batch(
self,
token_sequences: Sequence[Sequence[int]]
) -> Sequence[str]:
"""
Batch decodes token sequences.
Args:
token_sequences: Sequence of token ID sequences
Returns:
Sequence of decoded texts
"""
...
def count_tokens_batch(self, texts: Sequence[str]) -> Sequence[int]:
"""
Batch counts tokens.
Args:
texts: Sequence of input texts
Returns:
Sequence of token counts
"""
...Usage examples:
from chonkie import AutoTokenizer
# Built-in tokenizers
char_tokenizer = AutoTokenizer("character")
word_tokenizer = AutoTokenizer("word")
# HuggingFace tokenizers (requires transformers)
gpt2_tokenizer = AutoTokenizer("gpt2")
bert_tokenizer = AutoTokenizer("bert-base-uncased")
# tiktoken (requires tiktoken)
tiktoken_tokenizer = AutoTokenizer("gpt-4")
# Custom token counter function
def custom_counter(text: str) -> int:
return len(text.split())
custom_tokenizer = AutoTokenizer(custom_counter)
# Use with chunkers
from chonkie import TokenChunker
chunker = TokenChunker(tokenizer="gpt2", chunk_size=512)All tokenizers are available from the main package:
from chonkie import (
TokenizerProtocol,
Tokenizer,
AutoTokenizer,
CharacterTokenizer,
WordTokenizer,
ByteTokenizer,
RowTokenizer,
)All chunkers accept tokenizers via their constructor:
from chonkie import RecursiveChunker, AutoTokenizer
# Using string identifier
chunker1 = RecursiveChunker(tokenizer="gpt2")
# Using AutoTokenizer
tokenizer = AutoTokenizer("character")
chunker2 = RecursiveChunker(tokenizer=tokenizer)
# Using custom function
def my_counter(text: str) -> int:
return len(text)
chunker3 = RecursiveChunker(tokenizer=my_counter)Create custom tokenizers by extending the Tokenizer base class:
from chonkie import Tokenizer
from typing import Sequence, Union
class MyCustomTokenizer(Tokenizer):
def encode(self, text: str) -> Sequence[int]:
# Custom encoding logic
return [ord(c) for c in text]
def decode(self, tokens: Sequence[int]) -> str:
# Custom decoding logic
return ''.join(chr(t) for t in tokens)
def tokenize(self, text: str) -> Sequence[Union[str, int]]:
# Custom tokenization logic
return list(text)
# Use with chunkers
from chonkie import TokenChunker
custom_tokenizer = MyCustomTokenizer()
chunker = TokenChunker(tokenizer=custom_tokenizer)