A fast, lightweight and easy-to-use Python library for splitting text into semantically meaningful chunks.
npx @tessl/cli install tessl/pypi-semchunk@3.2.0A fast, lightweight and easy-to-use Python library for splitting text into semantically meaningful chunks. semchunk uses an efficient algorithm that prioritizes semantic boundaries over simple character or token-based splitting, making it ideal for RAG applications, document processing pipelines, and any system requiring intelligent text segmentation.
pip install semchunkconda install -c conda-forge semchunkimport semchunkCommon usage patterns:
from semchunk import chunk, Chunker, chunkerifyimport semchunk
import tiktoken
# Basic chunking with OpenAI tokenizer
# Note: Consider deducting special tokens from chunk_size if your tokenizer adds them
chunker = semchunk.chunkerify('gpt-4', chunk_size=512)
text = "The quick brown fox jumps over the lazy dog. This is a test sentence."
chunks = chunker(text)
# Chunking with offsets
chunks, offsets = chunker(text, offsets=True)
# Chunking with overlap
overlapped_chunks = chunker(text, overlap=0.1) # 10% overlap
# Using the chunk function directly
encoding = tiktoken.encoding_for_model('gpt-4')
def count_tokens(text):
return len(encoding.encode(text))
chunks = semchunk.chunk(
text=text,
chunk_size=512,
token_counter=count_tokens
)semchunk uses a hierarchical splitting strategy that preserves semantic boundaries through a 5-step algorithm:
semchunk uses the following splitters in order of semantic preference:
\n) and/or carriage returns (\r)\t)., ?, !, *;, ,, (, ), [, ], ", ", ', ', ', ", `:, —, …/, \, –, &, -Direct text chunking with full control over parameters and caching options.
def chunk(
text: str,
chunk_size: int,
token_counter: Callable[[str], int],
memoize: bool = True,
offsets: bool = False,
overlap: float | int | None = None,
cache_maxsize: int | None = None,
) -> list[str] | tuple[list[str], list[tuple[int, int]]]:
"""
Split a text into semantically meaningful chunks of a specified size.
Parameters:
- text: The text to be chunked
- chunk_size: The maximum number of tokens a chunk may contain
- token_counter: A callable that takes a string and returns the number of tokens in it
- memoize: Whether to memoize the token counter for performance. Defaults to True
- offsets: Whether to return the start and end offsets of each chunk. Defaults to False
- overlap: The proportion of the chunk size (if <1) or number of tokens (if >=1)
by which chunks should overlap. Defaults to None
- cache_maxsize: The maximum number of text-token count pairs that can be stored
in the token counter's cache. Defaults to None (unbounded)
Returns:
- If offsets=False: list[str] - List of chunks up to chunk_size tokens long
- If offsets=True: tuple[list[str], list[tuple[int, int]]] - Chunks and their
(start, end) character offsets in the original text
Raises:
- ValueError: If chunk_size is not provided and tokenizer lacks model_max_length
"""Create configured chunkers from tokenizers or token counters with automatic optimization.
def chunkerify(
tokenizer_or_token_counter: str | tiktoken.Encoding | transformers.PreTrainedTokenizer | tokenizers.Tokenizer | Callable[[str], int],
chunk_size: int | None = None,
max_token_chars: int | None = None,
memoize: bool = True,
cache_maxsize: int | None = None,
) -> Chunker:
"""
Construct a chunker that splits texts into semantically meaningful chunks.
Parameters:
- tokenizer_or_token_counter: Either:
* Name of a tiktoken or transformers tokenizer (e.g., 'gpt-4', 'cl100k_base')
* A tokenizer object with an encode() method (tiktoken, transformers, tokenizers)
* A token counter function that returns the number of tokens in input text
- chunk_size: Maximum number of tokens per chunk. Defaults to tokenizer's
model_max_length if available, otherwise raises ValueError
- max_token_chars: Maximum number of characters a token may contain. Used to
significantly speed up token counting for long inputs by using heuristics
to avoid tokenizing texts that would exceed chunk_size. Auto-detected from
tokenizer vocabulary if possible
- memoize: Whether to memoize the token counter. Defaults to True
- cache_maxsize: Maximum number of text-token count pairs in cache. Defaults to None
Returns:
- Chunker: A configured chunker instance that can process single texts or sequences
Raises:
- ValueError: If tokenizer_or_token_counter is a string that doesn't match any
known tokenizer, or if chunk_size is None and tokenizer lacks
model_max_length attribute, or if required libraries are not installed
"""High-performance chunker for processing single texts or sequences with multiprocessing support.
class Chunker:
def __init__(self, chunk_size: int, token_counter: Callable[[str], int]) -> None:
"""
Initialize a chunker with specified chunk size and token counter.
Parameters:
- chunk_size: Maximum number of tokens per chunk
- token_counter: Function that takes a string and returns token count
"""
def __call__(
self,
text_or_texts: str | Sequence[str],
processes: int = 1,
progress: bool = False,
offsets: bool = False,
overlap: int | float | None = None,
) -> list[str] | tuple[list[str], list[tuple[int, int]]] | list[list[str]] | tuple[list[list[str]], list[list[tuple[int, int]]]]:
"""
Split text or texts into semantically meaningful chunks.
Parameters:
- text_or_texts: Single text string or sequence of text strings to chunk
- processes: Number of processes for multiprocessing when processing multiple texts.
Defaults to 1 (single process)
- progress: Whether to display a progress bar when processing multiple texts.
Defaults to False
- offsets: Whether to return start and end character offsets for each chunk.
Defaults to False
- overlap: Proportion of chunk size (if <1) or number of tokens (if >=1)
by which chunks should overlap. Defaults to None
Returns:
For single text input:
- If offsets=False: list[str] - List of chunks
- If offsets=True: tuple[list[str], list[tuple[int, int]]] - Chunks and offsets
For multiple text input:
- If offsets=False: list[list[str]] - List of chunk lists, one per input text
- If offsets=True: tuple[list[list[str]], list[list[tuple[int, int]]]] -
Chunk lists and offset lists for each input text
"""semchunk includes several performance optimizations to handle large texts efficiently:
Enabled by default (memoize=True), this caches token counts for repeated text segments, significantly speeding up processing of documents with repeated content.
The max_token_chars parameter enables a smart optimization that avoids tokenizing very long texts when they would obviously exceed the chunk size. The algorithm:
chunk_size * 6 to identify potentially long textsheuristic + max_token_charschunk_size, returns chunk_size + 1 without full tokenizationThe Chunker class supports parallel processing of multiple texts via the processes parameter, using the mpire library with dill serialization for robust multiprocessing.
When using tokenizers that add special tokens (like BOS/EOS tokens), semchunk automatically:
add_special_tokens parameter and disables it during chunkingchunk_size by the number of special tokens when auto-detecting chunk size from model_max_length# Core imports
from typing import Callable, Sequence
# Type annotations used in the API
TokenCounter = Callable[[str], int]
# Offset tuple type (start, end character positions)
OffsetTuple = tuple[int, int]
# When TYPE_CHECKING is True, these imports are available for type hints:
# import tiktoken
# import tokenizers
# import transformers
# The tokenizer_or_token_counter parameter accepts any of:
# - str: Model name or encoding name (e.g., 'gpt-4', 'cl100k_base')
# - tiktoken.Encoding: tiktoken encoder object
# - transformers.PreTrainedTokenizer: Hugging Face tokenizer
# - tokenizers.Tokenizer: Fast tokenizer from tokenizers library
# - Callable[[str], int]: Custom token counter functionimport semchunk
# OpenAI tiktoken models
chunker_gpt4 = semchunk.chunkerify('gpt-4', chunk_size=1000)
chunker_gpt35 = semchunk.chunkerify('gpt-3.5-turbo', chunk_size=1000)
# tiktoken encodings
chunker_cl100k = semchunk.chunkerify('cl100k_base', chunk_size=1000)
# Hugging Face transformers
from transformers import AutoTokenizer
hf_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
chunker_bert = semchunk.chunkerify(hf_tokenizer, chunk_size=512)
# Custom token counter
def simple_word_counter(text: str) -> int:
return len(text.split())
chunker_words = semchunk.chunkerify(simple_word_counter, chunk_size=100)import semchunk
# Prepare chunker and texts
chunker = semchunk.chunkerify('gpt-4', chunk_size=512)
documents = [
"First document text...",
"Second document text...",
"Third document text..."
]
# Process with multiprocessing
chunks_per_doc = chunker(documents, processes=4, progress=True)
# Process with offsets
chunks_per_doc, offsets_per_doc = chunker(
documents,
processes=4,
progress=True,
offsets=True
)
# With overlap for better context preservation
overlapped_chunks = chunker(
documents,
overlap=0.2, # 20% overlap
processes=4
)import semchunk
from functools import lru_cache
# Custom token counter with caching
@lru_cache(maxsize=1000)
def cached_word_counter(text: str) -> int:
return len(text.split())
# Direct chunk function usage with custom settings
text = "Long document text..."
chunks = semchunk.chunk(
text=text,
chunk_size=200,
token_counter=cached_word_counter,
memoize=False, # Already cached manually
offsets=True,
overlap=50, # 50 token overlap
cache_maxsize=500
)
# Chunker with performance optimization
chunker = semchunk.chunkerify(
'gpt-4',
chunk_size=1000,
max_token_chars=10, # Optimize for typical token lengths
cache_maxsize=2000 # Large cache for repeated texts
)