tessl install tessl/pypi-chonkie@1.5.0The lightweight ingestion library for fast, efficient and robust RAG pipelines
LLM integrations (Genies) for AI-powered chunking, neural chunking models, and cloud-based refineries.
LLM provider integrations for advanced chunking strategies like SlumberChunker.
Abstract base class for LLM provider integrations.
from abc import ABC, abstractmethod
from typing import Any
class BaseGenie(ABC):
"""
Base class for LLM provider integrations.
"""
def __init__(self): ...
@abstractmethod
def generate(self, prompt: str) -> str:
"""
Generates text from a prompt.
Args:
prompt: Input prompt
Returns:
Generated text
"""
...
def generate_batch(self, prompts: list[str]) -> list[str]:
"""
Generates text for multiple prompts.
Args:
prompts: List of input prompts
Returns:
List of generated texts
"""
...
def generate_json(self, prompt: str, schema: Any) -> Any:
"""
Generates structured JSON output.
Args:
prompt: Input prompt
schema: Pydantic BaseModel schema for structured output
Returns:
Parsed JSON object matching schema
"""
...
def generate_json_batch(
self,
prompts: list[str],
schema: Any
) -> list[Any]:
"""
Generates structured JSON for multiple prompts.
Args:
prompts: List of input prompts
schema: Pydantic BaseModel schema
Returns:
List of parsed JSON objects
"""
...LLM integration for OpenAI models.
from typing import Optional, Any
from pydantic import BaseModel
class OpenAIGenie(BaseGenie):
"""
LLM integration for OpenAI models.
Args:
model: Model identifier (default: 'gpt-4.1')
base_url: Optional custom API base URL
api_key: Optional API key (defaults to OPENAI_API_KEY env var)
"""
def __init__(
self,
model: str = "gpt-4.1",
base_url: Optional[str] = None,
api_key: Optional[str] = None
): ...
def generate(self, prompt: str) -> str:
"""
Generates text using OpenAI.
Args:
prompt: Input prompt
Returns:
Generated text
"""
...
def generate_json(
self,
prompt: str,
schema: BaseModel
) -> dict[str, Any]:
"""
Generates structured output using OpenAI.
Args:
prompt: Input prompt
schema: Pydantic model for response structure
Returns:
Parsed JSON dictionary
"""
...Usage example:
from chonkie import OpenAIGenie
# Basic usage
genie = OpenAIGenie(model="gpt-4", api_key="your-key")
response = genie.generate("Explain chunking strategies.")
# Use with SlumberChunker
from chonkie import SlumberChunker
chunker = SlumberChunker(genie=genie, chunk_size=512)
chunks = chunker("Your text here...")
# Use with custom API endpoint (e.g., OpenRouter)
genie = OpenAIGenie(
model="meta-llama/llama-4-maverick",
base_url="https://openrouter.ai/api/v1",
api_key="your-openrouter-key"
)LLM integration for Azure OpenAI service.
from typing import Optional, Any
from pydantic import BaseModel
class AzureOpenAIGenie(BaseGenie):
"""
LLM integration for Azure OpenAI.
Args:
model: Azure deployment name
api_key: Optional Azure API key (defaults to AZURE_OPENAI_API_KEY env var)
azure_endpoint: Optional Azure endpoint URL (defaults to AZURE_OPENAI_ENDPOINT env var)
api_version: API version (default: '2024-02-01')
"""
def __init__(
self,
model: str,
api_key: Optional[str] = None,
azure_endpoint: Optional[str] = None,
api_version: str = "2024-02-01"
): ...
def generate(self, prompt: str) -> str:
"""
Generates text using Azure OpenAI.
Args:
prompt: Input prompt
Returns:
Generated text
"""
...
def generate_json(
self,
prompt: str,
schema: BaseModel
) -> dict[str, Any]:
"""
Generates structured output using Azure OpenAI.
Args:
prompt: Input prompt
schema: Pydantic model for response structure
Returns:
Parsed JSON dictionary
"""
...Usage example:
from chonkie import AzureOpenAIGenie
genie = AzureOpenAIGenie(
model="your-deployment-name",
api_key="your-azure-key",
azure_endpoint="https://your-resource.openai.azure.com/"
)
# Use with SlumberChunker
from chonkie import SlumberChunker
chunker = SlumberChunker(genie=genie)
chunks = chunker("Your text...")LLM integration for Google Gemini.
from typing import Optional, Any
from pydantic import BaseModel
class GeminiGenie(BaseGenie):
"""
LLM integration for Google Gemini.
Args:
model: Model identifier (default: 'gemini-1.5-flash')
api_key: Optional API key (defaults to GOOGLE_API_KEY env var)
"""
def __init__(
self,
model: str = "gemini-1.5-flash",
api_key: Optional[str] = None
): ...
def generate(self, prompt: str) -> str:
"""
Generates text using Gemini.
Args:
prompt: Input prompt
Returns:
Generated text
"""
...
def generate_json(
self,
prompt: str,
schema: BaseModel
) -> dict[str, Any]:
"""
Generates structured output using Gemini.
Args:
prompt: Input prompt
schema: Pydantic model for response structure
Returns:
Parsed JSON dictionary
"""
...Usage example:
from chonkie import GeminiGenie
genie = GeminiGenie(model="gemini-1.5-flash", api_key="your-google-key")
# Use with SlumberChunker
from chonkie import SlumberChunker
chunker = SlumberChunker(genie=genie)Uses LLM to make intelligent chunking decisions.
from typing import Optional, Union
class SlumberChunker(BaseChunker):
"""
Uses LLM (via Genie) to intelligently determine chunk boundaries.
Also known as AgenticChunker. This chunker uses an LLM to analyze text
and make intelligent decisions about where to split chunks based on
semantic coherence and topic boundaries.
Args:
genie: LLM integration instance (e.g., OpenAIGenie) (default: None)
tokenizer: Tokenizer instance or identifier (default: 'character')
chunk_size: Maximum number of tokens per chunk (default: 2048)
rules: RecursiveRules for initial splitting (default: RecursiveRules())
candidate_size: Token size for candidate chunks presented to LLM (default: 128)
min_characters_per_chunk: Minimum characters per chunk (default: 24)
verbose: If True, print LLM decision details (default: True)
"""
def __init__(
self,
genie: Optional[BaseGenie] = None,
tokenizer: Union[str, TokenizerProtocol] = "character",
chunk_size: int = 2048,
rules: RecursiveRules = ...,
candidate_size: int = 128,
min_characters_per_chunk: int = 24,
verbose: bool = True
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks text using LLM-guided decisions.
The chunker:
1. Performs initial recursive splitting
2. Presents candidate boundaries to the LLM
3. Uses LLM decisions to merge or split chunks
4. Returns optimally chunked text
Args:
text: Input text to chunk
Returns:
List of intelligently chunked Chunk objects
"""
...Usage example:
from chonkie import SlumberChunker, OpenAIGenie
# Create with OpenAI
genie = OpenAIGenie(model="gpt-4", api_key="your-key")
chunker = SlumberChunker(
genie=genie,
chunk_size=512,
candidate_size=128,
verbose=True # See LLM decisions
)
# Chunk complex text
complex_text = """
This is a complex document with multiple topics...
The LLM will intelligently identify topic boundaries...
And make chunking decisions based on semantic coherence...
"""
chunks = chunker(complex_text)
# Each chunk represents a semantically coherent unit
for chunk in chunks:
print(f"Chunk: {chunk.text}")
print(f"Tokens: {chunk.token_count}")
# Use with Gemini
from chonkie import GeminiGenie
gemini_genie = GeminiGenie(api_key="your-google-key")
gemini_chunker = SlumberChunker(genie=gemini_genie)
# Use in pipeline
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("slumber", genie=genie, chunk_size=512)
.refine_with("embeddings")
)Uses neural token classification models to predict chunk boundaries.
from typing import Optional, Union, Any
class NeuralChunker(BaseChunker):
"""
Uses neural models to predict chunk boundaries.
This chunker employs a trained neural network that classifies each token
to determine optimal chunk boundaries based on learned patterns.
Args:
model: Model identifier or instance (default: DEFAULT_MODEL)
tokenizer: Tokenizer for the model (default: None, uses model's tokenizer)
device_map: Device mapping for model (default: 'auto')
min_characters_per_chunk: Minimum characters per chunk (default: 10)
stride: Stride for sliding window processing (default: None)
"""
def __init__(
self,
model: Union[str, Any] = ...,
tokenizer: Optional[Union[str, Any]] = None,
device_map: str = "auto",
min_characters_per_chunk: int = 10,
stride: Optional[int] = None
): ...
def chunk(self, text: str) -> list[Chunk]:
"""
Chunks text using neural boundary detection.
The neural model:
1. Processes text through a transformer model
2. Predicts boundary probabilities for each token
3. Identifies high-confidence boundaries
4. Splits text at predicted boundaries
Args:
text: Input text to chunk
Returns:
List of Chunk objects identified by neural model
"""
...Usage example:
from chonkie import NeuralChunker
# Create neural chunker (requires neural installation: pip install chonkie[neural])
chunker = NeuralChunker(min_characters_per_chunk=50)
chunks = chunker("Your text here...")
# With custom model
chunker = NeuralChunker(
model="path/to/your/model",
device_map="cuda:0"
)The chonkie.cloud module provides cloud-based implementations via the Chonkie API for server-side processing, file management, and pipeline execution.
from chonkie.cloud import chunker, refineries, Pipeline, PipelineStep, FileManager, File
class CloudChunker(ABC):
"""
Base class for cloud-based chunking algorithms.
All cloud chunkers process text via the Chonkie API for server-side execution.
"""
@abstractmethod
def chunk(text: Union[str, list[str]]) -> Any:
"""Chunk the text into a list of chunks via cloud API."""
...
def __call__(text: Union[str, list[str]]) -> Any:
"""Call the chunker."""
...
class FileManager:
"""
File management functions for Chonkie API.
Args:
api_key: Chonkie API key. Falls back to CHONKIE_API_KEY environment variable.
Raises:
ValueError: If no API key is provided
"""
def __init__(api_key: Optional[str] = None): ...
def upload(path: str) -> File:
"""
Upload a file to the Chonkie API.
Args:
path: Local file path to upload
Returns:
File object with name and size metadata
Raises:
ValueError: If upload fails
"""
...
class File:
"""
File metadata from Chonkie API.
Attributes:
name: File name
size: File size as string
"""
name: str
size: str
@classmethod
def from_dict(data: dict) -> File:
"""Create a File object from a dictionary."""
...
class PipelineStep:
"""
A single step in a cloud pipeline configuration.
Attributes:
type: Step type (e.g., 'chunker', 'refinery')
component: Component name (e.g., 'recursive', 'overlap')
params: Component parameters as dictionary
"""
type: str
component: str
params: dict[str, Any]
def to_dict() -> dict[str, Any]:
"""Convert to dictionary for API payload."""
...
@classmethod
def from_dict(data: dict[str, Any]) -> PipelineStep:
"""Create from dictionary."""
...
class Pipeline:
"""
Cloud Pipeline - build and execute pipelines via Chonkie API.
Args:
slug: Unique identifier for the pipeline (lowercase letters, numbers, dashes, underscores)
description: Optional pipeline description
api_key: Chonkie API key. Falls back to CHONKIE_API_KEY environment variable.
"""
def __init__(slug: str, description: Optional[str] = None, api_key: Optional[str] = None): ...
def chunk_with(component: str, **params: Any) -> Pipeline:
"""Add chunking step to pipeline."""
...
def refine_with(component: str, **params: Any) -> Pipeline:
"""Add refinery step to pipeline."""
...
def run(
text: Optional[str] = None,
file: Optional[str] = None
) -> list[Chunk]:
"""
Execute pipeline with text or file.
Args:
text: Text to process (mutually exclusive with file)
file: Local file path to upload and process (mutually exclusive with text)
Returns:
List of Chunk objects from processing
Raises:
ValueError: If neither or both text and file provided
"""
...
@classmethod
def get(slug: str, api_key: Optional[str] = None) -> Pipeline:
"""Fetch existing pipeline by slug."""
...
@classmethod
def list(api_key: Optional[str] = None) -> list[Pipeline]:
"""List all pipelines."""
...
def describe() -> str:
"""Get pipeline description."""
...Cloud chunkers module:
# Available cloud chunkers
from chonkie.cloud import chunker
# All standard chunkers available via cloud
chunker.TokenChunker
chunker.RecursiveChunker
chunker.SentenceChunker
chunker.SemanticChunker
chunker.LateChunker
chunker.CodeChunker
chunker.NeuralChunker
chunker.SlumberChunkerCloud refineries module:
# Available cloud refineries
from chonkie.cloud import refineries
refineries.EmbeddingsRefinery
refineries.OverlapRefineryUsage examples:
from chonkie.cloud import Pipeline, FileManager
# Create and execute cloud pipeline
pipeline = Pipeline(
slug="my-rag-pipeline",
description="Pipeline for RAG processing"
)
pipeline = (
pipeline
.chunk_with("recursive", chunk_size=512)
.refine_with("overlap", context_size=64)
.refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
)
# Execute with text
chunks = pipeline.run(text="Your document text here")
# Execute with file (auto-uploaded)
chunks = pipeline.run(file="document.pdf")
# Fetch existing pipeline
existing = Pipeline.get("my-rag-pipeline")
chunks = existing.run(text="More text to process")
# List all pipelines
for p in Pipeline.list():
print(f"{p.slug}: {p.describe()}")
# File management
file_manager = FileManager(api_key="your-key")
uploaded_file = file_manager.upload("local_document.pdf")
print(f"Uploaded: {uploaded_file.name} ({uploaded_file.size})")All advanced features are available from the main package:
from chonkie import (
BaseGenie,
OpenAIGenie,
AzureOpenAIGenie,
GeminiGenie,
SlumberChunker,
NeuralChunker,
)
# Cloud modules
from chonkie import cloudfrom chonkie import Pipeline, OpenAIGenie
# Create LLM integration
genie = OpenAIGenie(model="gpt-4", api_key="your-key")
# Build sophisticated pipeline with LLM chunking
pipe = (
Pipeline()
# Use LLM to make chunking decisions
.chunk_with("slumber", genie=genie, chunk_size=1024, verbose=True)
# Add contextual overlap
.refine_with("overlap", context_size=0.2)
# Add embeddings
.refine_with("embeddings", embedding_model="openai/text-embedding-3-large")
# Store in vector database
.store_in("qdrant", collection_name="intelligent_chunks")
# Export for analysis
.export_with("json", file="llm_chunks.jsonl")
)
# Process complex document
doc = pipe.run("""
Your complex document here with multiple topics, sections, and nuanced content
that benefits from intelligent, LLM-guided chunking decisions...
""")
print(f"Created {len(doc.chunks)} intelligent chunks")from chonkie import OpenAIGenie, GeminiGenie, SlumberChunker
# Use GPT-4 for complex analysis
gpt4_genie = OpenAIGenie(model="gpt-4")
gpt4_chunker = SlumberChunker(genie=gpt4_genie, verbose=True)
# Use Gemini for faster processing
gemini_genie = GeminiGenie(model="gemini-1.5-flash")
gemini_chunker = SlumberChunker(genie=gemini_genie, verbose=False)
# Choose based on document complexity
def chunk_intelligently(text: str, is_complex: bool):
if is_complex:
return gpt4_chunker(text)
else:
return gemini_chunker(text)
# Process documents
simple_chunks = chunk_intelligently("Simple document...", is_complex=False)
complex_chunks = chunk_intelligently("Complex research paper...", is_complex=True)verbose=True helps understand LLM decisionscandidate_size controls how much text is sent to LLM per decision# Optimize for batch processing
from chonkie import NeuralChunker
chunker = NeuralChunker(device_map="cuda:0")
# Process multiple documents efficiently
texts = ["Doc 1...", "Doc 2...", "Doc 3..."]
all_chunks = chunker.chunk_batch(texts)The chonkie.experimental module contains experimental features:
from chonkie import experimental
# Experimental code analysis features
# Note: Not exported in main API, subject to changeThese features are under active development and may change in future versions.