tessl install tessl/pypi-chonkie@1.5.0The lightweight ingestion library for fast, efficient and robust RAG pipelines
Direct integration with 10+ vector databases for seamless chunk ingestion including ChromaDB, Qdrant, Pinecone, Weaviate, Milvus, Elasticsearch, MongoDB, pgvector, and Turbopuffer.
Abstract base class for all vector database integrations.
from abc import ABC, abstractmethod
from typing import Union, Any
class BaseHandshake(ABC):
"""
Base class for vector database integrations.
"""
def __init__(self): ...
@abstractmethod
def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
"""
Writes chunks to the vector database.
Args:
chunk: Single Chunk or list of Chunks
Returns:
Database-specific response
"""
...
def __call__(self, chunks: Union[Chunk, list[Chunk]]) -> Any:
"""
Allows calling handshake as a function.
Args:
chunks: Single Chunk or list of Chunks
Returns:
Database-specific response
"""
...Integration with ChromaDB vector database.
from typing import Optional, Union, Literal, Any
class ChromaHandshake(BaseHandshake):
"""
Integration with ChromaDB vector database.
Args:
client: Optional ChromaDB client instance
collection_name: Collection name or 'random' for auto-generated (default: 'random')
embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
path: Optional path for persistent storage
"""
def __init__(
self,
client: Optional[Any] = None,
collection_name: Union[str, Literal["random"]] = "random",
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
path: Optional[str] = None
): ...
def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
"""
Writes chunks to ChromaDB collection.
Args:
chunk: Single Chunk or list of Chunks (must have embeddings)
Returns:
ChromaDB response
"""
...Usage example:
from chonkie import ChromaHandshake, TokenChunker, EmbeddingsRefinery
# Create and embed chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")
refinery = EmbeddingsRefinery(embedding_model="all-MiniLM-L6-v2")
chunks = refinery(chunks)
# Write to ChromaDB
handshake = ChromaHandshake(
collection_name="my_documents",
path="./chroma_db"
)
handshake(chunks)
# Use in pipeline
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("recursive")
.refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
.store_in("chroma", collection_name="docs", path="./db")
)Integration with Qdrant vector database.
from typing import Optional, Union, Literal, Any
class QdrantHandshake(BaseHandshake):
"""
Integration with Qdrant vector database.
Args:
client: Optional QdrantClient instance
collection_name: Collection name or 'random' for auto-generated (default: 'random')
embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
url: Optional Qdrant server URL
path: Optional path for local storage
api_key: Optional API key for Qdrant Cloud
**kwargs: Additional arguments for QdrantClient
"""
def __init__(
self,
client: Optional[Any] = None,
collection_name: Union[str, Literal["random"]] = "random",
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
url: Optional[str] = None,
path: Optional[str] = None,
api_key: Optional[str] = None,
**kwargs: dict[str, Any]
): ...
def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
"""
Writes chunks to Qdrant collection.
Args:
chunk: Single Chunk or list of Chunks (must have embeddings)
Returns:
Qdrant response
"""
...Usage example:
from chonkie import QdrantHandshake
# Local Qdrant
handshake = QdrantHandshake(
collection_name="documents",
path="./qdrant_storage"
)
# Qdrant Cloud
handshake = QdrantHandshake(
collection_name="documents",
url="https://your-cluster.qdrant.io",
api_key="your-api-key"
)
# Use in pipeline
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("semantic")
.store_in("qdrant", collection_name="docs", url="http://localhost:6333")
)Integration with Pinecone vector database.
from typing import Optional, Union, Any
class PineconeHandshake(BaseHandshake):
"""
Integration with Pinecone vector database.
Args:
index_name: Name of the Pinecone index
api_key: Optional API key (defaults to PINECONE_API_KEY env var)
embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
**kwargs: Additional arguments for Pinecone client
"""
def __init__(
self,
index_name: str,
api_key: Optional[str] = None,
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
**kwargs: dict[str, Any]
): ...
def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
"""
Writes chunks to Pinecone index.
Args:
chunk: Single Chunk or list of Chunks (must have embeddings)
Returns:
Pinecone response
"""
...Usage example:
from chonkie import PineconeHandshake
handshake = PineconeHandshake(
index_name="my-index",
api_key="your-pinecone-key"
)
# Use in pipeline
pipe = Pipeline().store_in("pinecone", index_name="docs")Integration with Weaviate vector database.
from typing import Optional, Union, Any
class WeaviateHandshake(BaseHandshake):
"""
Integration with Weaviate vector database.
Args:
client: Optional Weaviate client instance
collection_name: Collection name (default: 'ChonkieChunks')
embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
**kwargs: Additional arguments for Weaviate client
"""
def __init__(
self,
client: Optional[Any] = None,
collection_name: str = "ChonkieChunks",
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
**kwargs: dict[str, Any]
): ...
def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
"""
Writes chunks to Weaviate collection.
Args:
chunk: Single Chunk or list of Chunks (must have embeddings)
Returns:
Weaviate response
"""
...Usage example:
from chonkie import WeaviateHandshake
handshake = WeaviateHandshake(collection_name="Documents")
# Use in pipeline
pipe = Pipeline().store_in("weaviate", collection_name="docs")Integration with Milvus vector database.
from typing import Optional, Union, Any
class MilvusHandshake(BaseHandshake):
"""
Integration with Milvus vector database.
Args:
collection_name: Name of the Milvus collection
embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
uri: Optional Milvus server URI
**kwargs: Additional arguments for Milvus client
"""
def __init__(
self,
collection_name: str,
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
uri: Optional[str] = None,
**kwargs: dict[str, Any]
): ...
def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
"""
Writes chunks to Milvus collection.
Args:
chunk: Single Chunk or list of Chunks (must have embeddings)
Returns:
Milvus response
"""
...Usage example:
from chonkie import MilvusHandshake
handshake = MilvusHandshake(
collection_name="documents",
uri="http://localhost:19530"
)
# Use in pipeline
pipe = Pipeline().store_in("milvus", collection_name="docs")Integration with Elasticsearch.
from typing import Optional, Union, Any
class ElasticHandshake(BaseHandshake):
"""
Integration with Elasticsearch.
Args:
index_name: Name of the Elasticsearch index
embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
es_client: Optional Elasticsearch client instance
**kwargs: Additional arguments for Elasticsearch client
"""
def __init__(
self,
index_name: str,
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
es_client: Optional[Any] = None,
**kwargs: dict[str, Any]
): ...
def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
"""
Writes chunks to Elasticsearch index.
Args:
chunk: Single Chunk or list of Chunks (must have embeddings)
Returns:
Elasticsearch response
"""
...Usage example:
from chonkie import ElasticHandshake
handshake = ElasticHandshake(index_name="documents")
# Use in pipeline
pipe = Pipeline().store_in("elastic", index_name="docs")Integration with MongoDB Atlas Vector Search.
from typing import Optional, Union, Any
class MongoDBHandshake(BaseHandshake):
"""
Integration with MongoDB Atlas Vector Search.
Args:
database_name: MongoDB database name
collection_name: MongoDB collection name
embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
connection_string: Optional MongoDB connection string (defaults to MONGODB_URI env var)
**kwargs: Additional arguments for MongoDB client
"""
def __init__(
self,
database_name: str,
collection_name: str,
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
connection_string: Optional[str] = None,
**kwargs: dict[str, Any]
): ...
def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
"""
Writes chunks to MongoDB collection.
Args:
chunk: Single Chunk or list of Chunks (must have embeddings)
Returns:
MongoDB response
"""
...Usage example:
from chonkie import MongoDBHandshake
handshake = MongoDBHandshake(
database_name="mydb",
collection_name="documents"
)
# Use in pipeline
pipe = Pipeline().store_in("mongodb", database_name="db", collection_name="docs")Integration with PostgreSQL pgvector extension.
from typing import Optional, Union, Any
class PgvectorHandshake(BaseHandshake):
"""
Integration with PostgreSQL pgvector extension.
Args:
table_name: PostgreSQL table name
embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
connection_string: Optional PostgreSQL connection string (defaults to POSTGRES_URI env var)
**kwargs: Additional arguments for psycopg2
"""
def __init__(
self,
table_name: str,
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
connection_string: Optional[str] = None,
**kwargs: dict[str, Any]
): ...
def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
"""
Writes chunks to PostgreSQL table with pgvector.
Args:
chunk: Single Chunk or list of Chunks (must have embeddings)
Returns:
PostgreSQL response
"""
...Usage example:
from chonkie import PgvectorHandshake
handshake = PgvectorHandshake(
table_name="documents",
connection_string="postgresql://user:pass@localhost/mydb"
)
# Use in pipeline
pipe = Pipeline().store_in("pgvector", table_name="docs")Integration with Turbopuffer vector database.
from typing import Optional, Union, Any
class TurbopufferHandshake(BaseHandshake):
"""
Integration with Turbopuffer vector database.
Args:
namespace: Turbopuffer namespace
embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
api_key: Optional API key (defaults to TURBOPUFFER_API_KEY env var)
**kwargs: Additional arguments for Turbopuffer client
"""
def __init__(
self,
namespace: str,
embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
api_key: Optional[str] = None,
**kwargs: dict[str, Any]
): ...
def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
"""
Writes chunks to Turbopuffer namespace.
Args:
chunk: Single Chunk or list of Chunks (must have embeddings)
Returns:
Turbopuffer response
"""
...Usage example:
from chonkie import TurbopufferHandshake
handshake = TurbopufferHandshake(namespace="documents")
# Use in pipeline
pipe = Pipeline().store_in("turbopuffer", namespace="docs")All handshake integrations are available from the main package:
from chonkie import (
BaseHandshake,
ChromaHandshake,
QdrantHandshake,
PineconeHandshake,
WeaviateHandshake,
MilvusHandshake,
ElasticHandshake,
MongoDBHandshake,
PgvectorHandshake,
TurbopufferHandshake,
)Handshakes are used in pipelines via the store_in() method:
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("recursive")
.refine_with("embeddings")
.store_in("chroma", collection_name="docs")
)Handshake aliases:
chroma - ChromaHandshakeqdrant - QdrantHandshakepinecone - PineconeHandshakeweaviate - WeaviateHandshakemilvus - MilvusHandshakeelastic - ElasticHandshakemongodb - MongoDBHandshakepgvector - PgvectorHandshaketurbopuffer - TurbopufferHandshakeAll handshakes require chunks to have embeddings before writing to the database:
from chonkie import Pipeline
# Correct: Add embeddings before storing
pipe = (
Pipeline()
.chunk_with("token")
.refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
.store_in("chroma")
)
# Will fail: No embeddings added
pipe = (
Pipeline()
.chunk_with("token")
.store_in("chroma") # Error: chunks have no embeddings
)Each handshake accepts an embedding model parameter. If chunks don't have embeddings yet, the handshake will generate them:
# Handshake generates embeddings if needed
handshake = ChromaHandshake(
collection_name="docs",
embedding_model="all-MiniLM-L6-v2"
)
# Or use a pre-configured embedding instance
from chonkie import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
handshake = QdrantHandshake(
collection_name="docs",
embedding_model=embeddings
)You can store chunks in multiple databases in a single pipeline:
pipe = (
Pipeline()
.chunk_with("recursive")
.refine_with("embeddings")
.store_in("chroma", collection_name="docs")
.store_in("qdrant", collection_name="docs")
.store_in("pinecone", index_name="docs")
)Create custom handshakes by extending BaseHandshake:
from chonkie import BaseHandshake, Chunk
from chonkie.pipeline import handshake
@handshake("custom")
class CustomHandshake(BaseHandshake):
def __init__(self, db_config: dict):
self.db = initialize_custom_db(db_config)
def write(self, chunk: Union[Chunk, list[Chunk]]):
chunks = [chunk] if isinstance(chunk, Chunk) else chunk
for c in chunks:
self.db.insert({
"id": c.id,
"text": c.text,
"embedding": c.embedding.tolist()
})
# Use in pipeline
pipe = Pipeline().store_in("custom", db_config={"host": "localhost"})