or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/chonkie@1.5.x

docs

advanced-features.mdchunkers.mdcore-types.mddata-processing.mdembeddings.mdexport.mdindex.mdlogging.mdpipeline.mdrefineries.mdtokenizers.mdvector-databases.md
tile.json

tessl/pypi-chonkie

tessl install tessl/pypi-chonkie@1.5.0

The lightweight ingestion library for fast, efficient and robust RAG pipelines

vector-databases.mddocs/

Vector Database Integrations

Direct integration with 10+ vector databases for seamless chunk ingestion including ChromaDB, Qdrant, Pinecone, Weaviate, Milvus, Elasticsearch, MongoDB, pgvector, and Turbopuffer.

Capabilities

BaseHandshake

Abstract base class for all vector database integrations.

from abc import ABC, abstractmethod
from typing import Union, Any

class BaseHandshake(ABC):
    """
    Base class for vector database integrations.
    """
    def __init__(self): ...

    @abstractmethod
    def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
        """
        Writes chunks to the vector database.

        Args:
            chunk: Single Chunk or list of Chunks

        Returns:
            Database-specific response
        """
        ...

    def __call__(self, chunks: Union[Chunk, list[Chunk]]) -> Any:
        """
        Allows calling handshake as a function.

        Args:
            chunks: Single Chunk or list of Chunks

        Returns:
            Database-specific response
        """
        ...

ChromaHandshake

Integration with ChromaDB vector database.

from typing import Optional, Union, Literal, Any

class ChromaHandshake(BaseHandshake):
    """
    Integration with ChromaDB vector database.

    Args:
        client: Optional ChromaDB client instance
        collection_name: Collection name or 'random' for auto-generated (default: 'random')
        embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
        path: Optional path for persistent storage
    """
    def __init__(
        self,
        client: Optional[Any] = None,
        collection_name: Union[str, Literal["random"]] = "random",
        embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
        path: Optional[str] = None
    ): ...

    def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
        """
        Writes chunks to ChromaDB collection.

        Args:
            chunk: Single Chunk or list of Chunks (must have embeddings)

        Returns:
            ChromaDB response
        """
        ...

Usage example:

from chonkie import ChromaHandshake, TokenChunker, EmbeddingsRefinery

# Create and embed chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")

refinery = EmbeddingsRefinery(embedding_model="all-MiniLM-L6-v2")
chunks = refinery(chunks)

# Write to ChromaDB
handshake = ChromaHandshake(
    collection_name="my_documents",
    path="./chroma_db"
)
handshake(chunks)

# Use in pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("recursive")
    .refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
    .store_in("chroma", collection_name="docs", path="./db")
)

QdrantHandshake

Integration with Qdrant vector database.

from typing import Optional, Union, Literal, Any

class QdrantHandshake(BaseHandshake):
    """
    Integration with Qdrant vector database.

    Args:
        client: Optional QdrantClient instance
        collection_name: Collection name or 'random' for auto-generated (default: 'random')
        embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
        url: Optional Qdrant server URL
        path: Optional path for local storage
        api_key: Optional API key for Qdrant Cloud
        **kwargs: Additional arguments for QdrantClient
    """
    def __init__(
        self,
        client: Optional[Any] = None,
        collection_name: Union[str, Literal["random"]] = "random",
        embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
        url: Optional[str] = None,
        path: Optional[str] = None,
        api_key: Optional[str] = None,
        **kwargs: dict[str, Any]
    ): ...

    def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
        """
        Writes chunks to Qdrant collection.

        Args:
            chunk: Single Chunk or list of Chunks (must have embeddings)

        Returns:
            Qdrant response
        """
        ...

Usage example:

from chonkie import QdrantHandshake

# Local Qdrant
handshake = QdrantHandshake(
    collection_name="documents",
    path="./qdrant_storage"
)

# Qdrant Cloud
handshake = QdrantHandshake(
    collection_name="documents",
    url="https://your-cluster.qdrant.io",
    api_key="your-api-key"
)

# Use in pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("semantic")
    .store_in("qdrant", collection_name="docs", url="http://localhost:6333")
)

PineconeHandshake

Integration with Pinecone vector database.

from typing import Optional, Union, Any

class PineconeHandshake(BaseHandshake):
    """
    Integration with Pinecone vector database.

    Args:
        index_name: Name of the Pinecone index
        api_key: Optional API key (defaults to PINECONE_API_KEY env var)
        embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
        **kwargs: Additional arguments for Pinecone client
    """
    def __init__(
        self,
        index_name: str,
        api_key: Optional[str] = None,
        embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
        **kwargs: dict[str, Any]
    ): ...

    def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
        """
        Writes chunks to Pinecone index.

        Args:
            chunk: Single Chunk or list of Chunks (must have embeddings)

        Returns:
            Pinecone response
        """
        ...

Usage example:

from chonkie import PineconeHandshake

handshake = PineconeHandshake(
    index_name="my-index",
    api_key="your-pinecone-key"
)

# Use in pipeline
pipe = Pipeline().store_in("pinecone", index_name="docs")

WeaviateHandshake

Integration with Weaviate vector database.

from typing import Optional, Union, Any

class WeaviateHandshake(BaseHandshake):
    """
    Integration with Weaviate vector database.

    Args:
        client: Optional Weaviate client instance
        collection_name: Collection name (default: 'ChonkieChunks')
        embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
        **kwargs: Additional arguments for Weaviate client
    """
    def __init__(
        self,
        client: Optional[Any] = None,
        collection_name: str = "ChonkieChunks",
        embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
        **kwargs: dict[str, Any]
    ): ...

    def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
        """
        Writes chunks to Weaviate collection.

        Args:
            chunk: Single Chunk or list of Chunks (must have embeddings)

        Returns:
            Weaviate response
        """
        ...

Usage example:

from chonkie import WeaviateHandshake

handshake = WeaviateHandshake(collection_name="Documents")

# Use in pipeline
pipe = Pipeline().store_in("weaviate", collection_name="docs")

MilvusHandshake

Integration with Milvus vector database.

from typing import Optional, Union, Any

class MilvusHandshake(BaseHandshake):
    """
    Integration with Milvus vector database.

    Args:
        collection_name: Name of the Milvus collection
        embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
        uri: Optional Milvus server URI
        **kwargs: Additional arguments for Milvus client
    """
    def __init__(
        self,
        collection_name: str,
        embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
        uri: Optional[str] = None,
        **kwargs: dict[str, Any]
    ): ...

    def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
        """
        Writes chunks to Milvus collection.

        Args:
            chunk: Single Chunk or list of Chunks (must have embeddings)

        Returns:
            Milvus response
        """
        ...

Usage example:

from chonkie import MilvusHandshake

handshake = MilvusHandshake(
    collection_name="documents",
    uri="http://localhost:19530"
)

# Use in pipeline
pipe = Pipeline().store_in("milvus", collection_name="docs")

ElasticHandshake

Integration with Elasticsearch.

from typing import Optional, Union, Any

class ElasticHandshake(BaseHandshake):
    """
    Integration with Elasticsearch.

    Args:
        index_name: Name of the Elasticsearch index
        embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
        es_client: Optional Elasticsearch client instance
        **kwargs: Additional arguments for Elasticsearch client
    """
    def __init__(
        self,
        index_name: str,
        embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
        es_client: Optional[Any] = None,
        **kwargs: dict[str, Any]
    ): ...

    def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
        """
        Writes chunks to Elasticsearch index.

        Args:
            chunk: Single Chunk or list of Chunks (must have embeddings)

        Returns:
            Elasticsearch response
        """
        ...

Usage example:

from chonkie import ElasticHandshake

handshake = ElasticHandshake(index_name="documents")

# Use in pipeline
pipe = Pipeline().store_in("elastic", index_name="docs")

MongoDBHandshake

Integration with MongoDB Atlas Vector Search.

from typing import Optional, Union, Any

class MongoDBHandshake(BaseHandshake):
    """
    Integration with MongoDB Atlas Vector Search.

    Args:
        database_name: MongoDB database name
        collection_name: MongoDB collection name
        embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
        connection_string: Optional MongoDB connection string (defaults to MONGODB_URI env var)
        **kwargs: Additional arguments for MongoDB client
    """
    def __init__(
        self,
        database_name: str,
        collection_name: str,
        embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
        connection_string: Optional[str] = None,
        **kwargs: dict[str, Any]
    ): ...

    def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
        """
        Writes chunks to MongoDB collection.

        Args:
            chunk: Single Chunk or list of Chunks (must have embeddings)

        Returns:
            MongoDB response
        """
        ...

Usage example:

from chonkie import MongoDBHandshake

handshake = MongoDBHandshake(
    database_name="mydb",
    collection_name="documents"
)

# Use in pipeline
pipe = Pipeline().store_in("mongodb", database_name="db", collection_name="docs")

PgvectorHandshake

Integration with PostgreSQL pgvector extension.

from typing import Optional, Union, Any

class PgvectorHandshake(BaseHandshake):
    """
    Integration with PostgreSQL pgvector extension.

    Args:
        table_name: PostgreSQL table name
        embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
        connection_string: Optional PostgreSQL connection string (defaults to POSTGRES_URI env var)
        **kwargs: Additional arguments for psycopg2
    """
    def __init__(
        self,
        table_name: str,
        embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
        connection_string: Optional[str] = None,
        **kwargs: dict[str, Any]
    ): ...

    def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
        """
        Writes chunks to PostgreSQL table with pgvector.

        Args:
            chunk: Single Chunk or list of Chunks (must have embeddings)

        Returns:
            PostgreSQL response
        """
        ...

Usage example:

from chonkie import PgvectorHandshake

handshake = PgvectorHandshake(
    table_name="documents",
    connection_string="postgresql://user:pass@localhost/mydb"
)

# Use in pipeline
pipe = Pipeline().store_in("pgvector", table_name="docs")

TurbopufferHandshake

Integration with Turbopuffer vector database.

from typing import Optional, Union, Any

class TurbopufferHandshake(BaseHandshake):
    """
    Integration with Turbopuffer vector database.

    Args:
        namespace: Turbopuffer namespace
        embedding_model: Embedding model identifier or instance (default: 'minishlab/potion-retrieval-32M')
        api_key: Optional API key (defaults to TURBOPUFFER_API_KEY env var)
        **kwargs: Additional arguments for Turbopuffer client
    """
    def __init__(
        self,
        namespace: str,
        embedding_model: Union[str, BaseEmbeddings] = "minishlab/potion-retrieval-32M",
        api_key: Optional[str] = None,
        **kwargs: dict[str, Any]
    ): ...

    def write(self, chunk: Union[Chunk, list[Chunk]]) -> Any:
        """
        Writes chunks to Turbopuffer namespace.

        Args:
            chunk: Single Chunk or list of Chunks (must have embeddings)

        Returns:
            Turbopuffer response
        """
        ...

Usage example:

from chonkie import TurbopufferHandshake

handshake = TurbopufferHandshake(namespace="documents")

# Use in pipeline
pipe = Pipeline().store_in("turbopuffer", namespace="docs")

Imports

All handshake integrations are available from the main package:

from chonkie import (
    BaseHandshake,
    ChromaHandshake,
    QdrantHandshake,
    PineconeHandshake,
    WeaviateHandshake,
    MilvusHandshake,
    ElasticHandshake,
    MongoDBHandshake,
    PgvectorHandshake,
    TurbopufferHandshake,
)

Pipeline Usage

Handshakes are used in pipelines via the store_in() method:

from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("recursive")
    .refine_with("embeddings")
    .store_in("chroma", collection_name="docs")
)

Handshake aliases:

  • chroma - ChromaHandshake
  • qdrant - QdrantHandshake
  • pinecone - PineconeHandshake
  • weaviate - WeaviateHandshake
  • milvus - MilvusHandshake
  • elastic - ElasticHandshake
  • mongodb - MongoDBHandshake
  • pgvector - PgvectorHandshake
  • turbopuffer - TurbopufferHandshake

Important Notes

Embeddings Required

All handshakes require chunks to have embeddings before writing to the database:

from chonkie import Pipeline

# Correct: Add embeddings before storing
pipe = (
    Pipeline()
    .chunk_with("token")
    .refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
    .store_in("chroma")
)

# Will fail: No embeddings added
pipe = (
    Pipeline()
    .chunk_with("token")
    .store_in("chroma")  # Error: chunks have no embeddings
)

Custom Embedding Models

Each handshake accepts an embedding model parameter. If chunks don't have embeddings yet, the handshake will generate them:

# Handshake generates embeddings if needed
handshake = ChromaHandshake(
    collection_name="docs",
    embedding_model="all-MiniLM-L6-v2"
)

# Or use a pre-configured embedding instance
from chonkie import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
handshake = QdrantHandshake(
    collection_name="docs",
    embedding_model=embeddings
)

Multiple Destinations

You can store chunks in multiple databases in a single pipeline:

pipe = (
    Pipeline()
    .chunk_with("recursive")
    .refine_with("embeddings")
    .store_in("chroma", collection_name="docs")
    .store_in("qdrant", collection_name="docs")
    .store_in("pinecone", index_name="docs")
)

Custom Handshakes

Create custom handshakes by extending BaseHandshake:

from chonkie import BaseHandshake, Chunk
from chonkie.pipeline import handshake

@handshake("custom")
class CustomHandshake(BaseHandshake):
    def __init__(self, db_config: dict):
        self.db = initialize_custom_db(db_config)

    def write(self, chunk: Union[Chunk, list[Chunk]]):
        chunks = [chunk] if isinstance(chunk, Chunk) else chunk
        for c in chunks:
            self.db.insert({
                "id": c.id,
                "text": c.text,
                "embedding": c.embedding.tolist()
            })

# Use in pipeline
pipe = Pipeline().store_in("custom", db_config={"host": "localhost"})