tessl install tessl/pypi-chonkie@1.5.0The lightweight ingestion library for fast, efficient and robust RAG pipelines
Integration with 9+ embedding providers including OpenAI, Azure OpenAI, Cohere, Gemini, Jina, Voyage AI, sentence-transformers, model2vec, and LiteLLM for unified access to 100+ models.
Abstract base class for all embedding implementations.
from abc import ABC, abstractmethod
import numpy as np
class BaseEmbeddings(ABC):
"""
Base class for all embedding implementations.
"""
def __init__(self): ...
@abstractmethod
def embed(self, text: str) -> np.ndarray:
"""
Generates embedding for a single text.
Args:
text: Input text
Returns:
Embedding vector as numpy array
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Generates embeddings for multiple texts.
Args:
texts: List of input texts
Returns:
List of embedding vectors as numpy arrays
"""
...
def similarity(self, u: np.ndarray, v: np.ndarray) -> np.float32:
"""
Computes cosine similarity between two embeddings.
Args:
u: First embedding vector
v: Second embedding vector
Returns:
Cosine similarity score (0.0 to 1.0)
"""
...Embeddings using the sentence-transformers library for local embedding generation.
from typing import Any, Union
class SentenceTransformerEmbeddings(BaseEmbeddings):
"""
Embeddings using sentence-transformers library.
Args:
model: Model identifier or SentenceTransformer instance (default: 'all-MiniLM-L6-v2')
**kwargs: Additional arguments passed to SentenceTransformer
"""
def __init__(
self,
model: Union[str, SentenceTransformer] = "all-MiniLM-L6-v2",
**kwargs: Any
): ...
def embed(self, text: str) -> np.ndarray:
"""
Generates embedding for text using sentence-transformers.
Args:
text: Input text
Returns:
Embedding vector
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Batch generates embeddings.
Args:
texts: List of input texts
Returns:
List of embedding vectors
"""
...
def get_tokenizer(self):
"""
Returns the model's tokenizer.
Returns:
Tokenizer instance from the model
"""
...
def embed_as_tokens(self, text: str) -> np.ndarray:
"""
Embed text as tokens using the sentence-transformers model.
Useful for getting token embeddings even for texts longer than max sequence length.
Args:
text: Input text to embed
Returns:
Token embeddings as numpy array
"""
...
def embed_as_tokens_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Embed multiple texts as tokens.
Args:
texts: List of input texts
Returns:
List of token embedding arrays
"""
...
def count_tokens(self, text: str) -> int:
"""
Count tokens in text using the model's tokenizer.
Args:
text: Input text
Returns:
Number of tokens
"""
...
def count_tokens_batch(self, texts: list[str]) -> list[int]:
"""
Count tokens in multiple texts.
Args:
texts: List of input texts
Returns:
List of token counts
"""
...
def similarity(self, u: np.ndarray, v: np.ndarray) -> np.float32:
"""
Compute cosine similarity between two embeddings.
Args:
u: First embedding vector
v: Second embedding vector
Returns:
Cosine similarity score
"""
...
@property
def dimension(self) -> int:
"""
Returns the embedding dimension.
Returns:
Embedding vector dimension
"""
...
@property
def max_seq_length(self) -> int:
"""
Returns the maximum sequence length.
Returns:
Maximum sequence length supported by the model
"""
...Usage example:
from chonkie import SentenceTransformerEmbeddings
# Create embeddings (requires sentence-transformers)
embeddings = SentenceTransformerEmbeddings(model="all-MiniLM-L6-v2")
# Single text
vector = embeddings.embed("Hello world")
print(vector.shape) # (384,)
# Batch
vectors = embeddings.embed_batch(["Text 1", "Text 2", "Text 3"])
# Token-level embeddings for long texts
token_embeddings = embeddings.embed_as_tokens("Very long text that exceeds max sequence length...")
print(token_embeddings.shape) # (num_tokens, 384)
# Count tokens
token_count = embeddings.count_tokens("Hello world")
print(token_count) # 2
# Compute similarity
similarity = embeddings.similarity(vector, vectors[0])
print(similarity) # 0.95
# Access properties
print(embeddings.dimension) # 384
print(embeddings.max_seq_length) # 512Embeddings using OpenAI's embedding API.
from typing import Optional, Any
class OpenAIEmbeddings(BaseEmbeddings):
"""
Embeddings using OpenAI's API.
Args:
model: Model identifier (default: varies by version)
tokenizer: Optional tokenizer for the model
dimension: Optional output dimension for supported models
max_tokens: Optional maximum tokens per request
base_url: Optional custom API base URL
api_key: Optional API key (defaults to OPENAI_API_KEY env var)
max_retries: Maximum number of retries (default: 3)
timeout: Request timeout in seconds (default: 60.0)
batch_size: Batch size for API requests (default: 128)
**kwargs: Additional arguments for the OpenAI client
"""
def __init__(
self,
model: str = ...,
tokenizer: Optional[Any] = None,
dimension: Optional[int] = None,
max_tokens: Optional[int] = None,
base_url: Optional[str] = None,
api_key: Optional[str] = None,
max_retries: int = 3,
timeout: float = 60.0,
batch_size: int = 128,
**kwargs: dict[str, Any]
): ...
def embed(self, text: str) -> np.ndarray:
"""
Generates OpenAI embedding for text.
Args:
text: Input text
Returns:
Embedding vector
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Batch generates embeddings with automatic batching.
Args:
texts: List of input texts
Returns:
List of embedding vectors
"""
...Usage example:
from chonkie import OpenAIEmbeddings
# Using environment variable OPENAI_API_KEY
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# With explicit API key
embeddings = OpenAIEmbeddings(
model="text-embedding-3-large",
api_key="your-api-key",
dimension=1024 # Optional dimension reduction
)
vector = embeddings.embed("Hello world")Embeddings using Azure OpenAI service.
from typing import Optional, Any
class AzureOpenAIEmbeddings(BaseEmbeddings):
"""
Embeddings using Azure OpenAI service.
Args:
model: Azure deployment name (default: 'text-embedding-3-small')
azure_endpoint: Optional Azure endpoint URL (defaults to AZURE_OPENAI_ENDPOINT env var)
tokenizer: Optional tokenizer override
dimension: Optional output dimension
azure_api_key: Optional Azure API key (defaults to AZURE_OPENAI_API_KEY env var)
api_version: API version (default: '2024-10-21')
deployment: Optional deployment name (can differ from model)
max_retries: Maximum number of retries (default: 3)
timeout: Request timeout in seconds (default: 60.0)
batch_size: Batch size for API requests (default: 128)
**kwargs: Additional arguments for the Azure OpenAI client
"""
def __init__(
self,
model: str = "text-embedding-3-small",
azure_endpoint: Optional[str] = None,
tokenizer: Optional[Any] = None,
dimension: Optional[int] = None,
azure_api_key: Optional[str] = None,
api_version: str = "2024-10-21",
deployment: Optional[str] = None,
max_retries: int = 3,
timeout: float = 60.0,
batch_size: int = 128,
**kwargs: dict[str, Any]
): ...
def embed(self, text: str) -> np.ndarray:
"""
Generates embedding via Azure OpenAI.
Args:
text: Input text
Returns:
Embedding vector
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Batch generates embeddings.
Args:
texts: List of input texts
Returns:
List of embedding vectors
"""
...Usage example:
from chonkie import AzureOpenAIEmbeddings
embeddings = AzureOpenAIEmbeddings(
model="your-deployment-name",
api_key="your-azure-key",
azure_endpoint="https://your-resource.openai.azure.com/"
)
vector = embeddings.embed("Hello world")Embeddings using Cohere's embedding API.
from typing import Optional, Any
class CohereEmbeddings(BaseEmbeddings):
"""
Embeddings using Cohere's API.
Args:
model: Model identifier (default: 'embed-english-light-v3.0')
api_key: Optional API key (defaults to COHERE_API_KEY env var)
client_name: Optional client name for API requests
max_retries: Maximum number of retries for failed requests (default: 3)
timeout: Timeout in seconds for API requests (default: 60.0)
batch_size: Maximum number of texts to embed in one API call, max 96 (default: 96)
show_warnings: Whether to show warnings about token usage and truncation (default: True)
"""
def __init__(
self,
model: str = "embed-english-light-v3.0",
api_key: Optional[str] = None,
client_name: Optional[str] = None,
max_retries: int = 3,
timeout: float = 60.0,
batch_size: int = 96,
show_warnings: bool = True
): ...
def embed(self, text: str) -> np.ndarray:
"""
Generates Cohere embedding.
Args:
text: Input text
Returns:
Embedding vector
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Batch generates embeddings.
Args:
texts: List of input texts
Returns:
List of embedding vectors
"""
...Usage example:
from chonkie import CohereEmbeddings
embeddings = CohereEmbeddings(
model="embed-english-v3.0",
api_key="your-cohere-key",
input_type="search_document"
)
vector = embeddings.embed("Hello world")Embeddings using Google's Gemini API.
from typing import Optional, Any
class GeminiEmbeddings(BaseEmbeddings):
"""
Embeddings using Google's Gemini API.
Args:
model: Model identifier (default: 'models/text-embedding-004')
api_key: Optional API key (defaults to GOOGLE_API_KEY env var)
**kwargs: Additional arguments for the Gemini client
"""
def __init__(
self,
model: str = "models/text-embedding-004",
api_key: Optional[str] = None,
**kwargs: dict[str, Any]
): ...
def embed(self, text: str) -> np.ndarray:
"""
Generates Gemini embedding.
Args:
text: Input text
Returns:
Embedding vector
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Batch generates embeddings.
Args:
texts: List of input texts
Returns:
List of embedding vectors
"""
...Usage example:
from chonkie import GeminiEmbeddings
embeddings = GeminiEmbeddings(
model="models/text-embedding-004",
api_key="your-google-key"
)
vector = embeddings.embed("Hello world")Embeddings using Jina AI's embedding API.
from typing import Optional, Any
class JinaEmbeddings(BaseEmbeddings):
"""
Embeddings using Jina AI's API.
Args:
model: Model identifier (default: 'jina-embeddings-v3')
api_key: Optional API key (defaults to JINA_API_KEY env var)
**kwargs: Additional arguments for the Jina client
"""
def __init__(
self,
model: str = "jina-embeddings-v3",
api_key: Optional[str] = None,
**kwargs: dict[str, Any]
): ...
def embed(self, text: str) -> np.ndarray:
"""
Generates Jina embedding.
Args:
text: Input text
Returns:
Embedding vector
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Batch generates embeddings.
Args:
texts: List of input texts
Returns:
List of embedding vectors
"""
...Usage example:
from chonkie import JinaEmbeddings
embeddings = JinaEmbeddings(
model="jina-embeddings-v3",
api_key="your-jina-key"
)
vector = embeddings.embed("Hello world")Embeddings using Voyage AI's embedding API.
Note: VoyageAI models are registered to use CatsuEmbeddings as the underlying implementation for better reliability and features. When you use VoyageAIEmbeddings or AutoEmbeddings with a VoyageAI model, it automatically routes through CatsuEmbeddings.
from typing import Optional, Any
class VoyageAIEmbeddings(BaseEmbeddings):
"""
Embeddings using Voyage AI's API (routes through CatsuEmbeddings).
Args:
model: Model identifier (default: 'voyage-3')
api_key: Optional API key (defaults to VOYAGEAI_API_KEY env var)
**kwargs: Additional arguments for the Voyage AI client
"""
def __init__(
self,
model: str = "voyage-3",
api_key: Optional[str] = None,
**kwargs: dict[str, Any]
): ...
def embed(self, text: str) -> np.ndarray:
"""
Generates Voyage AI embedding.
Args:
text: Input text
Returns:
Embedding vector
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Batch generates embeddings.
Args:
texts: List of input texts
Returns:
List of embedding vectors
"""
...Usage example:
from chonkie import VoyageAIEmbeddings
embeddings = VoyageAIEmbeddings(
model="voyage-3",
api_key="your-voyage-key"
)
vector = embeddings.embed("Hello world")Lightweight embeddings using the model2vec library for fast, distilled models.
from typing import Union, Any
class Model2VecEmbeddings(BaseEmbeddings):
"""
Lightweight embeddings using model2vec library.
Args:
model: Model identifier or instance (default: 'minishlab/potion-base-32M')
**kwargs: Additional arguments for model2vec
"""
def __init__(
self,
model: Union[str, Any] = "minishlab/potion-base-32M",
**kwargs: Any
): ...
def embed(self, text: str) -> np.ndarray:
"""
Generates embedding using model2vec.
Args:
text: Input text
Returns:
Embedding vector
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Batch generates embeddings.
Args:
texts: List of input texts
Returns:
List of embedding vectors
"""
...Usage example:
from chonkie import Model2VecEmbeddings
# Lightweight, fast embeddings (requires model2vec)
embeddings = Model2VecEmbeddings(model="minishlab/potion-base-32M")
vector = embeddings.embed("Hello world")Unified embeddings interface using LiteLLM for access to 100+ embedding models across providers.
from typing import Optional, Any
class LiteLLMEmbeddings(BaseEmbeddings):
"""
Unified embeddings interface using LiteLLM.
Args:
model: Model identifier in LiteLLM format (default: 'text-embedding-3-small')
Examples: 'openai/text-embedding-3-small', 'cohere/embed-english-v3.0'
api_key: Optional API key for the provider
**kwargs: Additional arguments for LiteLLM
"""
def __init__(
self,
model: str = "text-embedding-3-small",
api_key: Optional[str] = None,
**kwargs: dict[str, Any]
): ...
def embed(self, text: str) -> np.ndarray:
"""
Generates embedding via LiteLLM.
Args:
text: Input text
Returns:
Embedding vector
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Batch generates embeddings.
Args:
texts: List of input texts
Returns:
List of embedding vectors
"""
...Usage example:
from chonkie import LiteLLMEmbeddings
# Access any embedding provider through LiteLLM
embeddings = LiteLLMEmbeddings(
model="openai/text-embedding-3-small",
api_key="your-api-key"
)
# Or use different providers
cohere_embeddings = LiteLLMEmbeddings(
model="cohere/embed-english-v3.0",
api_key="your-cohere-key"
)
vector = embeddings.embed("Hello world")Unified embeddings adapter supporting 11+ providers including VoyageAI, Mistral, Nomic, Cloudflare, MixedBread, DeepInfra, and TogetherAI. Provides a consistent interface across multiple embedding services with automatic provider detection and routing.
from typing import Optional, Any, Dict
class CatsuEmbeddings(BaseEmbeddings):
"""
Unified embeddings adapter for multiple providers.
Supports providers: VoyageAI, Mistral, Nomic, Cloudflare, MixedBread, DeepInfra, TogetherAI, and more.
Args:
model: Model identifier (e.g., 'voyage-3-large', 'mistral-embed')
provider: Optional explicit provider name (auto-detected if not specified)
api_keys: Optional dict of API keys by provider name (e.g., {'voyageai': 'key1', 'mistral': 'key2'})
max_retries: Maximum retry attempts for failed requests (default: 3)
timeout: Request timeout in seconds (default: 30)
verbose: Enable verbose logging (default: False)
batch_size: Batch size for API requests (default: 128)
**kwargs: Additional provider-specific arguments
"""
def __init__(
self,
model: str,
provider: Optional[str] = None,
api_keys: Optional[Dict[str, str]] = None,
max_retries: int = 3,
timeout: int = 30,
verbose: bool = False,
batch_size: int = 128,
**kwargs: Dict[str, Any]
): ...
def embed(self, text: str) -> np.ndarray:
"""
Generates Catsu embedding.
Args:
text: Input text
Returns:
Embedding vector
"""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""
Batch generates embeddings.
Args:
texts: List of input texts
Returns:
List of embedding vectors
"""
...Factory class for automatically loading embeddings based on identifier strings.
from typing import Union, Any
class AutoEmbeddings:
"""
Factory for automatically loading embedding providers.
"""
@staticmethod
def get_embeddings(
model: Union[str, BaseEmbeddings, Any],
**kwargs: Any
) -> BaseEmbeddings:
"""
Auto-loads embeddings based on model identifier or returns existing instance.
Args:
model: Model identifier string, BaseEmbeddings instance, or provider-specific object
**kwargs: Additional arguments for the embedding provider
Returns:
BaseEmbeddings instance
Supported prefixes:
- 'openai/': OpenAIEmbeddings
- 'azure-openai/': AzureOpenAIEmbeddings
- 'cohere/': CohereEmbeddings
- 'gemini/': GeminiEmbeddings
- 'jina/': JinaEmbeddings
- 'voyageai/': VoyageAIEmbeddings
- 'model2vec/': Model2VecEmbeddings
- 'sentence-transformers/': SentenceTransformerEmbeddings
- Default: SentenceTransformerEmbeddings
"""
...Usage example:
from chonkie import AutoEmbeddings
# Auto-detect provider from identifier
embeddings = AutoEmbeddings.get_embeddings("openai/text-embedding-3-small")
embeddings = AutoEmbeddings.get_embeddings("cohere/embed-english-v3.0")
embeddings = AutoEmbeddings.get_embeddings("sentence-transformers/all-MiniLM-L6-v2")
# Pass through existing instance
from chonkie import OpenAIEmbeddings
existing = OpenAIEmbeddings()
same = AutoEmbeddings.get_embeddings(existing) # Returns same instanceRegistry for managing embedding provider mappings.
Note: EmbeddingsRegistry is not exported from the main chonkie module. Import it from the embeddings submodule:
from chonkie.embeddings import EmbeddingsRegistryclass EmbeddingsRegistry:
"""
Registry for managing embedding provider mappings.
"""
@staticmethod
def register_provider(name: str, cls: type) -> None:
"""
Registers an embedding provider.
Args:
name: Provider name/identifier
cls: Provider class
"""
...
@staticmethod
def get_provider(name: str) -> Optional[type]:
"""
Retrieves a provider class by name.
Args:
name: Provider name
Returns:
Provider class or None if not found
"""
...
@staticmethod
def list_providers() -> list[str]:
"""
Lists all registered provider names.
Returns:
List of provider names
"""
...All embedding classes are available from the main package:
from chonkie import (
BaseEmbeddings,
AutoEmbeddings,
SentenceTransformerEmbeddings,
OpenAIEmbeddings,
AzureOpenAIEmbeddings,
CohereEmbeddings,
GeminiEmbeddings,
JinaEmbeddings,
VoyageAIEmbeddings,
Model2VecEmbeddings,
LiteLLMEmbeddings,
)Embeddings are used with semantic chunkers and refineries:
from chonkie import SemanticChunker, AutoEmbeddings
# Direct usage with chunker
chunker = SemanticChunker(embedding_model="all-MiniLM-L6-v2")
# Using AutoEmbeddings
embeddings = AutoEmbeddings.get_embeddings("openai/text-embedding-3-small")
chunker = SemanticChunker(embedding_model=embeddings)
# With EmbeddingsRefinery
from chonkie import EmbeddingsRefinery, TokenChunker
chunker = TokenChunker()
refinery = EmbeddingsRefinery(embedding_model="all-MiniLM-L6-v2")
chunks = chunker("Text here...")
chunks_with_embeddings = refinery(chunks)