Chroma - the open-source embedding database
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
ChromaDB provides a comprehensive library of embedding functions for generating vector embeddings from text, supporting major AI providers and embedding models. Embedding functions are pluggable components that convert text into numerical representations for vector similarity search.
ChromaDB includes a default ONNX-based embedding function that works out-of-the-box without requiring API keys.
class DefaultEmbeddingFunction:
"""Default ONNX-based embedding function using all-MiniLM-L6-v2 model."""
def __call__(self, input: Documents) -> Embeddings:
"""Generate embeddings for input documents."""
def embed_with_retries(self, input: Documents, **retry_kwargs) -> Embeddings:
"""Generate embeddings with retry logic."""Usage Example:
import chromadb
# Uses DefaultEmbeddingFunction automatically
client = chromadb.EphemeralClient()
collection = client.create_collection("default_embeddings")
# Explicit usage
from chromadb.utils.embedding_functions import DefaultEmbeddingFunction
ef = DefaultEmbeddingFunction()
collection = client.create_collection("explicit_default", embedding_function=ef)Generate embeddings using OpenAI's embedding models with API key authentication.
class OpenAIEmbeddingFunction:
"""OpenAI embedding function using text-embedding-ada-002 or newer models."""
def __init__(
self,
api_key: str,
model_name: str = "text-embedding-ada-002",
api_base: Optional[str] = None,
api_type: Optional[str] = None,
api_version: Optional[str] = None,
deployment_id: Optional[str] = None
):
"""
Initialize OpenAI embedding function.
Args:
api_key: OpenAI API key
model_name: Model to use for embeddings
api_base: Custom API base URL
api_type: API type (e.g., 'azure')
api_version: API version for Azure
deployment_id: Deployment ID for Azure
"""Usage Example:
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
openai_ef = OpenAIEmbeddingFunction(
api_key="your-openai-api-key",
model_name="text-embedding-3-small"
)
collection = client.create_collection(
"openai_embeddings",
embedding_function=openai_ef
)Generate embeddings using Cohere's embedding models with support for different model types.
class CohereEmbeddingFunction:
"""Cohere embedding function supporting various Cohere models."""
def __init__(
self,
api_key: str,
model_name: str = "embed-english-v2.0"
):
"""
Initialize Cohere embedding function.
Args:
api_key: Cohere API key
model_name: Cohere model to use for embeddings
"""Generate embeddings using HuggingFace Transformers models with local or remote execution.
class HuggingFaceEmbeddingFunction:
"""HuggingFace Transformers embedding function."""
def __init__(
self,
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
device: str = "cpu",
normalize_embeddings: bool = True
):
"""
Initialize HuggingFace embedding function.
Args:
model_name: HuggingFace model identifier
device: Device to run model on ('cpu' or 'cuda')
normalize_embeddings: Whether to normalize embeddings
"""Usage Example:
from chromadb.utils.embedding_functions import HuggingFaceEmbeddingFunction
hf_ef = HuggingFaceEmbeddingFunction(
model_name="sentence-transformers/all-mpnet-base-v2",
device="cuda" if torch.cuda.is_available() else "cpu"
)
collection = client.create_collection(
"huggingface_embeddings",
embedding_function=hf_ef
)Specialized interface for Sentence Transformers models optimized for semantic similarity.
class SentenceTransformerEmbeddingFunction:
"""Sentence Transformers embedding function."""
def __init__(
self,
model_name: str = "all-MiniLM-L6-v2",
device: str = "cpu",
normalize_embeddings: bool = True
):
"""
Initialize Sentence Transformers embedding function.
Args:
model_name: Sentence Transformers model name
device: Device to run model on
normalize_embeddings: Whether to normalize embeddings
"""Generate embeddings using Google's AI models including PaLM and Vertex AI.
class GooglePalmEmbeddingFunction:
"""Google PaLM embedding function."""
def __init__(self, api_key: str, model_name: str = "models/embedding-gecko-001"):
"""Initialize Google PaLM embedding function."""
class GoogleVertexEmbeddingFunction:
"""Google Vertex AI embedding function."""
def __init__(
self,
project_id: str,
region: str = "us-central1",
model_name: str = "textembedding-gecko"
):
"""Initialize Google Vertex AI embedding function."""ChromaDB includes many specialized embedding functions for specific use cases:
class OllamaEmbeddingFunction:
"""Ollama local embedding function."""
class JinaEmbeddingFunction:
"""Jina AI embedding function."""
class VoyageAIEmbeddingFunction:
"""Voyage AI embedding function."""
class InstructorEmbeddingFunction:
"""Instructor embedding function."""
class OpenCLIPEmbeddingFunction:
"""OpenCLIP embedding function for images and text."""
class AmazonBedrockEmbeddingFunction:
"""Amazon Bedrock embedding function."""
class MistralEmbeddingFunction:
"""Mistral AI embedding function."""Create custom embedding functions by implementing the EmbeddingFunction protocol.
class EmbeddingFunction:
"""Protocol for embedding functions."""
def __call__(self, input: Documents) -> Embeddings:
"""Generate embeddings for input documents."""
def embed_with_retries(self, input: Documents, **retry_kwargs) -> Embeddings:
"""Generate embeddings with retry logic."""
@staticmethod
def name() -> str:
"""Return the name of the embedding function."""
@staticmethod
def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction":
"""Build embedding function from configuration."""
def get_config(self) -> Dict[str, Any]:
"""Get configuration for the embedding function."""
def default_space(self) -> str:
"""Return default distance metric ('cosine', 'l2', 'ip')."""
def supported_spaces(self) -> List[str]:
"""Return list of supported distance metrics."""Custom Implementation Example:
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
import requests
class CustomAPIEmbeddingFunction(EmbeddingFunction):
def __init__(self, api_url: str, api_key: str):
self.api_url = api_url
self.api_key = api_key
def __call__(self, input: Documents) -> Embeddings:
response = requests.post(
self.api_url,
headers={"Authorization": f"Bearer {self.api_key}"},
json={"texts": input}
)
return response.json()["embeddings"]
def embed_with_retries(self, input: Documents, **retry_kwargs) -> Embeddings:
# Implement retry logic
return self.__call__(input)
# Use custom embedding function
custom_ef = CustomAPIEmbeddingFunction("https://api.example.com/embed", "your-key")
collection = client.create_collection("custom_embeddings", embedding_function=custom_ef)Utility functions for working with embedding functions.
def register_embedding_function(ef_class: type) -> None:
"""Register a custom embedding function class."""
def config_to_embedding_function(config: Dict[str, Any]) -> EmbeddingFunction:
"""Create embedding function from configuration dictionary."""
known_embedding_functions: Dict[str, type] = {
# Dictionary of all available embedding functions
}Usage Example:
from chromadb.utils.embedding_functions import (
config_to_embedding_function,
known_embedding_functions
)
# Create from config
config = {
"name": "OpenAIEmbeddingFunction",
"api_key": "your-key",
"model_name": "text-embedding-3-small"
}
ef = config_to_embedding_function(config)
# List available functions
print("Available embedding functions:")
for name in known_embedding_functions.keys():
print(f" - {name}")from typing import List, Dict, Any, Optional, Protocol
from abc import ABC, abstractmethod
Documents = List[str]
Embeddings = List[List[float]]
class EmbeddingFunction(Protocol):
"""Protocol that all embedding functions must implement."""
def __call__(self, input: Documents) -> Embeddings: ...
def embed_with_retries(self, input: Documents, **retry_kwargs) -> Embeddings: ...
@staticmethod
def name() -> str: ...
@staticmethod
def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction": ...
def get_config(self) -> Dict[str, Any]: ...
def default_space(self) -> str: ...
def supported_spaces(self) -> List[str]: ...Install with Tessl CLI
npx tessl i tessl/pypi-chromadb