Chroma - the open-source embedding database
npx @tessl/cli install tessl/pypi-chromadb@1.0.0ChromaDB is an open-source embedding database designed as the fastest way to build Python or JavaScript LLM applications with memory. It provides a simple API for storing, querying, and managing document embeddings with automatic tokenization, embedding generation, and indexing capabilities.
pip install chromadbimport chromadbFor specific client types:
from chromadb import EphemeralClient, PersistentClient, HttpClientFor types and embedding functions:
from chromadb.api.types import Documents, Embeddings, Metadatas, Where, Include
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunctionimport chromadb
# Create a client (in-memory for testing)
client = chromadb.EphemeralClient()
# Create a collection
collection = client.create_collection(name="my_collection")
# Add documents with metadata
collection.add(
documents=["This is a document", "This is another document"],
metadatas=[{"source": "my_source"}, {"source": "my_source"}],
ids=["id1", "id2"]
)
# Query for similar documents
results = collection.query(
query_texts=["This is a query document"],
n_results=2
)
print(results)ChromaDB operates on a client-server architecture with multiple deployment options:
Factory functions for creating different types of ChromaDB clients depending on deployment needs. Supports in-memory for testing, persistent storage for local development, and remote connections for production.
def EphemeralClient(settings=None, tenant="default_tenant", database="default_database"): ...
def PersistentClient(path="./chroma", settings=None, tenant="default_tenant", database="default_database"): ...
def HttpClient(host="localhost", port=8000, ssl=False, headers=None, settings=None, tenant="default_tenant", database="default_database"): ...
def CloudClient(tenant=None, database=None, api_key=None, settings=None): ...Core collection management including creation, retrieval, modification, and deletion. Collections are the primary containers for documents and embeddings with configurable metadata and embedding functions.
class ClientAPI:
def create_collection(self, name: str, **kwargs) -> Collection: ...
def get_collection(self, name: str, **kwargs) -> Collection: ...
def delete_collection(self, name: str) -> None: ...
def list_collections(self, limit=None, offset=None) -> Sequence[Collection]: ...Adding, updating, querying, and deleting documents within collections. Supports embeddings, metadata, images, and URIs with flexible data formats and automatic embedding generation.
class Collection:
def add(self, ids, documents=None, embeddings=None, metadatas=None, images=None, uris=None): ...
def query(self, query_texts=None, query_embeddings=None, n_results=10, where=None, **kwargs): ...
def get(self, ids=None, where=None, limit=None, **kwargs): ...
def update(self, ids, documents=None, embeddings=None, metadatas=None, **kwargs): ...
def delete(self, ids=None, where=None, where_document=None): ...Pre-built and configurable embedding functions for generating vector embeddings from text, supporting major AI providers and embedding models with consistent interfaces.
class EmbeddingFunction:
def __call__(self, input): ...
def embed_with_retries(self, input, **retry_kwargs): ...Advanced query capabilities including vector similarity search, metadata filtering, and document text matching with logical operators and flexible result formatting.
Where = Dict[Union[str, LogicalOperator], Union[LiteralValue, OperatorExpression, List[Where]]]
WhereDocument = Dict[WhereDocumentOperator, Union[str, List[WhereDocument]]]
Include = List[Literal["documents", "embeddings", "metadatas", "distances", "uris", "data"]]Comprehensive configuration system for customizing ChromaDB behavior including authentication, server settings, telemetry, and storage options.
class Settings:
def __init__(self, **kwargs): ...
def configure(**kwargs) -> None: ...
def get_settings() -> Settings: ...# Document and metadata types
Documents = List[str]
Metadatas = List[Dict[str, Union[str, int, float, bool]]]
IDs = List[str]
Embeddings = List[List[float]]
# Query result types
GetResult = TypedDict('GetResult', {
'ids': List[List[str]],
'documents': List[List[Optional[str]]],
'metadatas': List[List[Optional[Dict]]],
'embeddings': List[List[Optional[List[float]]]],
'distances': List[List[float]],
'uris': List[List[Optional[str]]],
'data': List[List[Optional[Any]]],
'included': List[str]
})
QueryResult = TypedDict('QueryResult', {
'ids': List[List[str]],
'documents': List[List[Optional[str]]],
'metadatas': List[List[Optional[Dict]]],
'embeddings': List[List[Optional[List[float]]]],
'distances': List[List[float]],
'uris': List[List[Optional[str]]],
'data': List[List[Optional[Any]]],
'included': List[str]
})