Chroma - the open-source embedding database
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Document operations form the core of ChromaDB's functionality, enabling storage, retrieval, updating, and deletion of documents with embeddings, metadata, and associated data. All operations support batching for efficient processing.
Add documents to a collection with automatic or manual embedding generation, supporting text, images, URIs, and metadata.
def add(
ids: IDs,
documents: Optional[Documents] = None,
embeddings: Optional[Embeddings] = None,
metadatas: Optional[Metadatas] = None,
images: Optional[Images] = None,
uris: Optional[URIs] = None
) -> None:
"""
Add documents to the collection.
Args:
ids: List of unique document identifiers
documents: List of document text content
embeddings: List of embedding vectors (generated if not provided)
metadatas: List of metadata dictionaries for each document
images: List of image arrays
uris: List of URIs pointing to external resources
Raises:
ValueError: If document IDs already exist or invalid data provided
"""Usage Example:
import chromadb
client = chromadb.EphemeralClient()
collection = client.create_collection("documents")
# Add documents with automatic embedding generation
collection.add(
documents=["This is the first document", "This is the second document"],
metadatas=[{"source": "web", "type": "article"}, {"source": "book", "type": "chapter"}],
ids=["doc1", "doc2"]
)
# Add with custom embeddings
collection.add(
documents=["Custom embedding document"],
embeddings=[[0.1, 0.2, 0.3, 0.4]], # Your pre-computed embedding
metadatas=[{"custom": True}],
ids=["doc3"]
)Perform vector similarity search to find documents similar to query text, embeddings, or images with filtering and ranking.
def query(
query_texts: Optional[Documents] = None,
query_embeddings: Optional[Embeddings] = None,
query_images: Optional[Images] = None,
query_uris: Optional[URIs] = None,
ids: Optional[IDs] = None,
n_results: int = 10,
where: Optional[Where] = None,
where_document: Optional[WhereDocument] = None,
include: Include = ["metadatas", "documents", "distances"]
) -> QueryResult:
"""
Query the collection for similar documents.
Args:
query_texts: List of text queries to find similar documents
query_embeddings: List of embedding vectors to search with
query_images: List of image arrays to search with
query_uris: List of URIs to load and search with
ids: Specific document IDs to search within
n_results: Number of results to return per query
where: Metadata filter conditions
where_document: Document text filter conditions
include: Fields to include in results
Returns:
QueryResult: Search results with documents, distances, and metadata
"""Usage Example:
# Query with text
results = collection.query(
query_texts=["Find documents about machine learning"],
n_results=5,
where={"source": "web"},
include=["documents", "metadatas", "distances"]
)
# Query with embeddings
results = collection.query(
query_embeddings=[[0.1, 0.2, 0.3, 0.4]],
n_results=3,
where={"type": {"$in": ["article", "paper"]}}
)
print(f"Found {len(results['ids'][0])} similar documents")
for i, doc in enumerate(results['documents'][0]):
print(f"Distance: {results['distances'][0][i]:.3f}, Doc: {doc[:100]}...")Retrieve specific documents by ID or filter criteria without similarity ranking.
def get(
ids: Optional[IDs] = None,
where: Optional[Where] = None,
limit: Optional[int] = None,
offset: Optional[int] = None,
where_document: Optional[WhereDocument] = None,
include: Include = ["metadatas", "documents"]
) -> GetResult:
"""
Get documents from the collection.
Args:
ids: Specific document IDs to retrieve
where: Metadata filter conditions
limit: Maximum number of documents to return
offset: Number of documents to skip
where_document: Document text filter conditions
include: Fields to include in results
Returns:
GetResult: Retrieved documents with requested fields
"""Usage Example:
# Get specific documents by ID
docs = collection.get(
ids=["doc1", "doc2"],
include=["documents", "metadatas"]
)
# Get documents with metadata filtering
docs = collection.get(
where={"source": "web"},
limit=10,
include=["documents", "metadatas", "embeddings"]
)
# Get all documents (paginated)
all_docs = collection.get(limit=100, offset=0)Quickly preview the first few documents in a collection for inspection.
def peek(self, limit: int = 10) -> GetResult:
"""
Peek at the first few documents in the collection.
Args:
limit: Number of documents to return
Returns:
GetResult: First documents in the collection
"""Usage Example:
# Preview first 5 documents
preview = collection.peek(limit=5)
print(f"Collection contains {len(preview['ids'])} documents (showing first 5)")Modify existing documents, embeddings, or metadata while preserving document IDs.
def update(
ids: IDs,
documents: Optional[Documents] = None,
embeddings: Optional[Embeddings] = None,
metadatas: Optional[Metadatas] = None,
images: Optional[Images] = None,
uris: Optional[URIs] = None
) -> None:
"""
Update existing documents in the collection.
Args:
ids: List of document IDs to update
documents: New document text content
embeddings: New embedding vectors (regenerated if not provided)
metadatas: New metadata dictionaries
images: New image arrays
uris: New URIs
Raises:
ValueError: If document IDs do not exist
"""Usage Example:
# Update document text (embeddings will be regenerated)
collection.update(
ids=["doc1"],
documents=["This is the updated first document"],
metadatas=[{"source": "web", "type": "article", "updated": True}]
)
# Update only metadata
collection.update(
ids=["doc2"],
metadatas=[{"source": "book", "type": "chapter", "reviewed": True}]
)Insert new documents or update existing ones in a single operation, providing convenience for data synchronization.
def upsert(
ids: IDs,
documents: Optional[Documents] = None,
embeddings: Optional[Embeddings] = None,
metadatas: Optional[Metadatas] = None,
images: Optional[Images] = None,
uris: Optional[URIs] = None
) -> None:
"""
Insert new documents or update existing ones.
Args:
ids: List of document IDs to upsert
documents: Document text content
embeddings: Embedding vectors (generated if not provided)
metadatas: Metadata dictionaries
images: Image arrays
uris: URIs
"""Usage Example:
# Upsert documents (creates new or updates existing)
collection.upsert(
documents=["New document", "Updated existing document"],
metadatas=[{"source": "api"}, {"source": "user", "updated": True}],
ids=["new_doc", "existing_doc"]
)Remove documents from the collection by ID or filter criteria.
def delete(
ids: Optional[IDs] = None,
where: Optional[Where] = None,
where_document: Optional[WhereDocument] = None
) -> None:
"""
Delete documents from the collection.
Args:
ids: Specific document IDs to delete
where: Metadata filter conditions for deletion
where_document: Document text filter conditions for deletion
Note: If no arguments provided, deletes all documents in collection
"""Usage Example:
# Delete specific documents
collection.delete(ids=["doc1", "doc2"])
# Delete documents matching metadata criteria
collection.delete(where={"source": "temporary"})
# Delete documents matching text criteria
collection.delete(where_document={"$contains": "delete_me"})
# Delete all documents (use with caution)
collection.delete()from typing import List, Dict, Optional, Union, Any, Literal
from numpy.typing import NDArray
# Basic document types
ID = str
IDs = List[ID]
Document = str
Documents = List[Document]
URI = str
URIs = List[URI]
Image = NDArray[Any] # Image array
Images = List[Image]
# Embedding types
Embedding = List[float]
Embeddings = List[Embedding]
# Metadata types
Metadata = Dict[str, Union[str, int, float, bool, None]]
Metadatas = List[Metadata]
# Query filter types
Where = Dict[Union[str, Literal["$and", "$or"]], Any]
WhereDocument = Dict[Literal["$contains", "$not_contains"], Union[str, List[Any]]]
# Include fields specification
Include = List[Literal["documents", "embeddings", "metadatas", "distances", "uris", "data"]]
# Result types
GetResult = Dict[str, List[Any]] # Contains ids, documents, metadatas, embeddings, etc.
QueryResult = Dict[str, List[Any]] # Contains ids, documents, metadatas, embeddings, distances, etc.Install with Tessl CLI
npx tessl i tessl/pypi-chromadb