Chroma - the open-source embedding database
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
ChromaDB provides powerful query capabilities including vector similarity search, metadata filtering, and document text matching. The query system supports complex logical operations and flexible result formatting.
Find documents similar to query text, embeddings, or images using vector similarity metrics.
def query(
query_texts: Optional[Documents] = None,
query_embeddings: Optional[Embeddings] = None,
query_images: Optional[Images] = None,
query_uris: Optional[URIs] = None,
ids: Optional[IDs] = None,
n_results: int = 10,
where: Optional[Where] = None,
where_document: Optional[WhereDocument] = None,
include: Include = ["metadatas", "documents", "distances"]
) -> QueryResult:
"""
Query the collection for similar documents using vector similarity.
Args:
query_texts: Text queries (will be embedded automatically)
query_embeddings: Pre-computed embedding vectors
query_images: Image arrays for similarity search
query_uris: URIs to load and search with
ids: Restrict search to specific document IDs
n_results: Number of most similar results to return
where: Metadata filter conditions
where_document: Document text filter conditions
include: Fields to include in results
Returns:
QueryResult: Search results with similarity scores and requested fields
"""Usage Examples:
import chromadb
client = chromadb.EphemeralClient()
collection = client.get_collection("my_documents")
# Text-based similarity search
results = collection.query(
query_texts=["machine learning algorithms"],
n_results=5,
include=["documents", "metadatas", "distances"]
)
# Multi-query search
results = collection.query(
query_texts=["deep learning", "neural networks", "artificial intelligence"],
n_results=3 # 3 results per query
)
# Search with pre-computed embeddings
custom_embedding = [0.1, 0.2, 0.3, ...] # Your embedding vector
results = collection.query(
query_embeddings=[custom_embedding],
n_results=10
)Filter documents based on metadata values using logical operators and comparison functions.
# Where filter type definition
Where = Dict[Union[str, LogicalOperator], Union[LiteralValue, OperatorExpression, List[Where]]]
# Logical operators
LogicalOperator = Literal["$and", "$or"]
# Comparison operators
OperatorExpression = Dict[ComparisonOperator, Any]
ComparisonOperator = Literal["$eq", "$ne", "$gt", "$gte", "$lt", "$lte", "$in", "$nin"]
# Literal values
LiteralValue = Union[str, int, float, bool]Usage Examples:
# Simple equality filter
results = collection.query(
query_texts=["search term"],
where={"category": "science"}
)
# Comparison operators
results = collection.query(
query_texts=["search term"],
where={"year": {"$gte": 2020}} # Documents from 2020 or later
)
# Multiple conditions with $and (default)
results = collection.query(
query_texts=["search term"],
where={"category": "science", "year": {"$gte": 2020}}
)
# Explicit $and operator
results = collection.query(
query_texts=["search term"],
where={"$and": [
{"category": "science"},
{"year": {"$gte": 2020}}
]}
)
# $or operator
results = collection.query(
query_texts=["search term"],
where={"$or": [
{"category": "science"},
{"category": "technology"}
]}
)
# $in operator for multiple values
results = collection.query(
query_texts=["search term"],
where={"category": {"$in": ["science", "technology", "engineering"]}}
)
# Complex nested conditions
results = collection.query(
query_texts=["search term"],
where={
"$and": [
{"year": {"$gte": 2020}},
{"$or": [
{"category": "science"},
{"category": "technology"}
]},
{"priority": {"$in": ["high", "critical"]}}
]
}
)Filter documents based on their text content using substring matching.
# WhereDocument filter type definition
WhereDocument = Dict[WhereDocumentOperator, Union[str, List[WhereDocument]]]
# Document text operators
WhereDocumentOperator = Literal["$contains", "$not_contains"]Usage Examples:
# Documents containing specific text
results = collection.query(
query_texts=["search term"],
where_document={"$contains": "machine learning"}
)
# Documents not containing specific text
results = collection.query(
query_texts=["search term"],
where_document={"$not_contains": "deprecated"}
)
# Complex document filtering (not supported - use simple contains/not_contains)
# For complex text search, retrieve documents and filter programmaticallyControl which fields are included in query results to optimize performance and reduce data transfer.
# Include field specification
Include = List[IncludeField]
IncludeField = Literal["documents", "embeddings", "metadatas", "distances", "uris", "data"]Usage Examples:
# Include only documents and distances
results = collection.query(
query_texts=["search term"],
include=["documents", "distances"]
)
# Include all available fields
results = collection.query(
query_texts=["search term"],
include=["documents", "embeddings", "metadatas", "distances", "uris", "data"]
)
# Minimal result for performance
results = collection.query(
query_texts=["search term"],
include=["documents"] # Only document text
)
print(f"Query returned {len(results['ids'][0])} results")
print(f"Available fields: {list(results.keys())}")Combine metadata filtering, document text filtering, and vector similarity for precise document retrieval.
Usage Examples:
# Comprehensive search with all filter types
results = collection.query(
query_texts=["machine learning research"],
n_results=10,
where={
"$and": [
{"category": "research"},
{"year": {"$gte": 2020}},
{"citations": {"$gt": 100}}
]
},
where_document={"$contains": "neural network"},
include=["documents", "metadatas", "distances"]
)
# Process results
for i, (doc, metadata, distance) in enumerate(zip(
results['documents'][0],
results['metadatas'][0],
results['distances'][0]
)):
print(f"Result {i+1} (similarity: {1-distance:.3f}):")
print(f" Title: {metadata.get('title', 'Unknown')}")
print(f" Year: {metadata.get('year', 'Unknown')}")
print(f" Excerpt: {doc[:200]}...")
print()Query results contain lists of matching documents with associated data and similarity scores.
QueryResult = TypedDict('QueryResult', {
'ids': List[List[str]], # Document IDs per query
'documents': List[List[Optional[str]]], # Document text per query
'metadatas': List[List[Optional[Dict]]], # Metadata per query
'embeddings': List[List[Optional[List[float]]]], # Embeddings per query
'distances': List[List[float]], # Similarity distances per query
'uris': List[List[Optional[str]]], # URIs per query
'data': List[List[Optional[Any]]], # Additional data per query
'included': List[str] # Fields included in results
})Processing Examples:
results = collection.query(
query_texts=["first query", "second query"], # Multiple queries
n_results=3
)
# Process results for each query
for query_idx, query_text in enumerate(["first query", "second query"]):
print(f"Results for query '{query_text}':")
query_ids = results['ids'][query_idx]
query_docs = results['documents'][query_idx]
query_distances = results['distances'][query_idx]
for doc_idx, (doc_id, doc_text, distance) in enumerate(zip(
query_ids, query_docs, query_distances
)):
similarity_score = 1 - distance # Convert distance to similarity
print(f" {doc_idx+1}. {doc_id} (similarity: {similarity_score:.3f})")
print(f" {doc_text[:100]}...")
print()ChromaDB supports different distance metrics for vector similarity calculations.
# Distance metric specification
Space = Literal["cosine", "l2", "ip"]
# cosine: Cosine distance (1 - cosine_similarity)
# l2: Euclidean (L2) distance
# ip: Inner product (negative for similarity)Distance metrics are configured per collection through embedding functions and cannot be changed during queries.
from typing import Dict, List, Optional, Union, Any, Literal, TypedDict
# Query input types
Documents = List[str]
Embeddings = List[List[float]]
Images = List[Any] # Image arrays
URIs = List[str]
IDs = List[str]
# Filter types
Where = Dict[Union[str, Literal["$and", "$or"]], Union[
str, int, float, bool, # Literal values
Dict[Literal["$eq", "$ne", "$gt", "$gte", "$lt", "$lte", "$in", "$nin"], Any], # Operators
List["Where"] # Nested conditions
]]
WhereDocument = Dict[Literal["$contains", "$not_contains"], Union[str, List["WhereDocument"]]]
# Result field selection
Include = List[Literal["documents", "embeddings", "metadatas", "distances", "uris", "data"]]
# Result types
QueryResult = TypedDict('QueryResult', {
'ids': List[List[str]],
'documents': List[List[Optional[str]]],
'metadatas': List[List[Optional[Dict[str, Any]]]],
'embeddings': List[List[Optional[List[float]]]],
'distances': List[List[float]],
'uris': List[List[Optional[str]]],
'data': List[List[Optional[Any]]],
'included': List[str]
})
GetResult = TypedDict('GetResult', {
'ids': List[str],
'documents': List[Optional[str]],
'metadatas': List[Optional[Dict[str, Any]]],
'embeddings': List[Optional[List[float]]],
'uris': List[Optional[str]],
'data': List[Optional[Any]],
'included': List[str]
})Install with Tessl CLI
npx tessl i tessl/pypi-chromadb