An integration package connecting Chroma and LangChain for vector database operations.
—
Comprehensive search functionality for finding similar documents in the vector store. Supports text queries, vector queries, image queries, metadata filtering, and relevance scoring.
Search for documents similar to a text query using the configured embedding function.
def similarity_search(
query: str,
k: int = 4,
filter: Optional[dict[str, str]] = None,
**kwargs: Any
) -> list[Document]:
"""
Find documents most similar to the query text.
Parameters:
- query: Text query to search for
- k: Number of results to return (default: 4)
- filter: Metadata filter dictionary (e.g., {"category": "tech"})
- **kwargs: Additional arguments passed to ChromaDB query
Returns:
List of Document objects most similar to the query
"""
def similarity_search_with_score(
query: str,
k: int = 4,
filter: Optional[dict[str, str]] = None,
where_document: Optional[dict[str, str]] = None,
**kwargs: Any
) -> list[tuple[Document, float]]:
"""
Find documents similar to query text with similarity scores.
Parameters:
- query: Text query to search for
- k: Number of results to return (default: 4)
- filter: Metadata filter dictionary
- where_document: Document content filter (e.g., {"$contains": "python"})
- **kwargs: Additional arguments passed to ChromaDB query
Returns:
List of tuples containing (Document, similarity_score)
Lower scores indicate higher similarity
"""Usage Example:
# Basic similarity search
results = vector_store.similarity_search("machine learning", k=3)
for doc in results:
print(f"Content: {doc.page_content}")
# Search with score and filtering
results_with_scores = vector_store.similarity_search_with_score(
query="python programming",
k=5,
filter={"category": "tech"},
where_document={"$contains": "code"}
)
for doc, score in results_with_scores:
print(f"Score: {score:.3f}, Content: {doc.page_content}")Search using pre-computed embedding vectors instead of text queries.
def similarity_search_by_vector(
embedding: list[float],
k: int = 4,
filter: Optional[dict[str, str]] = None,
where_document: Optional[dict[str, str]] = None,
**kwargs: Any
) -> list[Document]:
"""
Find documents most similar to the provided embedding vector.
Parameters:
- embedding: Pre-computed embedding vector
- k: Number of results to return (default: 4)
- filter: Metadata filter dictionary
- where_document: Document content filter
- **kwargs: Additional arguments passed to ChromaDB query
Returns:
List of Document objects most similar to the embedding
"""
def similarity_search_by_vector_with_relevance_scores(
embedding: list[float],
k: int = 4,
filter: Optional[dict[str, str]] = None,
where_document: Optional[dict[str, str]] = None,
**kwargs: Any
) -> list[tuple[Document, float]]:
"""
Find documents similar to embedding vector with relevance scores.
Parameters:
- embedding: Pre-computed embedding vector
- k: Number of results to return (default: 4)
- filter: Metadata filter dictionary
- where_document: Document content filter
- **kwargs: Additional arguments passed to ChromaDB query
Returns:
List of tuples containing (Document, relevance_score)
Lower scores indicate higher similarity
"""Usage Example:
# Search by pre-computed vector
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
query_vector = embeddings.embed_query("artificial intelligence")
results = vector_store.similarity_search_by_vector(query_vector, k=3)
for doc in results:
print(f"Content: {doc.page_content}")
# Search with relevance scores
results_with_scores = vector_store.similarity_search_by_vector_with_relevance_scores(
embedding=query_vector,
k=5,
filter={"domain": "AI"}
)Search that returns both documents and their corresponding embedding vectors.
def similarity_search_with_vectors(
query: str,
k: int = 4,
filter: Optional[dict[str, str]] = None,
where_document: Optional[dict[str, str]] = None,
**kwargs: Any
) -> list[tuple[Document, np.ndarray]]:
"""
Search for similar documents and return their embedding vectors.
Parameters:
- query: Text query to search for
- k: Number of results to return (default: 4)
- filter: Metadata filter dictionary
- where_document: Document content filter
- **kwargs: Additional arguments passed to ChromaDB query
Returns:
List of tuples containing (Document, embedding_vector)
"""Usage Example:
import numpy as np
# Search with vectors for further processing
results_with_vectors = vector_store.similarity_search_with_vectors(
query="data science",
k=3
)
for doc, vector in results_with_vectors:
print(f"Content: {doc.page_content}")
print(f"Vector shape: {vector.shape}")Search for similar documents using image queries. Requires an embedding function that supports image embeddings.
def similarity_search_by_image(
uri: str,
k: int = 4,
filter: Optional[dict[str, str]] = None,
**kwargs: Any
) -> list[Document]:
"""
Search for documents similar to the provided image.
Parameters:
- uri: File path to the query image
- k: Number of results to return (default: 4)
- filter: Metadata filter dictionary
- **kwargs: Additional arguments passed to ChromaDB query
Returns:
List of Document objects most similar to the query image
Raises:
ValueError: If embedding function doesn't support image embeddings
"""
def similarity_search_by_image_with_relevance_score(
uri: str,
k: int = 4,
filter: Optional[dict[str, str]] = None,
**kwargs: Any
) -> list[tuple[Document, float]]:
"""
Search for documents similar to image with relevance scores.
Parameters:
- uri: File path to the query image
- k: Number of results to return (default: 4)
- filter: Metadata filter dictionary
- **kwargs: Additional arguments passed to ChromaDB query
Returns:
List of tuples containing (Document, relevance_score)
Raises:
ValueError: If embedding function doesn't support image embeddings
"""Usage Example:
# Search by image (requires multimodal embedding function)
image_results = vector_store.similarity_search_by_image(
uri="/path/to/query_image.jpg",
k=5,
filter={"type": "visual"}
)
# Image search with scores
image_results_with_scores = vector_store.similarity_search_by_image_with_relevance_score(
uri="/path/to/query_image.jpg",
k=3
)
for doc, score in image_results_with_scores:
print(f"Score: {score:.3f}, Metadata: {doc.metadata}")The Chroma class automatically selects relevance score functions based on the collection's distance metric configuration.
Usage Example:
# Configure distance metric during initialization
from chromadb.api import CreateCollectionConfiguration
vector_store = Chroma(
collection_name="my_collection",
embedding_function=embeddings,
collection_configuration=CreateCollectionConfiguration({
"hnsw": {"space": "cosine"}
})
)Filter results based on document metadata using dictionary conditions.
# Simple equality filter
filter = {"category": "science", "year": "2023"}
# Complex filters (ChromaDB-specific syntax)
filter = {
"$and": [
{"category": "science"},
{"year": {"$gte": "2020"}}
]
}Filter based on the actual document content using ChromaDB's where_document parameter.
# Content contains specific text
where_document = {"$contains": "machine learning"}
# Content matches pattern
where_document = {"$regex": "^Python.*tutorial$"}Install with Tessl CLI
npx tessl i tessl/pypi-langchain-chroma