tessl/pypi-langchain-chroma

An integration package connecting Chroma and LangChain for vector database operations.

—

Pending

Overview

Eval results

Files

Search Operations

Name: tessl/pypi-langchain-chroma
Author: tessl

Comprehensive search functionality for finding similar documents in the vector store. Supports text queries, vector queries, image queries, metadata filtering, and relevance scoring.

Capabilities

Text-Based Similarity Search

Search for documents similar to a text query using the configured embedding function.

def similarity_search(
    query: str, 
    k: int = 4, 
    filter: Optional[dict[str, str]] = None, 
    **kwargs: Any
) -> list[Document]:
    """
    Find documents most similar to the query text.
    
    Parameters:
    - query: Text query to search for
    - k: Number of results to return (default: 4)
    - filter: Metadata filter dictionary (e.g., {"category": "tech"})
    - **kwargs: Additional arguments passed to ChromaDB query
    
    Returns:
    List of Document objects most similar to the query
    """

def similarity_search_with_score(
    query: str, 
    k: int = 4, 
    filter: Optional[dict[str, str]] = None, 
    where_document: Optional[dict[str, str]] = None, 
    **kwargs: Any
) -> list[tuple[Document, float]]:
    """
    Find documents similar to query text with similarity scores.
    
    Parameters:
    - query: Text query to search for
    - k: Number of results to return (default: 4)
    - filter: Metadata filter dictionary
    - where_document: Document content filter (e.g., {"$contains": "python"})
    - **kwargs: Additional arguments passed to ChromaDB query
    
    Returns:
    List of tuples containing (Document, similarity_score)
    Lower scores indicate higher similarity
    """

Usage Example:

# Basic similarity search
results = vector_store.similarity_search("machine learning", k=3)
for doc in results:
    print(f"Content: {doc.page_content}")

# Search with score and filtering
results_with_scores = vector_store.similarity_search_with_score(
    query="python programming",
    k=5,
    filter={"category": "tech"},
    where_document={"$contains": "code"}
)
for doc, score in results_with_scores:
    print(f"Score: {score:.3f}, Content: {doc.page_content}")

Vector-Based Search

Search using pre-computed embedding vectors instead of text queries.

def similarity_search_by_vector(
    embedding: list[float], 
    k: int = 4, 
    filter: Optional[dict[str, str]] = None, 
    where_document: Optional[dict[str, str]] = None, 
    **kwargs: Any
) -> list[Document]:
    """
    Find documents most similar to the provided embedding vector.
    
    Parameters:
    - embedding: Pre-computed embedding vector
    - k: Number of results to return (default: 4)
    - filter: Metadata filter dictionary
    - where_document: Document content filter
    - **kwargs: Additional arguments passed to ChromaDB query
    
    Returns:
    List of Document objects most similar to the embedding
    """

def similarity_search_by_vector_with_relevance_scores(
    embedding: list[float], 
    k: int = 4, 
    filter: Optional[dict[str, str]] = None, 
    where_document: Optional[dict[str, str]] = None, 
    **kwargs: Any
) -> list[tuple[Document, float]]:
    """
    Find documents similar to embedding vector with relevance scores.
    
    Parameters:
    - embedding: Pre-computed embedding vector
    - k: Number of results to return (default: 4)
    - filter: Metadata filter dictionary
    - where_document: Document content filter
    - **kwargs: Additional arguments passed to ChromaDB query
    
    Returns:
    List of tuples containing (Document, relevance_score)
    Lower scores indicate higher similarity
    """

Usage Example:

# Search by pre-computed vector
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
query_vector = embeddings.embed_query("artificial intelligence")

results = vector_store.similarity_search_by_vector(query_vector, k=3)
for doc in results:
    print(f"Content: {doc.page_content}")

# Search with relevance scores
results_with_scores = vector_store.similarity_search_by_vector_with_relevance_scores(
    embedding=query_vector,
    k=5,
    filter={"domain": "AI"}
)

Search with Vector Embeddings

Search that returns both documents and their corresponding embedding vectors.

def similarity_search_with_vectors(
    query: str, 
    k: int = 4, 
    filter: Optional[dict[str, str]] = None, 
    where_document: Optional[dict[str, str]] = None, 
    **kwargs: Any
) -> list[tuple[Document, np.ndarray]]:
    """
    Search for similar documents and return their embedding vectors.
    
    Parameters:
    - query: Text query to search for
    - k: Number of results to return (default: 4)
    - filter: Metadata filter dictionary
    - where_document: Document content filter
    - **kwargs: Additional arguments passed to ChromaDB query
    
    Returns:
    List of tuples containing (Document, embedding_vector)
    """

Usage Example:

import numpy as np

# Search with vectors for further processing
results_with_vectors = vector_store.similarity_search_with_vectors(
    query="data science",
    k=3
)
for doc, vector in results_with_vectors:
    print(f"Content: {doc.page_content}")
    print(f"Vector shape: {vector.shape}")

Image-Based Search

Search for similar documents using image queries. Requires an embedding function that supports image embeddings.

def similarity_search_by_image(
    uri: str, 
    k: int = 4, 
    filter: Optional[dict[str, str]] = None, 
    **kwargs: Any
) -> list[Document]:
    """
    Search for documents similar to the provided image.
    
    Parameters:
    - uri: File path to the query image
    - k: Number of results to return (default: 4)
    - filter: Metadata filter dictionary
    - **kwargs: Additional arguments passed to ChromaDB query
    
    Returns:
    List of Document objects most similar to the query image
    
    Raises:
    ValueError: If embedding function doesn't support image embeddings
    """

def similarity_search_by_image_with_relevance_score(
    uri: str, 
    k: int = 4, 
    filter: Optional[dict[str, str]] = None, 
    **kwargs: Any
) -> list[tuple[Document, float]]:
    """
    Search for documents similar to image with relevance scores.
    
    Parameters:
    - uri: File path to the query image
    - k: Number of results to return (default: 4)
    - filter: Metadata filter dictionary
    - **kwargs: Additional arguments passed to ChromaDB query
    
    Returns:
    List of tuples containing (Document, relevance_score)
    
    Raises:
    ValueError: If embedding function doesn't support image embeddings
    """

Usage Example:

# Search by image (requires multimodal embedding function)
image_results = vector_store.similarity_search_by_image(
    uri="/path/to/query_image.jpg",
    k=5,
    filter={"type": "visual"}
)

# Image search with scores
image_results_with_scores = vector_store.similarity_search_by_image_with_relevance_score(
    uri="/path/to/query_image.jpg",
    k=3
)
for doc, score in image_results_with_scores:
    print(f"Score: {score:.3f}, Metadata: {doc.metadata}")

Relevance Score Functions

The Chroma class automatically selects relevance score functions based on the collection's distance metric configuration.

Available Distance Metrics

Cosine: Cosine similarity (space: "cosine")
Euclidean: L2 distance (space: "l2")
Inner Product: Maximum inner product (space: "ip")

Usage Example:

# Configure distance metric during initialization
from chromadb.api import CreateCollectionConfiguration

vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=embeddings,
    collection_configuration=CreateCollectionConfiguration({
        "hnsw": {"space": "cosine"}
    })
)

Advanced Filtering

Metadata Filtering

Filter results based on document metadata using dictionary conditions.

# Simple equality filter
filter = {"category": "science", "year": "2023"}

# Complex filters (ChromaDB-specific syntax)
filter = {
    "$and": [
        {"category": "science"},
        {"year": {"$gte": "2020"}}
    ]
}

Document Content Filtering

Filter based on the actual document content using ChromaDB's where_document parameter.

# Content contains specific text
where_document = {"$contains": "machine learning"}

# Content matches pattern
where_document = {"$regex": "^Python.*tutorial$"}

Install with Tessl CLI