tessl/pypi-langchain-chroma

An integration package connecting Chroma and LangChain for vector database operations.

—

Pending

Overview

Eval results

Files

Vector Store Construction

Name: tessl/pypi-langchain-chroma
Author: tessl

Class methods and utilities for creating Chroma vector store instances from various data sources and configurations. Provides convenient factory methods for common initialization patterns.

Capabilities

Creating from Text Lists

Factory method to create a Chroma instance and populate it with a list of texts in a single operation.

@classmethod
def from_texts(
    cls: type[Chroma],
    texts: list[str],
    embedding: Optional[Embeddings] = None,
    metadatas: Optional[list[dict]] = None,
    ids: Optional[list[str]] = None,
    collection_name: str = "langchain",
    persist_directory: Optional[str] = None,
    host: Optional[str] = None,
    port: Optional[int] = None,
    headers: Optional[dict[str, str]] = None,
    chroma_cloud_api_key: Optional[str] = None,
    tenant: Optional[str] = None,
    database: Optional[str] = None,
    client_settings: Optional[chromadb.config.Settings] = None,
    client: Optional[chromadb.ClientAPI] = None,
    collection_metadata: Optional[dict] = None,
    collection_configuration: Optional[CreateCollectionConfiguration] = None,
    *,
    ssl: bool = False,
    **kwargs: Any,
) -> Chroma:
    """
    Create a Chroma vector store from a list of texts.
    
    Creates the vector store instance and adds all provided texts in batch operations
    for efficient initialization.
    
    Parameters:
    - texts: List of text strings to add to the vector store
    - embedding: Embedding function for vectorizing texts
    - metadatas: Optional list of metadata dictionaries for each text
    - ids: Optional list of custom IDs (UUIDs generated if not provided)
    - collection_name: Name for the new collection (default: "langchain")
    - persist_directory: Directory to persist the collection
    - host: Hostname of deployed Chroma server
    - port: Connection port for Chroma server (default: 8000)
    - ssl: Whether to use SSL connection (default: False)
    - headers: HTTP headers for Chroma server
    - chroma_cloud_api_key: API key for Chroma Cloud
    - tenant: Tenant ID for Chroma Cloud
    - database: Database name for Chroma Cloud
    - client_settings: Custom ChromaDB client settings
    - client: Pre-configured ChromaDB client
    - collection_metadata: Metadata for the collection
    - collection_configuration: Index configuration for the collection
    - **kwargs: Additional arguments for Chroma client initialization
    
    Returns:
    Chroma instance populated with the provided texts
    """

Usage Example:

from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# Basic usage with texts
texts = [
    "The quick brown fox jumps over the lazy dog",
    "Python is a powerful programming language",
    "Machine learning is transforming technology"
]

vector_store = Chroma.from_texts(
    texts=texts,
    embedding=OpenAIEmbeddings(),
    collection_name="my_documents"
)

# With metadata and persistence
texts = ["Document 1", "Document 2", "Document 3"]
metadatas = [
    {"source": "file1.txt", "author": "Alice"},
    {"source": "file2.txt", "author": "Bob"},
    {"source": "file3.txt", "author": "Charlie"}
]
ids = ["doc_1", "doc_2", "doc_3"]

persistent_store = Chroma.from_texts(
    texts=texts,
    embedding=OpenAIEmbeddings(),
    metadatas=metadatas,
    ids=ids,
    collection_name="persistent_docs",
    persist_directory="./chroma_db"
)

# With Chroma Cloud
cloud_store = Chroma.from_texts(
    texts=texts,
    embedding=OpenAIEmbeddings(),
    collection_name="cloud_collection",
    chroma_cloud_api_key="your-api-key",
    tenant="your-tenant",
    database="your-database"
)

Creating from Document Objects

Factory method to create a Chroma instance from LangChain Document objects.

@classmethod
def from_documents(
    cls: type[Chroma],
    documents: list[Document],
    embedding: Optional[Embeddings] = None,
    ids: Optional[list[str]] = None,
    collection_name: str = "langchain",
    persist_directory: Optional[str] = None,
    host: Optional[str] = None,
    port: Optional[int] = None,
    headers: Optional[dict[str, str]] = None,
    chroma_cloud_api_key: Optional[str] = None,
    tenant: Optional[str] = None,
    database: Optional[str] = None,
    client_settings: Optional[chromadb.config.Settings] = None,
    client: Optional[chromadb.ClientAPI] = None,
    collection_metadata: Optional[dict] = None,
    collection_configuration: Optional[CreateCollectionConfiguration] = None,
    *,
    ssl: bool = False,
    **kwargs: Any,
) -> Chroma:
    """
    Create a Chroma vector store from a list of Document objects.
    
    Extracts text content and metadata from Document objects and creates
    a vector store with efficient batch operations.
    
    Parameters:
    - documents: List of Document objects to add to the vector store
    - embedding: Embedding function for vectorizing document content
    - ids: Optional list of custom IDs (uses document.id or generates UUIDs)
    - collection_name: Name for the new collection (default: "langchain")
    - persist_directory: Directory to persist the collection
    - host: Hostname of deployed Chroma server
    - port: Connection port (default: 8000)
    - ssl: Whether to use SSL connection (default: False)
    - headers: HTTP headers for server connection
    - chroma_cloud_api_key: API key for Chroma Cloud
    - tenant: Tenant ID for Chroma Cloud  
    - database: Database name for Chroma Cloud
    - client_settings: Custom ChromaDB client settings
    - client: Pre-configured ChromaDB client
    - collection_metadata: Metadata for the collection
    - collection_configuration: Index configuration
    - **kwargs: Additional client initialization arguments
    
    Returns:
    Chroma instance populated with the provided documents
    """

Usage Example:

from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# Create documents
documents = [
    Document(
        page_content="First document content",
        metadata={"source": "doc1", "category": "general"},
        id="custom_id_1"
    ),
    Document(
        page_content="Second document content", 
        metadata={"source": "doc2", "category": "technical"}
    ),
    Document(
        page_content="Third document content",
        metadata={"source": "doc3", "category": "general"}
    )
]

# Create vector store from documents
vector_store = Chroma.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(),
    collection_name="document_collection",
    persist_directory="./my_vector_db"
)

# With custom configuration
from chromadb.api import CreateCollectionConfiguration

configured_store = Chroma.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(),
    collection_name="configured_collection",
    collection_configuration=CreateCollectionConfiguration({
        "hnsw": {"space": "cosine", "M": 16}
    }),
    collection_metadata={"version": "1.0", "description": "My documents"}
)

Image Encoding Utility

Static utility method for encoding images to base64 strings for storage or processing.

@staticmethod
def encode_image(uri: str) -> str:
    """
    Encode an image file to a base64 string.
    
    Utility function for preparing images for storage in the vector store
    or for processing with multimodal embedding functions.
    
    Parameters:
    - uri: File path to the image file
    
    Returns:
    Base64 encoded string representation of the image
    
    Raises:
    FileNotFoundError: If the image file doesn't exist
    IOError: If the file cannot be read
    """

Usage Example:

# Encode image for storage or processing
image_path = "/path/to/image.jpg"
encoded_image = Chroma.encode_image(image_path)

# Use encoded image with documents
image_document = Document(
    page_content=encoded_image,
    metadata={"type": "image", "format": "jpg", "source": image_path}
)

# Add to vector store (requires multimodal embeddings)
vector_store.add_documents([image_document])

Configuration Options

Client Types and Configuration

Different ChromaDB client configurations for various deployment scenarios.

In-Memory Client (Default):

vector_store = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    collection_name="memory_collection"
)

Persistent Client:

vector_store = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    collection_name="persistent_collection",
    persist_directory="/path/to/chroma/db"
)

HTTP Client (Remote Server):

vector_store = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    collection_name="remote_collection",
    host="chroma-server.example.com",
    port=8000,
    ssl=True,
    headers={"Authorization": "Bearer token"}
)

Chroma Cloud Client:

vector_store = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    collection_name="cloud_collection",
    chroma_cloud_api_key="your-api-key",
    tenant="your-tenant",
    database="your-database"
)

Collection Configuration

Advanced collection settings for performance and behavior tuning.

from chromadb.api import CreateCollectionConfiguration

# HNSW index configuration
hnsw_config = CreateCollectionConfiguration({
    "hnsw": {
        "space": "cosine",  # cosine, l2, or ip
        "M": 16,            # Number of bi-directional links
        "ef_construction": 200,  # Size of dynamic candidate list
        "max_elements": 10000    # Maximum number of elements
    }
})

vector_store = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    collection_configuration=hnsw_config
)

Batch Processing

Factory methods automatically handle batch processing for large datasets.

# Large dataset - automatically batched
large_texts = ["Text {}".format(i) for i in range(10000)]
large_metadatas = [{"index": i} for i in range(10000)]

# Efficiently processes in batches
vector_store = Chroma.from_texts(
    texts=large_texts,
    metadatas=large_metadatas,
    embedding=embeddings,
    collection_name="large_collection"
)

Error Handling

Construction methods include error handling for common failure scenarios.

try:
    vector_store = Chroma.from_texts(
        texts=texts,
        embedding=embeddings,
        persist_directory="/invalid/path"
    )
except ValueError as e:
    print(f"Configuration error: {e}")
except Exception as e:
    print(f"Unexpected error during construction: {e}")

# Validate before construction
if texts and embeddings:
    vector_store = Chroma.from_texts(texts=texts, embedding=embeddings)
else:
    print("Missing required texts or embeddings")

Install with Tessl CLI