An integration package connecting Chroma and LangChain for vector database operations.
—
Class methods and utilities for creating Chroma vector store instances from various data sources and configurations. Provides convenient factory methods for common initialization patterns.
Factory method to create a Chroma instance and populate it with a list of texts in a single operation.
@classmethod
def from_texts(
cls: type[Chroma],
texts: list[str],
embedding: Optional[Embeddings] = None,
metadatas: Optional[list[dict]] = None,
ids: Optional[list[str]] = None,
collection_name: str = "langchain",
persist_directory: Optional[str] = None,
host: Optional[str] = None,
port: Optional[int] = None,
headers: Optional[dict[str, str]] = None,
chroma_cloud_api_key: Optional[str] = None,
tenant: Optional[str] = None,
database: Optional[str] = None,
client_settings: Optional[chromadb.config.Settings] = None,
client: Optional[chromadb.ClientAPI] = None,
collection_metadata: Optional[dict] = None,
collection_configuration: Optional[CreateCollectionConfiguration] = None,
*,
ssl: bool = False,
**kwargs: Any,
) -> Chroma:
"""
Create a Chroma vector store from a list of texts.
Creates the vector store instance and adds all provided texts in batch operations
for efficient initialization.
Parameters:
- texts: List of text strings to add to the vector store
- embedding: Embedding function for vectorizing texts
- metadatas: Optional list of metadata dictionaries for each text
- ids: Optional list of custom IDs (UUIDs generated if not provided)
- collection_name: Name for the new collection (default: "langchain")
- persist_directory: Directory to persist the collection
- host: Hostname of deployed Chroma server
- port: Connection port for Chroma server (default: 8000)
- ssl: Whether to use SSL connection (default: False)
- headers: HTTP headers for Chroma server
- chroma_cloud_api_key: API key for Chroma Cloud
- tenant: Tenant ID for Chroma Cloud
- database: Database name for Chroma Cloud
- client_settings: Custom ChromaDB client settings
- client: Pre-configured ChromaDB client
- collection_metadata: Metadata for the collection
- collection_configuration: Index configuration for the collection
- **kwargs: Additional arguments for Chroma client initialization
Returns:
Chroma instance populated with the provided texts
"""Usage Example:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
# Basic usage with texts
texts = [
"The quick brown fox jumps over the lazy dog",
"Python is a powerful programming language",
"Machine learning is transforming technology"
]
vector_store = Chroma.from_texts(
texts=texts,
embedding=OpenAIEmbeddings(),
collection_name="my_documents"
)
# With metadata and persistence
texts = ["Document 1", "Document 2", "Document 3"]
metadatas = [
{"source": "file1.txt", "author": "Alice"},
{"source": "file2.txt", "author": "Bob"},
{"source": "file3.txt", "author": "Charlie"}
]
ids = ["doc_1", "doc_2", "doc_3"]
persistent_store = Chroma.from_texts(
texts=texts,
embedding=OpenAIEmbeddings(),
metadatas=metadatas,
ids=ids,
collection_name="persistent_docs",
persist_directory="./chroma_db"
)
# With Chroma Cloud
cloud_store = Chroma.from_texts(
texts=texts,
embedding=OpenAIEmbeddings(),
collection_name="cloud_collection",
chroma_cloud_api_key="your-api-key",
tenant="your-tenant",
database="your-database"
)Factory method to create a Chroma instance from LangChain Document objects.
@classmethod
def from_documents(
cls: type[Chroma],
documents: list[Document],
embedding: Optional[Embeddings] = None,
ids: Optional[list[str]] = None,
collection_name: str = "langchain",
persist_directory: Optional[str] = None,
host: Optional[str] = None,
port: Optional[int] = None,
headers: Optional[dict[str, str]] = None,
chroma_cloud_api_key: Optional[str] = None,
tenant: Optional[str] = None,
database: Optional[str] = None,
client_settings: Optional[chromadb.config.Settings] = None,
client: Optional[chromadb.ClientAPI] = None,
collection_metadata: Optional[dict] = None,
collection_configuration: Optional[CreateCollectionConfiguration] = None,
*,
ssl: bool = False,
**kwargs: Any,
) -> Chroma:
"""
Create a Chroma vector store from a list of Document objects.
Extracts text content and metadata from Document objects and creates
a vector store with efficient batch operations.
Parameters:
- documents: List of Document objects to add to the vector store
- embedding: Embedding function for vectorizing document content
- ids: Optional list of custom IDs (uses document.id or generates UUIDs)
- collection_name: Name for the new collection (default: "langchain")
- persist_directory: Directory to persist the collection
- host: Hostname of deployed Chroma server
- port: Connection port (default: 8000)
- ssl: Whether to use SSL connection (default: False)
- headers: HTTP headers for server connection
- chroma_cloud_api_key: API key for Chroma Cloud
- tenant: Tenant ID for Chroma Cloud
- database: Database name for Chroma Cloud
- client_settings: Custom ChromaDB client settings
- client: Pre-configured ChromaDB client
- collection_metadata: Metadata for the collection
- collection_configuration: Index configuration
- **kwargs: Additional client initialization arguments
Returns:
Chroma instance populated with the provided documents
"""Usage Example:
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
# Create documents
documents = [
Document(
page_content="First document content",
metadata={"source": "doc1", "category": "general"},
id="custom_id_1"
),
Document(
page_content="Second document content",
metadata={"source": "doc2", "category": "technical"}
),
Document(
page_content="Third document content",
metadata={"source": "doc3", "category": "general"}
)
]
# Create vector store from documents
vector_store = Chroma.from_documents(
documents=documents,
embedding=OpenAIEmbeddings(),
collection_name="document_collection",
persist_directory="./my_vector_db"
)
# With custom configuration
from chromadb.api import CreateCollectionConfiguration
configured_store = Chroma.from_documents(
documents=documents,
embedding=OpenAIEmbeddings(),
collection_name="configured_collection",
collection_configuration=CreateCollectionConfiguration({
"hnsw": {"space": "cosine", "M": 16}
}),
collection_metadata={"version": "1.0", "description": "My documents"}
)Static utility method for encoding images to base64 strings for storage or processing.
@staticmethod
def encode_image(uri: str) -> str:
"""
Encode an image file to a base64 string.
Utility function for preparing images for storage in the vector store
or for processing with multimodal embedding functions.
Parameters:
- uri: File path to the image file
Returns:
Base64 encoded string representation of the image
Raises:
FileNotFoundError: If the image file doesn't exist
IOError: If the file cannot be read
"""Usage Example:
# Encode image for storage or processing
image_path = "/path/to/image.jpg"
encoded_image = Chroma.encode_image(image_path)
# Use encoded image with documents
image_document = Document(
page_content=encoded_image,
metadata={"type": "image", "format": "jpg", "source": image_path}
)
# Add to vector store (requires multimodal embeddings)
vector_store.add_documents([image_document])Different ChromaDB client configurations for various deployment scenarios.
In-Memory Client (Default):
vector_store = Chroma.from_texts(
texts=texts,
embedding=embeddings,
collection_name="memory_collection"
)Persistent Client:
vector_store = Chroma.from_texts(
texts=texts,
embedding=embeddings,
collection_name="persistent_collection",
persist_directory="/path/to/chroma/db"
)HTTP Client (Remote Server):
vector_store = Chroma.from_texts(
texts=texts,
embedding=embeddings,
collection_name="remote_collection",
host="chroma-server.example.com",
port=8000,
ssl=True,
headers={"Authorization": "Bearer token"}
)Chroma Cloud Client:
vector_store = Chroma.from_texts(
texts=texts,
embedding=embeddings,
collection_name="cloud_collection",
chroma_cloud_api_key="your-api-key",
tenant="your-tenant",
database="your-database"
)Advanced collection settings for performance and behavior tuning.
from chromadb.api import CreateCollectionConfiguration
# HNSW index configuration
hnsw_config = CreateCollectionConfiguration({
"hnsw": {
"space": "cosine", # cosine, l2, or ip
"M": 16, # Number of bi-directional links
"ef_construction": 200, # Size of dynamic candidate list
"max_elements": 10000 # Maximum number of elements
}
})
vector_store = Chroma.from_texts(
texts=texts,
embedding=embeddings,
collection_configuration=hnsw_config
)Factory methods automatically handle batch processing for large datasets.
# Large dataset - automatically batched
large_texts = ["Text {}".format(i) for i in range(10000)]
large_metadatas = [{"index": i} for i in range(10000)]
# Efficiently processes in batches
vector_store = Chroma.from_texts(
texts=large_texts,
metadatas=large_metadatas,
embedding=embeddings,
collection_name="large_collection"
)Construction methods include error handling for common failure scenarios.
try:
vector_store = Chroma.from_texts(
texts=texts,
embedding=embeddings,
persist_directory="/invalid/path"
)
except ValueError as e:
print(f"Configuration error: {e}")
except Exception as e:
print(f"Unexpected error during construction: {e}")
# Validate before construction
if texts and embeddings:
vector_store = Chroma.from_texts(texts=texts, embedding=embeddings)
else:
print("Missing required texts or embeddings")Install with Tessl CLI
npx tessl i tessl/pypi-langchain-chroma