An integration package connecting Chroma and LangChain for vector database operations.
—
Collection-level operations for managing ChromaDB collections, including document retrieval, collection maintenance, and low-level access to the underlying ChromaDB functionality.
Retrieve documents from the collection using various criteria without performing similarity search.
def get(
ids: Optional[Union[str, list[str]]] = None,
where: Optional[Where] = None,
limit: Optional[int] = None,
offset: Optional[int] = None,
where_document: Optional[WhereDocument] = None,
include: Optional[list[str]] = None
) -> dict[str, Any]:
"""
Retrieve documents from the collection.
Parameters:
- ids: Document IDs to retrieve (string or list of strings)
- where: Metadata filter conditions
- limit: Maximum number of documents to return
- offset: Number of documents to skip (for pagination)
- where_document: Document content filter conditions
- include: Fields to include in results ["embeddings", "metadatas", "documents"]
(IDs are always included, defaults to ["metadatas", "documents"])
Returns:
Dictionary with keys "ids", "embeddings", "metadatas", "documents"
containing the requested data
"""
def get_by_ids(ids: Sequence[str], /) -> list[Document]:
"""
Get documents by their specific IDs.
Parameters:
- ids: Sequence of document IDs to retrieve
Returns:
List of Document objects (may be fewer than requested if IDs not found)
The order may not match the input ID order - rely on document.id field
"""Usage Example:
# Get specific documents by ID
result = vector_store.get(ids=["doc_1", "doc_2", "doc_3"])
print(f"Found {len(result['ids'])} documents")
for i, doc_id in enumerate(result["ids"]):
print(f"ID: {doc_id}")
print(f"Content: {result['documents'][i]}")
print(f"Metadata: {result['metadatas'][i]}")
# Get documents with pagination
page_1 = vector_store.get(limit=10, offset=0)
page_2 = vector_store.get(limit=10, offset=10)
# Get documents matching metadata criteria
filtered_docs = vector_store.get(
where={"category": "science", "year": "2023"},
limit=50
)
# Get documents with embeddings included
docs_with_embeddings = vector_store.get(
where={"status": "active"},
include=["documents", "metadatas", "embeddings"]
)
# Get documents by IDs using convenience method
documents = vector_store.get_by_ids(["doc_1", "doc_2"])
for doc in documents:
print(f"ID: {doc.id}, Content: {doc.page_content}")Operations for maintaining and managing the underlying ChromaDB collection.
def reset_collection() -> None:
"""
Reset the collection by deleting it and recreating an empty one.
This operation removes all documents and starts with a fresh collection
using the same configuration settings.
"""
def delete_collection() -> None:
"""
Delete the entire collection and all its documents.
After this operation, the collection no longer exists and the
Chroma instance becomes unusable.
"""Usage Example:
# Reset collection (clear all data but keep configuration)
vector_store.reset_collection()
print("Collection reset - all documents removed")
# Delete collection entirely
vector_store.delete_collection()
print("Collection deleted")
# Note: After delete_collection(), the vector_store instance cannot be used
# Create a new instance if needed:
# vector_store = Chroma(collection_name="new_collection", ...)Access to collection-level information and settings.
@property
def embeddings(self) -> Optional[Embeddings]:
"""
Access the configured embedding function.
Returns:
The embedding function used by this vector store, or None if not configured
"""Usage Example:
# Check if embeddings are configured
if vector_store.embeddings:
print("Embedding function is configured")
# Use the embedding function directly if needed
query_embedding = vector_store.embeddings.embed_query("test query")
else:
print("No embedding function configured")Use ChromaDB's filtering syntax to query documents based on metadata.
Simple Equality:
where = {"category": "science"}
where = {"author": "Smith", "year": 2023}Comparison Operators:
where = {"year": {"$gte": 2020}} # Greater than or equal
where = {"score": {"$lt": 0.5}} # Less than
where = {"count": {"$ne": 0}} # Not equalLogical Operators:
where = {
"$and": [
{"category": "science"},
{"year": {"$gte": 2020}}
]
}
where = {
"$or": [
{"category": "science"},
{"category": "technology"}
]
}Inclusion/Exclusion:
where = {"category": {"$in": ["science", "tech", "ai"]}}
where = {"status": {"$nin": ["draft", "archived"]}}Filter based on the actual document text content.
Text Contains:
where_document = {"$contains": "machine learning"}Text Does Not Contain:
where_document = {"$not_contains": "deprecated"}Complex Document Filtering:
where_document = {
"$and": [
{"$contains": "python"},
{"$not_contains": "javascript"}
]
}Efficiently handle large result sets with pagination.
Basic Pagination:
# Get first 20 documents
batch_1 = vector_store.get(limit=20, offset=0)
# Get next 20 documents
batch_2 = vector_store.get(limit=20, offset=20)
# Get documents 100-120
batch_n = vector_store.get(limit=20, offset=100)Filtered Pagination:
def get_documents_by_category(category: str, page_size: int = 50):
offset = 0
while True:
batch = vector_store.get(
where={"category": category},
limit=page_size,
offset=offset
)
if not batch["ids"]:
break
yield batch
offset += page_size
# Use pagination generator
for batch in get_documents_by_category("science"):
print(f"Processing {len(batch['ids'])} documents")Collection management operations can raise various exceptions:
Common Error Scenarios:
try:
# Collection operations
documents = vector_store.get_by_ids(["invalid_id"])
except Exception as e:
print(f"Error retrieving documents: {e}")
try:
# Collection deletion
vector_store.delete_collection()
except Exception as e:
print(f"Error deleting collection: {e}")
# Check if collection exists before operations
if hasattr(vector_store, '_chroma_collection') and vector_store._chroma_collection:
# Safe to perform operations
result = vector_store.get(limit=10)
else:
print("Collection not initialized")Install with Tessl CLI
npx tessl i tessl/pypi-langchain-chroma