tessl/pypi-chromadb

Chroma - the open-source embedding database

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Document Operations

Name: tessl/pypi-chromadb
Author: tessl

Document operations form the core of ChromaDB's functionality, enabling storage, retrieval, updating, and deletion of documents with embeddings, metadata, and associated data. All operations support batching for efficient processing.

Capabilities

Adding Documents

Add documents to a collection with automatic or manual embedding generation, supporting text, images, URIs, and metadata.

def add(
    ids: IDs,
    documents: Optional[Documents] = None,
    embeddings: Optional[Embeddings] = None,
    metadatas: Optional[Metadatas] = None,
    images: Optional[Images] = None,
    uris: Optional[URIs] = None
) -> None:
    """
    Add documents to the collection.
    
    Args:
        ids: List of unique document identifiers
        documents: List of document text content
        embeddings: List of embedding vectors (generated if not provided)
        metadatas: List of metadata dictionaries for each document
        images: List of image arrays
        uris: List of URIs pointing to external resources
        
    Raises:
        ValueError: If document IDs already exist or invalid data provided
    """

Usage Example:

import chromadb

client = chromadb.EphemeralClient()
collection = client.create_collection("documents")

# Add documents with automatic embedding generation
collection.add(
    documents=["This is the first document", "This is the second document"],
    metadatas=[{"source": "web", "type": "article"}, {"source": "book", "type": "chapter"}],
    ids=["doc1", "doc2"]
)

# Add with custom embeddings
collection.add(
    documents=["Custom embedding document"],
    embeddings=[[0.1, 0.2, 0.3, 0.4]],  # Your pre-computed embedding
    metadatas=[{"custom": True}],
    ids=["doc3"]
)

Querying Documents

Perform vector similarity search to find documents similar to query text, embeddings, or images with filtering and ranking.

def query(
    query_texts: Optional[Documents] = None,
    query_embeddings: Optional[Embeddings] = None,
    query_images: Optional[Images] = None,
    query_uris: Optional[URIs] = None,
    ids: Optional[IDs] = None,
    n_results: int = 10,
    where: Optional[Where] = None,
    where_document: Optional[WhereDocument] = None,
    include: Include = ["metadatas", "documents", "distances"]
) -> QueryResult:
    """
    Query the collection for similar documents.
    
    Args:
        query_texts: List of text queries to find similar documents
        query_embeddings: List of embedding vectors to search with
        query_images: List of image arrays to search with
        query_uris: List of URIs to load and search with
        ids: Specific document IDs to search within
        n_results: Number of results to return per query
        where: Metadata filter conditions
        where_document: Document text filter conditions
        include: Fields to include in results
        
    Returns:
        QueryResult: Search results with documents, distances, and metadata
    """

Usage Example:

# Query with text
results = collection.query(
    query_texts=["Find documents about machine learning"],
    n_results=5,
    where={"source": "web"},
    include=["documents", "metadatas", "distances"]
)

# Query with embeddings
results = collection.query(
    query_embeddings=[[0.1, 0.2, 0.3, 0.4]],
    n_results=3,
    where={"type": {"$in": ["article", "paper"]}}
)

print(f"Found {len(results['ids'][0])} similar documents")
for i, doc in enumerate(results['documents'][0]):
    print(f"Distance: {results['distances'][0][i]:.3f}, Doc: {doc[:100]}...")

Getting Documents

Retrieve specific documents by ID or filter criteria without similarity ranking.

def get(
    ids: Optional[IDs] = None,
    where: Optional[Where] = None,
    limit: Optional[int] = None,
    offset: Optional[int] = None,
    where_document: Optional[WhereDocument] = None,
    include: Include = ["metadatas", "documents"]
) -> GetResult:
    """
    Get documents from the collection.
    
    Args:
        ids: Specific document IDs to retrieve
        where: Metadata filter conditions
        limit: Maximum number of documents to return
        offset: Number of documents to skip
        where_document: Document text filter conditions
        include: Fields to include in results
        
    Returns:
        GetResult: Retrieved documents with requested fields
    """

Usage Example:

# Get specific documents by ID
docs = collection.get(
    ids=["doc1", "doc2"],
    include=["documents", "metadatas"]
)

# Get documents with metadata filtering
docs = collection.get(
    where={"source": "web"},
    limit=10,
    include=["documents", "metadatas", "embeddings"]
)

# Get all documents (paginated)
all_docs = collection.get(limit=100, offset=0)

Peeking at Documents

Quickly preview the first few documents in a collection for inspection.

def peek(self, limit: int = 10) -> GetResult:
    """
    Peek at the first few documents in the collection.
    
    Args:
        limit: Number of documents to return
        
    Returns:
        GetResult: First documents in the collection
    """

Usage Example:

# Preview first 5 documents
preview = collection.peek(limit=5)
print(f"Collection contains {len(preview['ids'])} documents (showing first 5)")

Updating Documents

Modify existing documents, embeddings, or metadata while preserving document IDs.

def update(
    ids: IDs,
    documents: Optional[Documents] = None,
    embeddings: Optional[Embeddings] = None,
    metadatas: Optional[Metadatas] = None,
    images: Optional[Images] = None,
    uris: Optional[URIs] = None
) -> None:
    """
    Update existing documents in the collection.
    
    Args:
        ids: List of document IDs to update
        documents: New document text content
        embeddings: New embedding vectors (regenerated if not provided)
        metadatas: New metadata dictionaries
        images: New image arrays
        uris: New URIs
        
    Raises:
        ValueError: If document IDs do not exist
    """

Usage Example:

# Update document text (embeddings will be regenerated)
collection.update(
    ids=["doc1"],
    documents=["This is the updated first document"],
    metadatas=[{"source": "web", "type": "article", "updated": True}]
)

# Update only metadata
collection.update(
    ids=["doc2"],
    metadatas=[{"source": "book", "type": "chapter", "reviewed": True}]
)

Upserting Documents

Insert new documents or update existing ones in a single operation, providing convenience for data synchronization.

def upsert(
    ids: IDs,
    documents: Optional[Documents] = None,
    embeddings: Optional[Embeddings] = None,
    metadatas: Optional[Metadatas] = None,
    images: Optional[Images] = None,
    uris: Optional[URIs] = None
) -> None:
    """
    Insert new documents or update existing ones.
    
    Args:
        ids: List of document IDs to upsert
        documents: Document text content
        embeddings: Embedding vectors (generated if not provided)
        metadatas: Metadata dictionaries
        images: Image arrays
        uris: URIs
    """

Usage Example:

# Upsert documents (creates new or updates existing)
collection.upsert(
    documents=["New document", "Updated existing document"],
    metadatas=[{"source": "api"}, {"source": "user", "updated": True}],
    ids=["new_doc", "existing_doc"]
)

Deleting Documents

Remove documents from the collection by ID or filter criteria.

def delete(
    ids: Optional[IDs] = None,
    where: Optional[Where] = None,
    where_document: Optional[WhereDocument] = None
) -> None:
    """
    Delete documents from the collection.
    
    Args:
        ids: Specific document IDs to delete
        where: Metadata filter conditions for deletion
        where_document: Document text filter conditions for deletion
        
    Note: If no arguments provided, deletes all documents in collection
    """

Usage Example:

# Delete specific documents
collection.delete(ids=["doc1", "doc2"])

# Delete documents matching metadata criteria
collection.delete(where={"source": "temporary"})

# Delete documents matching text criteria  
collection.delete(where_document={"$contains": "delete_me"})

# Delete all documents (use with caution)
collection.delete()

Types

from typing import List, Dict, Optional, Union, Any, Literal
from numpy.typing import NDArray

# Basic document types
ID = str
IDs = List[ID]
Document = str
Documents = List[Document]
URI = str
URIs = List[URI]
Image = NDArray[Any]  # Image array
Images = List[Image]

# Embedding types
Embedding = List[float]
Embeddings = List[Embedding]

# Metadata types
Metadata = Dict[str, Union[str, int, float, bool, None]]
Metadatas = List[Metadata]

# Query filter types
Where = Dict[Union[str, Literal["$and", "$or"]], Any]
WhereDocument = Dict[Literal["$contains", "$not_contains"], Union[str, List[Any]]]

# Include fields specification
Include = List[Literal["documents", "embeddings", "metadatas", "distances", "uris", "data"]]

# Result types
GetResult = Dict[str, List[Any]]  # Contains ids, documents, metadatas, embeddings, etc.
QueryResult = Dict[str, List[Any]]  # Contains ids, documents, metadatas, embeddings, distances, etc.

Install with Tessl CLI