tessl/pypi-llm

CLI utility and Python library for interacting with Large Language Models from multiple providers including OpenAI, Anthropic, Google, and Meta plus locally installed models.

—

Pending

Overview

Eval results

Files

Embeddings

Name: tessl/pypi-llm
Author: tessl

Vector database operations with similarity search, metadata storage, and efficient batch processing. This module provides comprehensive functionality for working with text embeddings, including storage, retrieval, and similarity computations.

Capabilities

Embedding Model Management

Functions to discover and work with embedding models from various providers.

def get_embedding_model(name):
    """
    Get embedding model by name or alias.
    
    Args:
        name: Model name or configured alias
        
    Returns:
        EmbeddingModel instance
        
    Raises:
        UnknownModelError: If model name/alias not found
    """

def get_embedding_models() -> List[EmbeddingModel]:
    """Get all registered embedding models."""

def get_embedding_models_with_aliases() -> List[EmbeddingModelWithAliases]:
    """Get embedding models with their configured aliases."""

def get_embedding_model_aliases() -> Dict[str, EmbeddingModel]:
    """Get mapping of all aliases to their corresponding embedding models."""

def get_default_embedding_model() -> Optional[str]:
    """Get the default embedding model name."""

def set_default_embedding_model(model: str):
    """Set the default embedding model."""

Embedding Model Hierarchy

Abstract base classes for embedding model implementations.

class EmbeddingModel(ABC):
    """Abstract base class for embedding models."""
    
    model_id: str
    batch_size: int = 100
    supports_binary: bool = False
    supports_text: bool = True
    
    @abstractmethod
    def embed(self, items: List[str]) -> List[List[float]]:
        """
        Generate embeddings for a list of text items.
        
        Args:
            items: List of text strings to embed
            
        Returns:
            List of embedding vectors (lists of floats)
        """
    
    def embed_batch(self, items: List[str]) -> List[List[float]]:
        """Embed items in batches according to model's batch_size."""

class EmbeddingModelWithAliases:
    """Container for embedding model with its aliases."""
    
    model: EmbeddingModel
    aliases: List[str]

Collection Management

The Collection class provides vector database functionality with SQLite backend storage.

class Collection:
    """Vector database collection for embeddings storage and retrieval."""
    
    def __init__(
        self,
        name: str,
        model: Optional[EmbeddingModel] = None,
        db: Optional[Database] = None
    ):
        """
        Initialize collection.
        
        Args:
            name: Collection name
            model: Embedding model to use
            db: Optional database instance
        """
    
    def embed(
        self,
        id: str,
        value: Union[str, bytes],
        metadata: Optional[Dict[str, Any]] = None,
        store: bool = False
    ):
        """
        Embed and optionally store a single item.
        
        Args:
            id: Unique identifier for the item
            value: Text or binary content to embed
            metadata: Optional metadata dictionary
            store: Whether to store the original content
        """
    
    def embed_multi(
        self,
        entries: List[Tuple[str, Union[str, bytes]]],
        store: bool = False,
        batch_size: int = 100
    ):
        """
        Embed multiple items efficiently.
        
        Args:
            entries: List of (id, content) tuples
            store: Whether to store original content
            batch_size: Batch size for processing
        """
    
    def embed_multi_with_metadata(
        self,
        entries: List[Tuple[str, Union[str, bytes], Optional[Dict[str, Any]]]],
        store: bool = False,
        batch_size: int = 100
    ):
        """
        Embed multiple items with metadata.
        
        Args:
            entries: List of (id, content, metadata) tuples
            store: Whether to store original content
            batch_size: Batch size for processing
        """
    
    def similar(
        self,
        value: Union[str, bytes],
        number: int = 10,
        prefix: Optional[str] = None
    ) -> List[Entry]:
        """
        Find similar items by content.
        
        Args:
            value: Query content to find similar items for
            number: Maximum number of results
            prefix: Optional ID prefix filter
            
        Returns:
            List of Entry objects sorted by similarity score
        """
    
    def similar_by_id(
        self,
        id: str,
        number: int = 10,
        prefix: Optional[str] = None
    ) -> List[Entry]:
        """
        Find items similar to an existing item by ID.
        
        Args:
            id: ID of existing item to find similar items for
            number: Maximum number of results
            prefix: Optional ID prefix filter
            
        Returns:
            List of Entry objects sorted by similarity score
        """
    
    def similar_by_vector(
        self,
        vector: List[float],
        number: int = 10,
        skip_id: Optional[str] = None,
        prefix: Optional[str] = None
    ) -> List[Entry]:
        """
        Find similar items by embedding vector.
        
        Args:
            vector: Query embedding vector
            number: Maximum number of results
            skip_id: Optional ID to exclude from results
            prefix: Optional ID prefix filter
            
        Returns:
            List of Entry objects sorted by similarity score
        """
    
    def count(self) -> int:
        """Get total number of items in collection."""
    
    def delete(self):
        """Delete the collection and all its embeddings."""
    
    @classmethod
    def exists(cls, db: Database, name: str) -> bool:
        """
        Check if a collection exists in the database.
        
        Args:
            db: Database instance
            name: Collection name
            
        Returns:
            True if collection exists, False otherwise
        """
    
    name: str
    model: EmbeddingModel

Entry Objects

Entry objects represent individual items in a collection with their similarity scores.

class Entry:
    """Represents a single embedding entry with metadata."""
    
    def __init__(
        self,
        id: str,
        score: Optional[float] = None,
        content: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None
    ):
        """
        Initialize entry.
        
        Args:
            id: Entry identifier
            score: Similarity score (for search results)
            content: Original text content
            metadata: Associated metadata
        """
    
    id: str
    score: Optional[float]
    content: Optional[str]
    metadata: Optional[Dict[str, Any]]

Vector Utilities

Utility functions for working with embedding vectors.

def encode(values: List[float]) -> bytes:
    """
    Encode float vector to bytes for efficient storage.
    
    Args:
        values: List of float values
        
    Returns:
        Packed binary representation
    """

def decode(binary: bytes) -> List[float]:
    """
    Decode bytes back to float vector.
    
    Args:
        binary: Packed binary data
        
    Returns:
        List of float values
    """

def cosine_similarity(a: List[float], b: List[float]) -> float:
    """
    Calculate cosine similarity between two vectors.
    
    Args:
        a: First vector
        b: Second vector
        
    Returns:
        Cosine similarity score between -1 and 1
    """

Usage Examples

Basic Collection Operations

import llm

# Get embedding model and create collection
model = llm.get_embedding_model("text-embedding-ada-002")
collection = llm.Collection("documents", model)

# Add single document
collection.embed("doc1", "Paris is the capital of France")

# Add with metadata
collection.embed(
    "doc2", 
    "London is the capital of England",
    metadata={"country": "UK", "continent": "Europe"}
)

# Search for similar documents
results = collection.similar("French capital city", number=5)
for entry in results:
    print(f"{entry.id}: {entry.content} (score: {entry.score:.3f})")
    if entry.metadata:
        print(f"  Metadata: {entry.metadata}")

Batch Operations

import llm

model = llm.get_embedding_model("text-embedding-ada-002")
collection = llm.Collection("knowledge_base", model)

# Prepare batch data
documents = [
    ("physics_1", "Einstein's theory of relativity revolutionized physics"),
    ("physics_2", "Quantum mechanics describes the behavior of matter and energy"),
    ("history_1", "The Renaissance was a period of cultural rebirth in Europe"),
    ("history_2", "The Industrial Revolution transformed manufacturing"),
]

# Batch embed for efficiency
collection.embed_multi(documents, store=True)

# Batch with metadata
documents_with_metadata = [
    ("math_1", "Calculus is fundamental to mathematics", {"subject": "mathematics"}),
    ("math_2", "Linear algebra studies vector spaces", {"subject": "mathematics"}),
    ("art_1", "The Mona Lisa is a famous painting", {"subject": "art"}),
]

collection.embed_multi_with_metadata(documents_with_metadata, store=True)

print(f"Collection now has {collection.count()} documents")

Similarity Search

import llm

model = llm.get_embedding_model("text-embedding-ada-002")
collection = llm.Collection("research_papers", model)

# Add research papers
papers = [
    ("paper1", "Deep learning applications in computer vision"),
    ("paper2", "Natural language processing with transformers"),
    ("paper3", "Reinforcement learning for robotics"),
    ("paper4", "Computer vision techniques for medical imaging"),
    ("paper5", "Machine learning for climate prediction"),
]

collection.embed_multi(papers, store=True)

# Find papers similar to a query
query = "artificial intelligence in healthcare"
similar_papers = collection.similar(query, number=3)

print(f"Papers most similar to '{query}':")
for paper in similar_papers:
    print(f"- {paper.id}: {paper.content} (similarity: {paper.score:.3f})")

# Find papers similar to an existing paper
similar_to_paper = collection.similar_by_id("paper1", number=2)
print(f"\nPapers similar to paper1:")
for paper in similar_to_paper:
    print(f"- {paper.id}: {paper.content} (similarity: {paper.score:.3f})")

Working with Vector Embeddings Directly

import llm

model = llm.get_embedding_model("text-embedding-ada-002")

# Generate embeddings directly
texts = ["Hello world", "Python programming", "Machine learning"]
embeddings = model.embed(texts)

print(f"Generated {len(embeddings)} embeddings")
print(f"Each embedding has {len(embeddings[0])} dimensions")

# Calculate similarity between embeddings
similarity = llm.cosine_similarity(embeddings[0], embeddings[1])
print(f"Similarity between '{texts[0]}' and '{texts[1]}': {similarity:.3f}")

# Encode/decode for storage
encoded = llm.encode(embeddings[0])
decoded = llm.decode(encoded)

print(f"Original vector length: {len(embeddings[0])}")
print(f"Encoded bytes length: {len(encoded)}")
print(f"Decoded vector length: {len(decoded)}")
print(f"Vectors match: {embeddings[0] == decoded}")

Collection with Filtering

import llm

model = llm.get_embedding_model("text-embedding-ada-002")
collection = llm.Collection("products", model)

# Add products with metadata
products = [
    ("prod_1", "iPhone 15 Pro smartphone", {"category": "electronics", "price": 999}),
    ("prod_2", "MacBook Air laptop computer", {"category": "electronics", "price": 1299}),
    ("prod_3", "Nike Air Jordan sneakers", {"category": "clothing", "price": 180}),
    ("prod_4", "Samsung Galaxy tablet", {"category": "electronics", "price": 499}),
]

for prod_id, description, metadata in products:
    collection.embed(prod_id, description, metadata=metadata, store=True)

# Search with prefix filtering (e.g., only electronics)
electronics = collection.similar(
    "portable computer device",
    number=10,
    prefix="prod_"  # Could filter by category prefix if IDs were structured
)

print("Similar electronic products:")
for product in electronics:
    if product.metadata and product.metadata.get("category") == "electronics":
        print(f"- {product.content} (${product.metadata['price']})")

Async Embedding Operations

import asyncio
import llm

async def async_embedding_example():
    # Note: Actual async embedding models would be needed for true async operations
    model = llm.get_embedding_model("text-embedding-ada-002")
    collection = llm.Collection("async_docs", model)
    
    # In a real async scenario, you'd batch these operations
    documents = [
        "Async programming in Python",
        "Concurrency vs parallelism",
        "Event-driven architecture",
    ]
    
    # Embed in batch for efficiency
    batch_data = [(f"doc_{i}", doc) for i, doc in enumerate(documents)]
    collection.embed_multi(batch_data, store=True)
    
    # Search
    results = collection.similar("Python concurrency", number=2)
    for result in results:
        print(f"{result.id}: {result.content} ({result.score:.3f})")

# Run async example
asyncio.run(async_embedding_example())

Collection Management

import llm
from sqlite_utils import Database

# Check if collection exists
db = Database("embeddings.db")
if llm.Collection.exists(db, "my_collection"):
    print("Collection exists")
    collection = llm.Collection("my_collection", db=db)
    print(f"Collection has {collection.count()} items")
else:
    print("Creating new collection")
    model = llm.get_embedding_model("text-embedding-ada-002")
    collection = llm.Collection("my_collection", model, db=db)

# Add some data
collection.embed("item1", "Sample text for embedding")

# Clean up - delete collection when done
# collection.delete()

This comprehensive embeddings system enables efficient semantic search, document similarity, and vector operations while providing a simple interface for complex vector database operations. The SQLite backend ensures data persistence and efficient similarity computations.

Install with Tessl CLI