CLI utility and Python library for interacting with Large Language Models from multiple providers including OpenAI, Anthropic, Google, and Meta plus locally installed models.
—
Vector database operations with similarity search, metadata storage, and efficient batch processing. This module provides comprehensive functionality for working with text embeddings, including storage, retrieval, and similarity computations.
Functions to discover and work with embedding models from various providers.
def get_embedding_model(name):
"""
Get embedding model by name or alias.
Args:
name: Model name or configured alias
Returns:
EmbeddingModel instance
Raises:
UnknownModelError: If model name/alias not found
"""
def get_embedding_models() -> List[EmbeddingModel]:
"""Get all registered embedding models."""
def get_embedding_models_with_aliases() -> List[EmbeddingModelWithAliases]:
"""Get embedding models with their configured aliases."""
def get_embedding_model_aliases() -> Dict[str, EmbeddingModel]:
"""Get mapping of all aliases to their corresponding embedding models."""
def get_default_embedding_model() -> Optional[str]:
"""Get the default embedding model name."""
def set_default_embedding_model(model: str):
"""Set the default embedding model."""Abstract base classes for embedding model implementations.
class EmbeddingModel(ABC):
"""Abstract base class for embedding models."""
model_id: str
batch_size: int = 100
supports_binary: bool = False
supports_text: bool = True
@abstractmethod
def embed(self, items: List[str]) -> List[List[float]]:
"""
Generate embeddings for a list of text items.
Args:
items: List of text strings to embed
Returns:
List of embedding vectors (lists of floats)
"""
def embed_batch(self, items: List[str]) -> List[List[float]]:
"""Embed items in batches according to model's batch_size."""
class EmbeddingModelWithAliases:
"""Container for embedding model with its aliases."""
model: EmbeddingModel
aliases: List[str]The Collection class provides vector database functionality with SQLite backend storage.
class Collection:
"""Vector database collection for embeddings storage and retrieval."""
def __init__(
self,
name: str,
model: Optional[EmbeddingModel] = None,
db: Optional[Database] = None
):
"""
Initialize collection.
Args:
name: Collection name
model: Embedding model to use
db: Optional database instance
"""
def embed(
self,
id: str,
value: Union[str, bytes],
metadata: Optional[Dict[str, Any]] = None,
store: bool = False
):
"""
Embed and optionally store a single item.
Args:
id: Unique identifier for the item
value: Text or binary content to embed
metadata: Optional metadata dictionary
store: Whether to store the original content
"""
def embed_multi(
self,
entries: List[Tuple[str, Union[str, bytes]]],
store: bool = False,
batch_size: int = 100
):
"""
Embed multiple items efficiently.
Args:
entries: List of (id, content) tuples
store: Whether to store original content
batch_size: Batch size for processing
"""
def embed_multi_with_metadata(
self,
entries: List[Tuple[str, Union[str, bytes], Optional[Dict[str, Any]]]],
store: bool = False,
batch_size: int = 100
):
"""
Embed multiple items with metadata.
Args:
entries: List of (id, content, metadata) tuples
store: Whether to store original content
batch_size: Batch size for processing
"""
def similar(
self,
value: Union[str, bytes],
number: int = 10,
prefix: Optional[str] = None
) -> List[Entry]:
"""
Find similar items by content.
Args:
value: Query content to find similar items for
number: Maximum number of results
prefix: Optional ID prefix filter
Returns:
List of Entry objects sorted by similarity score
"""
def similar_by_id(
self,
id: str,
number: int = 10,
prefix: Optional[str] = None
) -> List[Entry]:
"""
Find items similar to an existing item by ID.
Args:
id: ID of existing item to find similar items for
number: Maximum number of results
prefix: Optional ID prefix filter
Returns:
List of Entry objects sorted by similarity score
"""
def similar_by_vector(
self,
vector: List[float],
number: int = 10,
skip_id: Optional[str] = None,
prefix: Optional[str] = None
) -> List[Entry]:
"""
Find similar items by embedding vector.
Args:
vector: Query embedding vector
number: Maximum number of results
skip_id: Optional ID to exclude from results
prefix: Optional ID prefix filter
Returns:
List of Entry objects sorted by similarity score
"""
def count(self) -> int:
"""Get total number of items in collection."""
def delete(self):
"""Delete the collection and all its embeddings."""
@classmethod
def exists(cls, db: Database, name: str) -> bool:
"""
Check if a collection exists in the database.
Args:
db: Database instance
name: Collection name
Returns:
True if collection exists, False otherwise
"""
name: str
model: EmbeddingModelEntry objects represent individual items in a collection with their similarity scores.
class Entry:
"""Represents a single embedding entry with metadata."""
def __init__(
self,
id: str,
score: Optional[float] = None,
content: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
):
"""
Initialize entry.
Args:
id: Entry identifier
score: Similarity score (for search results)
content: Original text content
metadata: Associated metadata
"""
id: str
score: Optional[float]
content: Optional[str]
metadata: Optional[Dict[str, Any]]Utility functions for working with embedding vectors.
def encode(values: List[float]) -> bytes:
"""
Encode float vector to bytes for efficient storage.
Args:
values: List of float values
Returns:
Packed binary representation
"""
def decode(binary: bytes) -> List[float]:
"""
Decode bytes back to float vector.
Args:
binary: Packed binary data
Returns:
List of float values
"""
def cosine_similarity(a: List[float], b: List[float]) -> float:
"""
Calculate cosine similarity between two vectors.
Args:
a: First vector
b: Second vector
Returns:
Cosine similarity score between -1 and 1
"""import llm
# Get embedding model and create collection
model = llm.get_embedding_model("text-embedding-ada-002")
collection = llm.Collection("documents", model)
# Add single document
collection.embed("doc1", "Paris is the capital of France")
# Add with metadata
collection.embed(
"doc2",
"London is the capital of England",
metadata={"country": "UK", "continent": "Europe"}
)
# Search for similar documents
results = collection.similar("French capital city", number=5)
for entry in results:
print(f"{entry.id}: {entry.content} (score: {entry.score:.3f})")
if entry.metadata:
print(f" Metadata: {entry.metadata}")import llm
model = llm.get_embedding_model("text-embedding-ada-002")
collection = llm.Collection("knowledge_base", model)
# Prepare batch data
documents = [
("physics_1", "Einstein's theory of relativity revolutionized physics"),
("physics_2", "Quantum mechanics describes the behavior of matter and energy"),
("history_1", "The Renaissance was a period of cultural rebirth in Europe"),
("history_2", "The Industrial Revolution transformed manufacturing"),
]
# Batch embed for efficiency
collection.embed_multi(documents, store=True)
# Batch with metadata
documents_with_metadata = [
("math_1", "Calculus is fundamental to mathematics", {"subject": "mathematics"}),
("math_2", "Linear algebra studies vector spaces", {"subject": "mathematics"}),
("art_1", "The Mona Lisa is a famous painting", {"subject": "art"}),
]
collection.embed_multi_with_metadata(documents_with_metadata, store=True)
print(f"Collection now has {collection.count()} documents")import llm
model = llm.get_embedding_model("text-embedding-ada-002")
collection = llm.Collection("research_papers", model)
# Add research papers
papers = [
("paper1", "Deep learning applications in computer vision"),
("paper2", "Natural language processing with transformers"),
("paper3", "Reinforcement learning for robotics"),
("paper4", "Computer vision techniques for medical imaging"),
("paper5", "Machine learning for climate prediction"),
]
collection.embed_multi(papers, store=True)
# Find papers similar to a query
query = "artificial intelligence in healthcare"
similar_papers = collection.similar(query, number=3)
print(f"Papers most similar to '{query}':")
for paper in similar_papers:
print(f"- {paper.id}: {paper.content} (similarity: {paper.score:.3f})")
# Find papers similar to an existing paper
similar_to_paper = collection.similar_by_id("paper1", number=2)
print(f"\nPapers similar to paper1:")
for paper in similar_to_paper:
print(f"- {paper.id}: {paper.content} (similarity: {paper.score:.3f})")import llm
model = llm.get_embedding_model("text-embedding-ada-002")
# Generate embeddings directly
texts = ["Hello world", "Python programming", "Machine learning"]
embeddings = model.embed(texts)
print(f"Generated {len(embeddings)} embeddings")
print(f"Each embedding has {len(embeddings[0])} dimensions")
# Calculate similarity between embeddings
similarity = llm.cosine_similarity(embeddings[0], embeddings[1])
print(f"Similarity between '{texts[0]}' and '{texts[1]}': {similarity:.3f}")
# Encode/decode for storage
encoded = llm.encode(embeddings[0])
decoded = llm.decode(encoded)
print(f"Original vector length: {len(embeddings[0])}")
print(f"Encoded bytes length: {len(encoded)}")
print(f"Decoded vector length: {len(decoded)}")
print(f"Vectors match: {embeddings[0] == decoded}")import llm
model = llm.get_embedding_model("text-embedding-ada-002")
collection = llm.Collection("products", model)
# Add products with metadata
products = [
("prod_1", "iPhone 15 Pro smartphone", {"category": "electronics", "price": 999}),
("prod_2", "MacBook Air laptop computer", {"category": "electronics", "price": 1299}),
("prod_3", "Nike Air Jordan sneakers", {"category": "clothing", "price": 180}),
("prod_4", "Samsung Galaxy tablet", {"category": "electronics", "price": 499}),
]
for prod_id, description, metadata in products:
collection.embed(prod_id, description, metadata=metadata, store=True)
# Search with prefix filtering (e.g., only electronics)
electronics = collection.similar(
"portable computer device",
number=10,
prefix="prod_" # Could filter by category prefix if IDs were structured
)
print("Similar electronic products:")
for product in electronics:
if product.metadata and product.metadata.get("category") == "electronics":
print(f"- {product.content} (${product.metadata['price']})")import asyncio
import llm
async def async_embedding_example():
# Note: Actual async embedding models would be needed for true async operations
model = llm.get_embedding_model("text-embedding-ada-002")
collection = llm.Collection("async_docs", model)
# In a real async scenario, you'd batch these operations
documents = [
"Async programming in Python",
"Concurrency vs parallelism",
"Event-driven architecture",
]
# Embed in batch for efficiency
batch_data = [(f"doc_{i}", doc) for i, doc in enumerate(documents)]
collection.embed_multi(batch_data, store=True)
# Search
results = collection.similar("Python concurrency", number=2)
for result in results:
print(f"{result.id}: {result.content} ({result.score:.3f})")
# Run async example
asyncio.run(async_embedding_example())import llm
from sqlite_utils import Database
# Check if collection exists
db = Database("embeddings.db")
if llm.Collection.exists(db, "my_collection"):
print("Collection exists")
collection = llm.Collection("my_collection", db=db)
print(f"Collection has {collection.count()} items")
else:
print("Creating new collection")
model = llm.get_embedding_model("text-embedding-ada-002")
collection = llm.Collection("my_collection", model, db=db)
# Add some data
collection.embed("item1", "Sample text for embedding")
# Clean up - delete collection when done
# collection.delete()This comprehensive embeddings system enables efficient semantic search, document similarity, and vector operations while providing a simple interface for complex vector database operations. The SQLite backend ensures data persistence and efficient similarity computations.
Install with Tessl CLI
npx tessl i tessl/pypi-llm