Tessl Tile for pypi/llm@0.27.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

configuration.md embeddings.md index.md models-and-conversations.md plugins.md templates.md tools-and-toolboxes.md

embeddings.mddocs/

0
# Embeddings
1

2
Vector database operations with similarity search, metadata storage, and efficient batch processing. This module provides comprehensive functionality for working with text embeddings, including storage, retrieval, and similarity computations.
3

4
## Capabilities
5

6
### Embedding Model Management
7

8
Functions to discover and work with embedding models from various providers.
9

10
```python { .api }
11
def get_embedding_model(name):
12
    """
13
    Get embedding model by name or alias.
14
    
15
    Args:
16
        name: Model name or configured alias
17
        
18
    Returns:
19
        EmbeddingModel instance
20
        
21
    Raises:
22
        UnknownModelError: If model name/alias not found
23
    """
24

25
def get_embedding_models() -> List[EmbeddingModel]:
26
    """Get all registered embedding models."""
27

28
def get_embedding_models_with_aliases() -> List[EmbeddingModelWithAliases]:
29
    """Get embedding models with their configured aliases."""
30

31
def get_embedding_model_aliases() -> Dict[str, EmbeddingModel]:
32
    """Get mapping of all aliases to their corresponding embedding models."""
33

34
def get_default_embedding_model() -> Optional[str]:
35
    """Get the default embedding model name."""
36

37
def set_default_embedding_model(model: str):
38
    """Set the default embedding model."""
39
```
40

41
### Embedding Model Hierarchy
42

43
Abstract base classes for embedding model implementations.
44

45
```python { .api }
46
class EmbeddingModel(ABC):
47
    """Abstract base class for embedding models."""
48
    
49
    model_id: str
50
    batch_size: int = 100
51
    supports_binary: bool = False
52
    supports_text: bool = True
53
    
54
    @abstractmethod
55
    def embed(self, items: List[str]) -> List[List[float]]:
56
        """
57
        Generate embeddings for a list of text items.
58
        
59
        Args:
60
            items: List of text strings to embed
61
            
62
        Returns:
63
            List of embedding vectors (lists of floats)
64
        """
65
    
66
    def embed_batch(self, items: List[str]) -> List[List[float]]:
67
        """Embed items in batches according to model's batch_size."""
68

69
class EmbeddingModelWithAliases:
70
    """Container for embedding model with its aliases."""
71
    
72
    model: EmbeddingModel
73
    aliases: List[str]
74
```
75

76
### Collection Management
77

78
The Collection class provides vector database functionality with SQLite backend storage.
79

80
```python { .api }
81
class Collection:
82
    """Vector database collection for embeddings storage and retrieval."""
83
    
84
    def __init__(
85
        self,
86
        name: str,
87
        model: Optional[EmbeddingModel] = None,
88
        db: Optional[Database] = None
89
    ):
90
        """
91
        Initialize collection.
92
        
93
        Args:
94
            name: Collection name
95
            model: Embedding model to use
96
            db: Optional database instance
97
        """
98
    
99
    def embed(
100
        self,
101
        id: str,
102
        value: Union[str, bytes],
103
        metadata: Optional[Dict[str, Any]] = None,
104
        store: bool = False
105
    ):
106
        """
107
        Embed and optionally store a single item.
108
        
109
        Args:
110
            id: Unique identifier for the item
111
            value: Text or binary content to embed
112
            metadata: Optional metadata dictionary
113
            store: Whether to store the original content
114
        """
115
    
116
    def embed_multi(
117
        self,
118
        entries: List[Tuple[str, Union[str, bytes]]],
119
        store: bool = False,
120
        batch_size: int = 100
121
    ):
122
        """
123
        Embed multiple items efficiently.
124
        
125
        Args:
126
            entries: List of (id, content) tuples
127
            store: Whether to store original content
128
            batch_size: Batch size for processing
129
        """
130
    
131
    def embed_multi_with_metadata(
132
        self,
133
        entries: List[Tuple[str, Union[str, bytes], Optional[Dict[str, Any]]]],
134
        store: bool = False,
135
        batch_size: int = 100
136
    ):
137
        """
138
        Embed multiple items with metadata.
139
        
140
        Args:
141
            entries: List of (id, content, metadata) tuples
142
            store: Whether to store original content
143
            batch_size: Batch size for processing
144
        """
145
    
146
    def similar(
147
        self,
148
        value: Union[str, bytes],
149
        number: int = 10,
150
        prefix: Optional[str] = None
151
    ) -> List[Entry]:
152
        """
153
        Find similar items by content.
154
        
155
        Args:
156
            value: Query content to find similar items for
157
            number: Maximum number of results
158
            prefix: Optional ID prefix filter
159
            
160
        Returns:
161
            List of Entry objects sorted by similarity score
162
        """
163
    
164
    def similar_by_id(
165
        self,
166
        id: str,
167
        number: int = 10,
168
        prefix: Optional[str] = None
169
    ) -> List[Entry]:
170
        """
171
        Find items similar to an existing item by ID.
172
        
173
        Args:
174
            id: ID of existing item to find similar items for
175
            number: Maximum number of results
176
            prefix: Optional ID prefix filter
177
            
178
        Returns:
179
            List of Entry objects sorted by similarity score
180
        """
181
    
182
    def similar_by_vector(
183
        self,
184
        vector: List[float],
185
        number: int = 10,
186
        skip_id: Optional[str] = None,
187
        prefix: Optional[str] = None
188
    ) -> List[Entry]:
189
        """
190
        Find similar items by embedding vector.
191
        
192
        Args:
193
            vector: Query embedding vector
194
            number: Maximum number of results
195
            skip_id: Optional ID to exclude from results
196
            prefix: Optional ID prefix filter
197
            
198
        Returns:
199
            List of Entry objects sorted by similarity score
200
        """
201
    
202
    def count(self) -> int:
203
        """Get total number of items in collection."""
204
    
205
    def delete(self):
206
        """Delete the collection and all its embeddings."""
207
    
208
    @classmethod
209
    def exists(cls, db: Database, name: str) -> bool:
210
        """
211
        Check if a collection exists in the database.
212
        
213
        Args:
214
            db: Database instance
215
            name: Collection name
216
            
217
        Returns:
218
            True if collection exists, False otherwise
219
        """
220
    
221
    name: str
222
    model: EmbeddingModel
223
```
224

225
### Entry Objects
226

227
Entry objects represent individual items in a collection with their similarity scores.
228

229
```python { .api }
230
class Entry:
231
    """Represents a single embedding entry with metadata."""
232
    
233
    def __init__(
234
        self,
235
        id: str,
236
        score: Optional[float] = None,
237
        content: Optional[str] = None,
238
        metadata: Optional[Dict[str, Any]] = None
239
    ):
240
        """
241
        Initialize entry.
242
        
243
        Args:
244
            id: Entry identifier
245
            score: Similarity score (for search results)
246
            content: Original text content
247
            metadata: Associated metadata
248
        """
249
    
250
    id: str
251
    score: Optional[float]
252
    content: Optional[str]
253
    metadata: Optional[Dict[str, Any]]
254
```
255

256
### Vector Utilities
257

258
Utility functions for working with embedding vectors.
259

260
```python { .api }
261
def encode(values: List[float]) -> bytes:
262
    """
263
    Encode float vector to bytes for efficient storage.
264
    
265
    Args:
266
        values: List of float values
267
        
268
    Returns:
269
        Packed binary representation
270
    """
271

272
def decode(binary: bytes) -> List[float]:
273
    """
274
    Decode bytes back to float vector.
275
    
276
    Args:
277
        binary: Packed binary data
278
        
279
    Returns:
280
        List of float values
281
    """
282

283
def cosine_similarity(a: List[float], b: List[float]) -> float:
284
    """
285
    Calculate cosine similarity between two vectors.
286
    
287
    Args:
288
        a: First vector
289
        b: Second vector
290
        
291
    Returns:
292
        Cosine similarity score between -1 and 1
293
    """
294
```
295

296
## Usage Examples
297

298
### Basic Collection Operations
299

300
```python
301
import llm
302

303
# Get embedding model and create collection
304
model = llm.get_embedding_model("text-embedding-ada-002")
305
collection = llm.Collection("documents", model)
306

307
# Add single document
308
collection.embed("doc1", "Paris is the capital of France")
309

310
# Add with metadata
311
collection.embed(
312
    "doc2", 
313
    "London is the capital of England",
314
    metadata={"country": "UK", "continent": "Europe"}
315
)
316

317
# Search for similar documents
318
results = collection.similar("French capital city", number=5)
319
for entry in results:
320
    print(f"{entry.id}: {entry.content} (score: {entry.score:.3f})")
321
    if entry.metadata:
322
        print(f"  Metadata: {entry.metadata}")
323
```
324

325
### Batch Operations
326

327
```python
328
import llm
329

330
model = llm.get_embedding_model("text-embedding-ada-002")
331
collection = llm.Collection("knowledge_base", model)
332

333
# Prepare batch data
334
documents = [
335
    ("physics_1", "Einstein's theory of relativity revolutionized physics"),
336
    ("physics_2", "Quantum mechanics describes the behavior of matter and energy"),
337
    ("history_1", "The Renaissance was a period of cultural rebirth in Europe"),
338
    ("history_2", "The Industrial Revolution transformed manufacturing"),
339
]
340

341
# Batch embed for efficiency
342
collection.embed_multi(documents, store=True)
343

344
# Batch with metadata
345
documents_with_metadata = [
346
    ("math_1", "Calculus is fundamental to mathematics", {"subject": "mathematics"}),
347
    ("math_2", "Linear algebra studies vector spaces", {"subject": "mathematics"}),
348
    ("art_1", "The Mona Lisa is a famous painting", {"subject": "art"}),
349
]
350

351
collection.embed_multi_with_metadata(documents_with_metadata, store=True)
352

353
print(f"Collection now has {collection.count()} documents")
354
```
355

356
### Similarity Search
357

358
```python
359
import llm
360

361
model = llm.get_embedding_model("text-embedding-ada-002")
362
collection = llm.Collection("research_papers", model)
363

364
# Add research papers
365
papers = [
366
    ("paper1", "Deep learning applications in computer vision"),
367
    ("paper2", "Natural language processing with transformers"),
368
    ("paper3", "Reinforcement learning for robotics"),
369
    ("paper4", "Computer vision techniques for medical imaging"),
370
    ("paper5", "Machine learning for climate prediction"),
371
]
372

373
collection.embed_multi(papers, store=True)
374

375
# Find papers similar to a query
376
query = "artificial intelligence in healthcare"
377
similar_papers = collection.similar(query, number=3)
378

379
print(f"Papers most similar to '{query}':")
380
for paper in similar_papers:
381
    print(f"- {paper.id}: {paper.content} (similarity: {paper.score:.3f})")
382

383
# Find papers similar to an existing paper
384
similar_to_paper = collection.similar_by_id("paper1", number=2)
385
print(f"\nPapers similar to paper1:")
386
for paper in similar_to_paper:
387
    print(f"- {paper.id}: {paper.content} (similarity: {paper.score:.3f})")
388
```
389

390
### Working with Vector Embeddings Directly
391

392
```python
393
import llm
394

395
model = llm.get_embedding_model("text-embedding-ada-002")
396

397
# Generate embeddings directly
398
texts = ["Hello world", "Python programming", "Machine learning"]
399
embeddings = model.embed(texts)
400

401
print(f"Generated {len(embeddings)} embeddings")
402
print(f"Each embedding has {len(embeddings[0])} dimensions")
403

404
# Calculate similarity between embeddings
405
similarity = llm.cosine_similarity(embeddings[0], embeddings[1])
406
print(f"Similarity between '{texts[0]}' and '{texts[1]}': {similarity:.3f}")
407

408
# Encode/decode for storage
409
encoded = llm.encode(embeddings[0])
410
decoded = llm.decode(encoded)
411

412
print(f"Original vector length: {len(embeddings[0])}")
413
print(f"Encoded bytes length: {len(encoded)}")
414
print(f"Decoded vector length: {len(decoded)}")
415
print(f"Vectors match: {embeddings[0] == decoded}")
416
```
417

418
### Collection with Filtering
419

420
```python
421
import llm
422

423
model = llm.get_embedding_model("text-embedding-ada-002")
424
collection = llm.Collection("products", model)
425

426
# Add products with metadata
427
products = [
428
    ("prod_1", "iPhone 15 Pro smartphone", {"category": "electronics", "price": 999}),
429
    ("prod_2", "MacBook Air laptop computer", {"category": "electronics", "price": 1299}),
430
    ("prod_3", "Nike Air Jordan sneakers", {"category": "clothing", "price": 180}),
431
    ("prod_4", "Samsung Galaxy tablet", {"category": "electronics", "price": 499}),
432
]
433

434
for prod_id, description, metadata in products:
435
    collection.embed(prod_id, description, metadata=metadata, store=True)
436

437
# Search with prefix filtering (e.g., only electronics)
438
electronics = collection.similar(
439
    "portable computer device",
440
    number=10,
441
    prefix="prod_"  # Could filter by category prefix if IDs were structured
442
)
443

444
print("Similar electronic products:")
445
for product in electronics:
446
    if product.metadata and product.metadata.get("category") == "electronics":
447
        print(f"- {product.content} (${product.metadata['price']})")
448
```
449

450
### Async Embedding Operations
451

452
```python
453
import asyncio
454
import llm
455

456
async def async_embedding_example():
457
    # Note: Actual async embedding models would be needed for true async operations
458
    model = llm.get_embedding_model("text-embedding-ada-002")
459
    collection = llm.Collection("async_docs", model)
460
    
461
    # In a real async scenario, you'd batch these operations
462
    documents = [
463
        "Async programming in Python",
464
        "Concurrency vs parallelism",
465
        "Event-driven architecture",
466
    ]
467
    
468
    # Embed in batch for efficiency
469
    batch_data = [(f"doc_{i}", doc) for i, doc in enumerate(documents)]
470
    collection.embed_multi(batch_data, store=True)
471
    
472
    # Search
473
    results = collection.similar("Python concurrency", number=2)
474
    for result in results:
475
        print(f"{result.id}: {result.content} ({result.score:.3f})")
476

477
# Run async example
478
asyncio.run(async_embedding_example())
479
```
480

481
### Collection Management
482

483
```python
484
import llm
485
from sqlite_utils import Database
486

487
# Check if collection exists
488
db = Database("embeddings.db")
489
if llm.Collection.exists(db, "my_collection"):
490
    print("Collection exists")
491
    collection = llm.Collection("my_collection", db=db)
492
    print(f"Collection has {collection.count()} items")
493
else:
494
    print("Creating new collection")
495
    model = llm.get_embedding_model("text-embedding-ada-002")
496
    collection = llm.Collection("my_collection", model, db=db)
497

498
# Add some data
499
collection.embed("item1", "Sample text for embedding")
500

501
# Clean up - delete collection when done
502
# collection.delete()
503
```
504

505
This comprehensive embeddings system enables efficient semantic search, document similarity, and vector operations while providing a simple interface for complex vector database operations. The SQLite backend ensures data persistence and efficient similarity computations.

Version

Tile

Files

embeddings.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

embeddings.mddocs/