or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdembeddings.mdindex.mdmodels-and-conversations.mdplugins.mdtemplates.mdtools-and-toolboxes.md

embeddings.mddocs/

0

# Embeddings

1

2

Vector database operations with similarity search, metadata storage, and efficient batch processing. This module provides comprehensive functionality for working with text embeddings, including storage, retrieval, and similarity computations.

3

4

## Capabilities

5

6

### Embedding Model Management

7

8

Functions to discover and work with embedding models from various providers.

9

10

```python { .api }

11

def get_embedding_model(name):

12

"""

13

Get embedding model by name or alias.

14

15

Args:

16

name: Model name or configured alias

17

18

Returns:

19

EmbeddingModel instance

20

21

Raises:

22

UnknownModelError: If model name/alias not found

23

"""

24

25

def get_embedding_models() -> List[EmbeddingModel]:

26

"""Get all registered embedding models."""

27

28

def get_embedding_models_with_aliases() -> List[EmbeddingModelWithAliases]:

29

"""Get embedding models with their configured aliases."""

30

31

def get_embedding_model_aliases() -> Dict[str, EmbeddingModel]:

32

"""Get mapping of all aliases to their corresponding embedding models."""

33

34

def get_default_embedding_model() -> Optional[str]:

35

"""Get the default embedding model name."""

36

37

def set_default_embedding_model(model: str):

38

"""Set the default embedding model."""

39

```

40

41

### Embedding Model Hierarchy

42

43

Abstract base classes for embedding model implementations.

44

45

```python { .api }

46

class EmbeddingModel(ABC):

47

"""Abstract base class for embedding models."""

48

49

model_id: str

50

batch_size: int = 100

51

supports_binary: bool = False

52

supports_text: bool = True

53

54

@abstractmethod

55

def embed(self, items: List[str]) -> List[List[float]]:

56

"""

57

Generate embeddings for a list of text items.

58

59

Args:

60

items: List of text strings to embed

61

62

Returns:

63

List of embedding vectors (lists of floats)

64

"""

65

66

def embed_batch(self, items: List[str]) -> List[List[float]]:

67

"""Embed items in batches according to model's batch_size."""

68

69

class EmbeddingModelWithAliases:

70

"""Container for embedding model with its aliases."""

71

72

model: EmbeddingModel

73

aliases: List[str]

74

```

75

76

### Collection Management

77

78

The Collection class provides vector database functionality with SQLite backend storage.

79

80

```python { .api }

81

class Collection:

82

"""Vector database collection for embeddings storage and retrieval."""

83

84

def __init__(

85

self,

86

name: str,

87

model: Optional[EmbeddingModel] = None,

88

db: Optional[Database] = None

89

):

90

"""

91

Initialize collection.

92

93

Args:

94

name: Collection name

95

model: Embedding model to use

96

db: Optional database instance

97

"""

98

99

def embed(

100

self,

101

id: str,

102

value: Union[str, bytes],

103

metadata: Optional[Dict[str, Any]] = None,

104

store: bool = False

105

):

106

"""

107

Embed and optionally store a single item.

108

109

Args:

110

id: Unique identifier for the item

111

value: Text or binary content to embed

112

metadata: Optional metadata dictionary

113

store: Whether to store the original content

114

"""

115

116

def embed_multi(

117

self,

118

entries: List[Tuple[str, Union[str, bytes]]],

119

store: bool = False,

120

batch_size: int = 100

121

):

122

"""

123

Embed multiple items efficiently.

124

125

Args:

126

entries: List of (id, content) tuples

127

store: Whether to store original content

128

batch_size: Batch size for processing

129

"""

130

131

def embed_multi_with_metadata(

132

self,

133

entries: List[Tuple[str, Union[str, bytes], Optional[Dict[str, Any]]]],

134

store: bool = False,

135

batch_size: int = 100

136

):

137

"""

138

Embed multiple items with metadata.

139

140

Args:

141

entries: List of (id, content, metadata) tuples

142

store: Whether to store original content

143

batch_size: Batch size for processing

144

"""

145

146

def similar(

147

self,

148

value: Union[str, bytes],

149

number: int = 10,

150

prefix: Optional[str] = None

151

) -> List[Entry]:

152

"""

153

Find similar items by content.

154

155

Args:

156

value: Query content to find similar items for

157

number: Maximum number of results

158

prefix: Optional ID prefix filter

159

160

Returns:

161

List of Entry objects sorted by similarity score

162

"""

163

164

def similar_by_id(

165

self,

166

id: str,

167

number: int = 10,

168

prefix: Optional[str] = None

169

) -> List[Entry]:

170

"""

171

Find items similar to an existing item by ID.

172

173

Args:

174

id: ID of existing item to find similar items for

175

number: Maximum number of results

176

prefix: Optional ID prefix filter

177

178

Returns:

179

List of Entry objects sorted by similarity score

180

"""

181

182

def similar_by_vector(

183

self,

184

vector: List[float],

185

number: int = 10,

186

skip_id: Optional[str] = None,

187

prefix: Optional[str] = None

188

) -> List[Entry]:

189

"""

190

Find similar items by embedding vector.

191

192

Args:

193

vector: Query embedding vector

194

number: Maximum number of results

195

skip_id: Optional ID to exclude from results

196

prefix: Optional ID prefix filter

197

198

Returns:

199

List of Entry objects sorted by similarity score

200

"""

201

202

def count(self) -> int:

203

"""Get total number of items in collection."""

204

205

def delete(self):

206

"""Delete the collection and all its embeddings."""

207

208

@classmethod

209

def exists(cls, db: Database, name: str) -> bool:

210

"""

211

Check if a collection exists in the database.

212

213

Args:

214

db: Database instance

215

name: Collection name

216

217

Returns:

218

True if collection exists, False otherwise

219

"""

220

221

name: str

222

model: EmbeddingModel

223

```

224

225

### Entry Objects

226

227

Entry objects represent individual items in a collection with their similarity scores.

228

229

```python { .api }

230

class Entry:

231

"""Represents a single embedding entry with metadata."""

232

233

def __init__(

234

self,

235

id: str,

236

score: Optional[float] = None,

237

content: Optional[str] = None,

238

metadata: Optional[Dict[str, Any]] = None

239

):

240

"""

241

Initialize entry.

242

243

Args:

244

id: Entry identifier

245

score: Similarity score (for search results)

246

content: Original text content

247

metadata: Associated metadata

248

"""

249

250

id: str

251

score: Optional[float]

252

content: Optional[str]

253

metadata: Optional[Dict[str, Any]]

254

```

255

256

### Vector Utilities

257

258

Utility functions for working with embedding vectors.

259

260

```python { .api }

261

def encode(values: List[float]) -> bytes:

262

"""

263

Encode float vector to bytes for efficient storage.

264

265

Args:

266

values: List of float values

267

268

Returns:

269

Packed binary representation

270

"""

271

272

def decode(binary: bytes) -> List[float]:

273

"""

274

Decode bytes back to float vector.

275

276

Args:

277

binary: Packed binary data

278

279

Returns:

280

List of float values

281

"""

282

283

def cosine_similarity(a: List[float], b: List[float]) -> float:

284

"""

285

Calculate cosine similarity between two vectors.

286

287

Args:

288

a: First vector

289

b: Second vector

290

291

Returns:

292

Cosine similarity score between -1 and 1

293

"""

294

```

295

296

## Usage Examples

297

298

### Basic Collection Operations

299

300

```python

301

import llm

302

303

# Get embedding model and create collection

304

model = llm.get_embedding_model("text-embedding-ada-002")

305

collection = llm.Collection("documents", model)

306

307

# Add single document

308

collection.embed("doc1", "Paris is the capital of France")

309

310

# Add with metadata

311

collection.embed(

312

"doc2",

313

"London is the capital of England",

314

metadata={"country": "UK", "continent": "Europe"}

315

)

316

317

# Search for similar documents

318

results = collection.similar("French capital city", number=5)

319

for entry in results:

320

print(f"{entry.id}: {entry.content} (score: {entry.score:.3f})")

321

if entry.metadata:

322

print(f" Metadata: {entry.metadata}")

323

```

324

325

### Batch Operations

326

327

```python

328

import llm

329

330

model = llm.get_embedding_model("text-embedding-ada-002")

331

collection = llm.Collection("knowledge_base", model)

332

333

# Prepare batch data

334

documents = [

335

("physics_1", "Einstein's theory of relativity revolutionized physics"),

336

("physics_2", "Quantum mechanics describes the behavior of matter and energy"),

337

("history_1", "The Renaissance was a period of cultural rebirth in Europe"),

338

("history_2", "The Industrial Revolution transformed manufacturing"),

339

]

340

341

# Batch embed for efficiency

342

collection.embed_multi(documents, store=True)

343

344

# Batch with metadata

345

documents_with_metadata = [

346

("math_1", "Calculus is fundamental to mathematics", {"subject": "mathematics"}),

347

("math_2", "Linear algebra studies vector spaces", {"subject": "mathematics"}),

348

("art_1", "The Mona Lisa is a famous painting", {"subject": "art"}),

349

]

350

351

collection.embed_multi_with_metadata(documents_with_metadata, store=True)

352

353

print(f"Collection now has {collection.count()} documents")

354

```

355

356

### Similarity Search

357

358

```python

359

import llm

360

361

model = llm.get_embedding_model("text-embedding-ada-002")

362

collection = llm.Collection("research_papers", model)

363

364

# Add research papers

365

papers = [

366

("paper1", "Deep learning applications in computer vision"),

367

("paper2", "Natural language processing with transformers"),

368

("paper3", "Reinforcement learning for robotics"),

369

("paper4", "Computer vision techniques for medical imaging"),

370

("paper5", "Machine learning for climate prediction"),

371

]

372

373

collection.embed_multi(papers, store=True)

374

375

# Find papers similar to a query

376

query = "artificial intelligence in healthcare"

377

similar_papers = collection.similar(query, number=3)

378

379

print(f"Papers most similar to '{query}':")

380

for paper in similar_papers:

381

print(f"- {paper.id}: {paper.content} (similarity: {paper.score:.3f})")

382

383

# Find papers similar to an existing paper

384

similar_to_paper = collection.similar_by_id("paper1", number=2)

385

print(f"\nPapers similar to paper1:")

386

for paper in similar_to_paper:

387

print(f"- {paper.id}: {paper.content} (similarity: {paper.score:.3f})")

388

```

389

390

### Working with Vector Embeddings Directly

391

392

```python

393

import llm

394

395

model = llm.get_embedding_model("text-embedding-ada-002")

396

397

# Generate embeddings directly

398

texts = ["Hello world", "Python programming", "Machine learning"]

399

embeddings = model.embed(texts)

400

401

print(f"Generated {len(embeddings)} embeddings")

402

print(f"Each embedding has {len(embeddings[0])} dimensions")

403

404

# Calculate similarity between embeddings

405

similarity = llm.cosine_similarity(embeddings[0], embeddings[1])

406

print(f"Similarity between '{texts[0]}' and '{texts[1]}': {similarity:.3f}")

407

408

# Encode/decode for storage

409

encoded = llm.encode(embeddings[0])

410

decoded = llm.decode(encoded)

411

412

print(f"Original vector length: {len(embeddings[0])}")

413

print(f"Encoded bytes length: {len(encoded)}")

414

print(f"Decoded vector length: {len(decoded)}")

415

print(f"Vectors match: {embeddings[0] == decoded}")

416

```

417

418

### Collection with Filtering

419

420

```python

421

import llm

422

423

model = llm.get_embedding_model("text-embedding-ada-002")

424

collection = llm.Collection("products", model)

425

426

# Add products with metadata

427

products = [

428

("prod_1", "iPhone 15 Pro smartphone", {"category": "electronics", "price": 999}),

429

("prod_2", "MacBook Air laptop computer", {"category": "electronics", "price": 1299}),

430

("prod_3", "Nike Air Jordan sneakers", {"category": "clothing", "price": 180}),

431

("prod_4", "Samsung Galaxy tablet", {"category": "electronics", "price": 499}),

432

]

433

434

for prod_id, description, metadata in products:

435

collection.embed(prod_id, description, metadata=metadata, store=True)

436

437

# Search with prefix filtering (e.g., only electronics)

438

electronics = collection.similar(

439

"portable computer device",

440

number=10,

441

prefix="prod_" # Could filter by category prefix if IDs were structured

442

)

443

444

print("Similar electronic products:")

445

for product in electronics:

446

if product.metadata and product.metadata.get("category") == "electronics":

447

print(f"- {product.content} (${product.metadata['price']})")

448

```

449

450

### Async Embedding Operations

451

452

```python

453

import asyncio

454

import llm

455

456

async def async_embedding_example():

457

# Note: Actual async embedding models would be needed for true async operations

458

model = llm.get_embedding_model("text-embedding-ada-002")

459

collection = llm.Collection("async_docs", model)

460

461

# In a real async scenario, you'd batch these operations

462

documents = [

463

"Async programming in Python",

464

"Concurrency vs parallelism",

465

"Event-driven architecture",

466

]

467

468

# Embed in batch for efficiency

469

batch_data = [(f"doc_{i}", doc) for i, doc in enumerate(documents)]

470

collection.embed_multi(batch_data, store=True)

471

472

# Search

473

results = collection.similar("Python concurrency", number=2)

474

for result in results:

475

print(f"{result.id}: {result.content} ({result.score:.3f})")

476

477

# Run async example

478

asyncio.run(async_embedding_example())

479

```

480

481

### Collection Management

482

483

```python

484

import llm

485

from sqlite_utils import Database

486

487

# Check if collection exists

488

db = Database("embeddings.db")

489

if llm.Collection.exists(db, "my_collection"):

490

print("Collection exists")

491

collection = llm.Collection("my_collection", db=db)

492

print(f"Collection has {collection.count()} items")

493

else:

494

print("Creating new collection")

495

model = llm.get_embedding_model("text-embedding-ada-002")

496

collection = llm.Collection("my_collection", model, db=db)

497

498

# Add some data

499

collection.embed("item1", "Sample text for embedding")

500

501

# Clean up - delete collection when done

502

# collection.delete()

503

```

504

505

This comprehensive embeddings system enables efficient semantic search, document similarity, and vector operations while providing a simple interface for complex vector database operations. The SQLite backend ensures data persistence and efficient similarity computations.