or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agent-framework.mdcore-framework.mddocument-processing.mddocument-stores.mdevaluation.mdindex.mdprompt-building.mdretrieval.mdtext-embeddings.mdtext-generation.md

text-embeddings.mddocs/

0

# Text Embeddings

1

2

Convert text and documents into vector embeddings for semantic search, retrieval, and similarity comparison. Supports multiple embedding providers including OpenAI, HuggingFace, and Sentence Transformers.

3

4

## Capabilities

5

6

### OpenAI Embeddings

7

8

Generate embeddings using OpenAI's text embedding models for high-quality semantic representations.

9

10

```python { .api }

11

class OpenAITextEmbedder:

12

def __init__(

13

self,

14

api_key: Secret = None,

15

model: str = "text-embedding-ada-002",

16

dimensions: Optional[int] = None,

17

api_base_url: Optional[str] = None,

18

organization: Optional[str] = None,

19

prefix: str = "",

20

suffix: str = ""

21

) -> None:

22

"""

23

Initialize OpenAI text embedder.

24

25

Args:

26

api_key: OpenAI API key

27

model: OpenAI embedding model name

28

dimensions: Number of dimensions for embedding (model dependent)

29

api_base_url: Custom API base URL

30

organization: OpenAI organization ID

31

prefix: Text prefix to add before embedding

32

suffix: Text suffix to add after embedding

33

"""

34

35

def run(self, text: str) -> Dict[str, List[float]]:

36

"""

37

Generate embedding for input text.

38

39

Args:

40

text: Input text to embed

41

42

Returns:

43

Dictionary with 'embedding' key containing the vector embedding

44

"""

45

46

class OpenAIDocumentEmbedder:

47

def __init__(

48

self,

49

api_key: Secret = None,

50

model: str = "text-embedding-ada-002",

51

dimensions: Optional[int] = None,

52

api_base_url: Optional[str] = None,

53

organization: Optional[str] = None,

54

prefix: str = "",

55

suffix: str = "",

56

batch_size: int = 32,

57

progress_bar: bool = True,

58

meta_fields_to_embed: Optional[List[str]] = None,

59

embedding_separator: str = "\n"

60

) -> None:

61

"""

62

Initialize OpenAI document embedder.

63

64

Args:

65

api_key: OpenAI API key

66

model: OpenAI embedding model name

67

dimensions: Number of dimensions for embedding

68

api_base_url: Custom API base URL

69

organization: OpenAI organization ID

70

prefix: Text prefix to add before embedding

71

suffix: Text suffix to add after embedding

72

batch_size: Number of documents to embed in each batch

73

progress_bar: Show progress bar during embedding

74

meta_fields_to_embed: Document metadata fields to include in embedding

75

embedding_separator: Separator for joining text and metadata

76

"""

77

78

def run(self, documents: List[Document]) -> Dict[str, List[Document]]:

79

"""

80

Generate embeddings for a list of documents.

81

82

Args:

83

documents: List of Document objects to embed

84

85

Returns:

86

Dictionary with 'documents' key containing documents with embeddings

87

"""

88

89

class AzureOpenAITextEmbedder:

90

def __init__(

91

self,

92

azure_endpoint: str,

93

api_version: str,

94

api_key: Secret = None,

95

azure_ad_token: Secret = None,

96

model: str = "text-embedding-ada-002",

97

dimensions: Optional[int] = None,

98

prefix: str = "",

99

suffix: str = ""

100

) -> None:

101

"""

102

Initialize Azure OpenAI text embedder.

103

104

Args:

105

azure_endpoint: Azure OpenAI endpoint URL

106

api_version: Azure OpenAI API version

107

api_key: Azure OpenAI API key

108

azure_ad_token: Azure AD token for authentication

109

model: Deployment name of the embedding model

110

dimensions: Number of dimensions for embedding

111

prefix: Text prefix to add before embedding

112

suffix: Text suffix to add after embedding

113

"""

114

115

def run(self, text: str) -> Dict[str, List[float]]:

116

"""Generate embedding using Azure OpenAI."""

117

118

class AzureOpenAIDocumentEmbedder:

119

def __init__(

120

self,

121

azure_endpoint: str,

122

api_version: str,

123

api_key: Secret = None,

124

azure_ad_token: Secret = None,

125

model: str = "text-embedding-ada-002",

126

dimensions: Optional[int] = None,

127

prefix: str = "",

128

suffix: str = "",

129

batch_size: int = 32,

130

progress_bar: bool = True,

131

meta_fields_to_embed: Optional[List[str]] = None,

132

embedding_separator: str = "\n"

133

) -> None:

134

"""Initialize Azure OpenAI document embedder."""

135

136

def run(self, documents: List[Document]) -> Dict[str, List[Document]]:

137

"""Generate embeddings for documents using Azure OpenAI."""

138

```

139

140

### Sentence Transformers Embeddings

141

142

Generate embeddings using Sentence Transformers models for high-quality semantic representations with local inference.

143

144

```python { .api }

145

class SentenceTransformersTextEmbedder:

146

def __init__(

147

self,

148

model: str = "sentence-transformers/all-MiniLM-L6-v2",

149

device: Optional[ComponentDevice] = None,

150

token: Secret = None,

151

prefix: str = "",

152

suffix: str = "",

153

normalize_embeddings: bool = True,

154

batch_size: int = 32,

155

progress_bar: bool = True,

156

model_kwargs: Optional[Dict[str, Any]] = None,

157

tokenizer_kwargs: Optional[Dict[str, Any]] = None,

158

config_kwargs: Optional[Dict[str, Any]] = None

159

) -> None:

160

"""

161

Initialize Sentence Transformers text embedder.

162

163

Args:

164

model: Sentence Transformers model name or path

165

device: Device for model inference

166

token: HuggingFace token for private models

167

prefix: Text prefix to add before embedding

168

suffix: Text suffix to add after embedding

169

normalize_embeddings: Whether to normalize embeddings to unit length

170

batch_size: Batch size for inference

171

progress_bar: Show progress bar during embedding

172

model_kwargs: Additional model initialization arguments

173

tokenizer_kwargs: Additional tokenizer arguments

174

config_kwargs: Additional configuration arguments

175

"""

176

177

def run(self, text: str) -> Dict[str, List[float]]:

178

"""

179

Generate embedding for input text using Sentence Transformers.

180

181

Args:

182

text: Input text to embed

183

184

Returns:

185

Dictionary with 'embedding' key containing the vector embedding

186

"""

187

188

class SentenceTransformersDocumentEmbedder:

189

def __init__(

190

self,

191

model: str = "sentence-transformers/all-MiniLM-L6-v2",

192

device: Optional[ComponentDevice] = None,

193

token: Secret = None,

194

prefix: str = "",

195

suffix: str = "",

196

normalize_embeddings: bool = True,

197

batch_size: int = 32,

198

progress_bar: bool = True,

199

model_kwargs: Optional[Dict[str, Any]] = None,

200

tokenizer_kwargs: Optional[Dict[str, Any]] = None,

201

config_kwargs: Optional[Dict[str, Any]] = None,

202

meta_fields_to_embed: Optional[List[str]] = None,

203

embedding_separator: str = "\n"

204

) -> None:

205

"""

206

Initialize Sentence Transformers document embedder.

207

208

Args:

209

model: Sentence Transformers model name or path

210

device: Device for model inference

211

token: HuggingFace token for private models

212

prefix: Text prefix to add before embedding

213

suffix: Text suffix to add after embedding

214

normalize_embeddings: Whether to normalize embeddings

215

batch_size: Batch size for inference

216

progress_bar: Show progress bar during embedding

217

model_kwargs: Additional model initialization arguments

218

tokenizer_kwargs: Additional tokenizer arguments

219

config_kwargs: Additional configuration arguments

220

meta_fields_to_embed: Document metadata fields to include in embedding

221

embedding_separator: Separator for joining text and metadata

222

"""

223

224

def run(self, documents: List[Document]) -> Dict[str, List[Document]]:

225

"""Generate embeddings for documents using Sentence Transformers."""

226

```

227

228

### HuggingFace Embeddings

229

230

Generate embeddings using HuggingFace models via API for various transformer models.

231

232

```python { .api }

233

class HuggingFaceAPITextEmbedder:

234

def __init__(

235

self,

236

api_type: Literal["serverless_inference_api", "inference_endpoints"] = "serverless_inference_api",

237

api_url: Optional[str] = None,

238

token: Secret = None,

239

model: Optional[str] = None,

240

prefix: str = "",

241

suffix: str = "",

242

truncate: bool = True,

243

normalize: bool = False

244

) -> None:

245

"""

246

Initialize HuggingFace API text embedder.

247

248

Args:

249

api_type: Type of HuggingFace API to use

250

api_url: Custom API endpoint URL

251

token: HuggingFace API token

252

model: Model name for serverless inference

253

prefix: Text prefix to add before embedding

254

suffix: Text suffix to add after embedding

255

truncate: Whether to truncate input text

256

normalize: Whether to normalize embeddings

257

"""

258

259

def run(self, text: str) -> Dict[str, List[float]]:

260

"""

261

Generate embedding using HuggingFace API.

262

263

Args:

264

text: Input text to embed

265

266

Returns:

267

Dictionary with 'embedding' key containing the vector embedding

268

"""

269

270

class HuggingFaceAPIDocumentEmbedder:

271

def __init__(

272

self,

273

api_type: Literal["serverless_inference_api", "inference_endpoints"] = "serverless_inference_api",

274

api_url: Optional[str] = None,

275

token: Secret = None,

276

model: Optional[str] = None,

277

prefix: str = "",

278

suffix: str = "",

279

truncate: bool = True,

280

normalize: bool = False,

281

batch_size: int = 32,

282

progress_bar: bool = True,

283

meta_fields_to_embed: Optional[List[str]] = None,

284

embedding_separator: str = "\n"

285

) -> None:

286

"""

287

Initialize HuggingFace API document embedder.

288

289

Args:

290

api_type: Type of HuggingFace API to use

291

api_url: Custom API endpoint URL

292

token: HuggingFace API token

293

model: Model name for serverless inference

294

prefix: Text prefix to add before embedding

295

suffix: Text suffix to add after embedding

296

truncate: Whether to truncate input text

297

normalize: Whether to normalize embeddings

298

batch_size: Batch size for processing

299

progress_bar: Show progress bar during embedding

300

meta_fields_to_embed: Document metadata fields to include

301

embedding_separator: Separator for joining text and metadata

302

"""

303

304

def run(self, documents: List[Document]) -> Dict[str, List[Document]]:

305

"""Generate embeddings for documents using HuggingFace API."""

306

```

307

308

### Image Embeddings

309

310

Generate embeddings for images and image content within documents.

311

312

```python { .api }

313

class SentenceTransformersDocumentImageEmbedder:

314

def __init__(

315

self,

316

model: str = "sentence-transformers/clip-ViT-B-32",

317

device: Optional[ComponentDevice] = None,

318

token: Secret = None,

319

prefix: str = "",

320

suffix: str = "",

321

normalize_embeddings: bool = True,

322

batch_size: int = 32,

323

progress_bar: bool = True,

324

model_kwargs: Optional[Dict[str, Any]] = None

325

) -> None:

326

"""

327

Initialize Sentence Transformers document image embedder.

328

329

Args:

330

model: Sentence Transformers CLIP model name

331

device: Device for model inference

332

token: HuggingFace token for private models

333

prefix: Text prefix for image descriptions

334

suffix: Text suffix for image descriptions

335

normalize_embeddings: Whether to normalize embeddings

336

batch_size: Batch size for inference

337

progress_bar: Show progress bar during embedding

338

model_kwargs: Additional model arguments

339

"""

340

341

def run(self, documents: List[Document]) -> Dict[str, List[Document]]:

342

"""

343

Generate embeddings for images in documents.

344

345

Args:

346

documents: List of documents containing ImageContent

347

348

Returns:

349

Dictionary with 'documents' key containing documents with image embeddings

350

"""

351

```

352

353

## Usage Examples

354

355

### Basic Text Embedding

356

357

```python

358

from haystack.components.embedders import OpenAITextEmbedder

359

from haystack.utils import Secret

360

361

# Initialize embedder

362

embedder = OpenAITextEmbedder(

363

api_key=Secret.from_env_var("OPENAI_API_KEY"),

364

model="text-embedding-ada-002"

365

)

366

367

# Generate embedding

368

result = embedder.run(text="Haystack is a framework for building LLM applications.")

369

embedding = result["embedding"]

370

371

print(f"Embedding dimension: {len(embedding)}")

372

print(f"First 5 values: {embedding[:5]}")

373

```

374

375

### Document Embedding with Metadata

376

377

```python

378

from haystack.components.embedders import SentenceTransformersDocumentEmbedder

379

from haystack import Document

380

381

# Initialize embedder with metadata fields

382

embedder = SentenceTransformersDocumentEmbedder(

383

model="sentence-transformers/all-MiniLM-L6-v2",

384

meta_fields_to_embed=["title", "category"],

385

embedding_separator=" | "

386

)

387

388

# Create documents with metadata

389

documents = [

390

Document(

391

content="Python is a programming language.",

392

meta={"title": "Python Overview", "category": "programming"}

393

),

394

Document(

395

content="Machine learning uses algorithms to find patterns.",

396

meta={"title": "ML Basics", "category": "artificial intelligence"}

397

)

398

]

399

400

# Embed documents

401

result = embedder.run(documents=documents)

402

embedded_docs = result["documents"]

403

404

for doc in embedded_docs:

405

print(f"Document: {doc.content[:30]}...")

406

print(f"Embedding shape: {len(doc.embedding)}")

407

print(f"Metadata: {doc.meta}")

408

print()

409

```

410

411

### Batch Processing with Progress

412

413

```python

414

from haystack.components.embedders import OpenAIDocumentEmbedder

415

from haystack import Document

416

from haystack.utils import Secret

417

418

# Create many documents

419

documents = [

420

Document(content=f"This is document number {i}")

421

for i in range(100)

422

]

423

424

# Initialize with batch processing

425

embedder = OpenAIDocumentEmbedder(

426

api_key=Secret.from_env_var("OPENAI_API_KEY"),

427

batch_size=16,

428

progress_bar=True

429

)

430

431

# Embed all documents with progress tracking

432

result = embedder.run(documents=documents)

433

embedded_docs = result["documents"]

434

435

print(f"Embedded {len(embedded_docs)} documents")

436

```

437

438

### Local vs API Embeddings

439

440

```python

441

from haystack.components.embedders import (

442

SentenceTransformersTextEmbedder,

443

HuggingFaceAPITextEmbedder

444

)

445

from haystack.utils import Secret

446

447

# Local embedding (no API required)

448

local_embedder = SentenceTransformersTextEmbedder(

449

model="sentence-transformers/all-MiniLM-L6-v2"

450

)

451

452

# API-based embedding

453

api_embedder = HuggingFaceAPITextEmbedder(

454

token=Secret.from_env_var("HUGGINGFACE_API_TOKEN"),

455

model="sentence-transformers/all-MiniLM-L6-v2"

456

)

457

458

text = "Compare local vs API embeddings"

459

460

# Generate embeddings

461

local_result = local_embedder.run(text=text)

462

api_result = api_embedder.run(text=text)

463

464

print(f"Local embedding dimension: {len(local_result['embedding'])}")

465

print(f"API embedding dimension: {len(api_result['embedding'])}")

466

```

467

468

## Types

469

470

```python { .api }

471

from typing import Optional, List, Dict, Any, Literal

472

from haystack import Document

473

from haystack.utils import Secret, ComponentDevice

474

from haystack.dataclasses import SparseEmbedding

475

476

# Embedding dimension varies by model:

477

# - OpenAI text-embedding-ada-002: 1536 dimensions

478

# - Sentence Transformers all-MiniLM-L6-v2: 384 dimensions

479

# - Sentence Transformers all-mpnet-base-v2: 768 dimensions

480

```