0
# Text Embeddings
1
2
Convert text and documents into vector embeddings for semantic search, retrieval, and similarity comparison. Supports multiple embedding providers including OpenAI, HuggingFace, and Sentence Transformers.
3
4
## Capabilities
5
6
### OpenAI Embeddings
7
8
Generate embeddings using OpenAI's text embedding models for high-quality semantic representations.
9
10
```python { .api }
11
class OpenAITextEmbedder:
12
def __init__(
13
self,
14
api_key: Secret = None,
15
model: str = "text-embedding-ada-002",
16
dimensions: Optional[int] = None,
17
api_base_url: Optional[str] = None,
18
organization: Optional[str] = None,
19
prefix: str = "",
20
suffix: str = ""
21
) -> None:
22
"""
23
Initialize OpenAI text embedder.
24
25
Args:
26
api_key: OpenAI API key
27
model: OpenAI embedding model name
28
dimensions: Number of dimensions for embedding (model dependent)
29
api_base_url: Custom API base URL
30
organization: OpenAI organization ID
31
prefix: Text prefix to add before embedding
32
suffix: Text suffix to add after embedding
33
"""
34
35
def run(self, text: str) -> Dict[str, List[float]]:
36
"""
37
Generate embedding for input text.
38
39
Args:
40
text: Input text to embed
41
42
Returns:
43
Dictionary with 'embedding' key containing the vector embedding
44
"""
45
46
class OpenAIDocumentEmbedder:
47
def __init__(
48
self,
49
api_key: Secret = None,
50
model: str = "text-embedding-ada-002",
51
dimensions: Optional[int] = None,
52
api_base_url: Optional[str] = None,
53
organization: Optional[str] = None,
54
prefix: str = "",
55
suffix: str = "",
56
batch_size: int = 32,
57
progress_bar: bool = True,
58
meta_fields_to_embed: Optional[List[str]] = None,
59
embedding_separator: str = "\n"
60
) -> None:
61
"""
62
Initialize OpenAI document embedder.
63
64
Args:
65
api_key: OpenAI API key
66
model: OpenAI embedding model name
67
dimensions: Number of dimensions for embedding
68
api_base_url: Custom API base URL
69
organization: OpenAI organization ID
70
prefix: Text prefix to add before embedding
71
suffix: Text suffix to add after embedding
72
batch_size: Number of documents to embed in each batch
73
progress_bar: Show progress bar during embedding
74
meta_fields_to_embed: Document metadata fields to include in embedding
75
embedding_separator: Separator for joining text and metadata
76
"""
77
78
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
79
"""
80
Generate embeddings for a list of documents.
81
82
Args:
83
documents: List of Document objects to embed
84
85
Returns:
86
Dictionary with 'documents' key containing documents with embeddings
87
"""
88
89
class AzureOpenAITextEmbedder:
90
def __init__(
91
self,
92
azure_endpoint: str,
93
api_version: str,
94
api_key: Secret = None,
95
azure_ad_token: Secret = None,
96
model: str = "text-embedding-ada-002",
97
dimensions: Optional[int] = None,
98
prefix: str = "",
99
suffix: str = ""
100
) -> None:
101
"""
102
Initialize Azure OpenAI text embedder.
103
104
Args:
105
azure_endpoint: Azure OpenAI endpoint URL
106
api_version: Azure OpenAI API version
107
api_key: Azure OpenAI API key
108
azure_ad_token: Azure AD token for authentication
109
model: Deployment name of the embedding model
110
dimensions: Number of dimensions for embedding
111
prefix: Text prefix to add before embedding
112
suffix: Text suffix to add after embedding
113
"""
114
115
def run(self, text: str) -> Dict[str, List[float]]:
116
"""Generate embedding using Azure OpenAI."""
117
118
class AzureOpenAIDocumentEmbedder:
119
def __init__(
120
self,
121
azure_endpoint: str,
122
api_version: str,
123
api_key: Secret = None,
124
azure_ad_token: Secret = None,
125
model: str = "text-embedding-ada-002",
126
dimensions: Optional[int] = None,
127
prefix: str = "",
128
suffix: str = "",
129
batch_size: int = 32,
130
progress_bar: bool = True,
131
meta_fields_to_embed: Optional[List[str]] = None,
132
embedding_separator: str = "\n"
133
) -> None:
134
"""Initialize Azure OpenAI document embedder."""
135
136
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
137
"""Generate embeddings for documents using Azure OpenAI."""
138
```
139
140
### Sentence Transformers Embeddings
141
142
Generate embeddings using Sentence Transformers models for high-quality semantic representations with local inference.
143
144
```python { .api }
145
class SentenceTransformersTextEmbedder:
146
def __init__(
147
self,
148
model: str = "sentence-transformers/all-MiniLM-L6-v2",
149
device: Optional[ComponentDevice] = None,
150
token: Secret = None,
151
prefix: str = "",
152
suffix: str = "",
153
normalize_embeddings: bool = True,
154
batch_size: int = 32,
155
progress_bar: bool = True,
156
model_kwargs: Optional[Dict[str, Any]] = None,
157
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
158
config_kwargs: Optional[Dict[str, Any]] = None
159
) -> None:
160
"""
161
Initialize Sentence Transformers text embedder.
162
163
Args:
164
model: Sentence Transformers model name or path
165
device: Device for model inference
166
token: HuggingFace token for private models
167
prefix: Text prefix to add before embedding
168
suffix: Text suffix to add after embedding
169
normalize_embeddings: Whether to normalize embeddings to unit length
170
batch_size: Batch size for inference
171
progress_bar: Show progress bar during embedding
172
model_kwargs: Additional model initialization arguments
173
tokenizer_kwargs: Additional tokenizer arguments
174
config_kwargs: Additional configuration arguments
175
"""
176
177
def run(self, text: str) -> Dict[str, List[float]]:
178
"""
179
Generate embedding for input text using Sentence Transformers.
180
181
Args:
182
text: Input text to embed
183
184
Returns:
185
Dictionary with 'embedding' key containing the vector embedding
186
"""
187
188
class SentenceTransformersDocumentEmbedder:
189
def __init__(
190
self,
191
model: str = "sentence-transformers/all-MiniLM-L6-v2",
192
device: Optional[ComponentDevice] = None,
193
token: Secret = None,
194
prefix: str = "",
195
suffix: str = "",
196
normalize_embeddings: bool = True,
197
batch_size: int = 32,
198
progress_bar: bool = True,
199
model_kwargs: Optional[Dict[str, Any]] = None,
200
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
201
config_kwargs: Optional[Dict[str, Any]] = None,
202
meta_fields_to_embed: Optional[List[str]] = None,
203
embedding_separator: str = "\n"
204
) -> None:
205
"""
206
Initialize Sentence Transformers document embedder.
207
208
Args:
209
model: Sentence Transformers model name or path
210
device: Device for model inference
211
token: HuggingFace token for private models
212
prefix: Text prefix to add before embedding
213
suffix: Text suffix to add after embedding
214
normalize_embeddings: Whether to normalize embeddings
215
batch_size: Batch size for inference
216
progress_bar: Show progress bar during embedding
217
model_kwargs: Additional model initialization arguments
218
tokenizer_kwargs: Additional tokenizer arguments
219
config_kwargs: Additional configuration arguments
220
meta_fields_to_embed: Document metadata fields to include in embedding
221
embedding_separator: Separator for joining text and metadata
222
"""
223
224
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
225
"""Generate embeddings for documents using Sentence Transformers."""
226
```
227
228
### HuggingFace Embeddings
229
230
Generate embeddings using HuggingFace models via API for various transformer models.
231
232
```python { .api }
233
class HuggingFaceAPITextEmbedder:
234
def __init__(
235
self,
236
api_type: Literal["serverless_inference_api", "inference_endpoints"] = "serverless_inference_api",
237
api_url: Optional[str] = None,
238
token: Secret = None,
239
model: Optional[str] = None,
240
prefix: str = "",
241
suffix: str = "",
242
truncate: bool = True,
243
normalize: bool = False
244
) -> None:
245
"""
246
Initialize HuggingFace API text embedder.
247
248
Args:
249
api_type: Type of HuggingFace API to use
250
api_url: Custom API endpoint URL
251
token: HuggingFace API token
252
model: Model name for serverless inference
253
prefix: Text prefix to add before embedding
254
suffix: Text suffix to add after embedding
255
truncate: Whether to truncate input text
256
normalize: Whether to normalize embeddings
257
"""
258
259
def run(self, text: str) -> Dict[str, List[float]]:
260
"""
261
Generate embedding using HuggingFace API.
262
263
Args:
264
text: Input text to embed
265
266
Returns:
267
Dictionary with 'embedding' key containing the vector embedding
268
"""
269
270
class HuggingFaceAPIDocumentEmbedder:
271
def __init__(
272
self,
273
api_type: Literal["serverless_inference_api", "inference_endpoints"] = "serverless_inference_api",
274
api_url: Optional[str] = None,
275
token: Secret = None,
276
model: Optional[str] = None,
277
prefix: str = "",
278
suffix: str = "",
279
truncate: bool = True,
280
normalize: bool = False,
281
batch_size: int = 32,
282
progress_bar: bool = True,
283
meta_fields_to_embed: Optional[List[str]] = None,
284
embedding_separator: str = "\n"
285
) -> None:
286
"""
287
Initialize HuggingFace API document embedder.
288
289
Args:
290
api_type: Type of HuggingFace API to use
291
api_url: Custom API endpoint URL
292
token: HuggingFace API token
293
model: Model name for serverless inference
294
prefix: Text prefix to add before embedding
295
suffix: Text suffix to add after embedding
296
truncate: Whether to truncate input text
297
normalize: Whether to normalize embeddings
298
batch_size: Batch size for processing
299
progress_bar: Show progress bar during embedding
300
meta_fields_to_embed: Document metadata fields to include
301
embedding_separator: Separator for joining text and metadata
302
"""
303
304
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
305
"""Generate embeddings for documents using HuggingFace API."""
306
```
307
308
### Image Embeddings
309
310
Generate embeddings for images and image content within documents.
311
312
```python { .api }
313
class SentenceTransformersDocumentImageEmbedder:
314
def __init__(
315
self,
316
model: str = "sentence-transformers/clip-ViT-B-32",
317
device: Optional[ComponentDevice] = None,
318
token: Secret = None,
319
prefix: str = "",
320
suffix: str = "",
321
normalize_embeddings: bool = True,
322
batch_size: int = 32,
323
progress_bar: bool = True,
324
model_kwargs: Optional[Dict[str, Any]] = None
325
) -> None:
326
"""
327
Initialize Sentence Transformers document image embedder.
328
329
Args:
330
model: Sentence Transformers CLIP model name
331
device: Device for model inference
332
token: HuggingFace token for private models
333
prefix: Text prefix for image descriptions
334
suffix: Text suffix for image descriptions
335
normalize_embeddings: Whether to normalize embeddings
336
batch_size: Batch size for inference
337
progress_bar: Show progress bar during embedding
338
model_kwargs: Additional model arguments
339
"""
340
341
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
342
"""
343
Generate embeddings for images in documents.
344
345
Args:
346
documents: List of documents containing ImageContent
347
348
Returns:
349
Dictionary with 'documents' key containing documents with image embeddings
350
"""
351
```
352
353
## Usage Examples
354
355
### Basic Text Embedding
356
357
```python
358
from haystack.components.embedders import OpenAITextEmbedder
359
from haystack.utils import Secret
360
361
# Initialize embedder
362
embedder = OpenAITextEmbedder(
363
api_key=Secret.from_env_var("OPENAI_API_KEY"),
364
model="text-embedding-ada-002"
365
)
366
367
# Generate embedding
368
result = embedder.run(text="Haystack is a framework for building LLM applications.")
369
embedding = result["embedding"]
370
371
print(f"Embedding dimension: {len(embedding)}")
372
print(f"First 5 values: {embedding[:5]}")
373
```
374
375
### Document Embedding with Metadata
376
377
```python
378
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
379
from haystack import Document
380
381
# Initialize embedder with metadata fields
382
embedder = SentenceTransformersDocumentEmbedder(
383
model="sentence-transformers/all-MiniLM-L6-v2",
384
meta_fields_to_embed=["title", "category"],
385
embedding_separator=" | "
386
)
387
388
# Create documents with metadata
389
documents = [
390
Document(
391
content="Python is a programming language.",
392
meta={"title": "Python Overview", "category": "programming"}
393
),
394
Document(
395
content="Machine learning uses algorithms to find patterns.",
396
meta={"title": "ML Basics", "category": "artificial intelligence"}
397
)
398
]
399
400
# Embed documents
401
result = embedder.run(documents=documents)
402
embedded_docs = result["documents"]
403
404
for doc in embedded_docs:
405
print(f"Document: {doc.content[:30]}...")
406
print(f"Embedding shape: {len(doc.embedding)}")
407
print(f"Metadata: {doc.meta}")
408
print()
409
```
410
411
### Batch Processing with Progress
412
413
```python
414
from haystack.components.embedders import OpenAIDocumentEmbedder
415
from haystack import Document
416
from haystack.utils import Secret
417
418
# Create many documents
419
documents = [
420
Document(content=f"This is document number {i}")
421
for i in range(100)
422
]
423
424
# Initialize with batch processing
425
embedder = OpenAIDocumentEmbedder(
426
api_key=Secret.from_env_var("OPENAI_API_KEY"),
427
batch_size=16,
428
progress_bar=True
429
)
430
431
# Embed all documents with progress tracking
432
result = embedder.run(documents=documents)
433
embedded_docs = result["documents"]
434
435
print(f"Embedded {len(embedded_docs)} documents")
436
```
437
438
### Local vs API Embeddings
439
440
```python
441
from haystack.components.embedders import (
442
SentenceTransformersTextEmbedder,
443
HuggingFaceAPITextEmbedder
444
)
445
from haystack.utils import Secret
446
447
# Local embedding (no API required)
448
local_embedder = SentenceTransformersTextEmbedder(
449
model="sentence-transformers/all-MiniLM-L6-v2"
450
)
451
452
# API-based embedding
453
api_embedder = HuggingFaceAPITextEmbedder(
454
token=Secret.from_env_var("HUGGINGFACE_API_TOKEN"),
455
model="sentence-transformers/all-MiniLM-L6-v2"
456
)
457
458
text = "Compare local vs API embeddings"
459
460
# Generate embeddings
461
local_result = local_embedder.run(text=text)
462
api_result = api_embedder.run(text=text)
463
464
print(f"Local embedding dimension: {len(local_result['embedding'])}")
465
print(f"API embedding dimension: {len(api_result['embedding'])}")
466
```
467
468
## Types
469
470
```python { .api }
471
from typing import Optional, List, Dict, Any, Literal
472
from haystack import Document
473
from haystack.utils import Secret, ComponentDevice
474
from haystack.dataclasses import SparseEmbedding
475
476
# Embedding dimension varies by model:
477
# - OpenAI text-embedding-ada-002: 1536 dimensions
478
# - Sentence Transformers all-MiniLM-L6-v2: 384 dimensions
479
# - Sentence Transformers all-mpnet-base-v2: 768 dimensions
480
```