LLM framework to build customizable, production-ready LLM applications.
—
Convert text and documents into vector embeddings for semantic search, retrieval, and similarity comparison. Supports multiple embedding providers including OpenAI, HuggingFace, and Sentence Transformers.
Generate embeddings using OpenAI's text embedding models for high-quality semantic representations.
class OpenAITextEmbedder:
def __init__(
self,
api_key: Secret = None,
model: str = "text-embedding-ada-002",
dimensions: Optional[int] = None,
api_base_url: Optional[str] = None,
organization: Optional[str] = None,
prefix: str = "",
suffix: str = ""
) -> None:
"""
Initialize OpenAI text embedder.
Args:
api_key: OpenAI API key
model: OpenAI embedding model name
dimensions: Number of dimensions for embedding (model dependent)
api_base_url: Custom API base URL
organization: OpenAI organization ID
prefix: Text prefix to add before embedding
suffix: Text suffix to add after embedding
"""
def run(self, text: str) -> Dict[str, List[float]]:
"""
Generate embedding for input text.
Args:
text: Input text to embed
Returns:
Dictionary with 'embedding' key containing the vector embedding
"""
class OpenAIDocumentEmbedder:
def __init__(
self,
api_key: Secret = None,
model: str = "text-embedding-ada-002",
dimensions: Optional[int] = None,
api_base_url: Optional[str] = None,
organization: Optional[str] = None,
prefix: str = "",
suffix: str = "",
batch_size: int = 32,
progress_bar: bool = True,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n"
) -> None:
"""
Initialize OpenAI document embedder.
Args:
api_key: OpenAI API key
model: OpenAI embedding model name
dimensions: Number of dimensions for embedding
api_base_url: Custom API base URL
organization: OpenAI organization ID
prefix: Text prefix to add before embedding
suffix: Text suffix to add after embedding
batch_size: Number of documents to embed in each batch
progress_bar: Show progress bar during embedding
meta_fields_to_embed: Document metadata fields to include in embedding
embedding_separator: Separator for joining text and metadata
"""
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""
Generate embeddings for a list of documents.
Args:
documents: List of Document objects to embed
Returns:
Dictionary with 'documents' key containing documents with embeddings
"""
class AzureOpenAITextEmbedder:
def __init__(
self,
azure_endpoint: str,
api_version: str,
api_key: Secret = None,
azure_ad_token: Secret = None,
model: str = "text-embedding-ada-002",
dimensions: Optional[int] = None,
prefix: str = "",
suffix: str = ""
) -> None:
"""
Initialize Azure OpenAI text embedder.
Args:
azure_endpoint: Azure OpenAI endpoint URL
api_version: Azure OpenAI API version
api_key: Azure OpenAI API key
azure_ad_token: Azure AD token for authentication
model: Deployment name of the embedding model
dimensions: Number of dimensions for embedding
prefix: Text prefix to add before embedding
suffix: Text suffix to add after embedding
"""
def run(self, text: str) -> Dict[str, List[float]]:
"""Generate embedding using Azure OpenAI."""
class AzureOpenAIDocumentEmbedder:
def __init__(
self,
azure_endpoint: str,
api_version: str,
api_key: Secret = None,
azure_ad_token: Secret = None,
model: str = "text-embedding-ada-002",
dimensions: Optional[int] = None,
prefix: str = "",
suffix: str = "",
batch_size: int = 32,
progress_bar: bool = True,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n"
) -> None:
"""Initialize Azure OpenAI document embedder."""
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""Generate embeddings for documents using Azure OpenAI."""Generate embeddings using Sentence Transformers models for high-quality semantic representations with local inference.
class SentenceTransformersTextEmbedder:
def __init__(
self,
model: str = "sentence-transformers/all-MiniLM-L6-v2",
device: Optional[ComponentDevice] = None,
token: Secret = None,
prefix: str = "",
suffix: str = "",
normalize_embeddings: bool = True,
batch_size: int = 32,
progress_bar: bool = True,
model_kwargs: Optional[Dict[str, Any]] = None,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
config_kwargs: Optional[Dict[str, Any]] = None
) -> None:
"""
Initialize Sentence Transformers text embedder.
Args:
model: Sentence Transformers model name or path
device: Device for model inference
token: HuggingFace token for private models
prefix: Text prefix to add before embedding
suffix: Text suffix to add after embedding
normalize_embeddings: Whether to normalize embeddings to unit length
batch_size: Batch size for inference
progress_bar: Show progress bar during embedding
model_kwargs: Additional model initialization arguments
tokenizer_kwargs: Additional tokenizer arguments
config_kwargs: Additional configuration arguments
"""
def run(self, text: str) -> Dict[str, List[float]]:
"""
Generate embedding for input text using Sentence Transformers.
Args:
text: Input text to embed
Returns:
Dictionary with 'embedding' key containing the vector embedding
"""
class SentenceTransformersDocumentEmbedder:
def __init__(
self,
model: str = "sentence-transformers/all-MiniLM-L6-v2",
device: Optional[ComponentDevice] = None,
token: Secret = None,
prefix: str = "",
suffix: str = "",
normalize_embeddings: bool = True,
batch_size: int = 32,
progress_bar: bool = True,
model_kwargs: Optional[Dict[str, Any]] = None,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
config_kwargs: Optional[Dict[str, Any]] = None,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n"
) -> None:
"""
Initialize Sentence Transformers document embedder.
Args:
model: Sentence Transformers model name or path
device: Device for model inference
token: HuggingFace token for private models
prefix: Text prefix to add before embedding
suffix: Text suffix to add after embedding
normalize_embeddings: Whether to normalize embeddings
batch_size: Batch size for inference
progress_bar: Show progress bar during embedding
model_kwargs: Additional model initialization arguments
tokenizer_kwargs: Additional tokenizer arguments
config_kwargs: Additional configuration arguments
meta_fields_to_embed: Document metadata fields to include in embedding
embedding_separator: Separator for joining text and metadata
"""
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""Generate embeddings for documents using Sentence Transformers."""Generate embeddings using HuggingFace models via API for various transformer models.
class HuggingFaceAPITextEmbedder:
def __init__(
self,
api_type: Literal["serverless_inference_api", "inference_endpoints"] = "serverless_inference_api",
api_url: Optional[str] = None,
token: Secret = None,
model: Optional[str] = None,
prefix: str = "",
suffix: str = "",
truncate: bool = True,
normalize: bool = False
) -> None:
"""
Initialize HuggingFace API text embedder.
Args:
api_type: Type of HuggingFace API to use
api_url: Custom API endpoint URL
token: HuggingFace API token
model: Model name for serverless inference
prefix: Text prefix to add before embedding
suffix: Text suffix to add after embedding
truncate: Whether to truncate input text
normalize: Whether to normalize embeddings
"""
def run(self, text: str) -> Dict[str, List[float]]:
"""
Generate embedding using HuggingFace API.
Args:
text: Input text to embed
Returns:
Dictionary with 'embedding' key containing the vector embedding
"""
class HuggingFaceAPIDocumentEmbedder:
def __init__(
self,
api_type: Literal["serverless_inference_api", "inference_endpoints"] = "serverless_inference_api",
api_url: Optional[str] = None,
token: Secret = None,
model: Optional[str] = None,
prefix: str = "",
suffix: str = "",
truncate: bool = True,
normalize: bool = False,
batch_size: int = 32,
progress_bar: bool = True,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n"
) -> None:
"""
Initialize HuggingFace API document embedder.
Args:
api_type: Type of HuggingFace API to use
api_url: Custom API endpoint URL
token: HuggingFace API token
model: Model name for serverless inference
prefix: Text prefix to add before embedding
suffix: Text suffix to add after embedding
truncate: Whether to truncate input text
normalize: Whether to normalize embeddings
batch_size: Batch size for processing
progress_bar: Show progress bar during embedding
meta_fields_to_embed: Document metadata fields to include
embedding_separator: Separator for joining text and metadata
"""
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""Generate embeddings for documents using HuggingFace API."""Generate embeddings for images and image content within documents.
class SentenceTransformersDocumentImageEmbedder:
def __init__(
self,
model: str = "sentence-transformers/clip-ViT-B-32",
device: Optional[ComponentDevice] = None,
token: Secret = None,
prefix: str = "",
suffix: str = "",
normalize_embeddings: bool = True,
batch_size: int = 32,
progress_bar: bool = True,
model_kwargs: Optional[Dict[str, Any]] = None
) -> None:
"""
Initialize Sentence Transformers document image embedder.
Args:
model: Sentence Transformers CLIP model name
device: Device for model inference
token: HuggingFace token for private models
prefix: Text prefix for image descriptions
suffix: Text suffix for image descriptions
normalize_embeddings: Whether to normalize embeddings
batch_size: Batch size for inference
progress_bar: Show progress bar during embedding
model_kwargs: Additional model arguments
"""
def run(self, documents: List[Document]) -> Dict[str, List[Document]]:
"""
Generate embeddings for images in documents.
Args:
documents: List of documents containing ImageContent
Returns:
Dictionary with 'documents' key containing documents with image embeddings
"""from haystack.components.embedders import OpenAITextEmbedder
from haystack.utils import Secret
# Initialize embedder
embedder = OpenAITextEmbedder(
api_key=Secret.from_env_var("OPENAI_API_KEY"),
model="text-embedding-ada-002"
)
# Generate embedding
result = embedder.run(text="Haystack is a framework for building LLM applications.")
embedding = result["embedding"]
print(f"Embedding dimension: {len(embedding)}")
print(f"First 5 values: {embedding[:5]}")from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Document
# Initialize embedder with metadata fields
embedder = SentenceTransformersDocumentEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2",
meta_fields_to_embed=["title", "category"],
embedding_separator=" | "
)
# Create documents with metadata
documents = [
Document(
content="Python is a programming language.",
meta={"title": "Python Overview", "category": "programming"}
),
Document(
content="Machine learning uses algorithms to find patterns.",
meta={"title": "ML Basics", "category": "artificial intelligence"}
)
]
# Embed documents
result = embedder.run(documents=documents)
embedded_docs = result["documents"]
for doc in embedded_docs:
print(f"Document: {doc.content[:30]}...")
print(f"Embedding shape: {len(doc.embedding)}")
print(f"Metadata: {doc.meta}")
print()from haystack.components.embedders import OpenAIDocumentEmbedder
from haystack import Document
from haystack.utils import Secret
# Create many documents
documents = [
Document(content=f"This is document number {i}")
for i in range(100)
]
# Initialize with batch processing
embedder = OpenAIDocumentEmbedder(
api_key=Secret.from_env_var("OPENAI_API_KEY"),
batch_size=16,
progress_bar=True
)
# Embed all documents with progress tracking
result = embedder.run(documents=documents)
embedded_docs = result["documents"]
print(f"Embedded {len(embedded_docs)} documents")from haystack.components.embedders import (
SentenceTransformersTextEmbedder,
HuggingFaceAPITextEmbedder
)
from haystack.utils import Secret
# Local embedding (no API required)
local_embedder = SentenceTransformersTextEmbedder(
model="sentence-transformers/all-MiniLM-L6-v2"
)
# API-based embedding
api_embedder = HuggingFaceAPITextEmbedder(
token=Secret.from_env_var("HUGGINGFACE_API_TOKEN"),
model="sentence-transformers/all-MiniLM-L6-v2"
)
text = "Compare local vs API embeddings"
# Generate embeddings
local_result = local_embedder.run(text=text)
api_result = api_embedder.run(text=text)
print(f"Local embedding dimension: {len(local_result['embedding'])}")
print(f"API embedding dimension: {len(api_result['embedding'])}")from typing import Optional, List, Dict, Any, Literal
from haystack import Document
from haystack.utils import Secret, ComponentDevice
from haystack.dataclasses import SparseEmbedding
# Embedding dimension varies by model:
# - OpenAI text-embedding-ada-002: 1536 dimensions
# - Sentence Transformers all-MiniLM-L6-v2: 384 dimensions
# - Sentence Transformers all-mpnet-base-v2: 768 dimensionsInstall with Tessl CLI
npx tessl i tessl/pypi-haystack-ai