VoyageAI embeddings integration for LangChain providing cutting-edge embedding models through the Voyage AI API
npx @tessl/cli install tessl/pypi-langchain-community@0.0.0VoyageAI embeddings integration for LangChain providing access to cutting-edge embedding models through the Voyage AI API. This integration implements the LangChain Embeddings interface to enable seamless text embedding generation for semantic search, document similarity, and vector-based retrieval systems.
pip install langchain-communityfrom langchain_community.embeddings import VoyageEmbeddingsAlternative imports:
from langchain_community.embeddings.voyageai import VoyageEmbeddingsFrom main langchain package (re-exports from community):
from langchain.embeddings import VoyageEmbeddingsfrom langchain_community.embeddings import VoyageEmbeddings
# Initialize with API key from environment (VOYAGE_API_KEY)
embeddings = VoyageEmbeddings()
# Or provide API key explicitly
embeddings = VoyageEmbeddings(voyage_api_key="your-api-key-here")
# Embed a single query
query = "What is machine learning?"
query_embedding = embeddings.embed_query(query)
print(f"Query embedding dimension: {len(query_embedding)}")
# Embed multiple documents
documents = [
"Machine learning is a subset of artificial intelligence.",
"Deep learning uses neural networks with multiple layers.",
"Natural language processing deals with text and speech."
]
doc_embeddings = embeddings.embed_documents(documents)
print(f"Document embeddings: {len(doc_embeddings)} vectors")Main class for generating embeddings using Voyage AI models. Supports batch processing, retry logic, and configurable parameters for optimal performance.
class VoyageEmbeddings(BaseModel, Embeddings):
"""
Voyage embedding models integration for LangChain.
Inherits from:
BaseModel: Pydantic model for configuration and validation
Embeddings: LangChain embeddings interface
Attributes:
model (str): Voyage AI model name (default: "voyage-01")
voyage_api_base (str): API endpoint URL (default: "https://api.voyageai.com/v1/embeddings")
voyage_api_key (Optional[SecretStr]): API key (loaded from VOYAGE_API_KEY env var if not provided)
batch_size (int): Maximum texts per API request (default: 8)
max_retries (int): Maximum retry attempts (default: 6)
request_timeout (Optional[Union[float, Tuple[float, float]]]): Request timeout in seconds
show_progress_bar (bool): Show progress for large batches (default: False, requires tqdm)
"""Embeds a single text query using the "query" input type, optimized for search and retrieval scenarios.
def embed_query(self, text: str) -> List[float]:
"""
Embed a single query text.
Args:
text (str): The text to embed
Returns:
List[float]: Embedding vector
"""Usage example:
# Embed a search query
query = "python machine learning libraries"
query_vector = embeddings.embed_query(query)Embeds multiple documents using the "document" input type, optimized for indexing and storage scenarios with automatic batching.
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""
Embed multiple documents.
Args:
texts (List[str]): List of texts to embed
Returns:
List[List[float]]: List of embedding vectors
"""Usage example:
# Embed documents for indexing
documents = [
"Python is a versatile programming language",
"Machine learning requires large datasets",
"Neural networks process information in layers"
]
doc_vectors = embeddings.embed_documents(documents)Embeds texts with configurable input type for flexible use cases beyond query/document distinction.
def embed_general_texts(
self,
texts: List[str],
*,
input_type: Optional[str] = None
) -> List[List[float]]:
"""
Embed texts with configurable input type.
Args:
texts (List[str]): List of texts to embed
input_type (str, optional): "query", "document", or None for unspecified
Returns:
List[List[float]]: List of embedding vectors
Raises:
ValueError: If input_type is not None, "query", or "document"
"""Usage example:
# Embed with explicit input type
texts = ["text classification", "sentiment analysis"]
vectors = embeddings.embed_general_texts(texts, input_type="query")
# Embed without specifying type
vectors = embeddings.embed_general_texts(texts)Utility function providing exponential backoff retry logic for robust API interaction.
def embed_with_retry(embeddings: VoyageEmbeddings, **kwargs: Any) -> Any:
"""
Execute embedding with retry logic using exponential backoff.
Args:
embeddings (VoyageEmbeddings): Embeddings instance (used for max_retries config)
**kwargs: Additional arguments passed to requests.post()
Returns:
dict: API response data containing "data" field with embeddings
Raises:
RuntimeError: If API response lacks "data" field
"""# Custom API endpoint and model
embeddings = VoyageEmbeddings(
model="voyage-01",
voyage_api_base="https://api.voyageai.com/v1/embeddings",
voyage_api_key="your-key"
)# Configure batching and retries
embeddings = VoyageEmbeddings(
batch_size=16, # Larger batches for better throughput
max_retries=10, # More retries for unstable connections
request_timeout=30.0, # 30-second timeout
show_progress_bar=True # Show progress for large datasets
)For large embedding tasks, enable progress tracking:
# Requires: pip install tqdm
embeddings = VoyageEmbeddings(show_progress_bar=True)
# Process large document set with progress bar
large_documents = ["doc " + str(i) for i in range(1000)]
embeddings_result = embeddings.embed_documents(large_documents)Supported input type values for embed_general_texts:
"query": Optimized for search queries and questions"document": Optimized for documents and content to be indexedNone: Unspecified input type (default behavior)All embedding methods return vectors with consistent dimensionality:
List[float] (dimensions depend on model used)List[List[float]] where each inner list contains the same number of dimensionstry:
embeddings = VoyageEmbeddings(show_progress_bar=True)
result = embeddings.embed_general_texts(["test"], input_type="invalid")
except ImportError as e:
# tqdm not installed but show_progress_bar=True
print("Install tqdm: pip install tqdm")
except RuntimeError as e:
# API error or malformed response (missing "data" field)
print(f"API error: {e}")
except ValueError as e:
# Invalid input_type parameter (must be None, "query", or "document")
print(f"Invalid parameter: {e}")import os
# Set API key via environment variable (recommended)
os.environ["VOYAGE_API_KEY"] = "your-api-key"
embeddings = VoyageEmbeddings()
# Or pass directly (less secure)
embeddings = VoyageEmbeddings(voyage_api_key="your-api-key")from langchain_community.embeddings import VoyageEmbeddings
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Initialize embeddings
embeddings = VoyageEmbeddings()
# Create document embeddings
documents = [
"Python is a programming language",
"Machine learning uses algorithms",
"Data science involves statistics"
]
doc_embeddings = embeddings.embed_documents(documents)
# Search with a query
query = "programming languages"
query_embedding = embeddings.embed_query(query)
# Calculate similarities
similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
best_match_idx = np.argmax(similarities)
print(f"Best match: {documents[best_match_idx]}")
print(f"Similarity: {similarities[best_match_idx]:.3f}")from langchain.retrievers import KNNRetriever
from langchain_community.embeddings import VoyageEmbeddings
# Create retriever with VoyageAI embeddings
embeddings = VoyageEmbeddings()
documents = ["doc1", "doc2", "doc3"]
retriever = KNNRetriever.from_texts(documents, embeddings)
results = retriever.get_relevant_documents("search query")