LLM framework to build customizable, production-ready LLM applications.
—
Search and retrieve relevant documents using various retrieval strategies including vector search, keyword search, filtering, and advanced retrieval techniques. Haystack provides comprehensive retrieval components for building robust information retrieval systems.
Retrieve documents using vector similarity search with embeddings stored in memory.
class InMemoryEmbeddingRetriever:
def __init__(
self,
document_store: InMemoryDocumentStore,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
scale_score: bool = False,
return_embedding: bool = False
) -> None:
"""
Initialize in-memory embedding retriever.
Args:
document_store: Document store containing embedded documents
filters: Filters to apply to documents during retrieval
top_k: Number of documents to retrieve
scale_score: Whether to scale similarity scores to [0,1] range
return_embedding: Whether to return document embeddings in results
"""
def run(
self,
query_embedding: List[float],
filters: Optional[Dict[str, Any]] = None,
top_k: Optional[int] = None,
scale_score: Optional[bool] = None,
return_embedding: Optional[bool] = None
) -> Dict[str, List[Document]]:
"""
Retrieve documents using embedding similarity search.
Args:
query_embedding: Vector embedding of the query
filters: Optional filters to apply during retrieval
top_k: Number of documents to retrieve
scale_score: Whether to scale similarity scores
return_embedding: Whether to return document embeddings
Returns:
Dictionary with 'documents' key containing list of retrieved documents
"""Perform keyword-based retrieval using BM25 scoring algorithm.
class InMemoryBM25Retriever:
def __init__(
self,
document_store: InMemoryDocumentStore,
filters: Optional[Dict[str, Any]] = None,
top_k: int = 10,
scale_score: bool = False
) -> None:
"""
Initialize in-memory BM25 retriever.
Args:
document_store: Document store containing documents
filters: Filters to apply to documents during retrieval
top_k: Number of documents to retrieve
scale_score: Whether to scale BM25 scores to [0,1] range
"""
def run(
self,
query: str,
filters: Optional[Dict[str, Any]] = None,
top_k: Optional[int] = None,
scale_score: Optional[bool] = None
) -> Dict[str, List[Document]]:
"""
Retrieve documents using BM25 keyword search.
Args:
query: Search query text
filters: Optional filters to apply during retrieval
top_k: Number of documents to retrieve
scale_score: Whether to scale BM25 scores
Returns:
Dictionary with 'documents' key containing list of retrieved documents
"""Retrieve documents based on metadata filters without scoring.
class FilterRetriever:
def __init__(
self,
document_store: InMemoryDocumentStore,
filters: Optional[Dict[str, Any]] = None
) -> None:
"""
Initialize filter-based retriever.
Args:
document_store: Document store containing documents
filters: Default filters to apply during retrieval
"""
def run(
self,
filters: Optional[Dict[str, Any]] = None
) -> Dict[str, List[Document]]:
"""
Retrieve documents using metadata filters.
Args:
filters: Filters to apply for document selection
Returns:
Dictionary with 'documents' key containing list of filtered documents
"""Advanced retrieval strategy that automatically merges smaller document chunks with their parent documents based on relevance.
class AutoMergingRetriever:
def __init__(
self,
document_store: InMemoryDocumentStore,
retriever: Union[InMemoryEmbeddingRetriever, InMemoryBM25Retriever],
threshold: float = 0.8,
top_k: int = 10
) -> None:
"""
Initialize auto-merging retriever.
Args:
document_store: Document store containing hierarchical documents
retriever: Base retriever to use for initial search
threshold: Similarity threshold for merging child documents
top_k: Number of documents to retrieve
"""
def run(
self,
query: Optional[str] = None,
query_embedding: Optional[List[float]] = None,
filters: Optional[Dict[str, Any]] = None,
top_k: Optional[int] = None
) -> Dict[str, List[Document]]:
"""
Retrieve documents with auto-merging of related chunks.
Args:
query: Search query text (for BM25-based retrieval)
query_embedding: Query embedding (for embedding-based retrieval)
filters: Optional filters to apply during retrieval
top_k: Number of documents to retrieve
Returns:
Dictionary with 'documents' key containing merged documents
"""Retrieve documents with expanded context windows around the matching sentences.
class SentenceWindowRetriever:
def __init__(
self,
document_store: InMemoryDocumentStore,
retriever: Union[InMemoryEmbeddingRetriever, InMemoryBM25Retriever],
window_size: int = 3,
top_k: int = 10
) -> None:
"""
Initialize sentence window retriever.
Args:
document_store: Document store containing documents with sentence metadata
retriever: Base retriever to use for initial search
window_size: Number of sentences to include before and after match
top_k: Number of documents to retrieve
"""
def run(
self,
query: Optional[str] = None,
query_embedding: Optional[List[float]] = None,
filters: Optional[Dict[str, Any]] = None,
top_k: Optional[int] = None,
window_size: Optional[int] = None
) -> Dict[str, List[Document]]:
"""
Retrieve documents with expanded sentence windows.
Args:
query: Search query text (for BM25-based retrieval)
query_embedding: Query embedding (for embedding-based retrieval)
filters: Optional filters to apply during retrieval
top_k: Number of documents to retrieve
window_size: Context window size in sentences
Returns:
Dictionary with 'documents' key containing documents with expanded context
"""from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack import Document
# Create documents and embed them
documents = [
Document(content="Python is a programming language."),
Document(content="Berlin is the capital of Germany."),
Document(content="Machine learning uses algorithms to find patterns.")
]
# Initialize document store
document_store = InMemoryDocumentStore()
# Embed documents
doc_embedder = OpenAIDocumentEmbedder()
embedded_docs = doc_embedder.run(documents=documents)
document_store.write_documents(embedded_docs["documents"])
# Set up retriever
retriever = InMemoryEmbeddingRetriever(
document_store=document_store,
top_k=2
)
# Create query embedding
text_embedder = OpenAITextEmbedder()
query_result = text_embedder.run(text="What is Python?")
query_embedding = query_result["embedding"]
# Retrieve relevant documents
result = retriever.run(query_embedding=query_embedding)
for doc in result["documents"]:
print(f"Score: {doc.score:.3f} - {doc.content}")from haystack.components.retrievers import InMemoryBM25Retriever
# Initialize BM25 retriever
bm25_retriever = InMemoryBM25Retriever(
document_store=document_store,
top_k=3,
scale_score=True
)
# Perform keyword search
result = bm25_retriever.run(query="programming language Python")
for doc in result["documents"]:
print(f"BM25 Score: {doc.score:.3f} - {doc.content}")from haystack.components.retrievers import FilterRetriever
# Add documents with metadata
documents_with_meta = [
Document(content="Python tutorial", meta={"language": "en", "type": "tutorial"}),
Document(content="Java guide", meta={"language": "en", "type": "guide"}),
Document(content="Tutorial de Python", meta={"language": "es", "type": "tutorial"})
]
document_store.write_documents(documents_with_meta)
# Initialize filter retriever
filter_retriever = FilterRetriever(document_store=document_store)
# Retrieve documents by metadata
result = filter_retriever.run(
filters={"language": "en", "type": "tutorial"}
)
for doc in result["documents"]:
print(f"Content: {doc.content} - Meta: {doc.meta}")from haystack.components.retrievers import AutoMergingRetriever
# Create hierarchical documents (parent-child relationships)
parent_doc = Document(
content="Complete guide to machine learning algorithms",
meta={"doc_id": "ml_guide", "level": "parent"}
)
child_docs = [
Document(
content="Linear regression is a supervised learning algorithm",
meta={"doc_id": "ml_guide_1", "parent_id": "ml_guide", "level": "child"}
),
Document(
content="Decision trees split data based on feature values",
meta={"doc_id": "ml_guide_2", "parent_id": "ml_guide", "level": "child"}
)
]
# Store hierarchical documents
document_store.write_documents([parent_doc] + child_docs)
# Create base retriever
base_retriever = InMemoryEmbeddingRetriever(document_store=document_store)
# Initialize auto-merging retriever
auto_merger = AutoMergingRetriever(
document_store=document_store,
retriever=base_retriever,
threshold=0.7
)
# Retrieve with auto-merging
result = auto_merger.run(query_embedding=query_embedding)
for doc in result["documents"]:
print(f"Merged doc: {doc.content[:100]}...")from haystack.components.retrievers import SentenceWindowRetriever
# Documents with sentence-level metadata
sentence_docs = [
Document(
content="First sentence. Second sentence. Third sentence.",
meta={"sentences": ["First sentence.", "Second sentence.", "Third sentence."]}
)
]
document_store.write_documents(sentence_docs)
# Initialize sentence window retriever
window_retriever = SentenceWindowRetriever(
document_store=document_store,
retriever=InMemoryEmbeddingRetriever(document_store=document_store),
window_size=1 # Include 1 sentence before and after
)
# Retrieve with expanded context
result = window_retriever.run(query_embedding=query_embedding)
for doc in result["documents"]:
print(f"Expanded context: {doc.content}")from haystack import Pipeline
from haystack.components.joiners import DocumentJoiner
# Create a pipeline that combines multiple retrieval strategies
retrieval_pipeline = Pipeline()
# Add multiple retrievers
retrieval_pipeline.add_component("embedding_retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=5))
retrieval_pipeline.add_component("bm25_retriever", InMemoryBM25Retriever(document_store=document_store, top_k=5))
retrieval_pipeline.add_component("document_joiner", DocumentJoiner(join_mode="merge"))
# Connect retrievers to joiner
retrieval_pipeline.connect("embedding_retriever.documents", "document_joiner.documents")
retrieval_pipeline.connect("bm25_retriever.documents", "document_joiner.documents")
# Run hybrid retrieval
result = retrieval_pipeline.run({
"embedding_retriever": {"query_embedding": query_embedding},
"bm25_retriever": {"query": "Python programming"}
})
combined_docs = result["document_joiner"]["documents"]
print(f"Retrieved {len(combined_docs)} documents using hybrid approach")from typing import Optional, Dict, Any, List, Union
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
class RetrievalResult:
documents: List[Document]
query: Optional[str]
query_embedding: Optional[List[float]]
filters: Optional[Dict[str, Any]]
class SimilarityFunction:
COSINE = "cosine"
DOT_PRODUCT = "dot_product"
EUCLIDEAN = "euclidean"Install with Tessl CLI
npx tessl i tessl/pypi-haystack-ai