or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agent-framework.mdcore-framework.mddocument-processing.mddocument-stores.mdevaluation.mdindex.mdprompt-building.mdretrieval.mdtext-embeddings.mdtext-generation.md

retrieval.mddocs/

0

# Retrieval

1

2

Search and retrieve relevant documents using various retrieval strategies including vector search, keyword search, filtering, and advanced retrieval techniques. Haystack provides comprehensive retrieval components for building robust information retrieval systems.

3

4

## Capabilities

5

6

### In-Memory Embedding Retrieval

7

8

Retrieve documents using vector similarity search with embeddings stored in memory.

9

10

```python { .api }

11

class InMemoryEmbeddingRetriever:

12

def __init__(

13

self,

14

document_store: InMemoryDocumentStore,

15

filters: Optional[Dict[str, Any]] = None,

16

top_k: int = 10,

17

scale_score: bool = False,

18

return_embedding: bool = False

19

) -> None:

20

"""

21

Initialize in-memory embedding retriever.

22

23

Args:

24

document_store: Document store containing embedded documents

25

filters: Filters to apply to documents during retrieval

26

top_k: Number of documents to retrieve

27

scale_score: Whether to scale similarity scores to [0,1] range

28

return_embedding: Whether to return document embeddings in results

29

"""

30

31

def run(

32

self,

33

query_embedding: List[float],

34

filters: Optional[Dict[str, Any]] = None,

35

top_k: Optional[int] = None,

36

scale_score: Optional[bool] = None,

37

return_embedding: Optional[bool] = None

38

) -> Dict[str, List[Document]]:

39

"""

40

Retrieve documents using embedding similarity search.

41

42

Args:

43

query_embedding: Vector embedding of the query

44

filters: Optional filters to apply during retrieval

45

top_k: Number of documents to retrieve

46

scale_score: Whether to scale similarity scores

47

return_embedding: Whether to return document embeddings

48

49

Returns:

50

Dictionary with 'documents' key containing list of retrieved documents

51

"""

52

```

53

54

### In-Memory BM25 Retrieval

55

56

Perform keyword-based retrieval using BM25 scoring algorithm.

57

58

```python { .api }

59

class InMemoryBM25Retriever:

60

def __init__(

61

self,

62

document_store: InMemoryDocumentStore,

63

filters: Optional[Dict[str, Any]] = None,

64

top_k: int = 10,

65

scale_score: bool = False

66

) -> None:

67

"""

68

Initialize in-memory BM25 retriever.

69

70

Args:

71

document_store: Document store containing documents

72

filters: Filters to apply to documents during retrieval

73

top_k: Number of documents to retrieve

74

scale_score: Whether to scale BM25 scores to [0,1] range

75

"""

76

77

def run(

78

self,

79

query: str,

80

filters: Optional[Dict[str, Any]] = None,

81

top_k: Optional[int] = None,

82

scale_score: Optional[bool] = None

83

) -> Dict[str, List[Document]]:

84

"""

85

Retrieve documents using BM25 keyword search.

86

87

Args:

88

query: Search query text

89

filters: Optional filters to apply during retrieval

90

top_k: Number of documents to retrieve

91

scale_score: Whether to scale BM25 scores

92

93

Returns:

94

Dictionary with 'documents' key containing list of retrieved documents

95

"""

96

```

97

98

### Filter-Based Retrieval

99

100

Retrieve documents based on metadata filters without scoring.

101

102

```python { .api }

103

class FilterRetriever:

104

def __init__(

105

self,

106

document_store: InMemoryDocumentStore,

107

filters: Optional[Dict[str, Any]] = None

108

) -> None:

109

"""

110

Initialize filter-based retriever.

111

112

Args:

113

document_store: Document store containing documents

114

filters: Default filters to apply during retrieval

115

"""

116

117

def run(

118

self,

119

filters: Optional[Dict[str, Any]] = None

120

) -> Dict[str, List[Document]]:

121

"""

122

Retrieve documents using metadata filters.

123

124

Args:

125

filters: Filters to apply for document selection

126

127

Returns:

128

Dictionary with 'documents' key containing list of filtered documents

129

"""

130

```

131

132

### Auto-Merging Retrieval

133

134

Advanced retrieval strategy that automatically merges smaller document chunks with their parent documents based on relevance.

135

136

```python { .api }

137

class AutoMergingRetriever:

138

def __init__(

139

self,

140

document_store: InMemoryDocumentStore,

141

retriever: Union[InMemoryEmbeddingRetriever, InMemoryBM25Retriever],

142

threshold: float = 0.8,

143

top_k: int = 10

144

) -> None:

145

"""

146

Initialize auto-merging retriever.

147

148

Args:

149

document_store: Document store containing hierarchical documents

150

retriever: Base retriever to use for initial search

151

threshold: Similarity threshold for merging child documents

152

top_k: Number of documents to retrieve

153

"""

154

155

def run(

156

self,

157

query: Optional[str] = None,

158

query_embedding: Optional[List[float]] = None,

159

filters: Optional[Dict[str, Any]] = None,

160

top_k: Optional[int] = None

161

) -> Dict[str, List[Document]]:

162

"""

163

Retrieve documents with auto-merging of related chunks.

164

165

Args:

166

query: Search query text (for BM25-based retrieval)

167

query_embedding: Query embedding (for embedding-based retrieval)

168

filters: Optional filters to apply during retrieval

169

top_k: Number of documents to retrieve

170

171

Returns:

172

Dictionary with 'documents' key containing merged documents

173

"""

174

```

175

176

### Sentence Window Retrieval

177

178

Retrieve documents with expanded context windows around the matching sentences.

179

180

```python { .api }

181

class SentenceWindowRetriever:

182

def __init__(

183

self,

184

document_store: InMemoryDocumentStore,

185

retriever: Union[InMemoryEmbeddingRetriever, InMemoryBM25Retriever],

186

window_size: int = 3,

187

top_k: int = 10

188

) -> None:

189

"""

190

Initialize sentence window retriever.

191

192

Args:

193

document_store: Document store containing documents with sentence metadata

194

retriever: Base retriever to use for initial search

195

window_size: Number of sentences to include before and after match

196

top_k: Number of documents to retrieve

197

"""

198

199

def run(

200

self,

201

query: Optional[str] = None,

202

query_embedding: Optional[List[float]] = None,

203

filters: Optional[Dict[str, Any]] = None,

204

top_k: Optional[int] = None,

205

window_size: Optional[int] = None

206

) -> Dict[str, List[Document]]:

207

"""

208

Retrieve documents with expanded sentence windows.

209

210

Args:

211

query: Search query text (for BM25-based retrieval)

212

query_embedding: Query embedding (for embedding-based retrieval)

213

filters: Optional filters to apply during retrieval

214

top_k: Number of documents to retrieve

215

window_size: Context window size in sentences

216

217

Returns:

218

Dictionary with 'documents' key containing documents with expanded context

219

"""

220

```

221

222

## Usage Examples

223

224

### Basic Embedding Retrieval

225

226

```python

227

from haystack.document_stores.in_memory import InMemoryDocumentStore

228

from haystack.components.retrievers import InMemoryEmbeddingRetriever

229

from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder

230

from haystack import Document

231

232

# Create documents and embed them

233

documents = [

234

Document(content="Python is a programming language."),

235

Document(content="Berlin is the capital of Germany."),

236

Document(content="Machine learning uses algorithms to find patterns.")

237

]

238

239

# Initialize document store

240

document_store = InMemoryDocumentStore()

241

242

# Embed documents

243

doc_embedder = OpenAIDocumentEmbedder()

244

embedded_docs = doc_embedder.run(documents=documents)

245

document_store.write_documents(embedded_docs["documents"])

246

247

# Set up retriever

248

retriever = InMemoryEmbeddingRetriever(

249

document_store=document_store,

250

top_k=2

251

)

252

253

# Create query embedding

254

text_embedder = OpenAITextEmbedder()

255

query_result = text_embedder.run(text="What is Python?")

256

query_embedding = query_result["embedding"]

257

258

# Retrieve relevant documents

259

result = retriever.run(query_embedding=query_embedding)

260

for doc in result["documents"]:

261

print(f"Score: {doc.score:.3f} - {doc.content}")

262

```

263

264

### BM25 Keyword Search

265

266

```python

267

from haystack.components.retrievers import InMemoryBM25Retriever

268

269

# Initialize BM25 retriever

270

bm25_retriever = InMemoryBM25Retriever(

271

document_store=document_store,

272

top_k=3,

273

scale_score=True

274

)

275

276

# Perform keyword search

277

result = bm25_retriever.run(query="programming language Python")

278

for doc in result["documents"]:

279

print(f"BM25 Score: {doc.score:.3f} - {doc.content}")

280

```

281

282

### Filter-Based Retrieval

283

284

```python

285

from haystack.components.retrievers import FilterRetriever

286

287

# Add documents with metadata

288

documents_with_meta = [

289

Document(content="Python tutorial", meta={"language": "en", "type": "tutorial"}),

290

Document(content="Java guide", meta={"language": "en", "type": "guide"}),

291

Document(content="Tutorial de Python", meta={"language": "es", "type": "tutorial"})

292

]

293

294

document_store.write_documents(documents_with_meta)

295

296

# Initialize filter retriever

297

filter_retriever = FilterRetriever(document_store=document_store)

298

299

# Retrieve documents by metadata

300

result = filter_retriever.run(

301

filters={"language": "en", "type": "tutorial"}

302

)

303

304

for doc in result["documents"]:

305

print(f"Content: {doc.content} - Meta: {doc.meta}")

306

```

307

308

### Advanced Auto-Merging Retrieval

309

310

```python

311

from haystack.components.retrievers import AutoMergingRetriever

312

313

# Create hierarchical documents (parent-child relationships)

314

parent_doc = Document(

315

content="Complete guide to machine learning algorithms",

316

meta={"doc_id": "ml_guide", "level": "parent"}

317

)

318

319

child_docs = [

320

Document(

321

content="Linear regression is a supervised learning algorithm",

322

meta={"doc_id": "ml_guide_1", "parent_id": "ml_guide", "level": "child"}

323

),

324

Document(

325

content="Decision trees split data based on feature values",

326

meta={"doc_id": "ml_guide_2", "parent_id": "ml_guide", "level": "child"}

327

)

328

]

329

330

# Store hierarchical documents

331

document_store.write_documents([parent_doc] + child_docs)

332

333

# Create base retriever

334

base_retriever = InMemoryEmbeddingRetriever(document_store=document_store)

335

336

# Initialize auto-merging retriever

337

auto_merger = AutoMergingRetriever(

338

document_store=document_store,

339

retriever=base_retriever,

340

threshold=0.7

341

)

342

343

# Retrieve with auto-merging

344

result = auto_merger.run(query_embedding=query_embedding)

345

for doc in result["documents"]:

346

print(f"Merged doc: {doc.content[:100]}...")

347

```

348

349

### Sentence Window Retrieval

350

351

```python

352

from haystack.components.retrievers import SentenceWindowRetriever

353

354

# Documents with sentence-level metadata

355

sentence_docs = [

356

Document(

357

content="First sentence. Second sentence. Third sentence.",

358

meta={"sentences": ["First sentence.", "Second sentence.", "Third sentence."]}

359

)

360

]

361

362

document_store.write_documents(sentence_docs)

363

364

# Initialize sentence window retriever

365

window_retriever = SentenceWindowRetriever(

366

document_store=document_store,

367

retriever=InMemoryEmbeddingRetriever(document_store=document_store),

368

window_size=1 # Include 1 sentence before and after

369

)

370

371

# Retrieve with expanded context

372

result = window_retriever.run(query_embedding=query_embedding)

373

for doc in result["documents"]:

374

print(f"Expanded context: {doc.content}")

375

```

376

377

### Combining Multiple Retrieval Strategies

378

379

```python

380

from haystack import Pipeline

381

from haystack.components.joiners import DocumentJoiner

382

383

# Create a pipeline that combines multiple retrieval strategies

384

retrieval_pipeline = Pipeline()

385

386

# Add multiple retrievers

387

retrieval_pipeline.add_component("embedding_retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=5))

388

retrieval_pipeline.add_component("bm25_retriever", InMemoryBM25Retriever(document_store=document_store, top_k=5))

389

retrieval_pipeline.add_component("document_joiner", DocumentJoiner(join_mode="merge"))

390

391

# Connect retrievers to joiner

392

retrieval_pipeline.connect("embedding_retriever.documents", "document_joiner.documents")

393

retrieval_pipeline.connect("bm25_retriever.documents", "document_joiner.documents")

394

395

# Run hybrid retrieval

396

result = retrieval_pipeline.run({

397

"embedding_retriever": {"query_embedding": query_embedding},

398

"bm25_retriever": {"query": "Python programming"}

399

})

400

401

combined_docs = result["document_joiner"]["documents"]

402

print(f"Retrieved {len(combined_docs)} documents using hybrid approach")

403

```

404

405

## Types

406

407

```python { .api }

408

from typing import Optional, Dict, Any, List, Union

409

from haystack import Document

410

from haystack.document_stores.in_memory import InMemoryDocumentStore

411

412

class RetrievalResult:

413

documents: List[Document]

414

query: Optional[str]

415

query_embedding: Optional[List[float]]

416

filters: Optional[Dict[str, Any]]

417

418

class SimilarityFunction:

419

COSINE = "cosine"

420

DOT_PRODUCT = "dot_product"

421

EUCLIDEAN = "euclidean"

422

```