0
# Document Processing and Retrieval
1
2
Tools and components for loading, processing, splitting, embedding, and retrieving documents to enable retrieval-augmented generation (RAG) workflows. This enables AI applications to work with external knowledge sources and large document collections.
3
4
## Capabilities
5
6
### Document Retrieval
7
8
Base classes and implementations for retrieving relevant documents based on queries.
9
10
```python { .api }
11
from langchain_core.retrievers import BaseRetriever
12
from langchain_core.documents import Document
13
14
class BaseRetriever:
15
"""Base class for document retrievers."""
16
17
def get_relevant_documents(self, query: str) -> List[Document]:
18
"""Retrieve documents relevant to query."""
19
20
def invoke(self, input: str) -> List[Document]:
21
"""Invoke retriever with input string."""
22
23
def batch(self, inputs: List[str]) -> List[List[Document]]:
24
"""Process multiple queries in batch."""
25
26
class VectorStoreRetriever(BaseRetriever):
27
"""Retriever backed by vector store."""
28
29
def __init__(
30
self,
31
vectorstore: VectorStore,
32
search_type: str = "similarity",
33
search_kwargs: Optional[dict] = None
34
): ...
35
36
def get_relevant_documents(self, query: str) -> List[Document]: ...
37
```
38
39
### Advanced Retrieval Strategies
40
41
Sophisticated retrieval methods that enhance basic similarity search with additional processing and filtering.
42
43
```python { .api }
44
class MultiQueryRetriever(BaseRetriever):
45
"""Generate multiple queries for more comprehensive retrieval."""
46
47
@classmethod
48
def from_llm(
49
cls,
50
retriever: BaseRetriever,
51
llm: BaseLanguageModel,
52
prompt: Optional[BasePromptTemplate] = None,
53
**kwargs: Any
54
) -> "MultiQueryRetriever": ...
55
56
class ContextualCompressionRetriever(BaseRetriever):
57
"""Compress retrieved documents based on query context."""
58
59
def __init__(
60
self,
61
base_compressor: BaseDocumentCompressor,
62
base_retriever: BaseRetriever
63
): ...
64
65
class EnsembleRetriever(BaseRetriever):
66
"""Combine multiple retrievers with weighted results."""
67
68
def __init__(
69
self,
70
retrievers: List[BaseRetriever],
71
weights: Optional[List[float]] = None,
72
**kwargs: Any
73
): ...
74
75
class ParentDocumentRetriever(BaseRetriever):
76
"""Retrieve parent documents from child document matches."""
77
78
def __init__(
79
self,
80
vectorstore: VectorStore,
81
docstore: BaseStore,
82
child_splitter: TextSplitter,
83
parent_splitter: Optional[TextSplitter] = None,
84
**kwargs: Any
85
): ...
86
87
class SelfQueryRetriever(BaseRetriever):
88
"""Retriever that can filter based on metadata using natural language."""
89
90
@classmethod
91
def from_llm(
92
cls,
93
llm: BaseLanguageModel,
94
vectorstore: VectorStore,
95
document_contents: str,
96
metadata_field_info: List[AttributeInfo],
97
**kwargs: Any
98
) -> "SelfQueryRetriever": ...
99
100
class TimeWeightedVectorStoreRetriever(BaseRetriever):
101
"""Retriever with time-based weighting of documents."""
102
103
def __init__(
104
self,
105
vectorstore: VectorStore,
106
decay_rate: float = -0.0001,
107
**kwargs: Any
108
): ...
109
```
110
111
### Retriever Utilities
112
113
Helper classes and functions for retriever processing and management.
114
115
```python { .api }
116
class MergerRetriever(BaseRetriever):
117
"""Merge and deduplicate results from multiple retrievers."""
118
119
def __init__(
120
self,
121
retrievers: List[BaseRetriever],
122
**kwargs: Any
123
): ...
124
125
class RePhraseQueryRetriever(BaseRetriever):
126
"""Rephrase queries before retrieval for better results."""
127
128
def __init__(
129
self,
130
retriever: BaseRetriever,
131
llm_chain: LLMChain
132
): ...
133
```
134
135
### Vector Store Integration
136
137
Integration with vector databases for similarity-based document retrieval.
138
139
```python { .api }
140
from langchain_core.vectorstores import VectorStore
141
142
class VectorStore:
143
"""Base vector store class for similarity search."""
144
145
def similarity_search(
146
self,
147
query: str,
148
k: int = 4,
149
**kwargs: Any
150
) -> List[Document]:
151
"""Search for similar documents."""
152
153
def similarity_search_with_score(
154
self,
155
query: str,
156
k: int = 4,
157
**kwargs: Any
158
) -> List[Tuple[Document, float]]:
159
"""Search with similarity scores."""
160
161
def as_retriever(self, **kwargs: Any) -> VectorStoreRetriever:
162
"""Convert vector store to retriever."""
163
164
@classmethod
165
def from_documents(
166
cls,
167
documents: List[Document],
168
embedding: Embeddings,
169
**kwargs: Any
170
) -> "VectorStore":
171
"""Create vector store from documents."""
172
```
173
174
### Document Processing Chains
175
176
Chains specifically designed for document processing workflows within retrieval systems.
177
178
```python { .api }
179
def create_retrieval_chain(
180
retriever: BaseRetriever,
181
combine_docs_chain: Runnable
182
) -> Runnable:
183
"""
184
Create a retrieval chain combining retriever and document processing.
185
186
Parameters:
187
- retriever: Document retriever for finding relevant content
188
- combine_docs_chain: Chain to process and combine retrieved documents
189
190
Returns:
191
Runnable chain that retrieves and processes documents
192
"""
193
194
def create_history_aware_retriever(
195
llm: BaseLanguageModel,
196
retriever: BaseRetriever,
197
prompt: BasePromptTemplate
198
) -> Runnable:
199
"""
200
Create retriever that incorporates conversation history.
201
202
Parameters:
203
- llm: Language model for processing history
204
- retriever: Base document retriever
205
- prompt: Template for combining history with query
206
207
Returns:
208
History-aware retriever runnable
209
"""
210
```
211
212
### Document Loaders and Text Splitters
213
214
**Note**: Document loaders and text splitters have been moved to specialized packages:
215
216
```python { .api }
217
# Document loaders moved to langchain_community
218
from langchain_community.document_loaders import (
219
TextLoader,
220
PyPDFLoader,
221
CSVLoader,
222
JSONLoader,
223
WebBaseLoader,
224
DirectoryLoader
225
)
226
227
# Text splitters moved to langchain_text_splitters
228
from langchain_text_splitters import (
229
CharacterTextSplitter,
230
RecursiveCharacterTextSplitter,
231
TokenTextSplitter,
232
SpacyTextSplitter
233
)
234
```
235
236
### Embeddings Integration
237
238
**Note**: Embedding models have been moved to provider-specific packages:
239
240
```python { .api }
241
# Core embeddings interface
242
from langchain_core.embeddings import Embeddings
243
244
class Embeddings:
245
"""Base embeddings class."""
246
247
def embed_documents(self, texts: List[str]) -> List[List[float]]:
248
"""Embed multiple documents."""
249
250
def embed_query(self, text: str) -> List[float]:
251
"""Embed single query."""
252
253
# Provider-specific embeddings
254
from langchain_openai import OpenAIEmbeddings
255
from langchain_huggingface import HuggingFaceEmbeddings
256
from langchain_community.embeddings import CohereEmbeddings
257
258
# Cached embeddings
259
from langchain.embeddings import CacheBackedEmbeddings
260
261
class CacheBackedEmbeddings:
262
"""Embeddings with caching support."""
263
264
def __init__(
265
self,
266
underlying_embeddings: Embeddings,
267
document_embedding_cache: BaseStore,
268
**kwargs: Any
269
): ...
270
```
271
272
## Usage Examples
273
274
### Basic Retrieval Setup
275
276
```python
277
from langchain_community.vectorstores import FAISS
278
from langchain_openai import OpenAIEmbeddings
279
from langchain_community.document_loaders import TextLoader
280
from langchain_text_splitters import CharacterTextSplitter
281
282
# Load and split documents
283
loader = TextLoader("document.txt")
284
documents = loader.load()
285
286
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
287
texts = text_splitter.split_documents(documents)
288
289
# Create embeddings and vector store
290
embeddings = OpenAIEmbeddings()
291
vectorstore = FAISS.from_documents(texts, embeddings)
292
293
# Create retriever
294
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
295
296
# Retrieve documents
297
docs = retriever.get_relevant_documents("What is the main topic?")
298
```
299
300
### Multi-Query Retrieval
301
302
```python
303
from langchain.retrievers import MultiQueryRetriever
304
from langchain_openai import OpenAI
305
306
# Create multi-query retriever
307
llm = OpenAI(temperature=0)
308
multi_query_retriever = MultiQueryRetriever.from_llm(
309
retriever=vectorstore.as_retriever(),
310
llm=llm
311
)
312
313
# This generates multiple query variations for better retrieval
314
docs = multi_query_retriever.get_relevant_documents(
315
"What are the benefits of renewable energy?"
316
)
317
```
318
319
### Contextual Compression
320
321
```python
322
from langchain.retrievers import ContextualCompressionRetriever
323
from langchain.retrievers.document_compressors import LLMChainExtractor
324
325
# Create compressor
326
compressor = LLMChainExtractor.from_llm(llm)
327
328
# Create compression retriever
329
compression_retriever = ContextualCompressionRetriever(
330
base_compressor=compressor,
331
base_retriever=vectorstore.as_retriever()
332
)
333
334
# Retrieves and compresses documents based on query
335
docs = compression_retriever.get_relevant_documents(
336
"What are the environmental impacts?"
337
)
338
```
339
340
### Self-Query Retrieval
341
342
```python
343
from langchain.retrievers import SelfQueryRetriever
344
from langchain.chains.query_constructor.base import AttributeInfo
345
346
# Define metadata fields
347
metadata_field_info = [
348
AttributeInfo(
349
name="source",
350
description="The source of the document",
351
type="string"
352
),
353
AttributeInfo(
354
name="date",
355
description="The date the document was created",
356
type="string"
357
)
358
]
359
360
# Create self-query retriever
361
self_query_retriever = SelfQueryRetriever.from_llm(
362
llm=llm,
363
vectorstore=vectorstore,
364
document_contents="Research papers on climate change",
365
metadata_field_info=metadata_field_info
366
)
367
368
# Can filter based on metadata using natural language
369
docs = self_query_retriever.get_relevant_documents(
370
"Papers from 2023 about solar energy"
371
)
372
```
373
374
### Ensemble Retrieval
375
376
```python
377
from langchain.retrievers import EnsembleRetriever
378
from langchain_community.retrievers import BM25Retriever
379
380
# Create different types of retrievers
381
bm25_retriever = BM25Retriever.from_documents(texts)
382
faiss_retriever = vectorstore.as_retriever()
383
384
# Combine retrievers with ensemble
385
ensemble_retriever = EnsembleRetriever(
386
retrievers=[bm25_retriever, faiss_retriever],
387
weights=[0.5, 0.5]
388
)
389
390
# Gets results from both retrievers and combines them
391
docs = ensemble_retriever.get_relevant_documents(
392
"machine learning applications"
393
)
394
```
395
396
### Complete RAG Pipeline
397
398
```python
399
from langchain.chains import create_retrieval_chain
400
from langchain.chains.combine_documents import create_stuff_documents_chain
401
from langchain_core.prompts import ChatPromptTemplate
402
403
# Create prompt for QA
404
system_prompt = (
405
"You are an assistant for question-answering tasks. "
406
"Use the following pieces of retrieved context to answer "
407
"the question. If you don't know the answer, say that you "
408
"don't know. Use three sentences maximum and keep the "
409
"answer concise.\n\n{context}"
410
)
411
412
prompt = ChatPromptTemplate.from_messages([
413
("system", system_prompt),
414
("human", "{input}"),
415
])
416
417
# Create document processing chain
418
question_answer_chain = create_stuff_documents_chain(llm, prompt)
419
420
# Create full RAG chain
421
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
422
423
# Use the complete pipeline
424
response = rag_chain.invoke({"input": "What are the key findings?"})
425
print(response["answer"])
426
```