or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agents-tools.mdcallbacks-monitoring.mdchains-workflows.mddocument-processing.mdindex.mdmemory-context.md

document-processing.mddocs/

0

# Document Processing and Retrieval

1

2

Tools and components for loading, processing, splitting, embedding, and retrieving documents to enable retrieval-augmented generation (RAG) workflows. This enables AI applications to work with external knowledge sources and large document collections.

3

4

## Capabilities

5

6

### Document Retrieval

7

8

Base classes and implementations for retrieving relevant documents based on queries.

9

10

```python { .api }

11

from langchain_core.retrievers import BaseRetriever

12

from langchain_core.documents import Document

13

14

class BaseRetriever:

15

"""Base class for document retrievers."""

16

17

def get_relevant_documents(self, query: str) -> List[Document]:

18

"""Retrieve documents relevant to query."""

19

20

def invoke(self, input: str) -> List[Document]:

21

"""Invoke retriever with input string."""

22

23

def batch(self, inputs: List[str]) -> List[List[Document]]:

24

"""Process multiple queries in batch."""

25

26

class VectorStoreRetriever(BaseRetriever):

27

"""Retriever backed by vector store."""

28

29

def __init__(

30

self,

31

vectorstore: VectorStore,

32

search_type: str = "similarity",

33

search_kwargs: Optional[dict] = None

34

): ...

35

36

def get_relevant_documents(self, query: str) -> List[Document]: ...

37

```

38

39

### Advanced Retrieval Strategies

40

41

Sophisticated retrieval methods that enhance basic similarity search with additional processing and filtering.

42

43

```python { .api }

44

class MultiQueryRetriever(BaseRetriever):

45

"""Generate multiple queries for more comprehensive retrieval."""

46

47

@classmethod

48

def from_llm(

49

cls,

50

retriever: BaseRetriever,

51

llm: BaseLanguageModel,

52

prompt: Optional[BasePromptTemplate] = None,

53

**kwargs: Any

54

) -> "MultiQueryRetriever": ...

55

56

class ContextualCompressionRetriever(BaseRetriever):

57

"""Compress retrieved documents based on query context."""

58

59

def __init__(

60

self,

61

base_compressor: BaseDocumentCompressor,

62

base_retriever: BaseRetriever

63

): ...

64

65

class EnsembleRetriever(BaseRetriever):

66

"""Combine multiple retrievers with weighted results."""

67

68

def __init__(

69

self,

70

retrievers: List[BaseRetriever],

71

weights: Optional[List[float]] = None,

72

**kwargs: Any

73

): ...

74

75

class ParentDocumentRetriever(BaseRetriever):

76

"""Retrieve parent documents from child document matches."""

77

78

def __init__(

79

self,

80

vectorstore: VectorStore,

81

docstore: BaseStore,

82

child_splitter: TextSplitter,

83

parent_splitter: Optional[TextSplitter] = None,

84

**kwargs: Any

85

): ...

86

87

class SelfQueryRetriever(BaseRetriever):

88

"""Retriever that can filter based on metadata using natural language."""

89

90

@classmethod

91

def from_llm(

92

cls,

93

llm: BaseLanguageModel,

94

vectorstore: VectorStore,

95

document_contents: str,

96

metadata_field_info: List[AttributeInfo],

97

**kwargs: Any

98

) -> "SelfQueryRetriever": ...

99

100

class TimeWeightedVectorStoreRetriever(BaseRetriever):

101

"""Retriever with time-based weighting of documents."""

102

103

def __init__(

104

self,

105

vectorstore: VectorStore,

106

decay_rate: float = -0.0001,

107

**kwargs: Any

108

): ...

109

```

110

111

### Retriever Utilities

112

113

Helper classes and functions for retriever processing and management.

114

115

```python { .api }

116

class MergerRetriever(BaseRetriever):

117

"""Merge and deduplicate results from multiple retrievers."""

118

119

def __init__(

120

self,

121

retrievers: List[BaseRetriever],

122

**kwargs: Any

123

): ...

124

125

class RePhraseQueryRetriever(BaseRetriever):

126

"""Rephrase queries before retrieval for better results."""

127

128

def __init__(

129

self,

130

retriever: BaseRetriever,

131

llm_chain: LLMChain

132

): ...

133

```

134

135

### Vector Store Integration

136

137

Integration with vector databases for similarity-based document retrieval.

138

139

```python { .api }

140

from langchain_core.vectorstores import VectorStore

141

142

class VectorStore:

143

"""Base vector store class for similarity search."""

144

145

def similarity_search(

146

self,

147

query: str,

148

k: int = 4,

149

**kwargs: Any

150

) -> List[Document]:

151

"""Search for similar documents."""

152

153

def similarity_search_with_score(

154

self,

155

query: str,

156

k: int = 4,

157

**kwargs: Any

158

) -> List[Tuple[Document, float]]:

159

"""Search with similarity scores."""

160

161

def as_retriever(self, **kwargs: Any) -> VectorStoreRetriever:

162

"""Convert vector store to retriever."""

163

164

@classmethod

165

def from_documents(

166

cls,

167

documents: List[Document],

168

embedding: Embeddings,

169

**kwargs: Any

170

) -> "VectorStore":

171

"""Create vector store from documents."""

172

```

173

174

### Document Processing Chains

175

176

Chains specifically designed for document processing workflows within retrieval systems.

177

178

```python { .api }

179

def create_retrieval_chain(

180

retriever: BaseRetriever,

181

combine_docs_chain: Runnable

182

) -> Runnable:

183

"""

184

Create a retrieval chain combining retriever and document processing.

185

186

Parameters:

187

- retriever: Document retriever for finding relevant content

188

- combine_docs_chain: Chain to process and combine retrieved documents

189

190

Returns:

191

Runnable chain that retrieves and processes documents

192

"""

193

194

def create_history_aware_retriever(

195

llm: BaseLanguageModel,

196

retriever: BaseRetriever,

197

prompt: BasePromptTemplate

198

) -> Runnable:

199

"""

200

Create retriever that incorporates conversation history.

201

202

Parameters:

203

- llm: Language model for processing history

204

- retriever: Base document retriever

205

- prompt: Template for combining history with query

206

207

Returns:

208

History-aware retriever runnable

209

"""

210

```

211

212

### Document Loaders and Text Splitters

213

214

**Note**: Document loaders and text splitters have been moved to specialized packages:

215

216

```python { .api }

217

# Document loaders moved to langchain_community

218

from langchain_community.document_loaders import (

219

TextLoader,

220

PyPDFLoader,

221

CSVLoader,

222

JSONLoader,

223

WebBaseLoader,

224

DirectoryLoader

225

)

226

227

# Text splitters moved to langchain_text_splitters

228

from langchain_text_splitters import (

229

CharacterTextSplitter,

230

RecursiveCharacterTextSplitter,

231

TokenTextSplitter,

232

SpacyTextSplitter

233

)

234

```

235

236

### Embeddings Integration

237

238

**Note**: Embedding models have been moved to provider-specific packages:

239

240

```python { .api }

241

# Core embeddings interface

242

from langchain_core.embeddings import Embeddings

243

244

class Embeddings:

245

"""Base embeddings class."""

246

247

def embed_documents(self, texts: List[str]) -> List[List[float]]:

248

"""Embed multiple documents."""

249

250

def embed_query(self, text: str) -> List[float]:

251

"""Embed single query."""

252

253

# Provider-specific embeddings

254

from langchain_openai import OpenAIEmbeddings

255

from langchain_huggingface import HuggingFaceEmbeddings

256

from langchain_community.embeddings import CohereEmbeddings

257

258

# Cached embeddings

259

from langchain.embeddings import CacheBackedEmbeddings

260

261

class CacheBackedEmbeddings:

262

"""Embeddings with caching support."""

263

264

def __init__(

265

self,

266

underlying_embeddings: Embeddings,

267

document_embedding_cache: BaseStore,

268

**kwargs: Any

269

): ...

270

```

271

272

## Usage Examples

273

274

### Basic Retrieval Setup

275

276

```python

277

from langchain_community.vectorstores import FAISS

278

from langchain_openai import OpenAIEmbeddings

279

from langchain_community.document_loaders import TextLoader

280

from langchain_text_splitters import CharacterTextSplitter

281

282

# Load and split documents

283

loader = TextLoader("document.txt")

284

documents = loader.load()

285

286

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

287

texts = text_splitter.split_documents(documents)

288

289

# Create embeddings and vector store

290

embeddings = OpenAIEmbeddings()

291

vectorstore = FAISS.from_documents(texts, embeddings)

292

293

# Create retriever

294

retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

295

296

# Retrieve documents

297

docs = retriever.get_relevant_documents("What is the main topic?")

298

```

299

300

### Multi-Query Retrieval

301

302

```python

303

from langchain.retrievers import MultiQueryRetriever

304

from langchain_openai import OpenAI

305

306

# Create multi-query retriever

307

llm = OpenAI(temperature=0)

308

multi_query_retriever = MultiQueryRetriever.from_llm(

309

retriever=vectorstore.as_retriever(),

310

llm=llm

311

)

312

313

# This generates multiple query variations for better retrieval

314

docs = multi_query_retriever.get_relevant_documents(

315

"What are the benefits of renewable energy?"

316

)

317

```

318

319

### Contextual Compression

320

321

```python

322

from langchain.retrievers import ContextualCompressionRetriever

323

from langchain.retrievers.document_compressors import LLMChainExtractor

324

325

# Create compressor

326

compressor = LLMChainExtractor.from_llm(llm)

327

328

# Create compression retriever

329

compression_retriever = ContextualCompressionRetriever(

330

base_compressor=compressor,

331

base_retriever=vectorstore.as_retriever()

332

)

333

334

# Retrieves and compresses documents based on query

335

docs = compression_retriever.get_relevant_documents(

336

"What are the environmental impacts?"

337

)

338

```

339

340

### Self-Query Retrieval

341

342

```python

343

from langchain.retrievers import SelfQueryRetriever

344

from langchain.chains.query_constructor.base import AttributeInfo

345

346

# Define metadata fields

347

metadata_field_info = [

348

AttributeInfo(

349

name="source",

350

description="The source of the document",

351

type="string"

352

),

353

AttributeInfo(

354

name="date",

355

description="The date the document was created",

356

type="string"

357

)

358

]

359

360

# Create self-query retriever

361

self_query_retriever = SelfQueryRetriever.from_llm(

362

llm=llm,

363

vectorstore=vectorstore,

364

document_contents="Research papers on climate change",

365

metadata_field_info=metadata_field_info

366

)

367

368

# Can filter based on metadata using natural language

369

docs = self_query_retriever.get_relevant_documents(

370

"Papers from 2023 about solar energy"

371

)

372

```

373

374

### Ensemble Retrieval

375

376

```python

377

from langchain.retrievers import EnsembleRetriever

378

from langchain_community.retrievers import BM25Retriever

379

380

# Create different types of retrievers

381

bm25_retriever = BM25Retriever.from_documents(texts)

382

faiss_retriever = vectorstore.as_retriever()

383

384

# Combine retrievers with ensemble

385

ensemble_retriever = EnsembleRetriever(

386

retrievers=[bm25_retriever, faiss_retriever],

387

weights=[0.5, 0.5]

388

)

389

390

# Gets results from both retrievers and combines them

391

docs = ensemble_retriever.get_relevant_documents(

392

"machine learning applications"

393

)

394

```

395

396

### Complete RAG Pipeline

397

398

```python

399

from langchain.chains import create_retrieval_chain

400

from langchain.chains.combine_documents import create_stuff_documents_chain

401

from langchain_core.prompts import ChatPromptTemplate

402

403

# Create prompt for QA

404

system_prompt = (

405

"You are an assistant for question-answering tasks. "

406

"Use the following pieces of retrieved context to answer "

407

"the question. If you don't know the answer, say that you "

408

"don't know. Use three sentences maximum and keep the "

409

"answer concise.\n\n{context}"

410

)

411

412

prompt = ChatPromptTemplate.from_messages([

413

("system", system_prompt),

414

("human", "{input}"),

415

])

416

417

# Create document processing chain

418

question_answer_chain = create_stuff_documents_chain(llm, prompt)

419

420

# Create full RAG chain

421

rag_chain = create_retrieval_chain(retriever, question_answer_chain)

422

423

# Use the complete pipeline

424

response = rag_chain.invoke({"input": "What are the key findings?"})

425

print(response["answer"])

426

```