0
# Document Processing
1
2
Document loading, parsing, and chunking functionality for various file formats with intelligent text splitting strategies and node creation.
3
4
## Capabilities
5
6
### Document Loading
7
8
Load documents from various sources and file formats.
9
10
```python { .api }
11
class SimpleDirectoryReader:
12
"""
13
Simple directory reader for loading documents from filesystem.
14
15
Args:
16
input_dir: Directory path to read documents from
17
input_files: List of specific files to load
18
exclude_hidden: Whether to exclude hidden files
19
errors: How to handle errors ('ignore', 'strict')
20
recursive: Whether to read directories recursively
21
encoding: Text encoding to use
22
filename_as_id: Use filename as document ID
23
required_exts: List of required file extensions
24
file_extractor: Custom file extractors
25
num_files_limit: Maximum number of files to load
26
**kwargs: Additional arguments
27
"""
28
def __init__(
29
self,
30
input_dir=None,
31
input_files=None,
32
exclude_hidden=True,
33
errors="ignore",
34
recursive=True,
35
encoding="utf-8",
36
filename_as_id=False,
37
required_exts=None,
38
file_extractor=None,
39
num_files_limit=None,
40
**kwargs
41
): ...
42
43
def load_data(self, show_progress=False):
44
"""
45
Load documents from specified sources.
46
47
Args:
48
show_progress: Whether to show loading progress
49
50
Returns:
51
List[Document]: List of loaded documents
52
"""
53
54
def iter_data(self, show_progress=False):
55
"""Iterate over documents without loading all into memory."""
56
57
def download_loader(loader_class):
58
"""
59
Download a community loader from LlamaHub.
60
61
Args:
62
loader_class: Name of the loader class to download
63
64
Returns:
65
class: Loader class ready for instantiation
66
"""
67
```
68
69
### Document Schema
70
71
Core document and node representations.
72
73
```python { .api }
74
class Document:
75
"""
76
Document object for storing text and metadata.
77
78
Args:
79
text: Document text content
80
metadata: Dictionary of metadata
81
excluded_embed_metadata_keys: Keys to exclude from embedding
82
excluded_llm_metadata_keys: Keys to exclude from LLM context
83
relationships: Relationships to other documents
84
**kwargs: Additional arguments
85
"""
86
def __init__(
87
self,
88
text=None,
89
metadata=None,
90
excluded_embed_metadata_keys=None,
91
excluded_llm_metadata_keys=None,
92
relationships=None,
93
**kwargs
94
): ...
95
96
@property
97
def text(self):
98
"""Document text content."""
99
100
@property
101
def metadata(self):
102
"""Document metadata dictionary."""
103
104
def get_content(self, metadata_mode="all"):
105
"""
106
Get document content with optional metadata.
107
108
Args:
109
metadata_mode: How to include metadata ("all", "embed", "llm", "none")
110
111
Returns:
112
str: Document content with metadata
113
"""
114
115
class TextNode:
116
"""
117
Text node for chunked document content.
118
119
Args:
120
text: Node text content
121
metadata: Node metadata
122
relationships: Relationships to other nodes
123
**kwargs: Additional arguments
124
"""
125
def __init__(self, text=None, metadata=None, relationships=None, **kwargs): ...
126
127
@property
128
def text(self):
129
"""Node text content."""
130
131
@property
132
def metadata(self):
133
"""Node metadata."""
134
135
def get_content(self, metadata_mode="all"):
136
"""Get node content with metadata."""
137
138
class ImageNode:
139
"""
140
Node for image content.
141
142
Args:
143
image: Image data or path
144
image_path: Path to image file
145
image_url: URL to image
146
text: Optional text description
147
**kwargs: Additional arguments
148
"""
149
def __init__(self, image=None, image_path=None, image_url=None, text=None, **kwargs): ...
150
151
class IndexNode:
152
"""
153
Node that references other indices.
154
155
Args:
156
text: Node text content
157
index_id: ID of referenced index
158
**kwargs: Additional arguments
159
"""
160
def __init__(self, text=None, index_id=None, **kwargs): ...
161
```
162
163
### Text Splitting
164
165
Various text splitting strategies for creating chunks from documents.
166
167
```python { .api }
168
class SentenceSplitter:
169
"""
170
Sentence-based text splitter.
171
172
Args:
173
chunk_size: Maximum chunk size in tokens
174
chunk_overlap: Overlap between chunks in tokens
175
separator: Sentence separator pattern
176
paragraph_separator: Paragraph separator
177
chunking_tokenizer_fn: Custom tokenizer function
178
secondary_chunking_regex: Secondary chunking pattern
179
"""
180
def __init__(
181
self,
182
chunk_size=1024,
183
chunk_overlap=200,
184
separator=" ",
185
paragraph_separator="\\n\\n\\n",
186
chunking_tokenizer_fn=None,
187
secondary_chunking_regex="[^,.;。?!]+[,.;。?!]?",
188
): ...
189
190
def split_text(self, text):
191
"""
192
Split text into chunks.
193
194
Args:
195
text: Text to split
196
197
Returns:
198
List[str]: List of text chunks
199
"""
200
201
def split_texts(self, texts):
202
"""Split multiple texts."""
203
204
class TokenTextSplitter:
205
"""
206
Token-based text splitter.
207
208
Args:
209
chunk_size: Maximum chunk size in tokens
210
chunk_overlap: Overlap between chunks in tokens
211
separator: Token separator
212
backup_separators: Fallback separators
213
tokenizer: Tokenizer to use
214
"""
215
def __init__(
216
self,
217
chunk_size=1024,
218
chunk_overlap=200,
219
separator=" ",
220
backup_separators=["\\n"],
221
tokenizer=None,
222
): ...
223
224
def split_text(self, text):
225
"""Split text using token counting."""
226
227
class CodeSplitter:
228
"""
229
Code-aware text splitter that respects code structure.
230
231
Args:
232
language: Programming language for syntax awareness
233
chunk_size: Maximum chunk size
234
chunk_overlap: Overlap between chunks
235
max_chars: Maximum characters per chunk
236
"""
237
def __init__(self, language="python", chunk_size=1024, chunk_overlap=200, max_chars=1500): ...
238
239
def split_text(self, text):
240
"""Split code text preserving structure."""
241
242
class SemanticSplitterNodeParser:
243
"""
244
Semantic-based text splitter using embeddings.
245
246
Args:
247
buffer_size: Buffer size for semantic analysis
248
breakpoint_percentile_threshold: Threshold for breakpoint detection
249
embed_model: Embedding model for semantic analysis
250
"""
251
def __init__(self, buffer_size=1, breakpoint_percentile_threshold=95, embed_model=None): ...
252
253
def get_nodes_from_documents(self, documents, show_progress=False):
254
"""
255
Create nodes from documents using semantic splitting.
256
257
Args:
258
documents: List of documents to process
259
show_progress: Whether to show progress
260
261
Returns:
262
List[TextNode]: List of semantic text nodes
263
"""
264
265
class SentenceWindowNodeParser:
266
"""
267
Sentence window splitter for context-aware chunking.
268
269
Args:
270
window_size: Number of sentences per window
271
window_metadata_key: Metadata key for window info
272
original_text_metadata_key: Metadata key for original text
273
"""
274
def __init__(self, window_size=3, window_metadata_key="window", original_text_metadata_key="original_text"): ...
275
276
def get_nodes_from_documents(self, documents, show_progress=False):
277
"""Create windowed nodes with sentence context."""
278
```
279
280
### Node Processing
281
282
Transform and process nodes after creation.
283
284
```python { .api }
285
class NodeParser:
286
"""Base class for node parsers."""
287
def get_nodes_from_documents(self, documents, show_progress=False):
288
"""
289
Parse documents into nodes.
290
291
Args:
292
documents: List of documents
293
show_progress: Whether to show progress
294
295
Returns:
296
List[BaseNode]: List of processed nodes
297
"""
298
299
class SimpleNodeParser(NodeParser):
300
"""
301
Simple node parser with basic text splitting.
302
303
Args:
304
text_splitter: Text splitter to use
305
include_metadata: Whether to include metadata
306
include_prev_next_rel: Whether to include previous/next relationships
307
"""
308
def __init__(self, text_splitter=None, include_metadata=True, include_prev_next_rel=True): ...
309
310
class HierarchicalNodeParser(NodeParser):
311
"""
312
Hierarchical node parser for multi-level document structure.
313
314
Args:
315
node_parsers: List of node parsers for different levels
316
"""
317
def __init__(self, node_parsers=None): ...
318
```
319
320
## Node Postprocessors
321
322
Process and filter nodes after retrieval.
323
324
```python { .api }
325
class SimilarityPostprocessor:
326
"""
327
Filter nodes by similarity threshold.
328
329
Args:
330
similarity_cutoff: Minimum similarity score
331
"""
332
def __init__(self, similarity_cutoff=0.7): ...
333
334
def postprocess_nodes(self, nodes, query_bundle=None):
335
"""Filter nodes by similarity."""
336
337
class KeywordNodePostprocessor:
338
"""
339
Filter nodes by keyword presence.
340
341
Args:
342
required_keywords: Keywords that must be present
343
exclude_keywords: Keywords that must not be present
344
"""
345
def __init__(self, required_keywords=None, exclude_keywords=None): ...
346
347
def postprocess_nodes(self, nodes, query_bundle=None):
348
"""Filter nodes by keyword criteria."""
349
350
class LLMRerank:
351
"""
352
Rerank nodes using LLM-based scoring.
353
354
Args:
355
llm: LLM to use for reranking
356
top_n: Number of top nodes to return
357
"""
358
def __init__(self, llm=None, top_n=5): ...
359
360
def postprocess_nodes(self, nodes, query_bundle=None):
361
"""Rerank nodes using LLM scoring."""
362
```
363
364
## Types
365
366
```python { .api }
367
from enum import Enum
368
369
class MetadataMode(str, Enum):
370
"""Metadata inclusion modes."""
371
ALL = "all"
372
EMBED = "embed"
373
LLM = "llm"
374
NONE = "none"
375
376
class NodeRelationship(str, Enum):
377
"""Node relationship types."""
378
SOURCE = "SOURCE"
379
PREVIOUS = "PREVIOUS"
380
NEXT = "NEXT"
381
PARENT = "PARENT"
382
CHILD = "CHILD"
383
```