or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agents-workflows.mddata-indexing.mddocument-processing.mdindex.mdllm-integration.mdprompts.mdquery-processing.mdresponse-synthesis.mdretrievers.mdstorage-settings.md

document-processing.mddocs/

0

# Document Processing

1

2

Document loading, parsing, and chunking functionality for various file formats with intelligent text splitting strategies and node creation.

3

4

## Capabilities

5

6

### Document Loading

7

8

Load documents from various sources and file formats.

9

10

```python { .api }

11

class SimpleDirectoryReader:

12

"""

13

Simple directory reader for loading documents from filesystem.

14

15

Args:

16

input_dir: Directory path to read documents from

17

input_files: List of specific files to load

18

exclude_hidden: Whether to exclude hidden files

19

errors: How to handle errors ('ignore', 'strict')

20

recursive: Whether to read directories recursively

21

encoding: Text encoding to use

22

filename_as_id: Use filename as document ID

23

required_exts: List of required file extensions

24

file_extractor: Custom file extractors

25

num_files_limit: Maximum number of files to load

26

**kwargs: Additional arguments

27

"""

28

def __init__(

29

self,

30

input_dir=None,

31

input_files=None,

32

exclude_hidden=True,

33

errors="ignore",

34

recursive=True,

35

encoding="utf-8",

36

filename_as_id=False,

37

required_exts=None,

38

file_extractor=None,

39

num_files_limit=None,

40

**kwargs

41

): ...

42

43

def load_data(self, show_progress=False):

44

"""

45

Load documents from specified sources.

46

47

Args:

48

show_progress: Whether to show loading progress

49

50

Returns:

51

List[Document]: List of loaded documents

52

"""

53

54

def iter_data(self, show_progress=False):

55

"""Iterate over documents without loading all into memory."""

56

57

def download_loader(loader_class):

58

"""

59

Download a community loader from LlamaHub.

60

61

Args:

62

loader_class: Name of the loader class to download

63

64

Returns:

65

class: Loader class ready for instantiation

66

"""

67

```

68

69

### Document Schema

70

71

Core document and node representations.

72

73

```python { .api }

74

class Document:

75

"""

76

Document object for storing text and metadata.

77

78

Args:

79

text: Document text content

80

metadata: Dictionary of metadata

81

excluded_embed_metadata_keys: Keys to exclude from embedding

82

excluded_llm_metadata_keys: Keys to exclude from LLM context

83

relationships: Relationships to other documents

84

**kwargs: Additional arguments

85

"""

86

def __init__(

87

self,

88

text=None,

89

metadata=None,

90

excluded_embed_metadata_keys=None,

91

excluded_llm_metadata_keys=None,

92

relationships=None,

93

**kwargs

94

): ...

95

96

@property

97

def text(self):

98

"""Document text content."""

99

100

@property

101

def metadata(self):

102

"""Document metadata dictionary."""

103

104

def get_content(self, metadata_mode="all"):

105

"""

106

Get document content with optional metadata.

107

108

Args:

109

metadata_mode: How to include metadata ("all", "embed", "llm", "none")

110

111

Returns:

112

str: Document content with metadata

113

"""

114

115

class TextNode:

116

"""

117

Text node for chunked document content.

118

119

Args:

120

text: Node text content

121

metadata: Node metadata

122

relationships: Relationships to other nodes

123

**kwargs: Additional arguments

124

"""

125

def __init__(self, text=None, metadata=None, relationships=None, **kwargs): ...

126

127

@property

128

def text(self):

129

"""Node text content."""

130

131

@property

132

def metadata(self):

133

"""Node metadata."""

134

135

def get_content(self, metadata_mode="all"):

136

"""Get node content with metadata."""

137

138

class ImageNode:

139

"""

140

Node for image content.

141

142

Args:

143

image: Image data or path

144

image_path: Path to image file

145

image_url: URL to image

146

text: Optional text description

147

**kwargs: Additional arguments

148

"""

149

def __init__(self, image=None, image_path=None, image_url=None, text=None, **kwargs): ...

150

151

class IndexNode:

152

"""

153

Node that references other indices.

154

155

Args:

156

text: Node text content

157

index_id: ID of referenced index

158

**kwargs: Additional arguments

159

"""

160

def __init__(self, text=None, index_id=None, **kwargs): ...

161

```

162

163

### Text Splitting

164

165

Various text splitting strategies for creating chunks from documents.

166

167

```python { .api }

168

class SentenceSplitter:

169

"""

170

Sentence-based text splitter.

171

172

Args:

173

chunk_size: Maximum chunk size in tokens

174

chunk_overlap: Overlap between chunks in tokens

175

separator: Sentence separator pattern

176

paragraph_separator: Paragraph separator

177

chunking_tokenizer_fn: Custom tokenizer function

178

secondary_chunking_regex: Secondary chunking pattern

179

"""

180

def __init__(

181

self,

182

chunk_size=1024,

183

chunk_overlap=200,

184

separator=" ",

185

paragraph_separator="\\n\\n\\n",

186

chunking_tokenizer_fn=None,

187

secondary_chunking_regex="[^,.;。?!]+[,.;。?!]?",

188

): ...

189

190

def split_text(self, text):

191

"""

192

Split text into chunks.

193

194

Args:

195

text: Text to split

196

197

Returns:

198

List[str]: List of text chunks

199

"""

200

201

def split_texts(self, texts):

202

"""Split multiple texts."""

203

204

class TokenTextSplitter:

205

"""

206

Token-based text splitter.

207

208

Args:

209

chunk_size: Maximum chunk size in tokens

210

chunk_overlap: Overlap between chunks in tokens

211

separator: Token separator

212

backup_separators: Fallback separators

213

tokenizer: Tokenizer to use

214

"""

215

def __init__(

216

self,

217

chunk_size=1024,

218

chunk_overlap=200,

219

separator=" ",

220

backup_separators=["\\n"],

221

tokenizer=None,

222

): ...

223

224

def split_text(self, text):

225

"""Split text using token counting."""

226

227

class CodeSplitter:

228

"""

229

Code-aware text splitter that respects code structure.

230

231

Args:

232

language: Programming language for syntax awareness

233

chunk_size: Maximum chunk size

234

chunk_overlap: Overlap between chunks

235

max_chars: Maximum characters per chunk

236

"""

237

def __init__(self, language="python", chunk_size=1024, chunk_overlap=200, max_chars=1500): ...

238

239

def split_text(self, text):

240

"""Split code text preserving structure."""

241

242

class SemanticSplitterNodeParser:

243

"""

244

Semantic-based text splitter using embeddings.

245

246

Args:

247

buffer_size: Buffer size for semantic analysis

248

breakpoint_percentile_threshold: Threshold for breakpoint detection

249

embed_model: Embedding model for semantic analysis

250

"""

251

def __init__(self, buffer_size=1, breakpoint_percentile_threshold=95, embed_model=None): ...

252

253

def get_nodes_from_documents(self, documents, show_progress=False):

254

"""

255

Create nodes from documents using semantic splitting.

256

257

Args:

258

documents: List of documents to process

259

show_progress: Whether to show progress

260

261

Returns:

262

List[TextNode]: List of semantic text nodes

263

"""

264

265

class SentenceWindowNodeParser:

266

"""

267

Sentence window splitter for context-aware chunking.

268

269

Args:

270

window_size: Number of sentences per window

271

window_metadata_key: Metadata key for window info

272

original_text_metadata_key: Metadata key for original text

273

"""

274

def __init__(self, window_size=3, window_metadata_key="window", original_text_metadata_key="original_text"): ...

275

276

def get_nodes_from_documents(self, documents, show_progress=False):

277

"""Create windowed nodes with sentence context."""

278

```

279

280

### Node Processing

281

282

Transform and process nodes after creation.

283

284

```python { .api }

285

class NodeParser:

286

"""Base class for node parsers."""

287

def get_nodes_from_documents(self, documents, show_progress=False):

288

"""

289

Parse documents into nodes.

290

291

Args:

292

documents: List of documents

293

show_progress: Whether to show progress

294

295

Returns:

296

List[BaseNode]: List of processed nodes

297

"""

298

299

class SimpleNodeParser(NodeParser):

300

"""

301

Simple node parser with basic text splitting.

302

303

Args:

304

text_splitter: Text splitter to use

305

include_metadata: Whether to include metadata

306

include_prev_next_rel: Whether to include previous/next relationships

307

"""

308

def __init__(self, text_splitter=None, include_metadata=True, include_prev_next_rel=True): ...

309

310

class HierarchicalNodeParser(NodeParser):

311

"""

312

Hierarchical node parser for multi-level document structure.

313

314

Args:

315

node_parsers: List of node parsers for different levels

316

"""

317

def __init__(self, node_parsers=None): ...

318

```

319

320

## Node Postprocessors

321

322

Process and filter nodes after retrieval.

323

324

```python { .api }

325

class SimilarityPostprocessor:

326

"""

327

Filter nodes by similarity threshold.

328

329

Args:

330

similarity_cutoff: Minimum similarity score

331

"""

332

def __init__(self, similarity_cutoff=0.7): ...

333

334

def postprocess_nodes(self, nodes, query_bundle=None):

335

"""Filter nodes by similarity."""

336

337

class KeywordNodePostprocessor:

338

"""

339

Filter nodes by keyword presence.

340

341

Args:

342

required_keywords: Keywords that must be present

343

exclude_keywords: Keywords that must not be present

344

"""

345

def __init__(self, required_keywords=None, exclude_keywords=None): ...

346

347

def postprocess_nodes(self, nodes, query_bundle=None):

348

"""Filter nodes by keyword criteria."""

349

350

class LLMRerank:

351

"""

352

Rerank nodes using LLM-based scoring.

353

354

Args:

355

llm: LLM to use for reranking

356

top_n: Number of top nodes to return

357

"""

358

def __init__(self, llm=None, top_n=5): ...

359

360

def postprocess_nodes(self, nodes, query_bundle=None):

361

"""Rerank nodes using LLM scoring."""

362

```

363

364

## Types

365

366

```python { .api }

367

from enum import Enum

368

369

class MetadataMode(str, Enum):

370

"""Metadata inclusion modes."""

371

ALL = "all"

372

EMBED = "embed"

373

LLM = "llm"

374

NONE = "none"

375

376

class NodeRelationship(str, Enum):

377

"""Node relationship types."""

378

SOURCE = "SOURCE"

379

PREVIOUS = "PREVIOUS"

380

NEXT = "NEXT"

381

PARENT = "PARENT"

382

CHILD = "CHILD"

383

```