or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agents-tools.mddocuments-nodes.mdevaluation.mdindex.mdindices.mdllms-embeddings.mdnode-parsers.mdpostprocessors.mdprompts.mdquery-engines.mdretrievers.mdsettings.mdstorage.md

node-parsers.mddocs/

0

# Node Parsers

1

2

Comprehensive text splitting, parsing, and preprocessing capabilities for transforming documents into nodes. Node parsers handle various content types including plain text, code, markdown, HTML, and JSON while supporting semantic chunking, hierarchical structures, and metadata preservation.

3

4

## Capabilities

5

6

### Base Parser Interfaces

7

8

Foundation interfaces for all node parsing operations, providing standardized document processing and node generation.

9

10

```python { .api }

11

class NodeParser:

12

"""

13

Base interface for node parsing operations.

14

15

Parameters:

16

- include_metadata: bool, whether to include metadata in parsed nodes

17

- include_prev_next_rel: bool, whether to include previous/next relationships

18

- callback_manager: Optional[CallbackManager], callback management system

19

"""

20

def __init__(

21

self,

22

include_metadata: bool = True,

23

include_prev_next_rel: bool = True,

24

callback_manager: Optional[CallbackManager] = None,

25

**kwargs

26

): ...

27

28

def get_nodes_from_documents(

29

self,

30

documents: Sequence[Document],

31

show_progress: bool = False,

32

**kwargs

33

) -> List[BaseNode]:

34

"""

35

Parse documents into nodes.

36

37

Parameters:

38

- documents: Sequence[Document], documents to parse

39

- show_progress: bool, whether to show parsing progress

40

41

Returns:

42

- List[BaseNode], parsed nodes from documents

43

"""

44

45

class TextSplitter:

46

"""

47

Base interface for text splitting operations.

48

49

Parameters:

50

- chunk_size: int, target size for text chunks

51

- chunk_overlap: int, overlap between adjacent chunks

52

- separator: str, separator used for splitting

53

- backup_separators: Optional[List[str]], fallback separators

54

"""

55

def __init__(

56

self,

57

chunk_size: int = 1024,

58

chunk_overlap: int = 200,

59

separator: str = " ",

60

backup_separators: Optional[List[str]] = None,

61

**kwargs

62

): ...

63

64

def split_text(self, text: str) -> List[str]:

65

"""

66

Split text into chunks.

67

68

Parameters:

69

- text: str, input text to split

70

71

Returns:

72

- List[str], list of text chunks

73

"""

74

75

def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]:

76

"""

77

Split text while considering metadata length.

78

79

Parameters:

80

- text: str, input text to split

81

- metadata_str: str, metadata string to account for

82

83

Returns:

84

- List[str], list of text chunks accounting for metadata

85

"""

86

87

class MetadataAwareTextSplitter(TextSplitter):

88

"""

89

Text splitter that considers metadata length in chunk calculations.

90

"""

91

pass

92

```

93

94

### Sentence-Based Splitting

95

96

Advanced sentence-aware text splitting with configurable chunk sizes and overlap strategies.

97

98

```python { .api }

99

class SentenceSplitter(MetadataAwareTextSplitter):

100

"""

101

Sentence-aware text splitter for natural text boundaries.

102

103

Parameters:

104

- chunk_size: int, target chunk size in tokens/characters

105

- chunk_overlap: int, overlap between chunks in tokens/characters

106

- separator: str, primary separator for splitting

107

- paragraph_separator: str, separator for paragraphs

108

- secondary_chunking_regex: str, regex for secondary chunking

109

- tokenizer: Optional[Callable], tokenizer function for token counting

110

- chunking_tokenizer_fn: Optional[Callable], function for chunking tokenization

111

- split_long_sentences: bool, whether to split sentences longer than chunk_size

112

"""

113

def __init__(

114

self,

115

chunk_size: int = 1024,

116

chunk_overlap: int = 200,

117

separator: str = " ",

118

paragraph_separator: str = "\\n\\n\\n",

119

secondary_chunking_regex: str = "[^,.;。?!]+[,.;。?!]?",

120

tokenizer: Optional[Callable] = None,

121

chunking_tokenizer_fn: Optional[Callable] = None,

122

split_long_sentences: bool = False,

123

**kwargs

124

): ...

125

```

126

127

### Token-Based Splitting

128

129

Precise token-level text splitting for applications requiring exact token count control.

130

131

```python { .api }

132

class TokenTextSplitter(MetadataAwareTextSplitter):

133

"""

134

Token-based text splitter for precise token count control.

135

136

Parameters:

137

- chunk_size: int, target chunk size in tokens

138

- chunk_overlap: int, overlap between chunks in tokens

139

- separator: str, separator for text splitting

140

- backup_separators: List[str], fallback separators

141

- tokenizer: Optional[Callable], tokenizer function

142

"""

143

def __init__(

144

self,

145

chunk_size: int = 1024,

146

chunk_overlap: int = 200,

147

separator: str = " ",

148

backup_separators: Optional[List[str]] = None,

149

tokenizer: Optional[Callable] = None,

150

**kwargs

151

): ...

152

```

153

154

### Semantic Splitting

155

156

Embedding-based semantic chunking that creates coherent content boundaries using similarity analysis.

157

158

```python { .api }

159

class SemanticSplitterNodeParser(NodeParser):

160

"""

161

Semantic-based node parser using embedding similarity for chunk boundaries.

162

163

Parameters:

164

- buffer_size: int, number of sentences in rolling window

165

- breakpoint_percentile_threshold: int, percentile threshold for breakpoints

166

- embed_model: Optional[BaseEmbedding], embedding model for similarity computation

167

- sentence_splitter: Optional[SentenceSplitter], sentence splitter for preprocessing

168

- original_text_metadata_key: str, metadata key for storing original text

169

"""

170

def __init__(

171

self,

172

buffer_size: int = 1,

173

breakpoint_percentile_threshold: int = 95,

174

embed_model: Optional[BaseEmbedding] = None,

175

sentence_splitter: Optional[SentenceSplitter] = None,

176

original_text_metadata_key: str = "original_text",

177

**kwargs

178

): ...

179

180

class SemanticDoubleMergingSplitterNodeParser(NodeParser):

181

"""

182

Advanced semantic splitter with double merging for optimal chunk coherence.

183

184

Parameters:

185

- max_chunk_size: int, maximum size for merged chunks

186

- merging_threshold: float, threshold for merging adjacent chunks

187

- embed_model: Optional[BaseEmbedding], embedding model for similarity

188

"""

189

def __init__(

190

self,

191

max_chunk_size: int = 2048,

192

merging_threshold: float = 0.5,

193

embed_model: Optional[BaseEmbedding] = None,

194

**kwargs

195

): ...

196

```

197

198

### Code-Aware Splitting

199

200

Specialized parser for source code with language-specific splitting and structure preservation.

201

202

```python { .api }

203

class CodeSplitter(TextSplitter):

204

"""

205

Code-aware text splitter supporting multiple programming languages.

206

207

Parameters:

208

- language: str, programming language (python, javascript, java, etc.)

209

- chunk_lines: int, target number of lines per chunk

210

- chunk_lines_overlap: int, overlap between chunks in lines

211

- max_chars: int, maximum characters per chunk

212

"""

213

def __init__(

214

self,

215

language: str = "python",

216

chunk_lines: int = 40,

217

chunk_lines_overlap: int = 15,

218

max_chars: int = 1500,

219

**kwargs

220

): ...

221

222

@classmethod

223

def get_separators_for_language(cls, language: str) -> List[str]:

224

"""Get language-specific separators for code splitting."""

225

```

226

227

### Sentence Window Parser

228

229

Parser that creates nodes with surrounding sentence context for enhanced retrieval accuracy.

230

231

```python { .api }

232

class SentenceWindowNodeParser(NodeParser):

233

"""

234

Parser creating nodes with configurable sentence window context.

235

236

Parameters:

237

- sentence_splitter: Optional[SentenceSplitter], sentence splitter for preprocessing

238

- window_size: int, number of sentences before and after target sentence

239

- window_metadata_key: str, metadata key for storing window content

240

- original_text_metadata_key: str, metadata key for original text

241

"""

242

def __init__(

243

self,

244

sentence_splitter: Optional[SentenceSplitter] = None,

245

window_size: int = 3,

246

window_metadata_key: str = "window",

247

original_text_metadata_key: str = "original_text",

248

**kwargs

249

): ...

250

```

251

252

### File Format Parsers

253

254

Specialized parsers for various file formats with structure-aware processing.

255

256

```python { .api }

257

class SimpleFileNodeParser(NodeParser):

258

"""

259

Simple file-based node parser for basic document processing.

260

261

Parameters:

262

- text_splitter: Optional[TextSplitter], text splitter for chunking

263

"""

264

def __init__(

265

self,

266

text_splitter: Optional[TextSplitter] = None,

267

**kwargs

268

): ...

269

270

class HTMLNodeParser(NodeParser):

271

"""

272

HTML document parser with tag-aware processing.

273

274

Parameters:

275

- tags: List[str], HTML tags to extract content from

276

- text_splitter: Optional[TextSplitter], text splitter for chunking

277

"""

278

def __init__(

279

self,

280

tags: Optional[List[str]] = None,

281

text_splitter: Optional[TextSplitter] = None,

282

**kwargs

283

): ...

284

285

class MarkdownNodeParser(NodeParser):

286

"""

287

Markdown document parser preserving structure and hierarchy.

288

289

Parameters:

290

- text_splitter: Optional[TextSplitter], text splitter for chunking

291

"""

292

def __init__(

293

self,

294

text_splitter: Optional[TextSplitter] = None,

295

**kwargs

296

): ...

297

298

class JSONNodeParser(NodeParser):

299

"""

300

JSON document parser for structured data processing.

301

302

Parameters:

303

- text_splitter: Optional[TextSplitter], text splitter for text fields

304

"""

305

def __init__(

306

self,

307

text_splitter: Optional[TextSplitter] = None,

308

**kwargs

309

): ...

310

```

311

312

### Hierarchical Parsing

313

314

Advanced parsers for creating hierarchical node structures with parent-child relationships.

315

316

```python { .api }

317

class HierarchicalNodeParser(NodeParser):

318

"""

319

Parser creating hierarchical node structures with configurable levels.

320

321

Parameters:

322

- node_parser: Optional[NodeParser], base parser for node creation

323

- hierarchical_separator: str, separator defining hierarchy levels

324

- get_windows_from_nodes: Optional[Callable], function to extract windows from nodes

325

- window_metadata_key: str, metadata key for window content

326

"""

327

def __init__(

328

self,

329

node_parser: Optional[NodeParser] = None,

330

hierarchical_separator: str = "\\n\\n",

331

get_windows_from_nodes: Optional[Callable] = None,

332

window_metadata_key: str = "window",

333

**kwargs

334

): ...

335

336

class MarkdownElementNodeParser(NodeParser):

337

"""

338

Markdown parser creating nodes based on document elements and structure.

339

340

Parameters:

341

- llm: Optional[LLM], language model for element classification

342

- num_workers: int, number of worker processes for parallel processing

343

"""

344

def __init__(

345

self,

346

llm: Optional[LLM] = None,

347

num_workers: int = 4,

348

**kwargs

349

): ...

350

351

class UnstructuredElementNodeParser(NodeParser):

352

"""

353

Parser for unstructured documents using element detection and classification.

354

355

Parameters:

356

- api_key: Optional[str], API key for unstructured service

357

- url: Optional[str], URL for unstructured service endpoint

358

- fast_mode: bool, whether to use fast processing mode

359

"""

360

def __init__(

361

self,

362

api_key: Optional[str] = None,

363

url: Optional[str] = None,

364

fast_mode: bool = True,

365

**kwargs

366

): ...

367

```

368

369

### Integration Parsers

370

371

Parsers for integrating with external services and third-party tools.

372

373

```python { .api }

374

class LlamaParseJsonNodeParser(NodeParser):

375

"""

376

Node parser integrating with LlamaParse service for advanced document processing.

377

378

Parameters:

379

- api_key: str, API key for LlamaParse service

380

- base_url: Optional[str], base URL for LlamaParse API

381

- verbose: bool, whether to enable verbose logging

382

"""

383

def __init__(

384

self,

385

api_key: str,

386

base_url: Optional[str] = None,

387

verbose: bool = True,

388

**kwargs

389

): ...

390

391

class LangchainNodeParser(NodeParser):

392

"""

393

Integration wrapper for Langchain text splitter compatibility.

394

395

Parameters:

396

- lc_splitter: Any, Langchain text splitter instance

397

"""

398

def __init__(self, lc_splitter: Any, **kwargs): ...

399

```

400

401

### Language Configuration

402

403

Configuration system for language-specific parsing behavior and optimization.

404

405

```python { .api }

406

class LanguageConfig:

407

"""

408

Language-specific configuration for parsing operations.

409

410

Parameters:

411

- language: str, language identifier (en, es, fr, etc.)

412

- spacy_model: Optional[str], spaCy model name for language

413

- punkt_model: Optional[str], NLTK Punkt model for sentence segmentation

414

"""

415

def __init__(

416

self,

417

language: str = "en",

418

spacy_model: Optional[str] = None,

419

punkt_model: Optional[str] = None

420

): ...

421

```

422

423

### Utility Functions

424

425

Helper functions for working with hierarchical node structures and relationships.

426

427

```python { .api }

428

def get_leaf_nodes(nodes: List[BaseNode]) -> List[BaseNode]:

429

"""

430

Extract leaf nodes from a hierarchical node structure.

431

432

Parameters:

433

- nodes: List[BaseNode], hierarchical node list

434

435

Returns:

436

- List[BaseNode], leaf nodes without children

437

"""

438

439

def get_root_nodes(nodes: List[BaseNode]) -> List[BaseNode]:

440

"""

441

Extract root nodes from a hierarchical node structure.

442

443

Parameters:

444

- nodes: List[BaseNode], hierarchical node list

445

446

Returns:

447

- List[BaseNode], root nodes without parents

448

"""

449

450

def get_child_nodes(

451

nodes: List[BaseNode],

452

all_nodes: List[BaseNode]

453

) -> Dict[str, List[BaseNode]]:

454

"""

455

Get mapping of parent nodes to their children.

456

457

Parameters:

458

- nodes: List[BaseNode], parent nodes

459

- all_nodes: List[BaseNode], complete node collection

460

461

Returns:

462

- Dict[str, List[BaseNode]], mapping of parent ID to child nodes

463

"""

464

465

def get_deeper_nodes(

466

nodes: List[BaseNode],

467

depth: int = 1

468

) -> List[BaseNode]:

469

"""

470

Get nodes at specified depth level in hierarchy.

471

472

Parameters:

473

- nodes: List[BaseNode], node collection

474

- depth: int, target depth level

475

476

Returns:

477

- List[BaseNode], nodes at specified depth

478

"""

479

```

480

481

## Usage Examples

482

483

### Basic Text Splitting

484

485

```python

486

from llama_index.core.node_parser import SentenceSplitter

487

from llama_index.core import Document

488

489

# Create documents

490

documents = [

491

Document(text="Machine learning is a subset of artificial intelligence. It focuses on algorithms that learn from data. Deep learning uses neural networks with multiple layers."),

492

Document(text="Natural language processing helps computers understand human language. It involves tokenization, parsing, and semantic analysis.")

493

]

494

495

# Initialize sentence splitter

496

splitter = SentenceSplitter(

497

chunk_size=512,

498

chunk_overlap=50,

499

separator=" "

500

)

501

502

# Parse documents into nodes

503

nodes = splitter.get_nodes_from_documents(documents, show_progress=True)

504

505

print(f"Created {len(nodes)} nodes")

506

for i, node in enumerate(nodes):

507

print(f"Node {i}: {len(node.text)} characters")

508

```

509

510

### Semantic Chunking

511

512

```python

513

from llama_index.core.node_parser import SemanticSplitterNodeParser

514

from llama_index.core.embeddings import MockEmbedding

515

516

# Initialize semantic splitter with embedding model

517

embed_model = MockEmbedding(embed_dim=384)

518

semantic_splitter = SemanticSplitterNodeParser(

519

buffer_size=1,

520

breakpoint_percentile_threshold=95,

521

embed_model=embed_model

522

)

523

524

# Parse with semantic boundaries

525

nodes = semantic_splitter.get_nodes_from_documents(documents)

526

527

print("Semantic chunks:")

528

for i, node in enumerate(nodes):

529

print(f"Chunk {i}: {node.text[:100]}...")

530

```

531

532

### Code Splitting

533

534

```python

535

from llama_index.core.node_parser import CodeSplitter

536

537

# Python code document

538

code_doc = Document(text="""

539

def factorial(n):

540

if n <= 1:

541

return 1

542

return n * factorial(n - 1)

543

544

class Calculator:

545

def add(self, a, b):

546

return a + b

547

548

def multiply(self, a, b):

549

return a * b

550

551

def main():

552

calc = Calculator()

553

print(calc.add(5, 3))

554

print(factorial(5))

555

556

if __name__ == "__main__":

557

main()

558

""")

559

560

# Code-aware splitter

561

code_splitter = CodeSplitter(

562

language="python",

563

chunk_lines=10,

564

chunk_lines_overlap=2,

565

max_chars=500

566

)

567

568

# Parse code into structured chunks

569

code_nodes = code_splitter.get_nodes_from_documents([code_doc])

570

571

print("Code chunks:")

572

for i, node in enumerate(code_nodes):

573

print(f"Chunk {i}:\\n{node.text}\\n{'-'*40}")

574

```

575

576

### Markdown Processing

577

578

```python

579

from llama_index.core.node_parser import MarkdownNodeParser

580

581

# Markdown document

582

markdown_doc = Document(text="""

583

# Machine Learning Guide

584

585

## Introduction

586

Machine learning is a powerful subset of artificial intelligence.

587

588

### Supervised Learning

589

- Classification

590

- Regression

591

592

### Unsupervised Learning

593

- Clustering

594

- Dimensionality Reduction

595

596

## Deep Learning

597

Deep learning uses neural networks with multiple layers.

598

599

### Neural Networks

600

Neural networks are inspired by biological neurons.

601

""")

602

603

# Markdown-aware parser

604

markdown_parser = MarkdownNodeParser()

605

markdown_nodes = markdown_parser.get_nodes_from_documents([markdown_doc])

606

607

print("Markdown nodes:")

608

for i, node in enumerate(markdown_nodes):

609

print(f"Node {i}: {node.text[:50]}...")

610

print(f"Metadata: {node.metadata}")

611

```

612

613

### Hierarchical Parsing

614

615

```python

616

from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes, get_root_nodes

617

618

# Initialize hierarchical parser

619

hierarchical_parser = HierarchicalNodeParser(

620

node_parser=SentenceSplitter(chunk_size=256),

621

hierarchical_separator="\\n\\n"

622

)

623

624

# Create hierarchical structure

625

hierarchical_nodes = hierarchical_parser.get_nodes_from_documents(documents)

626

627

# Extract different levels

628

leaf_nodes = get_leaf_nodes(hierarchical_nodes)

629

root_nodes = get_root_nodes(hierarchical_nodes)

630

631

print(f"Total nodes: {len(hierarchical_nodes)}")

632

print(f"Leaf nodes: {len(leaf_nodes)}")

633

print(f"Root nodes: {len(root_nodes)}")

634

```

635

636

### Sentence Window Context

637

638

```python

639

from llama_index.core.node_parser import SentenceWindowNodeParser

640

641

# Initialize sentence window parser

642

window_parser = SentenceWindowNodeParser(

643

window_size=3,

644

window_metadata_key="window",

645

original_text_metadata_key="original_text"

646

)

647

648

# Parse with sentence context

649

windowed_nodes = window_parser.get_nodes_from_documents(documents)

650

651

print("Windowed nodes:")

652

for i, node in enumerate(windowed_nodes):

653

print(f"Node {i}:")

654

print(f" Text: {node.text}")

655

print(f" Window: {node.metadata.get('window', 'N/A')}")

656

print(f" Original: {node.metadata.get('original_text', 'N/A')[:50]}...")

657

```

658

659

## Types & Configuration

660

661

```python { .api }

662

# Legacy alias for backward compatibility

663

SimpleNodeParser = SentenceSplitter

664

665

# Language configuration options

666

SUPPORTED_LANGUAGES = [

667

"python", "javascript", "typescript", "java", "cpp", "c",

668

"csharp", "php", "ruby", "go", "rust", "kotlin", "swift"

669

]

670

671

# Metadata keys used by parsers

672

DEFAULT_WINDOW_METADATA_KEY = "window"

673

DEFAULT_ORIGINAL_TEXT_METADATA_KEY = "original_text"

674

DEFAULT_SUB_DOCS_KEY = "sub_docs"

675

```