tessl/pypi-llama-index-core

Interface between LLMs and your data

—

Pending

Overview

Eval results

Files

Node Parsers

Name: tessl/pypi-llama-index-core
Author: tessl

Comprehensive text splitting, parsing, and preprocessing capabilities for transforming documents into nodes. Node parsers handle various content types including plain text, code, markdown, HTML, and JSON while supporting semantic chunking, hierarchical structures, and metadata preservation.

Capabilities

Base Parser Interfaces

Foundation interfaces for all node parsing operations, providing standardized document processing and node generation.

class NodeParser:
    """
    Base interface for node parsing operations.
    
    Parameters:
    - include_metadata: bool, whether to include metadata in parsed nodes
    - include_prev_next_rel: bool, whether to include previous/next relationships
    - callback_manager: Optional[CallbackManager], callback management system
    """
    def __init__(
        self,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        callback_manager: Optional[CallbackManager] = None,
        **kwargs
    ): ...
    
    def get_nodes_from_documents(
        self,
        documents: Sequence[Document],
        show_progress: bool = False,
        **kwargs
    ) -> List[BaseNode]:
        """
        Parse documents into nodes.
        
        Parameters:
        - documents: Sequence[Document], documents to parse
        - show_progress: bool, whether to show parsing progress
        
        Returns:
        - List[BaseNode], parsed nodes from documents
        """

class TextSplitter:
    """
    Base interface for text splitting operations.
    
    Parameters:
    - chunk_size: int, target size for text chunks
    - chunk_overlap: int, overlap between adjacent chunks
    - separator: str, separator used for splitting
    - backup_separators: Optional[List[str]], fallback separators
    """
    def __init__(
        self,
        chunk_size: int = 1024,
        chunk_overlap: int = 200,
        separator: str = " ",
        backup_separators: Optional[List[str]] = None,
        **kwargs
    ): ...
    
    def split_text(self, text: str) -> List[str]:
        """
        Split text into chunks.
        
        Parameters:
        - text: str, input text to split
        
        Returns:
        - List[str], list of text chunks
        """
        
    def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]:
        """
        Split text while considering metadata length.
        
        Parameters:
        - text: str, input text to split
        - metadata_str: str, metadata string to account for
        
        Returns:
        - List[str], list of text chunks accounting for metadata
        """

class MetadataAwareTextSplitter(TextSplitter):
    """
    Text splitter that considers metadata length in chunk calculations.
    """
    pass

Sentence-Based Splitting

Advanced sentence-aware text splitting with configurable chunk sizes and overlap strategies.

class SentenceSplitter(MetadataAwareTextSplitter):
    """
    Sentence-aware text splitter for natural text boundaries.
    
    Parameters:
    - chunk_size: int, target chunk size in tokens/characters
    - chunk_overlap: int, overlap between chunks in tokens/characters
    - separator: str, primary separator for splitting
    - paragraph_separator: str, separator for paragraphs
    - secondary_chunking_regex: str, regex for secondary chunking
    - tokenizer: Optional[Callable], tokenizer function for token counting
    - chunking_tokenizer_fn: Optional[Callable], function for chunking tokenization
    - split_long_sentences: bool, whether to split sentences longer than chunk_size
    """
    def __init__(
        self,
        chunk_size: int = 1024,
        chunk_overlap: int = 200,
        separator: str = " ",
        paragraph_separator: str = "\\n\\n\\n",
        secondary_chunking_regex: str = "[^,.;。?!]+[,.;。?!]?",
        tokenizer: Optional[Callable] = None,
        chunking_tokenizer_fn: Optional[Callable] = None,
        split_long_sentences: bool = False,
        **kwargs
    ): ...

Token-Based Splitting

Precise token-level text splitting for applications requiring exact token count control.

class TokenTextSplitter(MetadataAwareTextSplitter):
    """
    Token-based text splitter for precise token count control.
    
    Parameters:
    - chunk_size: int, target chunk size in tokens
    - chunk_overlap: int, overlap between chunks in tokens
    - separator: str, separator for text splitting
    - backup_separators: List[str], fallback separators
    - tokenizer: Optional[Callable], tokenizer function
    """
    def __init__(
        self,
        chunk_size: int = 1024,
        chunk_overlap: int = 200,
        separator: str = " ",
        backup_separators: Optional[List[str]] = None,
        tokenizer: Optional[Callable] = None,
        **kwargs
    ): ...

Semantic Splitting

Embedding-based semantic chunking that creates coherent content boundaries using similarity analysis.

class SemanticSplitterNodeParser(NodeParser):
    """
    Semantic-based node parser using embedding similarity for chunk boundaries.
    
    Parameters:
    - buffer_size: int, number of sentences in rolling window
    - breakpoint_percentile_threshold: int, percentile threshold for breakpoints
    - embed_model: Optional[BaseEmbedding], embedding model for similarity computation
    - sentence_splitter: Optional[SentenceSplitter], sentence splitter for preprocessing
    - original_text_metadata_key: str, metadata key for storing original text
    """
    def __init__(
        self,
        buffer_size: int = 1,
        breakpoint_percentile_threshold: int = 95,
        embed_model: Optional[BaseEmbedding] = None,
        sentence_splitter: Optional[SentenceSplitter] = None,
        original_text_metadata_key: str = "original_text",
        **kwargs
    ): ...

class SemanticDoubleMergingSplitterNodeParser(NodeParser):
    """
    Advanced semantic splitter with double merging for optimal chunk coherence.
    
    Parameters:
    - max_chunk_size: int, maximum size for merged chunks
    - merging_threshold: float, threshold for merging adjacent chunks
    - embed_model: Optional[BaseEmbedding], embedding model for similarity
    """
    def __init__(
        self,
        max_chunk_size: int = 2048,
        merging_threshold: float = 0.5,
        embed_model: Optional[BaseEmbedding] = None,
        **kwargs
    ): ...

Code-Aware Splitting

Specialized parser for source code with language-specific splitting and structure preservation.

class CodeSplitter(TextSplitter):
    """
    Code-aware text splitter supporting multiple programming languages.
    
    Parameters:
    - language: str, programming language (python, javascript, java, etc.)
    - chunk_lines: int, target number of lines per chunk
    - chunk_lines_overlap: int, overlap between chunks in lines
    - max_chars: int, maximum characters per chunk
    """
    def __init__(
        self,
        language: str = "python",
        chunk_lines: int = 40,
        chunk_lines_overlap: int = 15,
        max_chars: int = 1500,
        **kwargs
    ): ...
    
    @classmethod
    def get_separators_for_language(cls, language: str) -> List[str]:
        """Get language-specific separators for code splitting."""

Sentence Window Parser

Parser that creates nodes with surrounding sentence context for enhanced retrieval accuracy.

class SentenceWindowNodeParser(NodeParser):
    """
    Parser creating nodes with configurable sentence window context.
    
    Parameters:
    - sentence_splitter: Optional[SentenceSplitter], sentence splitter for preprocessing
    - window_size: int, number of sentences before and after target sentence
    - window_metadata_key: str, metadata key for storing window content
    - original_text_metadata_key: str, metadata key for original text
    """
    def __init__(
        self,
        sentence_splitter: Optional[SentenceSplitter] = None,
        window_size: int = 3,
        window_metadata_key: str = "window",
        original_text_metadata_key: str = "original_text",
        **kwargs
    ): ...

File Format Parsers

Specialized parsers for various file formats with structure-aware processing.

class SimpleFileNodeParser(NodeParser):
    """
    Simple file-based node parser for basic document processing.
    
    Parameters:
    - text_splitter: Optional[TextSplitter], text splitter for chunking
    """
    def __init__(
        self,
        text_splitter: Optional[TextSplitter] = None,
        **kwargs
    ): ...

class HTMLNodeParser(NodeParser):
    """
    HTML document parser with tag-aware processing.
    
    Parameters:
    - tags: List[str], HTML tags to extract content from
    - text_splitter: Optional[TextSplitter], text splitter for chunking
    """
    def __init__(
        self,
        tags: Optional[List[str]] = None,
        text_splitter: Optional[TextSplitter] = None,
        **kwargs
    ): ...

class MarkdownNodeParser(NodeParser):
    """
    Markdown document parser preserving structure and hierarchy.
    
    Parameters:
    - text_splitter: Optional[TextSplitter], text splitter for chunking
    """
    def __init__(
        self,
        text_splitter: Optional[TextSplitter] = None,
        **kwargs
    ): ...

class JSONNodeParser(NodeParser):
    """
    JSON document parser for structured data processing.
    
    Parameters:
    - text_splitter: Optional[TextSplitter], text splitter for text fields
    """
    def __init__(
        self,
        text_splitter: Optional[TextSplitter] = None,
        **kwargs
    ): ...

Hierarchical Parsing

Advanced parsers for creating hierarchical node structures with parent-child relationships.

class HierarchicalNodeParser(NodeParser):
    """
    Parser creating hierarchical node structures with configurable levels.
    
    Parameters:
    - node_parser: Optional[NodeParser], base parser for node creation
    - hierarchical_separator: str, separator defining hierarchy levels
    - get_windows_from_nodes: Optional[Callable], function to extract windows from nodes
    - window_metadata_key: str, metadata key for window content
    """
    def __init__(
        self,
        node_parser: Optional[NodeParser] = None,
        hierarchical_separator: str = "\\n\\n",
        get_windows_from_nodes: Optional[Callable] = None,
        window_metadata_key: str = "window",
        **kwargs
    ): ...

class MarkdownElementNodeParser(NodeParser):
    """
    Markdown parser creating nodes based on document elements and structure.
    
    Parameters:
    - llm: Optional[LLM], language model for element classification
    - num_workers: int, number of worker processes for parallel processing
    """
    def __init__(
        self,
        llm: Optional[LLM] = None,
        num_workers: int = 4,
        **kwargs
    ): ...

class UnstructuredElementNodeParser(NodeParser):
    """
    Parser for unstructured documents using element detection and classification.
    
    Parameters:
    - api_key: Optional[str], API key for unstructured service
    - url: Optional[str], URL for unstructured service endpoint
    - fast_mode: bool, whether to use fast processing mode
    """
    def __init__(
        self,
        api_key: Optional[str] = None,
        url: Optional[str] = None,
        fast_mode: bool = True,
        **kwargs
    ): ...

Integration Parsers

Parsers for integrating with external services and third-party tools.

class LlamaParseJsonNodeParser(NodeParser):
    """
    Node parser integrating with LlamaParse service for advanced document processing.
    
    Parameters:
    - api_key: str, API key for LlamaParse service
    - base_url: Optional[str], base URL for LlamaParse API
    - verbose: bool, whether to enable verbose logging
    """
    def __init__(
        self,
        api_key: str,
        base_url: Optional[str] = None,
        verbose: bool = True,
        **kwargs
    ): ...

class LangchainNodeParser(NodeParser):
    """
    Integration wrapper for Langchain text splitter compatibility.
    
    Parameters:
    - lc_splitter: Any, Langchain text splitter instance
    """
    def __init__(self, lc_splitter: Any, **kwargs): ...

Language Configuration

Configuration system for language-specific parsing behavior and optimization.

class LanguageConfig:
    """
    Language-specific configuration for parsing operations.
    
    Parameters:
    - language: str, language identifier (en, es, fr, etc.)
    - spacy_model: Optional[str], spaCy model name for language
    - punkt_model: Optional[str], NLTK Punkt model for sentence segmentation
    """
    def __init__(
        self,
        language: str = "en",
        spacy_model: Optional[str] = None,
        punkt_model: Optional[str] = None
    ): ...

Utility Functions

Helper functions for working with hierarchical node structures and relationships.

def get_leaf_nodes(nodes: List[BaseNode]) -> List[BaseNode]:
    """
    Extract leaf nodes from a hierarchical node structure.
    
    Parameters:
    - nodes: List[BaseNode], hierarchical node list
    
    Returns:
    - List[BaseNode], leaf nodes without children
    """

def get_root_nodes(nodes: List[BaseNode]) -> List[BaseNode]:
    """
    Extract root nodes from a hierarchical node structure.
    
    Parameters:
    - nodes: List[BaseNode], hierarchical node list
    
    Returns:
    - List[BaseNode], root nodes without parents
    """

def get_child_nodes(
    nodes: List[BaseNode],
    all_nodes: List[BaseNode]
) -> Dict[str, List[BaseNode]]:
    """
    Get mapping of parent nodes to their children.
    
    Parameters:
    - nodes: List[BaseNode], parent nodes
    - all_nodes: List[BaseNode], complete node collection
    
    Returns:
    - Dict[str, List[BaseNode]], mapping of parent ID to child nodes
    """

def get_deeper_nodes(
    nodes: List[BaseNode],
    depth: int = 1
) -> List[BaseNode]:
    """
    Get nodes at specified depth level in hierarchy.
    
    Parameters:
    - nodes: List[BaseNode], node collection
    - depth: int, target depth level
    
    Returns:
    - List[BaseNode], nodes at specified depth
    """

Usage Examples

Basic Text Splitting

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document

# Create documents
documents = [
    Document(text="Machine learning is a subset of artificial intelligence. It focuses on algorithms that learn from data. Deep learning uses neural networks with multiple layers."),
    Document(text="Natural language processing helps computers understand human language. It involves tokenization, parsing, and semantic analysis.")
]

# Initialize sentence splitter
splitter = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=50,
    separator=" "
)

# Parse documents into nodes
nodes = splitter.get_nodes_from_documents(documents, show_progress=True)

print(f"Created {len(nodes)} nodes")
for i, node in enumerate(nodes):
    print(f"Node {i}: {len(node.text)} characters")

Semantic Chunking

from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.embeddings import MockEmbedding

# Initialize semantic splitter with embedding model
embed_model = MockEmbedding(embed_dim=384)
semantic_splitter = SemanticSplitterNodeParser(
    buffer_size=1,
    breakpoint_percentile_threshold=95,
    embed_model=embed_model
)

# Parse with semantic boundaries
nodes = semantic_splitter.get_nodes_from_documents(documents)

print("Semantic chunks:")
for i, node in enumerate(nodes):
    print(f"Chunk {i}: {node.text[:100]}...")

Code Splitting

from llama_index.core.node_parser import CodeSplitter

# Python code document
code_doc = Document(text="""
def factorial(n):
    if n <= 1:
        return 1
    return n * factorial(n - 1)

class Calculator:
    def add(self, a, b):
        return a + b
    
    def multiply(self, a, b):
        return a * b

def main():
    calc = Calculator()
    print(calc.add(5, 3))
    print(factorial(5))

if __name__ == "__main__":
    main()
""")

# Code-aware splitter
code_splitter = CodeSplitter(
    language="python",
    chunk_lines=10,
    chunk_lines_overlap=2,
    max_chars=500
)

# Parse code into structured chunks
code_nodes = code_splitter.get_nodes_from_documents([code_doc])

print("Code chunks:")
for i, node in enumerate(code_nodes):
    print(f"Chunk {i}:\\n{node.text}\\n{'-'*40}")

Markdown Processing

from llama_index.core.node_parser import MarkdownNodeParser

# Markdown document
markdown_doc = Document(text="""
# Machine Learning Guide

## Introduction
Machine learning is a powerful subset of artificial intelligence.

### Supervised Learning
- Classification
- Regression

### Unsupervised Learning
- Clustering
- Dimensionality Reduction

## Deep Learning
Deep learning uses neural networks with multiple layers.

### Neural Networks
Neural networks are inspired by biological neurons.
""")

# Markdown-aware parser
markdown_parser = MarkdownNodeParser()
markdown_nodes = markdown_parser.get_nodes_from_documents([markdown_doc])

print("Markdown nodes:")
for i, node in enumerate(markdown_nodes):
    print(f"Node {i}: {node.text[:50]}...")
    print(f"Metadata: {node.metadata}")

Hierarchical Parsing

from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes, get_root_nodes

# Initialize hierarchical parser
hierarchical_parser = HierarchicalNodeParser(
    node_parser=SentenceSplitter(chunk_size=256),
    hierarchical_separator="\\n\\n"
)

# Create hierarchical structure
hierarchical_nodes = hierarchical_parser.get_nodes_from_documents(documents)

# Extract different levels
leaf_nodes = get_leaf_nodes(hierarchical_nodes)
root_nodes = get_root_nodes(hierarchical_nodes)

print(f"Total nodes: {len(hierarchical_nodes)}")
print(f"Leaf nodes: {len(leaf_nodes)}")
print(f"Root nodes: {len(root_nodes)}")

Sentence Window Context

from llama_index.core.node_parser import SentenceWindowNodeParser

# Initialize sentence window parser
window_parser = SentenceWindowNodeParser(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

# Parse with sentence context
windowed_nodes = window_parser.get_nodes_from_documents(documents)

print("Windowed nodes:")
for i, node in enumerate(windowed_nodes):
    print(f"Node {i}:")
    print(f"  Text: {node.text}")
    print(f"  Window: {node.metadata.get('window', 'N/A')}")
    print(f"  Original: {node.metadata.get('original_text', 'N/A')[:50]}...")

Types & Configuration

# Legacy alias for backward compatibility
SimpleNodeParser = SentenceSplitter

# Language configuration options
SUPPORTED_LANGUAGES = [
    "python", "javascript", "typescript", "java", "cpp", "c", 
    "csharp", "php", "ruby", "go", "rust", "kotlin", "swift"
]

# Metadata keys used by parsers
DEFAULT_WINDOW_METADATA_KEY = "window"
DEFAULT_ORIGINAL_TEXT_METADATA_KEY = "original_text"
DEFAULT_SUB_DOCS_KEY = "sub_docs"

Install with Tessl CLI