Interface between LLMs and your data for building retrieval-augmented generation (RAG) applications
—
Document loading, parsing, and chunking functionality for various file formats with intelligent text splitting strategies and node creation.
Load documents from various sources and file formats.
class SimpleDirectoryReader:
"""
Simple directory reader for loading documents from filesystem.
Args:
input_dir: Directory path to read documents from
input_files: List of specific files to load
exclude_hidden: Whether to exclude hidden files
errors: How to handle errors ('ignore', 'strict')
recursive: Whether to read directories recursively
encoding: Text encoding to use
filename_as_id: Use filename as document ID
required_exts: List of required file extensions
file_extractor: Custom file extractors
num_files_limit: Maximum number of files to load
**kwargs: Additional arguments
"""
def __init__(
self,
input_dir=None,
input_files=None,
exclude_hidden=True,
errors="ignore",
recursive=True,
encoding="utf-8",
filename_as_id=False,
required_exts=None,
file_extractor=None,
num_files_limit=None,
**kwargs
): ...
def load_data(self, show_progress=False):
"""
Load documents from specified sources.
Args:
show_progress: Whether to show loading progress
Returns:
List[Document]: List of loaded documents
"""
def iter_data(self, show_progress=False):
"""Iterate over documents without loading all into memory."""
def download_loader(loader_class):
"""
Download a community loader from LlamaHub.
Args:
loader_class: Name of the loader class to download
Returns:
class: Loader class ready for instantiation
"""Core document and node representations.
class Document:
"""
Document object for storing text and metadata.
Args:
text: Document text content
metadata: Dictionary of metadata
excluded_embed_metadata_keys: Keys to exclude from embedding
excluded_llm_metadata_keys: Keys to exclude from LLM context
relationships: Relationships to other documents
**kwargs: Additional arguments
"""
def __init__(
self,
text=None,
metadata=None,
excluded_embed_metadata_keys=None,
excluded_llm_metadata_keys=None,
relationships=None,
**kwargs
): ...
@property
def text(self):
"""Document text content."""
@property
def metadata(self):
"""Document metadata dictionary."""
def get_content(self, metadata_mode="all"):
"""
Get document content with optional metadata.
Args:
metadata_mode: How to include metadata ("all", "embed", "llm", "none")
Returns:
str: Document content with metadata
"""
class TextNode:
"""
Text node for chunked document content.
Args:
text: Node text content
metadata: Node metadata
relationships: Relationships to other nodes
**kwargs: Additional arguments
"""
def __init__(self, text=None, metadata=None, relationships=None, **kwargs): ...
@property
def text(self):
"""Node text content."""
@property
def metadata(self):
"""Node metadata."""
def get_content(self, metadata_mode="all"):
"""Get node content with metadata."""
class ImageNode:
"""
Node for image content.
Args:
image: Image data or path
image_path: Path to image file
image_url: URL to image
text: Optional text description
**kwargs: Additional arguments
"""
def __init__(self, image=None, image_path=None, image_url=None, text=None, **kwargs): ...
class IndexNode:
"""
Node that references other indices.
Args:
text: Node text content
index_id: ID of referenced index
**kwargs: Additional arguments
"""
def __init__(self, text=None, index_id=None, **kwargs): ...Various text splitting strategies for creating chunks from documents.
class SentenceSplitter:
"""
Sentence-based text splitter.
Args:
chunk_size: Maximum chunk size in tokens
chunk_overlap: Overlap between chunks in tokens
separator: Sentence separator pattern
paragraph_separator: Paragraph separator
chunking_tokenizer_fn: Custom tokenizer function
secondary_chunking_regex: Secondary chunking pattern
"""
def __init__(
self,
chunk_size=1024,
chunk_overlap=200,
separator=" ",
paragraph_separator="\\n\\n\\n",
chunking_tokenizer_fn=None,
secondary_chunking_regex="[^,.;。?!]+[,.;。?!]?",
): ...
def split_text(self, text):
"""
Split text into chunks.
Args:
text: Text to split
Returns:
List[str]: List of text chunks
"""
def split_texts(self, texts):
"""Split multiple texts."""
class TokenTextSplitter:
"""
Token-based text splitter.
Args:
chunk_size: Maximum chunk size in tokens
chunk_overlap: Overlap between chunks in tokens
separator: Token separator
backup_separators: Fallback separators
tokenizer: Tokenizer to use
"""
def __init__(
self,
chunk_size=1024,
chunk_overlap=200,
separator=" ",
backup_separators=["\\n"],
tokenizer=None,
): ...
def split_text(self, text):
"""Split text using token counting."""
class CodeSplitter:
"""
Code-aware text splitter that respects code structure.
Args:
language: Programming language for syntax awareness
chunk_size: Maximum chunk size
chunk_overlap: Overlap between chunks
max_chars: Maximum characters per chunk
"""
def __init__(self, language="python", chunk_size=1024, chunk_overlap=200, max_chars=1500): ...
def split_text(self, text):
"""Split code text preserving structure."""
class SemanticSplitterNodeParser:
"""
Semantic-based text splitter using embeddings.
Args:
buffer_size: Buffer size for semantic analysis
breakpoint_percentile_threshold: Threshold for breakpoint detection
embed_model: Embedding model for semantic analysis
"""
def __init__(self, buffer_size=1, breakpoint_percentile_threshold=95, embed_model=None): ...
def get_nodes_from_documents(self, documents, show_progress=False):
"""
Create nodes from documents using semantic splitting.
Args:
documents: List of documents to process
show_progress: Whether to show progress
Returns:
List[TextNode]: List of semantic text nodes
"""
class SentenceWindowNodeParser:
"""
Sentence window splitter for context-aware chunking.
Args:
window_size: Number of sentences per window
window_metadata_key: Metadata key for window info
original_text_metadata_key: Metadata key for original text
"""
def __init__(self, window_size=3, window_metadata_key="window", original_text_metadata_key="original_text"): ...
def get_nodes_from_documents(self, documents, show_progress=False):
"""Create windowed nodes with sentence context."""Transform and process nodes after creation.
class NodeParser:
"""Base class for node parsers."""
def get_nodes_from_documents(self, documents, show_progress=False):
"""
Parse documents into nodes.
Args:
documents: List of documents
show_progress: Whether to show progress
Returns:
List[BaseNode]: List of processed nodes
"""
class SimpleNodeParser(NodeParser):
"""
Simple node parser with basic text splitting.
Args:
text_splitter: Text splitter to use
include_metadata: Whether to include metadata
include_prev_next_rel: Whether to include previous/next relationships
"""
def __init__(self, text_splitter=None, include_metadata=True, include_prev_next_rel=True): ...
class HierarchicalNodeParser(NodeParser):
"""
Hierarchical node parser for multi-level document structure.
Args:
node_parsers: List of node parsers for different levels
"""
def __init__(self, node_parsers=None): ...Process and filter nodes after retrieval.
class SimilarityPostprocessor:
"""
Filter nodes by similarity threshold.
Args:
similarity_cutoff: Minimum similarity score
"""
def __init__(self, similarity_cutoff=0.7): ...
def postprocess_nodes(self, nodes, query_bundle=None):
"""Filter nodes by similarity."""
class KeywordNodePostprocessor:
"""
Filter nodes by keyword presence.
Args:
required_keywords: Keywords that must be present
exclude_keywords: Keywords that must not be present
"""
def __init__(self, required_keywords=None, exclude_keywords=None): ...
def postprocess_nodes(self, nodes, query_bundle=None):
"""Filter nodes by keyword criteria."""
class LLMRerank:
"""
Rerank nodes using LLM-based scoring.
Args:
llm: LLM to use for reranking
top_n: Number of top nodes to return
"""
def __init__(self, llm=None, top_n=5): ...
def postprocess_nodes(self, nodes, query_bundle=None):
"""Rerank nodes using LLM scoring."""from enum import Enum
class MetadataMode(str, Enum):
"""Metadata inclusion modes."""
ALL = "all"
EMBED = "embed"
LLM = "llm"
NONE = "none"
class NodeRelationship(str, Enum):
"""Node relationship types."""
SOURCE = "SOURCE"
PREVIOUS = "PREVIOUS"
NEXT = "NEXT"
PARENT = "PARENT"
CHILD = "CHILD"Install with Tessl CLI
npx tessl i tessl/pypi-llama-index