tessl/pypi-llama-index

Interface between LLMs and your data for building retrieval-augmented generation (RAG) applications

—

Pending

Overview

Eval results

Files

Document Processing

Name: tessl/pypi-llama-index
Author: tessl

Document loading, parsing, and chunking functionality for various file formats with intelligent text splitting strategies and node creation.

Capabilities

Document Loading

Load documents from various sources and file formats.

class SimpleDirectoryReader:
    """
    Simple directory reader for loading documents from filesystem.
    
    Args:
        input_dir: Directory path to read documents from
        input_files: List of specific files to load
        exclude_hidden: Whether to exclude hidden files
        errors: How to handle errors ('ignore', 'strict')
        recursive: Whether to read directories recursively
        encoding: Text encoding to use
        filename_as_id: Use filename as document ID
        required_exts: List of required file extensions
        file_extractor: Custom file extractors
        num_files_limit: Maximum number of files to load
        **kwargs: Additional arguments
    """
    def __init__(
        self,
        input_dir=None,
        input_files=None,
        exclude_hidden=True,
        errors="ignore",
        recursive=True,
        encoding="utf-8",
        filename_as_id=False,
        required_exts=None,
        file_extractor=None,
        num_files_limit=None,
        **kwargs
    ): ...
    
    def load_data(self, show_progress=False):
        """
        Load documents from specified sources.
        
        Args:
            show_progress: Whether to show loading progress
            
        Returns:
            List[Document]: List of loaded documents
        """
    
    def iter_data(self, show_progress=False):
        """Iterate over documents without loading all into memory."""

def download_loader(loader_class):
    """
    Download a community loader from LlamaHub.
    
    Args:
        loader_class: Name of the loader class to download
        
    Returns:
        class: Loader class ready for instantiation
    """

Document Schema

Core document and node representations.

class Document:
    """
    Document object for storing text and metadata.
    
    Args:
        text: Document text content
        metadata: Dictionary of metadata
        excluded_embed_metadata_keys: Keys to exclude from embedding
        excluded_llm_metadata_keys: Keys to exclude from LLM context
        relationships: Relationships to other documents
        **kwargs: Additional arguments
    """
    def __init__(
        self,
        text=None,
        metadata=None,
        excluded_embed_metadata_keys=None,
        excluded_llm_metadata_keys=None,
        relationships=None,
        **kwargs
    ): ...
    
    @property
    def text(self):
        """Document text content."""
    
    @property
    def metadata(self):
        """Document metadata dictionary."""
    
    def get_content(self, metadata_mode="all"):
        """
        Get document content with optional metadata.
        
        Args:
            metadata_mode: How to include metadata ("all", "embed", "llm", "none")
            
        Returns:
            str: Document content with metadata
        """

class TextNode:
    """
    Text node for chunked document content.
    
    Args:
        text: Node text content
        metadata: Node metadata
        relationships: Relationships to other nodes
        **kwargs: Additional arguments
    """
    def __init__(self, text=None, metadata=None, relationships=None, **kwargs): ...
    
    @property
    def text(self):
        """Node text content."""
    
    @property
    def metadata(self):
        """Node metadata."""
    
    def get_content(self, metadata_mode="all"):
        """Get node content with metadata."""

class ImageNode:
    """
    Node for image content.
    
    Args:
        image: Image data or path
        image_path: Path to image file
        image_url: URL to image
        text: Optional text description
        **kwargs: Additional arguments
    """
    def __init__(self, image=None, image_path=None, image_url=None, text=None, **kwargs): ...

class IndexNode:
    """
    Node that references other indices.
    
    Args:
        text: Node text content
        index_id: ID of referenced index
        **kwargs: Additional arguments
    """
    def __init__(self, text=None, index_id=None, **kwargs): ...

Text Splitting

Various text splitting strategies for creating chunks from documents.

class SentenceSplitter:
    """
    Sentence-based text splitter.
    
    Args:
        chunk_size: Maximum chunk size in tokens
        chunk_overlap: Overlap between chunks in tokens
        separator: Sentence separator pattern
        paragraph_separator: Paragraph separator
        chunking_tokenizer_fn: Custom tokenizer function
        secondary_chunking_regex: Secondary chunking pattern
    """
    def __init__(
        self,
        chunk_size=1024,
        chunk_overlap=200,
        separator=" ",
        paragraph_separator="\\n\\n\\n",
        chunking_tokenizer_fn=None,
        secondary_chunking_regex="[^,.;。？！]+[,.;。？！]?",
    ): ...
    
    def split_text(self, text):
        """
        Split text into chunks.
        
        Args:
            text: Text to split
            
        Returns:
            List[str]: List of text chunks
        """
    
    def split_texts(self, texts):
        """Split multiple texts."""

class TokenTextSplitter:
    """
    Token-based text splitter.
    
    Args:
        chunk_size: Maximum chunk size in tokens
        chunk_overlap: Overlap between chunks in tokens
        separator: Token separator
        backup_separators: Fallback separators
        tokenizer: Tokenizer to use
    """
    def __init__(
        self,
        chunk_size=1024,
        chunk_overlap=200,
        separator=" ",
        backup_separators=["\\n"],
        tokenizer=None,
    ): ...
    
    def split_text(self, text):
        """Split text using token counting."""

class CodeSplitter:
    """
    Code-aware text splitter that respects code structure.
    
    Args:
        language: Programming language for syntax awareness
        chunk_size: Maximum chunk size
        chunk_overlap: Overlap between chunks
        max_chars: Maximum characters per chunk
    """
    def __init__(self, language="python", chunk_size=1024, chunk_overlap=200, max_chars=1500): ...
    
    def split_text(self, text):
        """Split code text preserving structure."""

class SemanticSplitterNodeParser:
    """
    Semantic-based text splitter using embeddings.
    
    Args:
        buffer_size: Buffer size for semantic analysis
        breakpoint_percentile_threshold: Threshold for breakpoint detection
        embed_model: Embedding model for semantic analysis
    """
    def __init__(self, buffer_size=1, breakpoint_percentile_threshold=95, embed_model=None): ...
    
    def get_nodes_from_documents(self, documents, show_progress=False):
        """
        Create nodes from documents using semantic splitting.
        
        Args:
            documents: List of documents to process
            show_progress: Whether to show progress
            
        Returns:
            List[TextNode]: List of semantic text nodes
        """

class SentenceWindowNodeParser:
    """
    Sentence window splitter for context-aware chunking.
    
    Args:
        window_size: Number of sentences per window
        window_metadata_key: Metadata key for window info
        original_text_metadata_key: Metadata key for original text
    """
    def __init__(self, window_size=3, window_metadata_key="window", original_text_metadata_key="original_text"): ...
    
    def get_nodes_from_documents(self, documents, show_progress=False):
        """Create windowed nodes with sentence context."""

Node Processing

Transform and process nodes after creation.

class NodeParser:
    """Base class for node parsers."""
    def get_nodes_from_documents(self, documents, show_progress=False):
        """
        Parse documents into nodes.
        
        Args:
            documents: List of documents
            show_progress: Whether to show progress
            
        Returns:
            List[BaseNode]: List of processed nodes
        """

class SimpleNodeParser(NodeParser):
    """
    Simple node parser with basic text splitting.
    
    Args:
        text_splitter: Text splitter to use
        include_metadata: Whether to include metadata
        include_prev_next_rel: Whether to include previous/next relationships
    """
    def __init__(self, text_splitter=None, include_metadata=True, include_prev_next_rel=True): ...

class HierarchicalNodeParser(NodeParser):
    """
    Hierarchical node parser for multi-level document structure.
    
    Args:
        node_parsers: List of node parsers for different levels
    """
    def __init__(self, node_parsers=None): ...

Node Postprocessors

Process and filter nodes after retrieval.

class SimilarityPostprocessor:
    """
    Filter nodes by similarity threshold.
    
    Args:
        similarity_cutoff: Minimum similarity score
    """
    def __init__(self, similarity_cutoff=0.7): ...
    
    def postprocess_nodes(self, nodes, query_bundle=None):
        """Filter nodes by similarity."""

class KeywordNodePostprocessor:
    """
    Filter nodes by keyword presence.
    
    Args:
        required_keywords: Keywords that must be present
        exclude_keywords: Keywords that must not be present
    """
    def __init__(self, required_keywords=None, exclude_keywords=None): ...
    
    def postprocess_nodes(self, nodes, query_bundle=None):
        """Filter nodes by keyword criteria."""

class LLMRerank:
    """
    Rerank nodes using LLM-based scoring.
    
    Args:
        llm: LLM to use for reranking
        top_n: Number of top nodes to return
    """
    def __init__(self, llm=None, top_n=5): ...
    
    def postprocess_nodes(self, nodes, query_bundle=None):
        """Rerank nodes using LLM scoring."""

Types

from enum import Enum

class MetadataMode(str, Enum):
    """Metadata inclusion modes."""
    ALL = "all"
    EMBED = "embed" 
    LLM = "llm"
    NONE = "none"

class NodeRelationship(str, Enum):
    """Node relationship types."""
    SOURCE = "SOURCE"
    PREVIOUS = "PREVIOUS"
    NEXT = "NEXT"
    PARENT = "PARENT"
    CHILD = "CHILD"

Install with Tessl CLI