or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/chonkie@1.5.x

docs

advanced-features.mdchunkers.mdcore-types.mddata-processing.mdembeddings.mdexport.mdindex.mdlogging.mdpipeline.mdrefineries.mdtokenizers.mdvector-databases.md
tile.json

tessl/pypi-chonkie

tessl install tessl/pypi-chonkie@1.5.0

The lightweight ingestion library for fast, efficient and robust RAG pipelines

data-processing.mddocs/

Data Processing

File loading and text preprocessing components including Fetchers for data loading and Chefs for parsing and preprocessing various file formats.

Capabilities

Fetchers

Components for loading data from various sources.

BaseFetcher

Abstract base class for all fetcher implementations.

from abc import ABC, abstractmethod
from typing import Any

class BaseFetcher(ABC):
    """
    Base class for all fetcher implementations.
    """
    def __init__(self): ...

    @abstractmethod
    def fetch(self, *args: Any, **kwargs: Any) -> Any:
        """
        Fetches data from source.

        Args:
            *args: Positional arguments
            **kwargs: Keyword arguments

        Returns:
            Fetched data
        """
        ...

FileFetcher

Fetches files from the local filesystem.

from pathlib import Path
from typing import Optional, Union

class FileFetcher(BaseFetcher):
    """
    Fetches files from local filesystem.
    """
    def __init__(self): ...

    def fetch(
        self,
        path: Optional[str] = None,
        dir: Optional[str] = None,
        ext: Optional[list[str]] = None
    ) -> Union[Path, list[Path]]:
        """
        Fetches single file or multiple files from directory.

        Args:
            path: Path to a specific file (mutually exclusive with dir)
            dir: Directory to search for files (mutually exclusive with path)
            ext: List of file extensions to filter (e.g., ['.txt', '.md'])

        Returns:
            Single Path if path is provided, list of Paths if dir is provided

        Raises:
            ValueError: If both path and dir are provided, or neither is provided
        """
        ...

    def fetch_file(self, dir: str, name: str) -> Path:
        """
        Fetches a specific file from a directory.

        Args:
            dir: Directory path
            name: File name

        Returns:
            Path to the file
        """
        ...

    def __call__(
        self,
        path: Optional[str] = None,
        dir: Optional[str] = None,
        ext: Optional[list[str]] = None
    ) -> Union[Path, list[Path]]:
        """
        Allows calling fetcher as a function.

        Args:
            path: Path to a specific file
            dir: Directory to search
            ext: List of file extensions to filter

        Returns:
            Single Path or list of Paths
        """
        ...

Usage example:

from chonkie import FileFetcher

fetcher = FileFetcher()

# Fetch single file
file_path = fetcher.fetch(path="document.txt")

# Fetch all files from directory
all_files = fetcher.fetch(dir="./documents")

# Fetch files with specific extensions
text_files = fetcher.fetch(dir="./documents", ext=[".txt", ".md"])

# Use in pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .fetch_from("file", dir="./documents", ext=[".txt", ".md"])
    .process_with("text")
    .chunk_with("recursive")
)

docs = pipe.run()

Chefs

Components for parsing and preprocessing text from various file formats.

BaseChef

Abstract base class for all chef implementations.

from abc import ABC, abstractmethod
from pathlib import Path
from typing import Union

class BaseChef(ABC):
    """
    Base class for all chef implementations that process files into documents.
    """
    def __init__(self): ...

    @abstractmethod
    def process(self, path: Union[str, Path]) -> Document:
        """
        Processes a file into a document.

        Args:
            path: File path

        Returns:
            Document object
        """
        ...

    @abstractmethod
    def parse(self, text: str) -> Document:
        """
        Parses text into a document.

        Args:
            text: Raw text content

        Returns:
            Document object
        """
        ...

    def process_batch(
        self,
        paths: Union[list[str], list[Path]]
    ) -> list[Document]:
        """
        Processes multiple files in batch.

        Args:
            paths: List of file paths

        Returns:
            List of Document objects
        """
        ...

    def read(self, path: Union[str, Path]) -> str:
        """
        Reads file content.

        Args:
            path: File path

        Returns:
            File content as string
        """
        ...

    def __call__(self, path: Union[str, Path]) -> Document:
        """
        Allows calling chef as a function.

        Args:
            path: File path

        Returns:
            Document object
        """
        ...

TextChef

Processes plain text files into documents.

class TextChef(BaseChef):
    """
    Processes plain text files into documents.
    """

    def process(self, path: Union[str, Path]) -> Document:
        """
        Processes a text file into a document.

        Args:
            path: Path to text file

        Returns:
            Document with file content and metadata
        """
        ...

    def parse(self, text: str) -> Document:
        """
        Parses text into a document.

        Args:
            text: Plain text content

        Returns:
            Document with text content
        """
        ...

Usage example:

from chonkie import TextChef

chef = TextChef()

# Process file
doc = chef.process("document.txt")
print(doc.content)
print(doc.metadata)

# Parse text directly
doc = chef.parse("This is some text content.")

# Use in pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .fetch_from("file", dir="./documents")
    .process_with("text")
    .chunk_with("recursive")
)

MarkdownChef

Processes markdown files, extracting tables, code blocks, and images.

class MarkdownChef(BaseChef):
    """
    Processes markdown files extracting tables, code blocks, and images.

    Args:
        tokenizer: Tokenizer instance or identifier (default: 'character')
    """
    def __init__(self, tokenizer: Union[TokenizerProtocol, str] = "character"): ...

    def process(self, path: Union[str, Path]) -> MarkdownDocument:
        """
        Processes a markdown file into a document.

        Args:
            path: Path to markdown file

        Returns:
            MarkdownDocument with extracted elements
        """
        ...

    def parse(self, text: str) -> MarkdownDocument:
        """
        Parses markdown text into a document.

        Args:
            text: Markdown content

        Returns:
            MarkdownDocument with extracted elements
        """
        ...

    def prepare_tables(self, markdown: str) -> list[MarkdownTable]:
        """
        Extracts tables from markdown.

        Args:
            markdown: Markdown content

        Returns:
            List of MarkdownTable objects
        """
        ...

    def prepare_code(self, markdown: str) -> list[MarkdownCode]:
        """
        Extracts code blocks from markdown.

        Args:
            markdown: Markdown content

        Returns:
            List of MarkdownCode objects
        """
        ...

Usage example:

from chonkie import MarkdownChef

chef = MarkdownChef()

# Process markdown file
doc = chef.process("README.md")

# Access extracted elements
for table in doc.tables:
    print(f"Table at {table.start_index}: {table.content}")

for code in doc.code:
    print(f"Code block ({code.language}): {code.content}")

for image in doc.images:
    print(f"Image: {image.alias} -> {image.link}")

# Parse markdown text
markdown_text = """
# Title

Some text here.

```python
print("Hello")
Col1Col2
AB

Alt text """

doc = chef.parse(markdown_text)

Use in pipeline

from chonkie import Pipeline

pipe = ( Pipeline() .fetch_from("file", dir="./docs", ext=[".md"]) .process_with("markdown") .chunk_with("recursive", recipe="markdown") )

#### TableChef

Specialized chef for processing tabular data.

```python { .api }
class TableChef(BaseChef):
    """
    Specialized chef for processing tabular data.

    Args:
        tokenizer: Tokenizer instance or identifier (default: 'character')
    """
    def __init__(self, tokenizer: Union[TokenizerProtocol, str] = "character"): ...

    def process(self, path: Union[str, Path]) -> Document:
        """
        Processes a table file into a document.

        Args:
            path: Path to table file

        Returns:
            Document with table content
        """
        ...

    def parse(self, text: str) -> Document:
        """
        Parses table text into a document.

        Args:
            text: Table content (markdown table format)

        Returns:
            Document with table data
        """
        ...

Usage example:

from chonkie import TableChef

chef = TableChef()

# Process table file
doc = chef.process("data.csv")

# Parse table text
table_text = """
| Name  | Age | City     |
|-------|-----|----------|
| Alice | 30  | New York |
| Bob   | 25  | London   |
"""

doc = chef.parse(table_text)

# Use with TableChunker
from chonkie import TableChunker

chunker = TableChunker(chunk_size=3)  # 3 rows per chunk
chunks = chunker(doc.content)

Imports

All data processing components are available from the main package:

from chonkie import (
    BaseFetcher,
    FileFetcher,
    BaseChef,
    TextChef,
    MarkdownChef,
    TableChef,
)

Pipeline Usage

Data processing components are used at the beginning of pipelines:

from chonkie import Pipeline

# Fetch and process workflow
pipe = (
    Pipeline()
    .fetch_from("file", dir="./documents", ext=[".txt", ".md"])
    .process_with("markdown")
    .chunk_with("recursive", recipe="markdown")
    .refine_with("embeddings")
)

docs = pipe.run()

Component aliases:

  • Fetchers: file
  • Chefs: text, markdown, table

Custom Components

Custom Fetcher

from chonkie import BaseFetcher
from chonkie.pipeline import fetcher

@fetcher("custom")
class CustomFetcher(BaseFetcher):
    def fetch(self, url: str):
        # Fetch from custom source
        return data

# Use in pipeline
pipe = Pipeline().fetch_from("custom", url="https://example.com")

Custom Chef

from chonkie import BaseChef, Document
from chonkie.pipeline import chef

@chef("custom")
class CustomChef(BaseChef):
    def process(self, path):
        text = self.read(path)
        return self.parse(text)

    def parse(self, text):
        # Custom parsing logic
        return Document(content=processed_text)

# Use in pipeline
pipe = Pipeline().process_with("custom")

Complete Workflow Example

from chonkie import Pipeline

# End-to-end document processing pipeline
pipe = (
    Pipeline()
    # Load markdown files
    .fetch_from("file", dir="./documentation", ext=[".md"])
    # Parse markdown content
    .process_with("markdown")
    # Chunk using markdown-aware rules
    .chunk_with("recursive", recipe="markdown", chunk_size=1024)
    # Add semantic refinement
    .chunk_with("semantic", chunk_size=512, threshold=0.75)
    # Add context overlap
    .refine_with("overlap", context_size=128)
    # Add embeddings
    .refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
    # Store in vector database
    .store_in("chroma", collection_name="docs")
    # Export backup
    .export_with("json", file="chunks_backup.jsonl")
)

# Execute pipeline
documents = pipe.run()

# Access results
for doc in documents:
    print(f"Processed: {doc.metadata.get('source')}")
    print(f"Chunks: {len(doc.chunks)}")
    if isinstance(doc, MarkdownDocument):
        print(f"Tables: {len(doc.tables)}")
        print(f"Code blocks: {len(doc.code)}")