or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/chonkie@1.5.x

docs

advanced-features.mdchunkers.mdcore-types.mddata-processing.mdembeddings.mdexport.mdindex.mdlogging.mdpipeline.mdrefineries.mdtokenizers.mdvector-databases.md
tile.json

tessl/pypi-chonkie

tessl install tessl/pypi-chonkie@1.5.0

The lightweight ingestion library for fast, efficient and robust RAG pipelines

export.mddocs/

Export and Utilities

Export chunks to various formats including JSON, JSONL, and HuggingFace datasets, with visualization and hub access utilities.

Capabilities

Porters

Components for exporting chunks to various file formats.

BasePorter

Abstract base class for all porter implementations.

from abc import ABC, abstractmethod
from typing import Any

class BasePorter(ABC):
    """
    Base class for exporting chunks to various formats.
    """
    def __init__(self): ...

    @abstractmethod
    def export(self, chunks: list[Chunk], **kwargs: dict[str, Any]) -> None:
        """
        Exports chunks to a format.

        Args:
            chunks: List of chunks to export
            **kwargs: Additional export parameters
        """
        ...

    def __call__(self, chunks: list[Chunk], **kwargs: dict[str, Any]) -> None:
        """
        Allows calling porter as a function. Calls export(chunks) internally.

        Note: kwargs are not passed to export() - use export() method directly if you need to pass additional parameters.

        Args:
            chunks: List of chunks to export
            **kwargs: Accepted but not used (for compatibility)
        """
        ...

JSONPorter

Exports chunks to JSON or JSONL format.

class JSONPorter(BasePorter):
    """
    Exports chunks to JSON or JSONL format.

    Args:
        lines: If True, export as JSONL (one JSON object per line); if False, export as JSON array
            (default: True)
    """
    def __init__(self, lines: bool = True): ...

    def export(self, chunks: list[Chunk], file: str = "chunks.jsonl") -> None:
        """
        Exports chunks to JSON/JSONL file.

        Args:
            chunks: List of chunks to export
            file: Output file path (default: 'chunks.jsonl')
        """
        ...

Usage example:

from chonkie import TokenChunker, JSONPorter

# Create chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")

# Export as JSONL (default)
porter = JSONPorter(lines=True)
porter.export(chunks, file="output.jsonl")

# Export as JSON array
porter = JSONPorter(lines=False)
porter.export(chunks, file="output.json")

# Use in pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("recursive")
    .export_with("json", file="chunks.jsonl")
)

doc = pipe.run("Your text...")

DatasetsPorter

Exports chunks to HuggingFace Datasets format.

from datasets import Dataset
from typing import Any

class DatasetsPorter(BasePorter):
    """
    Exports chunks to HuggingFace Datasets format.
    """
    def __init__(self): ...

    def export(
        self,
        chunks: list[Chunk],
        save_to_disk: bool = True,
        path: str = "chunks",
        **kwargs: dict[str, Any]
    ) -> Dataset:
        """
        Exports chunks as a HuggingFace Dataset.

        Args:
            chunks: List of chunks to export
            save_to_disk: If True, saves the dataset to disk (default: True)
            path: Directory path to save dataset (default: 'chunks')
            **kwargs: Additional arguments for Dataset.save_to_disk()

        Returns:
            HuggingFace Dataset object
        """
        ...

Usage example:

from chonkie import TokenChunker, DatasetsPorter

# Create chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")

porter = DatasetsPorter()

# Export as Dataset and save to disk (default behavior)
dataset = porter.export(chunks, path="./my_dataset")
print(dataset)

# Export without saving to disk
dataset = porter.export(chunks, save_to_disk=False)

# The returned dataset can be pushed to HuggingFace Hub
dataset.push_to_hub("username/my-chunks-dataset")

# Or saved later
dataset.save_to_disk("./my_dataset")

# Use in pipeline
from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("recursive")
    .refine_with("embeddings")
    .export_with("datasets", path="./output_dataset")
)

Utilities

Visualizer

Visualizes chunks with color-coded highlighting in terminal or HTML.

from typing import Optional, Union

class Visualizer:
    """
    Visualizes chunks with color-coded highlighting.

    Args:
        theme: Color theme name or list of colors (default: 'pastel')
            Available themes:
            - Light: 'pastel', 'tiktokenizer', 'ocean_breeze'
            - Dark: 'tiktokenizer_dark', 'pastel_dark', 'midnight'
    """
    def __init__(self, theme: Union[str, list[str]] = "pastel"): ...

    def print(
        self,
        chunks: list[Chunk],
        full_text: Optional[str] = None
    ) -> None:
        """
        Prints colored chunks to terminal.

        Args:
            chunks: List of chunks to visualize
            full_text: Optional full text (if not provided, reconstructed from chunks)
        """
        ...

    def save(
        self,
        filename: str,
        chunks: list[Chunk],
        full_text: Optional[str] = None,
        title: str = "Chunk Visualization"
    ) -> None:
        """
        Saves visualization as HTML file.

        Args:
            filename: Output HTML file path
            chunks: List of chunks to visualize
            full_text: Optional full text
            title: HTML page title
        """
        ...

    def __call__(
        self,
        chunks: list[Chunk],
        full_text: Optional[str] = None
    ) -> None:
        """
        Prints chunks (same as print method).

        Args:
            chunks: List of chunks to visualize
            full_text: Optional full text
        """
        ...

Usage example:

from chonkie import RecursiveChunker, Visualizer

# Create chunks
chunker = RecursiveChunker(chunk_size=100)
text = "This is a sample text for visualization. It will be split into chunks."
chunks = chunker(text)

# Visualize in terminal
viz = Visualizer(theme="pastel")
viz.print(chunks, full_text=text)

# Or use directly
viz(chunks, full_text=text)

# Save as HTML
viz.save("chunks_visualization.html", chunks, full_text=text, title="My Chunks")

# Try different themes
viz_dark = Visualizer(theme="midnight")
viz_dark.print(chunks, full_text=text)

viz_ocean = Visualizer(theme="ocean_breeze")
viz_ocean.print(chunks, full_text=text)

# Custom colors
viz_custom = Visualizer(theme=["#FF5733", "#33FF57", "#3357FF", "#F333FF"])
viz_custom.print(chunks, full_text=text)

Hubbie

Manager for accessing recipes and schemas from the Chonkie Hub on HuggingFace.

from typing import Optional

class Hubbie:
    """
    Manager for accessing recipes and schemas from the Chonkie Hub.
    """
    SCHEMA_VERSION: str = "v1"

    def __init__(self): ...

    def get_recipe(
        self,
        name: Optional[str] = "default",
        lang: Optional[str] = "en",
        path: Optional[str] = None
    ) -> dict:
        """
        Gets a recipe from the hub.

        Args:
            name: Recipe name (e.g., 'default', 'markdown', 'code')
            lang: Language code for the recipe
            path: Optional path to a local recipe file

        Returns:
            Recipe dictionary
        """
        ...

    def get_pipeline_recipe(
        self,
        name: str,
        path: Optional[str] = None
    ) -> dict:
        """
        Gets a pipeline recipe from the hub.

        Args:
            name: Pipeline recipe name
            path: Optional path to a local recipe file

        Returns:
            Pipeline recipe dictionary
        """
        ...

    def get_recipe_schema(self) -> dict:
        """
        Gets the current recipe schema.

        Returns:
            Recipe schema dictionary
        """
        ...

    def _validate_recipe(self, recipe: dict) -> Optional[bool]:
        """
        Validates a recipe against the schema.

        Args:
            recipe: Recipe dictionary to validate

        Returns:
            True if valid, raises exception if invalid
        """
        ...

Usage example:

from chonkie.utils import Hubbie

hubbie = Hubbie()

# Get a chunking recipe
recipe = hubbie.get_recipe(name="markdown", lang="en")
print(recipe)

# Get a pipeline recipe
pipeline_recipe = hubbie.get_pipeline_recipe(name="default")

# Get schema
schema = hubbie.get_recipe_schema()

# Use with RecursiveChunker
from chonkie import RecursiveChunker, RecursiveRules

rules = RecursiveRules.from_recipe(name="markdown")
chunker = RecursiveChunker(rules=rules)

# Use with Pipeline
from chonkie import Pipeline

pipe = Pipeline.from_recipe(name="default")

Imports

All export and utility components are available from the main package:

from chonkie import (
    BasePorter,
    JSONPorter,
    DatasetsPorter,
    Visualizer,
    Hubbie,
)

Pipeline Usage

Porters are used in pipelines via the export_with() method:

from chonkie import Pipeline

pipe = (
    Pipeline()
    .chunk_with("recursive")
    .refine_with("embeddings")
    .export_with("json", file="output.jsonl")
    .export_with("datasets", repo_id="username/chunks")
)

Porter aliases:

  • json - JSONPorter
  • datasets - DatasetsPorter

Complete Example

from chonkie import (
    Pipeline,
    Visualizer,
)

# Build pipeline with visualization and export
text = """
Long document text here...
Multiple paragraphs...
And more content...
"""

# Process and visualize
pipe = (
    Pipeline()
    .chunk_with("recursive", chunk_size=512)
    .refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
    .store_in("chroma", collection_name="docs")
    .export_with("json", file="chunks.jsonl")
    .export_with("datasets", path="./dataset")
)

doc = pipe.run(text)

# Visualize the chunks
viz = Visualizer(theme="pastel")
viz.print(doc.chunks, full_text=doc.content)

# Save visualization
viz.save("chunks.html", doc.chunks, full_text=doc.content)

# Load dataset for further processing
from datasets import load_from_disk

dataset = load_from_disk("./dataset")
print(dataset)

Custom Porters

Create custom porters by extending BasePorter:

from chonkie import BasePorter, Chunk
from chonkie.pipeline import porter

@porter("csv")
class CSVPorter(BasePorter):
    def export(self, chunks: list[Chunk], file: str = "chunks.csv"):
        import csv
        with open(file, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['id', 'text', 'token_count'])
            for chunk in chunks:
                writer.writerow([chunk.id, chunk.text, chunk.token_count])

# Use in pipeline
from chonkie import Pipeline

pipe = Pipeline().export_with("csv", file="output.csv")

Visualization Themes

Available built-in themes:

from chonkie import Visualizer

# Light themes
viz1 = Visualizer(theme="pastel")
viz2 = Visualizer(theme="tiktokenizer")
viz3 = Visualizer(theme="ocean_breeze")

# Dark themes
viz4 = Visualizer(theme="tiktokenizer_dark")
viz5 = Visualizer(theme="pastel_dark")
viz6 = Visualizer(theme="midnight")

# Custom theme with hex colors
viz7 = Visualizer(theme=[
    "#FF6B6B",  # Red
    "#4ECDC4",  # Teal
    "#45B7D1",  # Blue
    "#FFA07A",  # Orange
    "#98D8C8",  # Mint
])