tessl install tessl/pypi-chonkie@1.5.0The lightweight ingestion library for fast, efficient and robust RAG pipelines
Export chunks to various formats including JSON, JSONL, and HuggingFace datasets, with visualization and hub access utilities.
Components for exporting chunks to various file formats.
Abstract base class for all porter implementations.
from abc import ABC, abstractmethod
from typing import Any
class BasePorter(ABC):
"""
Base class for exporting chunks to various formats.
"""
def __init__(self): ...
@abstractmethod
def export(self, chunks: list[Chunk], **kwargs: dict[str, Any]) -> None:
"""
Exports chunks to a format.
Args:
chunks: List of chunks to export
**kwargs: Additional export parameters
"""
...
def __call__(self, chunks: list[Chunk], **kwargs: dict[str, Any]) -> None:
"""
Allows calling porter as a function. Calls export(chunks) internally.
Note: kwargs are not passed to export() - use export() method directly if you need to pass additional parameters.
Args:
chunks: List of chunks to export
**kwargs: Accepted but not used (for compatibility)
"""
...Exports chunks to JSON or JSONL format.
class JSONPorter(BasePorter):
"""
Exports chunks to JSON or JSONL format.
Args:
lines: If True, export as JSONL (one JSON object per line); if False, export as JSON array
(default: True)
"""
def __init__(self, lines: bool = True): ...
def export(self, chunks: list[Chunk], file: str = "chunks.jsonl") -> None:
"""
Exports chunks to JSON/JSONL file.
Args:
chunks: List of chunks to export
file: Output file path (default: 'chunks.jsonl')
"""
...Usage example:
from chonkie import TokenChunker, JSONPorter
# Create chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")
# Export as JSONL (default)
porter = JSONPorter(lines=True)
porter.export(chunks, file="output.jsonl")
# Export as JSON array
porter = JSONPorter(lines=False)
porter.export(chunks, file="output.json")
# Use in pipeline
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("recursive")
.export_with("json", file="chunks.jsonl")
)
doc = pipe.run("Your text...")Exports chunks to HuggingFace Datasets format.
from datasets import Dataset
from typing import Any
class DatasetsPorter(BasePorter):
"""
Exports chunks to HuggingFace Datasets format.
"""
def __init__(self): ...
def export(
self,
chunks: list[Chunk],
save_to_disk: bool = True,
path: str = "chunks",
**kwargs: dict[str, Any]
) -> Dataset:
"""
Exports chunks as a HuggingFace Dataset.
Args:
chunks: List of chunks to export
save_to_disk: If True, saves the dataset to disk (default: True)
path: Directory path to save dataset (default: 'chunks')
**kwargs: Additional arguments for Dataset.save_to_disk()
Returns:
HuggingFace Dataset object
"""
...Usage example:
from chonkie import TokenChunker, DatasetsPorter
# Create chunks
chunker = TokenChunker(chunk_size=512)
chunks = chunker("Your text here...")
porter = DatasetsPorter()
# Export as Dataset and save to disk (default behavior)
dataset = porter.export(chunks, path="./my_dataset")
print(dataset)
# Export without saving to disk
dataset = porter.export(chunks, save_to_disk=False)
# The returned dataset can be pushed to HuggingFace Hub
dataset.push_to_hub("username/my-chunks-dataset")
# Or saved later
dataset.save_to_disk("./my_dataset")
# Use in pipeline
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("recursive")
.refine_with("embeddings")
.export_with("datasets", path="./output_dataset")
)Visualizes chunks with color-coded highlighting in terminal or HTML.
from typing import Optional, Union
class Visualizer:
"""
Visualizes chunks with color-coded highlighting.
Args:
theme: Color theme name or list of colors (default: 'pastel')
Available themes:
- Light: 'pastel', 'tiktokenizer', 'ocean_breeze'
- Dark: 'tiktokenizer_dark', 'pastel_dark', 'midnight'
"""
def __init__(self, theme: Union[str, list[str]] = "pastel"): ...
def print(
self,
chunks: list[Chunk],
full_text: Optional[str] = None
) -> None:
"""
Prints colored chunks to terminal.
Args:
chunks: List of chunks to visualize
full_text: Optional full text (if not provided, reconstructed from chunks)
"""
...
def save(
self,
filename: str,
chunks: list[Chunk],
full_text: Optional[str] = None,
title: str = "Chunk Visualization"
) -> None:
"""
Saves visualization as HTML file.
Args:
filename: Output HTML file path
chunks: List of chunks to visualize
full_text: Optional full text
title: HTML page title
"""
...
def __call__(
self,
chunks: list[Chunk],
full_text: Optional[str] = None
) -> None:
"""
Prints chunks (same as print method).
Args:
chunks: List of chunks to visualize
full_text: Optional full text
"""
...Usage example:
from chonkie import RecursiveChunker, Visualizer
# Create chunks
chunker = RecursiveChunker(chunk_size=100)
text = "This is a sample text for visualization. It will be split into chunks."
chunks = chunker(text)
# Visualize in terminal
viz = Visualizer(theme="pastel")
viz.print(chunks, full_text=text)
# Or use directly
viz(chunks, full_text=text)
# Save as HTML
viz.save("chunks_visualization.html", chunks, full_text=text, title="My Chunks")
# Try different themes
viz_dark = Visualizer(theme="midnight")
viz_dark.print(chunks, full_text=text)
viz_ocean = Visualizer(theme="ocean_breeze")
viz_ocean.print(chunks, full_text=text)
# Custom colors
viz_custom = Visualizer(theme=["#FF5733", "#33FF57", "#3357FF", "#F333FF"])
viz_custom.print(chunks, full_text=text)Manager for accessing recipes and schemas from the Chonkie Hub on HuggingFace.
from typing import Optional
class Hubbie:
"""
Manager for accessing recipes and schemas from the Chonkie Hub.
"""
SCHEMA_VERSION: str = "v1"
def __init__(self): ...
def get_recipe(
self,
name: Optional[str] = "default",
lang: Optional[str] = "en",
path: Optional[str] = None
) -> dict:
"""
Gets a recipe from the hub.
Args:
name: Recipe name (e.g., 'default', 'markdown', 'code')
lang: Language code for the recipe
path: Optional path to a local recipe file
Returns:
Recipe dictionary
"""
...
def get_pipeline_recipe(
self,
name: str,
path: Optional[str] = None
) -> dict:
"""
Gets a pipeline recipe from the hub.
Args:
name: Pipeline recipe name
path: Optional path to a local recipe file
Returns:
Pipeline recipe dictionary
"""
...
def get_recipe_schema(self) -> dict:
"""
Gets the current recipe schema.
Returns:
Recipe schema dictionary
"""
...
def _validate_recipe(self, recipe: dict) -> Optional[bool]:
"""
Validates a recipe against the schema.
Args:
recipe: Recipe dictionary to validate
Returns:
True if valid, raises exception if invalid
"""
...Usage example:
from chonkie.utils import Hubbie
hubbie = Hubbie()
# Get a chunking recipe
recipe = hubbie.get_recipe(name="markdown", lang="en")
print(recipe)
# Get a pipeline recipe
pipeline_recipe = hubbie.get_pipeline_recipe(name="default")
# Get schema
schema = hubbie.get_recipe_schema()
# Use with RecursiveChunker
from chonkie import RecursiveChunker, RecursiveRules
rules = RecursiveRules.from_recipe(name="markdown")
chunker = RecursiveChunker(rules=rules)
# Use with Pipeline
from chonkie import Pipeline
pipe = Pipeline.from_recipe(name="default")All export and utility components are available from the main package:
from chonkie import (
BasePorter,
JSONPorter,
DatasetsPorter,
Visualizer,
Hubbie,
)Porters are used in pipelines via the export_with() method:
from chonkie import Pipeline
pipe = (
Pipeline()
.chunk_with("recursive")
.refine_with("embeddings")
.export_with("json", file="output.jsonl")
.export_with("datasets", repo_id="username/chunks")
)Porter aliases:
json - JSONPorterdatasets - DatasetsPorterfrom chonkie import (
Pipeline,
Visualizer,
)
# Build pipeline with visualization and export
text = """
Long document text here...
Multiple paragraphs...
And more content...
"""
# Process and visualize
pipe = (
Pipeline()
.chunk_with("recursive", chunk_size=512)
.refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
.store_in("chroma", collection_name="docs")
.export_with("json", file="chunks.jsonl")
.export_with("datasets", path="./dataset")
)
doc = pipe.run(text)
# Visualize the chunks
viz = Visualizer(theme="pastel")
viz.print(doc.chunks, full_text=doc.content)
# Save visualization
viz.save("chunks.html", doc.chunks, full_text=doc.content)
# Load dataset for further processing
from datasets import load_from_disk
dataset = load_from_disk("./dataset")
print(dataset)Create custom porters by extending BasePorter:
from chonkie import BasePorter, Chunk
from chonkie.pipeline import porter
@porter("csv")
class CSVPorter(BasePorter):
def export(self, chunks: list[Chunk], file: str = "chunks.csv"):
import csv
with open(file, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['id', 'text', 'token_count'])
for chunk in chunks:
writer.writerow([chunk.id, chunk.text, chunk.token_count])
# Use in pipeline
from chonkie import Pipeline
pipe = Pipeline().export_with("csv", file="output.csv")Available built-in themes:
from chonkie import Visualizer
# Light themes
viz1 = Visualizer(theme="pastel")
viz2 = Visualizer(theme="tiktokenizer")
viz3 = Visualizer(theme="ocean_breeze")
# Dark themes
viz4 = Visualizer(theme="tiktokenizer_dark")
viz5 = Visualizer(theme="pastel_dark")
viz6 = Visualizer(theme="midnight")
# Custom theme with hex colors
viz7 = Visualizer(theme=[
"#FF6B6B", # Red
"#4ECDC4", # Teal
"#45B7D1", # Blue
"#FFA07A", # Orange
"#98D8C8", # Mint
])