tessl install tessl/pypi-chonkie@1.5.0The lightweight ingestion library for fast, efficient and robust RAG pipelines
File loading and text preprocessing components including Fetchers for data loading and Chefs for parsing and preprocessing various file formats.
Components for loading data from various sources.
Abstract base class for all fetcher implementations.
from abc import ABC, abstractmethod
from typing import Any
class BaseFetcher(ABC):
"""
Base class for all fetcher implementations.
"""
def __init__(self): ...
@abstractmethod
def fetch(self, *args: Any, **kwargs: Any) -> Any:
"""
Fetches data from source.
Args:
*args: Positional arguments
**kwargs: Keyword arguments
Returns:
Fetched data
"""
...Fetches files from the local filesystem.
from pathlib import Path
from typing import Optional, Union
class FileFetcher(BaseFetcher):
"""
Fetches files from local filesystem.
"""
def __init__(self): ...
def fetch(
self,
path: Optional[str] = None,
dir: Optional[str] = None,
ext: Optional[list[str]] = None
) -> Union[Path, list[Path]]:
"""
Fetches single file or multiple files from directory.
Args:
path: Path to a specific file (mutually exclusive with dir)
dir: Directory to search for files (mutually exclusive with path)
ext: List of file extensions to filter (e.g., ['.txt', '.md'])
Returns:
Single Path if path is provided, list of Paths if dir is provided
Raises:
ValueError: If both path and dir are provided, or neither is provided
"""
...
def fetch_file(self, dir: str, name: str) -> Path:
"""
Fetches a specific file from a directory.
Args:
dir: Directory path
name: File name
Returns:
Path to the file
"""
...
def __call__(
self,
path: Optional[str] = None,
dir: Optional[str] = None,
ext: Optional[list[str]] = None
) -> Union[Path, list[Path]]:
"""
Allows calling fetcher as a function.
Args:
path: Path to a specific file
dir: Directory to search
ext: List of file extensions to filter
Returns:
Single Path or list of Paths
"""
...Usage example:
from chonkie import FileFetcher
fetcher = FileFetcher()
# Fetch single file
file_path = fetcher.fetch(path="document.txt")
# Fetch all files from directory
all_files = fetcher.fetch(dir="./documents")
# Fetch files with specific extensions
text_files = fetcher.fetch(dir="./documents", ext=[".txt", ".md"])
# Use in pipeline
from chonkie import Pipeline
pipe = (
Pipeline()
.fetch_from("file", dir="./documents", ext=[".txt", ".md"])
.process_with("text")
.chunk_with("recursive")
)
docs = pipe.run()Components for parsing and preprocessing text from various file formats.
Abstract base class for all chef implementations.
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Union
class BaseChef(ABC):
"""
Base class for all chef implementations that process files into documents.
"""
def __init__(self): ...
@abstractmethod
def process(self, path: Union[str, Path]) -> Document:
"""
Processes a file into a document.
Args:
path: File path
Returns:
Document object
"""
...
@abstractmethod
def parse(self, text: str) -> Document:
"""
Parses text into a document.
Args:
text: Raw text content
Returns:
Document object
"""
...
def process_batch(
self,
paths: Union[list[str], list[Path]]
) -> list[Document]:
"""
Processes multiple files in batch.
Args:
paths: List of file paths
Returns:
List of Document objects
"""
...
def read(self, path: Union[str, Path]) -> str:
"""
Reads file content.
Args:
path: File path
Returns:
File content as string
"""
...
def __call__(self, path: Union[str, Path]) -> Document:
"""
Allows calling chef as a function.
Args:
path: File path
Returns:
Document object
"""
...Processes plain text files into documents.
class TextChef(BaseChef):
"""
Processes plain text files into documents.
"""
def process(self, path: Union[str, Path]) -> Document:
"""
Processes a text file into a document.
Args:
path: Path to text file
Returns:
Document with file content and metadata
"""
...
def parse(self, text: str) -> Document:
"""
Parses text into a document.
Args:
text: Plain text content
Returns:
Document with text content
"""
...Usage example:
from chonkie import TextChef
chef = TextChef()
# Process file
doc = chef.process("document.txt")
print(doc.content)
print(doc.metadata)
# Parse text directly
doc = chef.parse("This is some text content.")
# Use in pipeline
from chonkie import Pipeline
pipe = (
Pipeline()
.fetch_from("file", dir="./documents")
.process_with("text")
.chunk_with("recursive")
)Processes markdown files, extracting tables, code blocks, and images.
class MarkdownChef(BaseChef):
"""
Processes markdown files extracting tables, code blocks, and images.
Args:
tokenizer: Tokenizer instance or identifier (default: 'character')
"""
def __init__(self, tokenizer: Union[TokenizerProtocol, str] = "character"): ...
def process(self, path: Union[str, Path]) -> MarkdownDocument:
"""
Processes a markdown file into a document.
Args:
path: Path to markdown file
Returns:
MarkdownDocument with extracted elements
"""
...
def parse(self, text: str) -> MarkdownDocument:
"""
Parses markdown text into a document.
Args:
text: Markdown content
Returns:
MarkdownDocument with extracted elements
"""
...
def prepare_tables(self, markdown: str) -> list[MarkdownTable]:
"""
Extracts tables from markdown.
Args:
markdown: Markdown content
Returns:
List of MarkdownTable objects
"""
...
def prepare_code(self, markdown: str) -> list[MarkdownCode]:
"""
Extracts code blocks from markdown.
Args:
markdown: Markdown content
Returns:
List of MarkdownCode objects
"""
...Usage example:
from chonkie import MarkdownChef
chef = MarkdownChef()
# Process markdown file
doc = chef.process("README.md")
# Access extracted elements
for table in doc.tables:
print(f"Table at {table.start_index}: {table.content}")
for code in doc.code:
print(f"Code block ({code.language}): {code.content}")
for image in doc.images:
print(f"Image: {image.alias} -> {image.link}")
# Parse markdown text
markdown_text = """
# Title
Some text here.
```python
print("Hello")| Col1 | Col2 |
|---|---|
| A | B |
"""
doc = chef.parse(markdown_text)
from chonkie import Pipeline
pipe = ( Pipeline() .fetch_from("file", dir="./docs", ext=[".md"]) .process_with("markdown") .chunk_with("recursive", recipe="markdown") )
#### TableChef
Specialized chef for processing tabular data.
```python { .api }
class TableChef(BaseChef):
"""
Specialized chef for processing tabular data.
Args:
tokenizer: Tokenizer instance or identifier (default: 'character')
"""
def __init__(self, tokenizer: Union[TokenizerProtocol, str] = "character"): ...
def process(self, path: Union[str, Path]) -> Document:
"""
Processes a table file into a document.
Args:
path: Path to table file
Returns:
Document with table content
"""
...
def parse(self, text: str) -> Document:
"""
Parses table text into a document.
Args:
text: Table content (markdown table format)
Returns:
Document with table data
"""
...Usage example:
from chonkie import TableChef
chef = TableChef()
# Process table file
doc = chef.process("data.csv")
# Parse table text
table_text = """
| Name | Age | City |
|-------|-----|----------|
| Alice | 30 | New York |
| Bob | 25 | London |
"""
doc = chef.parse(table_text)
# Use with TableChunker
from chonkie import TableChunker
chunker = TableChunker(chunk_size=3) # 3 rows per chunk
chunks = chunker(doc.content)All data processing components are available from the main package:
from chonkie import (
BaseFetcher,
FileFetcher,
BaseChef,
TextChef,
MarkdownChef,
TableChef,
)Data processing components are used at the beginning of pipelines:
from chonkie import Pipeline
# Fetch and process workflow
pipe = (
Pipeline()
.fetch_from("file", dir="./documents", ext=[".txt", ".md"])
.process_with("markdown")
.chunk_with("recursive", recipe="markdown")
.refine_with("embeddings")
)
docs = pipe.run()Component aliases:
filetext, markdown, tablefrom chonkie import BaseFetcher
from chonkie.pipeline import fetcher
@fetcher("custom")
class CustomFetcher(BaseFetcher):
def fetch(self, url: str):
# Fetch from custom source
return data
# Use in pipeline
pipe = Pipeline().fetch_from("custom", url="https://example.com")from chonkie import BaseChef, Document
from chonkie.pipeline import chef
@chef("custom")
class CustomChef(BaseChef):
def process(self, path):
text = self.read(path)
return self.parse(text)
def parse(self, text):
# Custom parsing logic
return Document(content=processed_text)
# Use in pipeline
pipe = Pipeline().process_with("custom")from chonkie import Pipeline
# End-to-end document processing pipeline
pipe = (
Pipeline()
# Load markdown files
.fetch_from("file", dir="./documentation", ext=[".md"])
# Parse markdown content
.process_with("markdown")
# Chunk using markdown-aware rules
.chunk_with("recursive", recipe="markdown", chunk_size=1024)
# Add semantic refinement
.chunk_with("semantic", chunk_size=512, threshold=0.75)
# Add context overlap
.refine_with("overlap", context_size=128)
# Add embeddings
.refine_with("embeddings", embedding_model="all-MiniLM-L6-v2")
# Store in vector database
.store_in("chroma", collection_name="docs")
# Export backup
.export_with("json", file="chunks_backup.jsonl")
)
# Execute pipeline
documents = pipe.run()
# Access results
for doc in documents:
print(f"Processed: {doc.metadata.get('source')}")
print(f"Chunks: {len(doc.chunks)}")
if isinstance(doc, MarkdownDocument):
print(f"Tables: {len(doc.tables)}")
print(f"Code blocks: {len(doc.code)}")