Python Bidi layout wrapping the Rust crate unicode-bidi
npx @tessl/cli install tessl/pypi-python-bidi@0.6.0Python BiDi provides bi-directional (BiDi) text layout support for Python applications, enabling correct display of mixed left-to-right and right-to-left text (such as Arabic, Hebrew mixed with English). The library offers two implementations: a high-performance Rust-based implementation (default) and a pure Python implementation for compatibility.
Python-bidi uses a dual-implementation approach to provide both performance and compatibility:
unicode-bidi Rust crate, compiled as a Python extension module (.bidi). Implements a more recent version of the Unicode BiDi algorithm.get_display, get_base_level) with identical behavior for standard use cases.from bidi import) uses the Rust implementation, while the Python implementation is explicitly accessible via from bidi.algorithm import.pip install python-bidiMain API (Rust-based implementation):
from bidi import get_display, get_base_levelPure Python implementation:
from bidi.algorithm import get_display, get_base_levelfrom bidi import get_display
# Hebrew text example
hebrew_text = "שלום"
display_text = get_display(hebrew_text)
print(display_text) # Outputs correctly ordered text for display
# Mixed text with numbers
mixed_text = "1 2 3 ניסיון"
display_text = get_display(mixed_text)
print(display_text) # "ןויסינ 3 2 1"
# Working with bytes and encoding
hebrew_bytes = "שלם".encode('utf-8')
display_bytes = get_display(hebrew_bytes, encoding='utf-8')
print(display_bytes.decode('utf-8'))
# Override base direction
text = "hello world"
rtl_display = get_display(text, base_dir='R')
print(rtl_display)
# Debug mode to see algorithm steps
debug_output = get_display("hello שלום", debug=True)
# Outputs algorithm steps to stderrConverts logical text order to visual display order according to the Unicode BiDi algorithm.
def get_display(
str_or_bytes: StrOrBytes,
encoding: str = "utf-8",
base_dir: Optional[str] = None,
debug: bool = False
) -> StrOrBytes:
"""
Convert text from logical order to visual display order.
Args:
str_or_bytes: Input text as string or bytes
encoding: Encoding to use if input is bytes (default: "utf-8")
base_dir: Override base direction ('L' for LTR, 'R' for RTL)
debug: Enable debug output to stderr (default: False)
Returns:
Processed text in same type as input (str or bytes)
"""Determines the base paragraph direction of text.
def get_base_level(text: str) -> int:
"""
Get the base embedding level of the first paragraph in text.
Args:
text: Input text string
Returns:
Base level (0 for LTR, 1 for RTL)
"""For compatibility or when Rust implementation is not available, use the pure Python implementation.
# From bidi.algorithm module
def get_display(
str_or_bytes: StrOrBytes,
encoding: str = "utf-8",
upper_is_rtl: bool = False,
base_dir: Optional[str] = None,
debug: bool = False
) -> StrOrBytes:
"""
Pure Python implementation of BiDi text layout.
Args:
str_or_bytes: Input text as string or bytes
encoding: Encoding to use if input is bytes (default: "utf-8")
upper_is_rtl: Treat uppercase chars as strong RTL for debugging (default: False)
base_dir: Override base direction ('L' for LTR, 'R' for RTL)
debug: Enable debug output to stderr (default: False)
Returns:
Processed text in same type as input (str or bytes)
"""
def get_base_level(text, upper_is_rtl: bool = False) -> int:
"""
Get base embedding level using Python implementation.
Args:
text: Input text string
upper_is_rtl: Treat uppercase chars as strong RTL for debugging (default: False)
Returns:
Base level (0 for LTR, 1 for RTL)
"""For advanced usage, the Python implementation exposes internal algorithm functions.
def get_empty_storage() -> dict:
"""
Return empty storage skeleton for testing and advanced usage.
Returns:
Dictionary with keys: base_level, base_dir, chars, runs
"""
def get_embedding_levels(text, storage, upper_is_rtl: bool = False, debug: bool = False):
"""
Get paragraph embedding levels and populate storage with character data.
Args:
text: Input text string
storage: Storage dictionary from get_empty_storage()
upper_is_rtl: Treat uppercase chars as strong RTL (default: False)
debug: Enable debug output (default: False)
"""
def debug_storage(storage, base_info: bool = False, chars: bool = True, runs: bool = False):
"""
Display debug information for storage object.
Args:
storage: Storage dictionary
base_info: Show base level and direction info (default: False)
chars: Show character data (default: True)
runs: Show level runs (default: False)
"""Access to Unicode character mirroring data.
from bidi.mirror import MIRRORED
# MIRRORED is a dictionary mapping characters to their mirrored versions
# Example: MIRRORED['('] == ')'Use pybidi command for text processing from the command line.
# Basic usage
pybidi "your text here"
# Read from stdin
echo "your text here" | pybidi
# Use Rust implementation (default is Python)
pybidi -r "your text here"
# Override base direction
pybidi -b R "your text here"
# Enable debug output
pybidi -d "your text here"
# Specify encoding
pybidi -e utf-8 "your text here"
# For Python implementation, treat uppercase as RTL (debugging)
pybidi -u "Your Text HERE"Access version information for the package:
from bidi import VERSION, VERSION_TUPLE
# VERSION is a string like "0.6.0"
# VERSION_TUPLE is a tuple like (0, 6, 0)The package provides a main function for command-line usage:
from bidi import main
def main():
"""
Command-line interface function for pybidi.
Processes command line arguments and applies BiDi algorithm to input text.
Used by the pybidi console script. Reads from arguments or stdin,
supports all CLI options (encoding, base direction, debug, etc.).
Returns:
None (outputs processed text to stdout)
"""from typing import Union, Optional, List, Dict, Any
from collections import deque
# Type aliases used in the API
StrOrBytes = Union[str, bytes]
# Storage structure (Python implementation)
Storage = Dict[str, Any] # Contains:
# {
# "base_level": int, # Base embedding level (0 for LTR, 1 for RTL)
# "base_dir": str, # Base direction ('L' or 'R')
# "chars": List[Dict], # Character data with level, type, original type
# "runs": deque # Level runs for processing
# }
# Character object structure (within Storage["chars"])
Character = Dict[str, Union[str, int]] # Contains:
# {
# "ch": str, # The character
# "level": int, # Embedding level
# "type": str, # BiDi character type
# "orig": str # Original BiDi character type
# }from bidi import get_display, get_base_level (uses compiled .bidi module)upper_is_rtl parameterfrom bidi.algorithm import get_display, get_base_levelupper_is_rtl parameter for debuggingBoth implementations handle common error cases gracefully:
UnicodeDecodeError or UnicodeEncodeErrorValueErrorbase_dir values: Rust implementation raises ValueError for values other than 'L', 'R', or Noneget_base_level_inner() raises ValueError for text with no paragraphsValueError with message "base_dir can be 'L', 'R' or None"AssertionError for invalid character typessys.stderr, does not raise exceptionsSupports any encoding that Python's str.encode() and bytes.decode() support, including:
from bidi import get_display
# English with Hebrew
text = "Hello שלום World"
display = get_display(text)
print(display) # Correctly ordered for display
# Numbers with RTL text
text = "הספר עולה 25 שקל"
display = get_display(text)
print(display) # Numbers maintain LTR order within RTL textfrom bidi import get_display
# Hebrew text in different encoding
hebrew_cp1255 = "שלום".encode('cp1255')
display = get_display(hebrew_cp1255, encoding='cp1255')
print(display.decode('cp1255'))from bidi.algorithm import get_display, debug_storage, get_empty_storage, get_embedding_levels
# Enable debug output
text = "Hello שלום"
display = get_display(text, debug=True)
# Outputs detailed algorithm steps to stderr
# Manual debugging with storage
storage = get_empty_storage()
get_embedding_levels(text, storage)
debug_storage(storage, base_info=True, chars=True, runs=True)