Pure Python module to hyphenate text using existing Hunspell hyphenation dictionaries
npx @tessl/cli install tessl/pypi-pyphen@0.17.0A pure Python text hyphenation library that uses existing Hunspell hyphenation dictionaries to provide automatic word breaking at syllable boundaries. Pyphen offers comprehensive hyphenation capabilities through multiple methods including position detection, word wrapping with width constraints, and hyphen insertion, supporting over 40 languages through included LibreOffice dictionaries.
pip install pyphenimport pyphenAccess to all public functionality:
from pyphen import Pyphen, LANGUAGES, language_fallbackimport pyphen
# Create hyphenation instance for a language
dic = pyphen.Pyphen(lang='en_US')
# Get word with hyphens inserted at all valid positions
hyphenated = dic.inserted('hyphenation')
print(hyphenated) # 'hy-phen-ation'
# Iterate through all possible hyphenation splits
for first_part, second_part in dic.iterate('hyphenation'):
print(f"{first_part} | {second_part}")
# Output:
# hyphena | tion
# hyphen | ation
# hy | phenation
# Wrap word to fit within specific width
wrapped = dic.wrap('hyphenation', width=8)
if wrapped:
first, second = wrapped
print(f"{first} {second}") # 'hyphen- ation'
# Find all valid hyphenation positions
positions = dic.positions('hyphenation')
print(positions) # [2, 6, 8] (DataInt objects)Discover available languages and find appropriate fallback languages using Unicode locale inheritance.
LANGUAGES: dict[str, Path] # { .api }Dictionary mapping language codes to hyphenation dictionary file paths. Contains both full codes (e.g., 'en_US', 'de_DE') and short codes (e.g., 'en', 'de') for over 40 supported languages.
def language_fallback(language: str) -> str | None:
"""
Get a fallback language available in dictionaries using Unicode locale inheritance.
Args:
language (str): Language code with potential region/script variants (e.g., 'en-US', 'sr-Latn')
Returns:
str | None: Available language code in LANGUAGES, or None if no fallback found
"""Main interface for text hyphenation with configurable parameters and multiple hyphenation methods.
class Pyphen:
def __init__(
self,
filename: str | Path | None = None,
lang: str | None = None,
left: int = 2,
right: int = 2,
cache: bool = True
):
"""
Create hyphenation instance for specified language or dictionary file.
Args:
filename (str | Path, optional): Path to custom hyph_*.dic file
lang (str, optional): Language code for built-in dictionary
left (int): Minimum characters in first syllable (default: 2)
right (int): Minimum characters in last syllable (default: 2)
cache (bool): Whether to cache hyphenation patterns (default: True)
Raises:
KeyError: If specified language is not available in LANGUAGES
"""Find where words can be hyphenated and access position metadata.
def positions(self, word: str) -> list[DataInt]:
"""
Get valid hyphenation positions in word, respecting left/right constraints.
Args:
word (str): Word to find hyphenation positions for
Returns:
list[DataInt]: Positions where word can be hyphenated. Each DataInt may contain
tuple (change, index, cut) for non-standard hyphenation patterns.
"""Iterate through all possible hyphenation splits of a word.
def iterate(self, word: str) -> Generator[tuple[str, str], None, None]:
"""
Iterate over all hyphenation possibilities, longest first part first.
Args:
word (str): Word to hyphenate
Yields:
tuple[str, str]: (first_part, second_part) for each valid hyphenation point
"""
def __call__(self, word: str) -> Generator[tuple[str, str], None, None]:
"""Alias for iterate() method - makes Pyphen instances callable."""Wrap words to fit within specific width constraints with hyphenation.
def wrap(self, word: str, width: int, hyphen: str = '-') -> tuple[str, str] | None:
"""
Get longest possible first part and remaining part that fits within width.
Args:
word (str): Word to wrap
width (int): Maximum length for first part including hyphen
hyphen (str): Hyphen character to use (default: '-')
Returns:
tuple[str, str] | None: (first_part_with_hyphen, remaining_part) or None
if no valid hyphenation fits within width
"""Insert hyphens at all valid hyphenation points in a word.
def inserted(self, word: str, hyphen: str = '-') -> str:
"""
Get word with all possible hyphens inserted at valid positions.
Args:
word (str): Word to hyphenate
hyphen (str): Hyphen character to insert (default: '-')
Returns:
str: Word with hyphens inserted at all valid hyphenation points
"""class DataInt(int):
"""
Integer with additional data attribute for hyphenation metadata.
Attributes:
data (tuple[str, int, int] | None): Non-standard hyphenation data containing
(change, index, cut) for character substitutions
"""
def __new__(cls, value: int, data: tuple[str, int, int] | None = None, reference: DataInt | None = None) -> DataInt:
"""
Create DataInt with optional hyphenation metadata.
Args:
value (int): Integer value (hyphenation position)
data (tuple, optional): Hyphenation metadata (change, index, cut)
reference (DataInt, optional): Copy data from another DataInt
"""Pyphen includes hyphenation dictionaries for 40+ languages sourced from LibreOffice:
Major Languages: en_US, en_GB, fr, de, es, it, pt_PT, pt_BR, ru_RU, nl_NL, sv, da_DK, no_NO, pl_PL, cs_CZ, hu_HU, el_GR, tr_TR, fi_FI
Regional Variants: de_DE, de_AT, de_CH, pt_BR vs pt_PT, en_US vs en_GB, sr_Latn vs sr (Cyrillic)
Script Variants: Serbian Latin (sr_Latn) and Cyrillic (sr), multiple Chinese variants
Access all available languages:
import pyphen
# List all available language codes
print(list(pyphen.LANGUAGES.keys()))
# Check if language is available
if 'de_DE' in pyphen.LANGUAGES:
dic = pyphen.Pyphen(lang='de_DE')
# Use fallback for unavailable variants
fallback = pyphen.language_fallback('de-AT-x-variant') # Returns 'de_AT'Load hyphenation patterns from custom dictionary files:
from pathlib import Path
import pyphen
# Load from file path
custom_dict = Path('/path/to/custom_hyph.dic')
dic = pyphen.Pyphen(filename=custom_dict)
# Use existing dictionary path
dic = pyphen.Pyphen(filename=pyphen.LANGUAGES['fr'])Control minimum syllable lengths for hyphenation:
import pyphen
# Default: minimum 2 characters on each side
dic = pyphen.Pyphen(lang='en_US')
print(dic.inserted('automatic')) # 'au-to-mat-ic'
# Require 4 characters at start, 3 at end
dic = pyphen.Pyphen(lang='en_US', left=4, right=3)
print(dic.inserted('automatic')) # 'auto-matic'
# No constraints (left=1, right=1)
dic = pyphen.Pyphen(lang='en_US', left=1, right=1)
print(dic.inserted('automatic')) # 'a-u-t-o-m-a-t-i-c'Some languages use character substitution during hyphenation:
import pyphen
# Hungarian example with character changes
dic = pyphen.Pyphen(lang='hu', left=1, right=1)
word = 'kulissza'
# Standard positions show metadata
for pos in dic.positions(word):
if pos.data:
change, index, cut = pos.data
print(f"Position {pos}: change '{change}' at index {index}, cut {cut}")
# Iteration handles substitutions automatically
for first, second in dic.iterate(word):
print(f"{first} | {second}")
# Output:
# kulisz | sza # 'ss' becomes 'sz' + 's'
# ku | lissza
# Inserted form shows final result
print(dic.inserted(word)) # 'ku-lisz-sza'Dictionary loading is cached by default for better performance:
import pyphen
# First instance loads and caches dictionary
dic1 = pyphen.Pyphen(lang='en_US') # Loads dictionary
# Subsequent instances reuse cached dictionary
dic2 = pyphen.Pyphen(lang='en_US') # Uses cached dictionary
# Disable caching if needed (e.g., for memory-constrained environments)
dic3 = pyphen.Pyphen(lang='en_US', cache=False) # Reloads dictionaryimport pyphen
# Handle unavailable languages
try:
dic = pyphen.Pyphen(lang='unknown_language')
except KeyError as e:
print(f"Language not available: {e}")
# Use fallback or default language
dic = pyphen.Pyphen(lang='en_US')
# Handle custom dictionary file errors
try:
dic = pyphen.Pyphen(filename='/nonexistent/path.dic')
except (FileNotFoundError, PermissionError, UnicodeDecodeError) as e:
print(f"Dictionary file error: {e}")
# Fallback to built-in dictionary
dic = pyphen.Pyphen(lang='en_US')