Comprehensive Chinese character to Pinyin conversion library with intelligent word segmentation and multiple output styles
—
Extended functionality including custom converters, tone sandhi processing, segmentation control, and specialized mixins for advanced pinyin processing scenarios.
The main Pinyin class provides configurable converter backends for advanced customization.
class Pinyin:
"""Main pinyin conversion class with configurable converter backend."""
def __init__(self, converter=None):
"""
Initialize Pinyin converter.
Parameters:
- converter: Custom converter instance (default: DefaultConverter)
"""
def pinyin(self, hans, style=Style.TONE, heteronym=False, errors='default', strict=True):
"""Convert Chinese characters to pinyin using configured converter."""
def lazy_pinyin(self, hans, style=Style.NORMAL, errors='default', strict=True):
"""Convert Chinese characters to pinyin (lazy mode) using configured converter."""from pypinyin.core import Pinyin
from pypinyin.converter import DefaultConverter, UltimateConverter
from pypinyin import Style
# Use default converter
pinyin_converter = Pinyin()
result = pinyin_converter.pinyin('中国')
print(result) # [['zhōng'], ['guó']]
# Use advanced converter
ultimate_converter = UltimateConverter()
pinyin_converter = Pinyin(converter=ultimate_converter)
result = pinyin_converter.pinyin('中国')
print(result) # Enhanced conversion with ultimate converter
# Custom converter configuration
custom_converter = DefaultConverter()
# Configure custom converter settings...
pinyin_converter = Pinyin(converter=custom_converter)Pluggable converter implementations providing different processing backends.
class DefaultConverter:
"""Basic pinyin converter implementation."""
def __init__(self):
"""Initialize default converter with standard settings."""
def convert(self, han, style, errors, strict):
"""
Convert single character to pinyin.
Parameters:
- han (str): Chinese character to convert
- style (Style): Output style
- errors (str): Error handling strategy
- strict (bool): Strict mode
Returns:
list: Pinyin pronunciations for the character
"""class UltimateConverter:
"""Enhanced converter with advanced processing options."""
def __init__(self):
"""Initialize ultimate converter with enhanced features."""
def convert(self, han, style, errors, strict):
"""Convert single character with enhanced processing."""from pypinyin.converter import DefaultConverter, UltimateConverter
from pypinyin.core import Pinyin
from pypinyin import Style
# Compare converter outputs
text = '重庆'
# Default converter
default_conv = DefaultConverter()
pinyin_default = Pinyin(converter=default_conv)
result1 = pinyin_default.pinyin(text)
print(f"Default: {result1}")
# Ultimate converter
ultimate_conv = UltimateConverter()
pinyin_ultimate = Pinyin(converter=ultimate_conv)
result2 = pinyin_ultimate.pinyin(text)
print(f"Ultimate: {result2}")
# Custom converter subclass
class CustomConverter(DefaultConverter):
def convert(self, han, style, errors, strict):
# Custom processing logic
result = super().convert(han, style, errors, strict)
# Post-process result...
return result
custom_conv = CustomConverter()
pinyin_custom = Pinyin(converter=custom_conv)
result3 = pinyin_custom.pinyin(text)
print(f"Custom: {result3}")Extended processing capabilities through contrib mixins and modules.
# pypinyin.contrib.tone_sandhi
class ToneSandhiMixin:
"""Mixin providing tone sandhi rule processing."""
def pre_handle_tone_sandhi(self, han_list):
"""Apply tone sandhi rules to character sequence."""Tone sandhi automatically applies tone change rules for natural pronunciation:
from pypinyin.contrib.tone_sandhi import ToneSandhiMixin
from pypinyin import lazy_pinyin
# Enable tone sandhi in lazy_pinyin
result = lazy_pinyin('一个', tone_sandhi=True)
print(result) # ['yí', 'gè'] # 一 changes from tone 1 to tone 2
result = lazy_pinyin('不用', tone_sandhi=True)
print(result) # ['bú', 'yòng'] # 不 changes from tone 4 to tone 2
# Common tone sandhi patterns
examples = [
('一天', ['yì', 'tiān']), # 一 + 1st tone -> 4th tone
('一个', ['yí', 'gè']), # 一 + 4th tone -> 2nd tone
('一些', ['yì', 'xiē']), # 一 + 1st tone -> 4th tone
('不对', ['bú', 'duì']), # 不 + 4th tone -> 2nd tone
('不好', ['bù', 'hǎo']), # 不 + 3rd tone -> 4th tone
]
for text, expected in examples:
result = lazy_pinyin(text, tone_sandhi=True)
print(f"{text}: {result}")# pypinyin.contrib.uv
class V2UMixin:
"""Mixin handling v/ü character conversion."""
def pre_handle_v_to_u(self, han_list):
"""Convert 'v' characters to 'ü' in output."""from pypinyin import lazy_pinyin, Style
# Standard output with 'v'
result = lazy_pinyin('女', style=Style.TONE2)
print(result) # ['nv3']
# Convert 'v' to 'ü'
result = lazy_pinyin('女', style=Style.TONE2, v_to_u=True)
print(result) # ['nü3']
# Works with different styles
result = lazy_pinyin('绿', style=Style.NORMAL, v_to_u=True)
print(result) # ['lü'] instead of ['lv']# pypinyin.contrib.neutral_tone
class NeutralToneWith5Mixin:
"""Mixin for neutral tone handling with number 5."""
def pre_handle_neutral_tone_with_5(self, han_list):
"""Use '5' for neutral tone in numeric styles."""from pypinyin import lazy_pinyin, Style
# Standard neutral tone representation
result = lazy_pinyin('的', style=Style.TONE3)
print(result) # ['de'] (no tone number for neutral tone)
# Use '5' for neutral tone
result = lazy_pinyin('的', style=Style.TONE3, neutral_tone_with_five=True)
print(result) # ['de5']
# Examples with neutral tone particles
particles = ['的', '了', '着', '过']
for particle in particles:
standard = lazy_pinyin(particle, style=Style.TONE3)
with_five = lazy_pinyin(particle, style=Style.TONE3, neutral_tone_with_five=True)
print(f"{particle}: {standard} -> {with_five}")Word boundary detection modules for accurate pronunciation through proper segmentation.
# pypinyin.seg.mmseg
def seg(hans):
"""
Segment Chinese text using MMSeg algorithm.
Parameters:
- hans (str): Chinese text to segment
Returns:
list: List of segmented words
"""from pypinyin.seg.mmseg import seg
from pypinyin import lazy_pinyin
# Compare with and without segmentation
text = '研究生命的起源'
# Without proper segmentation (character by character)
result1 = lazy_pinyin(text)
print(f"Character-by-character: {result1}")
# With MMSeg segmentation
segments = seg(text)
print(f"Segments: {segments}") # Better word boundaries
# Apply segmentation for better pronunciation
segmented_text = ' '.join(segments)
result2 = lazy_pinyin(segmented_text)
print(f"Segmented: {result2}")# pypinyin.seg.simpleseg
def seg(hans):
"""
Simple character-by-character segmentation.
Parameters:
- hans (str): Chinese text to segment
Returns:
list: List of individual characters
"""from pypinyin.seg.simpleseg import seg
text = '中华人民共和国'
segments = seg(text)
print(segments) # ['中', '华', '人', '民', '共', '和', '国']Direct tone style conversion functions for format transformation.
# pypinyin.contrib.tone_convert
def tone_to_tone2(tone_pinyin):
"""Convert tone marks to tone2 format."""
def tone2_to_tone(tone2_pinyin):
"""Convert tone2 format to tone marks."""
def tone_to_tone3(tone_pinyin):
"""Convert tone marks to tone3 format."""
def tone3_to_tone(tone3_pinyin):
"""Convert tone3 format to tone marks."""
# Additional conversion functions for all style pairs...from pypinyin.contrib.tone_convert import (
tone_to_tone2, tone2_to_tone,
tone_to_tone3, tone3_to_tone
)
# Convert between tone formats
original = 'zhōng guó'
# To tone2 (numbers after vowels)
tone2_result = tone_to_tone2(original)
print(f"Tone2: {tone2_result}") # zho1ng guo2
# To tone3 (numbers after pinyin)
tone3_result = tone_to_tone3(original)
print(f"Tone3: {tone3_result}") # zhong1 guo2
# Back to tone marks
back_to_tone = tone3_to_tone(tone3_result)
print(f"Back to tone: {back_to_tone}") # zhōng guó
# Chain conversions
conversion_chain = [
('Original', 'zhōng guó'),
('Tone2', tone_to_tone2('zhōng guó')),
('Tone3', tone_to_tone3('zhōng guó')),
('Back', tone3_to_tone(tone_to_tone3('zhōng guó')))
]
for label, result in conversion_chain:
print(f"{label}: {result}")Creating specialized converters for domain-specific needs:
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin
from pypinyin import Style
class DomainSpecificConverter(DefaultConverter):
"""Custom converter for domain-specific pronunciation."""
def __init__(self, domain='general'):
super().__init__()
self.domain = domain
self.domain_dict = self._load_domain_dict()
def _load_domain_dict(self):
"""Load domain-specific pronunciation mappings."""
domain_mappings = {
'medical': {
'症': ['zhèng'], # Medical symptom context
'脉': ['mài'], # Pulse context
},
'legal': {
'法': ['fǎ'], # Law context
'案': ['àn'], # Legal case context
}
}
return domain_mappings.get(self.domain, {})
def convert(self, han, style, errors, strict):
"""Convert with domain-specific rules."""
# Check domain dictionary first
if han in self.domain_dict:
domain_pronunciations = self.domain_dict[han]
# Format according to requested style...
return domain_pronunciations
# Fall back to default conversion
return super().convert(han, style, errors, strict)
# Use custom converter
medical_converter = DomainSpecificConverter(domain='medical')
medical_pinyin = Pinyin(converter=medical_converter)
medical_text = '症状分析'
result = medical_pinyin.pinyin(medical_text)
print(f"Medical context: {result}")Integrating multiple advanced features for comprehensive processing:
from pypinyin import lazy_pinyin, Style
from pypinyin.seg.mmseg import seg as mmseg_seg
from pypinyin.contrib.tone_convert import tone_to_tone3
def advanced_processing_pipeline(text):
"""Comprehensive processing with multiple advanced features."""
# Step 1: Intelligent segmentation
segments = mmseg_seg(text)
print(f"Segments: {segments}")
# Step 2: Pinyin conversion with tone sandhi
pinyin_result = lazy_pinyin(
text,
style=Style.TONE,
tone_sandhi=True,
v_to_u=True,
neutral_tone_with_five=True
)
print(f"Pinyin with advanced features: {pinyin_result}")
# Step 3: Format conversion
tone_marked = ' '.join(pinyin_result)
tone3_format = tone_to_tone3(tone_marked)
print(f"Tone3 format: {tone3_format}")
return {
'segments': segments,
'pinyin_advanced': pinyin_result,
'tone3_format': tone3_format
}
# Example usage
text = '一个不错的研究生'
results = advanced_processing_pipeline(text)
# Access different processing results
for key, value in results.items():
print(f"{key}: {value}")Optimizing advanced feature usage for production scenarios:
from functools import lru_cache
from pypinyin.core import Pinyin
from pypinyin.converter import DefaultConverter
class OptimizedConverter(DefaultConverter):
"""Performance-optimized converter with caching."""
def __init__(self, cache_size=1000):
super().__init__()
self.cache_size = cache_size
# Use LRU cache for frequent conversions
self.convert = lru_cache(maxsize=cache_size)(self.convert)
@lru_cache(maxsize=1000)
def convert_cached(self, han, style, errors, strict):
"""Cached conversion for performance."""
return super().convert(han, style, errors, strict)
# Batch processing with optimized converter
def batch_process_optimized(texts):
"""Process multiple texts with performance optimization."""
optimized_converter = OptimizedConverter(cache_size=5000)
pinyin_processor = Pinyin(converter=optimized_converter)
results = []
for text in texts:
result = pinyin_processor.lazy_pinyin(text)
results.append(result)
return results
# Example with large dataset
large_dataset = ['中国', '美国', '英国'] * 1000 # Repeated texts
results = batch_process_optimized(large_dataset)
print(f"Processed {len(results)} texts efficiently")Install with Tessl CLI
npx tessl i tessl/pypi-pypinyin