tessl/pypi-pypinyin

Comprehensive Chinese character to Pinyin conversion library with intelligent word segmentation and multiple output styles

—

Pending

Overview

Eval results

Files

Advanced Features

Name: tessl/pypi-pypinyin
Author: tessl

Extended functionality including custom converters, tone sandhi processing, segmentation control, and specialized mixins for advanced pinyin processing scenarios.

Capabilities

Core Pinyin Class

The main Pinyin class provides configurable converter backends for advanced customization.

class Pinyin:
    """Main pinyin conversion class with configurable converter backend."""
    
    def __init__(self, converter=None):
        """
        Initialize Pinyin converter.
        
        Parameters:
        - converter: Custom converter instance (default: DefaultConverter)
        """
    
    def pinyin(self, hans, style=Style.TONE, heteronym=False, errors='default', strict=True):
        """Convert Chinese characters to pinyin using configured converter."""
    
    def lazy_pinyin(self, hans, style=Style.NORMAL, errors='default', strict=True):
        """Convert Chinese characters to pinyin (lazy mode) using configured converter."""

Usage Examples

from pypinyin.core import Pinyin
from pypinyin.converter import DefaultConverter, UltimateConverter
from pypinyin import Style

# Use default converter
pinyin_converter = Pinyin()
result = pinyin_converter.pinyin('中国')
print(result)  # [['zhōng'], ['guó']]

# Use advanced converter
ultimate_converter = UltimateConverter()
pinyin_converter = Pinyin(converter=ultimate_converter)
result = pinyin_converter.pinyin('中国')
print(result)  # Enhanced conversion with ultimate converter

# Custom converter configuration
custom_converter = DefaultConverter()
# Configure custom converter settings...
pinyin_converter = Pinyin(converter=custom_converter)

Converter Classes

Pluggable converter implementations providing different processing backends.

DefaultConverter

class DefaultConverter:
    """Basic pinyin converter implementation."""
    
    def __init__(self):
        """Initialize default converter with standard settings."""
    
    def convert(self, han, style, errors, strict):
        """
        Convert single character to pinyin.
        
        Parameters:
        - han (str): Chinese character to convert
        - style (Style): Output style
        - errors (str): Error handling strategy
        - strict (bool): Strict mode
        
        Returns:
        list: Pinyin pronunciations for the character
        """

UltimateConverter

class UltimateConverter:
    """Enhanced converter with advanced processing options."""
    
    def __init__(self):
        """Initialize ultimate converter with enhanced features."""
    
    def convert(self, han, style, errors, strict):
        """Convert single character with enhanced processing."""

Usage Examples

from pypinyin.converter import DefaultConverter, UltimateConverter
from pypinyin.core import Pinyin
from pypinyin import Style

# Compare converter outputs
text = '重庆'

# Default converter
default_conv = DefaultConverter()
pinyin_default = Pinyin(converter=default_conv)
result1 = pinyin_default.pinyin(text)
print(f"Default: {result1}")

# Ultimate converter  
ultimate_conv = UltimateConverter()
pinyin_ultimate = Pinyin(converter=ultimate_conv)
result2 = pinyin_ultimate.pinyin(text)
print(f"Ultimate: {result2}")

# Custom converter subclass
class CustomConverter(DefaultConverter):
    def convert(self, han, style, errors, strict):
        # Custom processing logic
        result = super().convert(han, style, errors, strict)
        # Post-process result...
        return result

custom_conv = CustomConverter() 
pinyin_custom = Pinyin(converter=custom_conv)
result3 = pinyin_custom.pinyin(text)
print(f"Custom: {result3}")

Contrib Modules - Advanced Processing

Extended processing capabilities through contrib mixins and modules.

Tone Sandhi Processing

# pypinyin.contrib.tone_sandhi
class ToneSandhiMixin:
    """Mixin providing tone sandhi rule processing."""
    
    def pre_handle_tone_sandhi(self, han_list):
        """Apply tone sandhi rules to character sequence."""

Tone sandhi automatically applies tone change rules for natural pronunciation:

from pypinyin.contrib.tone_sandhi import ToneSandhiMixin
from pypinyin import lazy_pinyin

# Enable tone sandhi in lazy_pinyin
result = lazy_pinyin('一个', tone_sandhi=True)
print(result)  # ['yí', 'gè']  # 一 changes from tone 1 to tone 2

result = lazy_pinyin('不用', tone_sandhi=True)
print(result)  # ['bú', 'yòng']  # 不 changes from tone 4 to tone 2

# Common tone sandhi patterns
examples = [
    ('一天', ['yì', 'tiān']),      # 一 + 1st tone -> 4th tone
    ('一个', ['yí', 'gè']),        # 一 + 4th tone -> 2nd tone  
    ('一些', ['yì', 'xiē']),       # 一 + 1st tone -> 4th tone
    ('不对', ['bú', 'duì']),       # 不 + 4th tone -> 2nd tone
    ('不好', ['bù', 'hǎo']),       # 不 + 3rd tone -> 4th tone
]

for text, expected in examples:
    result = lazy_pinyin(text, tone_sandhi=True)
    print(f"{text}: {result}")

Character Variant Handling

# pypinyin.contrib.uv
class V2UMixin:
    """Mixin handling v/ü character conversion."""
    
    def pre_handle_v_to_u(self, han_list):
        """Convert 'v' characters to 'ü' in output."""

from pypinyin import lazy_pinyin, Style

# Standard output with 'v'
result = lazy_pinyin('女', style=Style.TONE2)  
print(result)  # ['nv3']

# Convert 'v' to 'ü'
result = lazy_pinyin('女', style=Style.TONE2, v_to_u=True)
print(result)  # ['nü3']

# Works with different styles
result = lazy_pinyin('绿', style=Style.NORMAL, v_to_u=True)
print(result)  # ['lü'] instead of ['lv']

Neutral Tone Handling

# pypinyin.contrib.neutral_tone
class NeutralToneWith5Mixin:
    """Mixin for neutral tone handling with number 5."""
    
    def pre_handle_neutral_tone_with_5(self, han_list):
        """Use '5' for neutral tone in numeric styles."""

from pypinyin import lazy_pinyin, Style

# Standard neutral tone representation
result = lazy_pinyin('的', style=Style.TONE3)
print(result)  # ['de'] (no tone number for neutral tone)

# Use '5' for neutral tone
result = lazy_pinyin('的', style=Style.TONE3, neutral_tone_with_five=True)
print(result)  # ['de5']

# Examples with neutral tone particles
particles = ['的', '了', '着', '过']
for particle in particles:
    standard = lazy_pinyin(particle, style=Style.TONE3)
    with_five = lazy_pinyin(particle, style=Style.TONE3, neutral_tone_with_five=True)
    print(f"{particle}: {standard} -> {with_five}")

Segmentation Modules

Word boundary detection modules for accurate pronunciation through proper segmentation.

MMSeg Segmentation

# pypinyin.seg.mmseg
def seg(hans):
    """
    Segment Chinese text using MMSeg algorithm.
    
    Parameters:
    - hans (str): Chinese text to segment
    
    Returns:
    list: List of segmented words
    """

from pypinyin.seg.mmseg import seg
from pypinyin import lazy_pinyin

# Compare with and without segmentation
text = '研究生命的起源'

# Without proper segmentation (character by character)
result1 = lazy_pinyin(text)
print(f"Character-by-character: {result1}")

# With MMSeg segmentation
segments = seg(text)
print(f"Segments: {segments}")  # Better word boundaries

# Apply segmentation for better pronunciation
segmented_text = ' '.join(segments)
result2 = lazy_pinyin(segmented_text)
print(f"Segmented: {result2}")

Simple Segmentation

# pypinyin.seg.simpleseg  
def seg(hans):
    """
    Simple character-by-character segmentation.
    
    Parameters:
    - hans (str): Chinese text to segment
    
    Returns:
    list: List of individual characters
    """

from pypinyin.seg.simpleseg import seg

text = '中华人民共和国'
segments = seg(text)
print(segments)  # ['中', '华', '人', '民', '共', '和', '国']

Tone Conversion Utilities

Direct tone style conversion functions for format transformation.

# pypinyin.contrib.tone_convert
def tone_to_tone2(tone_pinyin):
    """Convert tone marks to tone2 format."""

def tone2_to_tone(tone2_pinyin):
    """Convert tone2 format to tone marks."""

def tone_to_tone3(tone_pinyin):
    """Convert tone marks to tone3 format."""

def tone3_to_tone(tone3_pinyin):
    """Convert tone3 format to tone marks."""

# Additional conversion functions for all style pairs...

Usage Examples

from pypinyin.contrib.tone_convert import (
    tone_to_tone2, tone2_to_tone, 
    tone_to_tone3, tone3_to_tone
)

# Convert between tone formats
original = 'zhōng guó'

# To tone2 (numbers after vowels)
tone2_result = tone_to_tone2(original)
print(f"Tone2: {tone2_result}")  # zho1ng guo2

# To tone3 (numbers after pinyin)
tone3_result = tone_to_tone3(original)
print(f"Tone3: {tone3_result}")  # zhong1 guo2

# Back to tone marks
back_to_tone = tone3_to_tone(tone3_result)
print(f"Back to tone: {back_to_tone}")  # zhōng guó

# Chain conversions
conversion_chain = [
    ('Original', 'zhōng guó'),
    ('Tone2', tone_to_tone2('zhōng guó')),
    ('Tone3', tone_to_tone3('zhōng guó')),
    ('Back', tone3_to_tone(tone_to_tone3('zhōng guó')))
]

for label, result in conversion_chain:
    print(f"{label}: {result}")

Advanced Integration Patterns

Custom Converter Development

Creating specialized converters for domain-specific needs:

from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin
from pypinyin import Style

class DomainSpecificConverter(DefaultConverter):
    """Custom converter for domain-specific pronunciation."""
    
    def __init__(self, domain='general'):
        super().__init__()
        self.domain = domain
        self.domain_dict = self._load_domain_dict()
    
    def _load_domain_dict(self):
        """Load domain-specific pronunciation mappings."""
        domain_mappings = {
            'medical': {
                '症': ['zhèng'],  # Medical symptom context
                '脉': ['mài'],    # Pulse context
            },
            'legal': {
                '法': ['fǎ'],     # Law context
                '案': ['àn'],     # Legal case context
            }
        }
        return domain_mappings.get(self.domain, {})
    
    def convert(self, han, style, errors, strict):
        """Convert with domain-specific rules."""
        # Check domain dictionary first
        if han in self.domain_dict:
            domain_pronunciations = self.domain_dict[han]
            # Format according to requested style...
            return domain_pronunciations
        
        # Fall back to default conversion
        return super().convert(han, style, errors, strict)

# Use custom converter
medical_converter = DomainSpecificConverter(domain='medical')
medical_pinyin = Pinyin(converter=medical_converter)

medical_text = '症状分析'
result = medical_pinyin.pinyin(medical_text)
print(f"Medical context: {result}")

Combining Advanced Features

Integrating multiple advanced features for comprehensive processing:

from pypinyin import lazy_pinyin, Style
from pypinyin.seg.mmseg import seg as mmseg_seg
from pypinyin.contrib.tone_convert import tone_to_tone3

def advanced_processing_pipeline(text):
    """Comprehensive processing with multiple advanced features."""
    
    # Step 1: Intelligent segmentation
    segments = mmseg_seg(text)
    print(f"Segments: {segments}")
    
    # Step 2: Pinyin conversion with tone sandhi
    pinyin_result = lazy_pinyin(
        text, 
        style=Style.TONE,
        tone_sandhi=True,
        v_to_u=True,
        neutral_tone_with_five=True
    )
    print(f"Pinyin with advanced features: {pinyin_result}")
    
    # Step 3: Format conversion
    tone_marked = ' '.join(pinyin_result)
    tone3_format = tone_to_tone3(tone_marked)
    print(f"Tone3 format: {tone3_format}")
    
    return {
        'segments': segments,
        'pinyin_advanced': pinyin_result,
        'tone3_format': tone3_format
    }

# Example usage
text = '一个不错的研究生'
results = advanced_processing_pipeline(text)

# Access different processing results
for key, value in results.items():
    print(f"{key}: {value}")

Performance Optimization

Optimizing advanced feature usage for production scenarios:

from functools import lru_cache
from pypinyin.core import Pinyin
from pypinyin.converter import DefaultConverter

class OptimizedConverter(DefaultConverter):
    """Performance-optimized converter with caching."""
    
    def __init__(self, cache_size=1000):
        super().__init__()
        self.cache_size = cache_size
        # Use LRU cache for frequent conversions
        self.convert = lru_cache(maxsize=cache_size)(self.convert)
    
    @lru_cache(maxsize=1000)
    def convert_cached(self, han, style, errors, strict):
        """Cached conversion for performance."""
        return super().convert(han, style, errors, strict)

# Batch processing with optimized converter
def batch_process_optimized(texts):
    """Process multiple texts with performance optimization."""
    optimized_converter = OptimizedConverter(cache_size=5000)
    pinyin_processor = Pinyin(converter=optimized_converter)
    
    results = []
    for text in texts:
        result = pinyin_processor.lazy_pinyin(text)
        results.append(result)
    
    return results

# Example with large dataset
large_dataset = ['中国', '美国', '英国'] * 1000  # Repeated texts
results = batch_process_optimized(large_dataset)
print(f"Processed {len(results)} texts efficiently")

Install with Tessl CLI