CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pypinyin

Comprehensive Chinese character to Pinyin conversion library with intelligent word segmentation and multiple output styles

Pending
Overview
Eval results
Files

dictionary-customization.mddocs/

Dictionary Customization

Functions for loading custom pronunciation dictionaries to override default pinyin mappings for specific characters or phrases, enabling accurate pronunciation for specialized domains, proper nouns, and regional variations.

Capabilities

Single Character Dictionary Loading

Load custom pronunciations for individual Chinese characters to override default mappings.

def load_single_dict(pinyin_dict, style='default'):
    """
    Load custom dictionary for single character pinyin pronunciation corrections.

    Parameters:
    - pinyin_dict (dict): Dictionary mapping characters to pinyin pronunciations
      Format: {character: [pinyin_list]} or {character: pinyin_string}
    - style (str): Dictionary style identifier (default: 'default')

    Returns:
    None: Dictionary is loaded globally and affects all subsequent conversions
    """

Usage Examples

from pypinyin import load_single_dict, lazy_pinyin

# Load custom pronunciations for specific characters
custom_dict = {
    '朴': ['pǔ'],     # Override default pronunciation
    '任': ['rén'],    # Ensure specific pronunciation for surnames
    '华': ['huá']     # Set preferred pronunciation
}

# Load the custom dictionary
load_single_dict(custom_dict)

# Test the custom pronunciations
result = lazy_pinyin('朴素')  
print(result)  # Uses custom pronunciation for 朴

# Multiple pronunciation format
custom_dict_multi = {
    '银': ['yín', 'yǐn'],  # Multiple possible pronunciations
    '行': ['xíng', 'háng'] 
}

load_single_dict(custom_dict_multi)

Phrase Dictionary Loading

Load custom pronunciations for multi-character phrases to ensure accurate pronunciation through proper word segmentation.

def load_phrases_dict(phrases_dict, style='default'):
    """
    Load custom dictionary for phrase pinyin pronunciation corrections.

    Parameters:
    - phrases_dict (dict): Dictionary mapping phrases to pinyin pronunciations
      Format: {phrase: [[pinyin_for_char1], [pinyin_for_char2], ...]}
    - style (str): Dictionary style identifier (default: 'default')

    Returns:
    None: Dictionary is loaded globally and affects all subsequent conversions
    """

Usage Examples

from pypinyin import load_phrases_dict, lazy_pinyin

# Load custom phrase pronunciations
phrase_dict = {
    '重庆': [['chóng'], ['qìng']],        # Proper pronunciation for city name
    '长安': [['cháng'], ['ān']],          # Historical place name
    '银行': [['yín'], ['háng']],          # Financial institution context
    '一个': [['yí'], ['gè']],             # Tone sandhi correction
}

# Load the phrase dictionary
load_phrases_dict(phrase_dict)

# Test phrase pronunciations
result = lazy_pinyin('重庆市')
print(result)  # Uses custom phrase pronunciation for 重庆

result = lazy_pinyin('我在银行工作')  
print(result)  # Uses banking context pronunciation for 银行

Dictionary Style Management

Both dictionary loading functions support style parameters for managing multiple dictionary sets.

from pypinyin import load_single_dict, load_phrases_dict

# Load dictionaries with different styles for different contexts
medical_chars = {
    '症': ['zhèng'],  # Medical context
    '脉': ['mài']     # Traditional medicine
}
load_single_dict(medical_chars, style='medical')

# Legal terminology phrases
legal_phrases = {
    '法院': [['fǎ'], ['yuàn']],
    '起诉': [['qǐ'], ['sù']]
}
load_phrases_dict(legal_phrases, style='legal')

Custom Dictionary Formats

Single Character Dictionary Format

Character dictionaries map individual characters to their pronunciations:

# Simple string format (single pronunciation)
single_dict = {
    '张': 'zhāng',
    '李': 'lǐ',
    '王': 'wáng'
}

# List format (multiple pronunciations)
multi_dict = {
    '中': ['zhōng', 'zhòng'],  # Different pronunciations in different contexts
    '的': ['de', 'dí', 'dì'],  # Multiple grammatical uses
    '行': ['xíng', 'háng']     # Different meanings
}

# Mixed format
mixed_dict = {
    '朴': 'pǔ',               # Single pronunciation
    '任': ['rén', 'rèn'],     # Multiple pronunciations
    '华': 'huá'               # Single pronunciation
}

Phrase Dictionary Format

Phrase dictionaries map multi-character strings to pinyin arrays:

# Standard phrase format
phrase_dict = {
    # Each phrase maps to list of lists (one list per character)
    '北京': [['běi'], ['jīng']],
    '上海': [['shàng'], ['hǎi']],
    
    # Phrases with tone sandhi corrections
    '一个': [['yí'], ['gè']],      # 一 changes tone before 4th tone
    '不用': [['bú'], ['yòng']],    # 不 changes tone before 4th tone
    
    # Proper nouns with specific pronunciations  
    '西安': [['xī'], ['ān']],      # City name
    '长城': [['cháng'], ['chéng']], # Great Wall
    
    # Technical terms
    '数据': [['shù'], ['jù']],     # Data/statistics context
    '银行': [['yín'], ['háng']],   # Banking context
}

Dictionary Loading Strategies

Domain-Specific Dictionaries

Load different dictionaries for different application domains:

from pypinyin import load_single_dict, load_phrases_dict, lazy_pinyin

# Medical terminology
def load_medical_dict():
    medical_chars = {
        '症': ['zhèng'],   # Symptom
        '脉': ['mài'],     # Pulse
        '药': ['yào']      # Medicine
    }
    
    medical_phrases = {
        '感冒': [['gǎn'], ['mào']],
        '发烧': [['fā'], ['shāo']],
        '头痛': [['tóu'], ['tòng']]
    }
    
    load_single_dict(medical_chars, style='medical')
    load_phrases_dict(medical_phrases, style='medical')

# Geographic names
def load_geographic_dict():
    geo_phrases = {
        '黑龙江': [['hēi'], ['lóng'], ['jiāng']],
        '内蒙古': [['nèi'], ['měng'], ['gǔ']],
        '新疆': [['xīn'], ['jiāng']]
    }
    
    load_phrases_dict(geo_phrases, style='geographic')

# Load domain-specific dictionaries
load_medical_dict()
load_geographic_dict()

# Test with domain-specific text
medical_text = "患者出现发烧症状"
result = lazy_pinyin(medical_text)
print(result)  # Uses medical pronunciation rules

Personal Name Handling

Special handling for Chinese personal names and surnames:

from pypinyin import load_single_dict, load_phrases_dict

# Common surname pronunciations
surname_dict = {
    '朴': ['piáo'],    # Korean-origin surname
    '任': ['rén'],     # Surname context
    '华': ['huá'],     # Given name context
    '单': ['shàn'],    # Surname (not dān)
    '种': ['chóng']    # Surname (not zhǒng)
}

# Famous person names
famous_names = {
    '孔子': [['kǒng'], ['zǐ']],
    '老子': [['lǎo'], ['zǐ']],
    '庄子': [['zhuāng'], ['zǐ']]
}

load_single_dict(surname_dict, style='names')
load_phrases_dict(famous_names, style='names')

# Test name pronunciation
names = ['朴志明', '任小华', '孔子']
for name in names:
    result = lazy_pinyin(name)
    print(f"{name}: {' '.join(result)}")

Dictionary Integration Patterns

Layered Dictionary Loading

Build comprehensive pronunciation systems by layering multiple dictionaries:

def setup_comprehensive_dict():
    # Base corrections for common issues
    base_corrections = {
        '一': ['yī', 'yí', 'yì'],  # Context-dependent tone changes
        '不': ['bù', 'bú']         # Tone sandhi variations
    }
    load_single_dict(base_corrections, style='base')
    
    # Regional pronunciation preferences
    regional_prefs = {
        '什么': [['shén'], ['me']],  # Northern pronunciation
        '这样': [['zhè'], ['yàng']]  # Standard pronunciation
    }
    load_phrases_dict(regional_prefs, style='regional')
    
    # Specialized terminology
    tech_terms = {
        '数据库': [['shù'], ['jù'], ['kù']],
        '算法': [['suàn'], ['fǎ']]
    }
    load_phrases_dict(tech_terms, style='technical')

# Initialize comprehensive dictionary system
setup_comprehensive_dict()

Dynamic Dictionary Updates

Update dictionaries based on context or user preferences:

from pypinyin import load_single_dict, load_phrases_dict

def update_context_dict(context='general'):
    """Update pronunciation dictionary based on context."""
    
    if context == 'business':
        business_dict = {
            '银行': [['yín'], ['háng']],    # Banking context
            '股票': [['gǔ'], ['piào']],     # Stock market
            '公司': [['gōng'], ['sī']]      # Company
        }
        load_phrases_dict(business_dict, style='business')
        
    elif context == 'education':
        edu_dict = {
            '学校': [['xué'], ['xiào']],
            '老师': [['lǎo'], ['shī']],
            '学生': [['xué'], ['shēng']]
        }
        load_phrases_dict(edu_dict, style='education')
        
    elif context == 'travel':
        travel_dict = {
            '飞机': [['fēi'], ['jī']],
            '火车': [['huǒ'], ['chē']],
            '酒店': [['jiǔ'], ['diàn']]
        }
        load_phrases_dict(travel_dict, style='travel')

# Use context-specific dictionaries
update_context_dict('business')
business_text = "去银行办理股票账户"
result = lazy_pinyin(business_text)
print(' '.join(result))

Install with Tessl CLI

npx tessl i tessl/pypi-pypinyin

docs

advanced-features.md

command-line-tools.md

core-functions.md

dictionary-customization.md

index.md

styles-formatting.md

tile.json