0
# Advanced Features
1
2
Extended functionality including custom converters, tone sandhi processing, segmentation control, and specialized mixins for advanced pinyin processing scenarios.
3
4
## Capabilities
5
6
### Core Pinyin Class
7
8
The main Pinyin class provides configurable converter backends for advanced customization.
9
10
```python { .api }
11
class Pinyin:
12
"""Main pinyin conversion class with configurable converter backend."""
13
14
def __init__(self, converter=None):
15
"""
16
Initialize Pinyin converter.
17
18
Parameters:
19
- converter: Custom converter instance (default: DefaultConverter)
20
"""
21
22
def pinyin(self, hans, style=Style.TONE, heteronym=False, errors='default', strict=True):
23
"""Convert Chinese characters to pinyin using configured converter."""
24
25
def lazy_pinyin(self, hans, style=Style.NORMAL, errors='default', strict=True):
26
"""Convert Chinese characters to pinyin (lazy mode) using configured converter."""
27
```
28
29
#### Usage Examples
30
31
```python
32
from pypinyin.core import Pinyin
33
from pypinyin.converter import DefaultConverter, UltimateConverter
34
from pypinyin import Style
35
36
# Use default converter
37
pinyin_converter = Pinyin()
38
result = pinyin_converter.pinyin('中国')
39
print(result) # [['zhōng'], ['guó']]
40
41
# Use advanced converter
42
ultimate_converter = UltimateConverter()
43
pinyin_converter = Pinyin(converter=ultimate_converter)
44
result = pinyin_converter.pinyin('中国')
45
print(result) # Enhanced conversion with ultimate converter
46
47
# Custom converter configuration
48
custom_converter = DefaultConverter()
49
# Configure custom converter settings...
50
pinyin_converter = Pinyin(converter=custom_converter)
51
```
52
53
### Converter Classes
54
55
Pluggable converter implementations providing different processing backends.
56
57
#### DefaultConverter
58
59
```python { .api }
60
class DefaultConverter:
61
"""Basic pinyin converter implementation."""
62
63
def __init__(self):
64
"""Initialize default converter with standard settings."""
65
66
def convert(self, han, style, errors, strict):
67
"""
68
Convert single character to pinyin.
69
70
Parameters:
71
- han (str): Chinese character to convert
72
- style (Style): Output style
73
- errors (str): Error handling strategy
74
- strict (bool): Strict mode
75
76
Returns:
77
list: Pinyin pronunciations for the character
78
"""
79
```
80
81
#### UltimateConverter
82
83
```python { .api }
84
class UltimateConverter:
85
"""Enhanced converter with advanced processing options."""
86
87
def __init__(self):
88
"""Initialize ultimate converter with enhanced features."""
89
90
def convert(self, han, style, errors, strict):
91
"""Convert single character with enhanced processing."""
92
```
93
94
#### Usage Examples
95
96
```python
97
from pypinyin.converter import DefaultConverter, UltimateConverter
98
from pypinyin.core import Pinyin
99
from pypinyin import Style
100
101
# Compare converter outputs
102
text = '重庆'
103
104
# Default converter
105
default_conv = DefaultConverter()
106
pinyin_default = Pinyin(converter=default_conv)
107
result1 = pinyin_default.pinyin(text)
108
print(f"Default: {result1}")
109
110
# Ultimate converter
111
ultimate_conv = UltimateConverter()
112
pinyin_ultimate = Pinyin(converter=ultimate_conv)
113
result2 = pinyin_ultimate.pinyin(text)
114
print(f"Ultimate: {result2}")
115
116
# Custom converter subclass
117
class CustomConverter(DefaultConverter):
118
def convert(self, han, style, errors, strict):
119
# Custom processing logic
120
result = super().convert(han, style, errors, strict)
121
# Post-process result...
122
return result
123
124
custom_conv = CustomConverter()
125
pinyin_custom = Pinyin(converter=custom_conv)
126
result3 = pinyin_custom.pinyin(text)
127
print(f"Custom: {result3}")
128
```
129
130
### Contrib Modules - Advanced Processing
131
132
Extended processing capabilities through contrib mixins and modules.
133
134
#### Tone Sandhi Processing
135
136
```python { .api }
137
# pypinyin.contrib.tone_sandhi
138
class ToneSandhiMixin:
139
"""Mixin providing tone sandhi rule processing."""
140
141
def pre_handle_tone_sandhi(self, han_list):
142
"""Apply tone sandhi rules to character sequence."""
143
```
144
145
Tone sandhi automatically applies tone change rules for natural pronunciation:
146
147
```python
148
from pypinyin.contrib.tone_sandhi import ToneSandhiMixin
149
from pypinyin import lazy_pinyin
150
151
# Enable tone sandhi in lazy_pinyin
152
result = lazy_pinyin('一个', tone_sandhi=True)
153
print(result) # ['yí', 'gè'] # 一 changes from tone 1 to tone 2
154
155
result = lazy_pinyin('不用', tone_sandhi=True)
156
print(result) # ['bú', 'yòng'] # 不 changes from tone 4 to tone 2
157
158
# Common tone sandhi patterns
159
examples = [
160
('一天', ['yì', 'tiān']), # 一 + 1st tone -> 4th tone
161
('一个', ['yí', 'gè']), # 一 + 4th tone -> 2nd tone
162
('一些', ['yì', 'xiē']), # 一 + 1st tone -> 4th tone
163
('不对', ['bú', 'duì']), # 不 + 4th tone -> 2nd tone
164
('不好', ['bù', 'hǎo']), # 不 + 3rd tone -> 4th tone
165
]
166
167
for text, expected in examples:
168
result = lazy_pinyin(text, tone_sandhi=True)
169
print(f"{text}: {result}")
170
```
171
172
#### Character Variant Handling
173
174
```python { .api }
175
# pypinyin.contrib.uv
176
class V2UMixin:
177
"""Mixin handling v/ü character conversion."""
178
179
def pre_handle_v_to_u(self, han_list):
180
"""Convert 'v' characters to 'ü' in output."""
181
```
182
183
```python
184
from pypinyin import lazy_pinyin, Style
185
186
# Standard output with 'v'
187
result = lazy_pinyin('女', style=Style.TONE2)
188
print(result) # ['nv3']
189
190
# Convert 'v' to 'ü'
191
result = lazy_pinyin('女', style=Style.TONE2, v_to_u=True)
192
print(result) # ['nü3']
193
194
# Works with different styles
195
result = lazy_pinyin('绿', style=Style.NORMAL, v_to_u=True)
196
print(result) # ['lü'] instead of ['lv']
197
```
198
199
#### Neutral Tone Handling
200
201
```python { .api }
202
# pypinyin.contrib.neutral_tone
203
class NeutralToneWith5Mixin:
204
"""Mixin for neutral tone handling with number 5."""
205
206
def pre_handle_neutral_tone_with_5(self, han_list):
207
"""Use '5' for neutral tone in numeric styles."""
208
```
209
210
```python
211
from pypinyin import lazy_pinyin, Style
212
213
# Standard neutral tone representation
214
result = lazy_pinyin('的', style=Style.TONE3)
215
print(result) # ['de'] (no tone number for neutral tone)
216
217
# Use '5' for neutral tone
218
result = lazy_pinyin('的', style=Style.TONE3, neutral_tone_with_five=True)
219
print(result) # ['de5']
220
221
# Examples with neutral tone particles
222
particles = ['的', '了', '着', '过']
223
for particle in particles:
224
standard = lazy_pinyin(particle, style=Style.TONE3)
225
with_five = lazy_pinyin(particle, style=Style.TONE3, neutral_tone_with_five=True)
226
print(f"{particle}: {standard} -> {with_five}")
227
```
228
229
### Segmentation Modules
230
231
Word boundary detection modules for accurate pronunciation through proper segmentation.
232
233
#### MMSeg Segmentation
234
235
```python { .api }
236
# pypinyin.seg.mmseg
237
def seg(hans):
238
"""
239
Segment Chinese text using MMSeg algorithm.
240
241
Parameters:
242
- hans (str): Chinese text to segment
243
244
Returns:
245
list: List of segmented words
246
"""
247
```
248
249
```python
250
from pypinyin.seg.mmseg import seg
251
from pypinyin import lazy_pinyin
252
253
# Compare with and without segmentation
254
text = '研究生命的起源'
255
256
# Without proper segmentation (character by character)
257
result1 = lazy_pinyin(text)
258
print(f"Character-by-character: {result1}")
259
260
# With MMSeg segmentation
261
segments = seg(text)
262
print(f"Segments: {segments}") # Better word boundaries
263
264
# Apply segmentation for better pronunciation
265
segmented_text = ' '.join(segments)
266
result2 = lazy_pinyin(segmented_text)
267
print(f"Segmented: {result2}")
268
```
269
270
#### Simple Segmentation
271
272
```python { .api }
273
# pypinyin.seg.simpleseg
274
def seg(hans):
275
"""
276
Simple character-by-character segmentation.
277
278
Parameters:
279
- hans (str): Chinese text to segment
280
281
Returns:
282
list: List of individual characters
283
"""
284
```
285
286
```python
287
from pypinyin.seg.simpleseg import seg
288
289
text = '中华人民共和国'
290
segments = seg(text)
291
print(segments) # ['中', '华', '人', '民', '共', '和', '国']
292
```
293
294
### Tone Conversion Utilities
295
296
Direct tone style conversion functions for format transformation.
297
298
```python { .api }
299
# pypinyin.contrib.tone_convert
300
def tone_to_tone2(tone_pinyin):
301
"""Convert tone marks to tone2 format."""
302
303
def tone2_to_tone(tone2_pinyin):
304
"""Convert tone2 format to tone marks."""
305
306
def tone_to_tone3(tone_pinyin):
307
"""Convert tone marks to tone3 format."""
308
309
def tone3_to_tone(tone3_pinyin):
310
"""Convert tone3 format to tone marks."""
311
312
# Additional conversion functions for all style pairs...
313
```
314
315
#### Usage Examples
316
317
```python
318
from pypinyin.contrib.tone_convert import (
319
tone_to_tone2, tone2_to_tone,
320
tone_to_tone3, tone3_to_tone
321
)
322
323
# Convert between tone formats
324
original = 'zhōng guó'
325
326
# To tone2 (numbers after vowels)
327
tone2_result = tone_to_tone2(original)
328
print(f"Tone2: {tone2_result}") # zho1ng guo2
329
330
# To tone3 (numbers after pinyin)
331
tone3_result = tone_to_tone3(original)
332
print(f"Tone3: {tone3_result}") # zhong1 guo2
333
334
# Back to tone marks
335
back_to_tone = tone3_to_tone(tone3_result)
336
print(f"Back to tone: {back_to_tone}") # zhōng guó
337
338
# Chain conversions
339
conversion_chain = [
340
('Original', 'zhōng guó'),
341
('Tone2', tone_to_tone2('zhōng guó')),
342
('Tone3', tone_to_tone3('zhōng guó')),
343
('Back', tone3_to_tone(tone_to_tone3('zhōng guó')))
344
]
345
346
for label, result in conversion_chain:
347
print(f"{label}: {result}")
348
```
349
350
## Advanced Integration Patterns
351
352
### Custom Converter Development
353
354
Creating specialized converters for domain-specific needs:
355
356
```python
357
from pypinyin.converter import DefaultConverter
358
from pypinyin.core import Pinyin
359
from pypinyin import Style
360
361
class DomainSpecificConverter(DefaultConverter):
362
"""Custom converter for domain-specific pronunciation."""
363
364
def __init__(self, domain='general'):
365
super().__init__()
366
self.domain = domain
367
self.domain_dict = self._load_domain_dict()
368
369
def _load_domain_dict(self):
370
"""Load domain-specific pronunciation mappings."""
371
domain_mappings = {
372
'medical': {
373
'症': ['zhèng'], # Medical symptom context
374
'脉': ['mài'], # Pulse context
375
},
376
'legal': {
377
'法': ['fǎ'], # Law context
378
'案': ['àn'], # Legal case context
379
}
380
}
381
return domain_mappings.get(self.domain, {})
382
383
def convert(self, han, style, errors, strict):
384
"""Convert with domain-specific rules."""
385
# Check domain dictionary first
386
if han in self.domain_dict:
387
domain_pronunciations = self.domain_dict[han]
388
# Format according to requested style...
389
return domain_pronunciations
390
391
# Fall back to default conversion
392
return super().convert(han, style, errors, strict)
393
394
# Use custom converter
395
medical_converter = DomainSpecificConverter(domain='medical')
396
medical_pinyin = Pinyin(converter=medical_converter)
397
398
medical_text = '症状分析'
399
result = medical_pinyin.pinyin(medical_text)
400
print(f"Medical context: {result}")
401
```
402
403
### Combining Advanced Features
404
405
Integrating multiple advanced features for comprehensive processing:
406
407
```python
408
from pypinyin import lazy_pinyin, Style
409
from pypinyin.seg.mmseg import seg as mmseg_seg
410
from pypinyin.contrib.tone_convert import tone_to_tone3
411
412
def advanced_processing_pipeline(text):
413
"""Comprehensive processing with multiple advanced features."""
414
415
# Step 1: Intelligent segmentation
416
segments = mmseg_seg(text)
417
print(f"Segments: {segments}")
418
419
# Step 2: Pinyin conversion with tone sandhi
420
pinyin_result = lazy_pinyin(
421
text,
422
style=Style.TONE,
423
tone_sandhi=True,
424
v_to_u=True,
425
neutral_tone_with_five=True
426
)
427
print(f"Pinyin with advanced features: {pinyin_result}")
428
429
# Step 3: Format conversion
430
tone_marked = ' '.join(pinyin_result)
431
tone3_format = tone_to_tone3(tone_marked)
432
print(f"Tone3 format: {tone3_format}")
433
434
return {
435
'segments': segments,
436
'pinyin_advanced': pinyin_result,
437
'tone3_format': tone3_format
438
}
439
440
# Example usage
441
text = '一个不错的研究生'
442
results = advanced_processing_pipeline(text)
443
444
# Access different processing results
445
for key, value in results.items():
446
print(f"{key}: {value}")
447
```
448
449
### Performance Optimization
450
451
Optimizing advanced feature usage for production scenarios:
452
453
```python
454
from functools import lru_cache
455
from pypinyin.core import Pinyin
456
from pypinyin.converter import DefaultConverter
457
458
class OptimizedConverter(DefaultConverter):
459
"""Performance-optimized converter with caching."""
460
461
def __init__(self, cache_size=1000):
462
super().__init__()
463
self.cache_size = cache_size
464
# Use LRU cache for frequent conversions
465
self.convert = lru_cache(maxsize=cache_size)(self.convert)
466
467
@lru_cache(maxsize=1000)
468
def convert_cached(self, han, style, errors, strict):
469
"""Cached conversion for performance."""
470
return super().convert(han, style, errors, strict)
471
472
# Batch processing with optimized converter
473
def batch_process_optimized(texts):
474
"""Process multiple texts with performance optimization."""
475
optimized_converter = OptimizedConverter(cache_size=5000)
476
pinyin_processor = Pinyin(converter=optimized_converter)
477
478
results = []
479
for text in texts:
480
result = pinyin_processor.lazy_pinyin(text)
481
results.append(result)
482
483
return results
484
485
# Example with large dataset
486
large_dataset = ['中国', '美国', '英国'] * 1000 # Repeated texts
487
results = batch_process_optimized(large_dataset)
488
print(f"Processed {len(results)} texts efficiently")
489
```