or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-features.mdcommand-line-tools.mdcore-functions.mddictionary-customization.mdindex.mdstyles-formatting.md

advanced-features.mddocs/

0

# Advanced Features

1

2

Extended functionality including custom converters, tone sandhi processing, segmentation control, and specialized mixins for advanced pinyin processing scenarios.

3

4

## Capabilities

5

6

### Core Pinyin Class

7

8

The main Pinyin class provides configurable converter backends for advanced customization.

9

10

```python { .api }

11

class Pinyin:

12

"""Main pinyin conversion class with configurable converter backend."""

13

14

def __init__(self, converter=None):

15

"""

16

Initialize Pinyin converter.

17

18

Parameters:

19

- converter: Custom converter instance (default: DefaultConverter)

20

"""

21

22

def pinyin(self, hans, style=Style.TONE, heteronym=False, errors='default', strict=True):

23

"""Convert Chinese characters to pinyin using configured converter."""

24

25

def lazy_pinyin(self, hans, style=Style.NORMAL, errors='default', strict=True):

26

"""Convert Chinese characters to pinyin (lazy mode) using configured converter."""

27

```

28

29

#### Usage Examples

30

31

```python

32

from pypinyin.core import Pinyin

33

from pypinyin.converter import DefaultConverter, UltimateConverter

34

from pypinyin import Style

35

36

# Use default converter

37

pinyin_converter = Pinyin()

38

result = pinyin_converter.pinyin('中国')

39

print(result) # [['zhōng'], ['guó']]

40

41

# Use advanced converter

42

ultimate_converter = UltimateConverter()

43

pinyin_converter = Pinyin(converter=ultimate_converter)

44

result = pinyin_converter.pinyin('中国')

45

print(result) # Enhanced conversion with ultimate converter

46

47

# Custom converter configuration

48

custom_converter = DefaultConverter()

49

# Configure custom converter settings...

50

pinyin_converter = Pinyin(converter=custom_converter)

51

```

52

53

### Converter Classes

54

55

Pluggable converter implementations providing different processing backends.

56

57

#### DefaultConverter

58

59

```python { .api }

60

class DefaultConverter:

61

"""Basic pinyin converter implementation."""

62

63

def __init__(self):

64

"""Initialize default converter with standard settings."""

65

66

def convert(self, han, style, errors, strict):

67

"""

68

Convert single character to pinyin.

69

70

Parameters:

71

- han (str): Chinese character to convert

72

- style (Style): Output style

73

- errors (str): Error handling strategy

74

- strict (bool): Strict mode

75

76

Returns:

77

list: Pinyin pronunciations for the character

78

"""

79

```

80

81

#### UltimateConverter

82

83

```python { .api }

84

class UltimateConverter:

85

"""Enhanced converter with advanced processing options."""

86

87

def __init__(self):

88

"""Initialize ultimate converter with enhanced features."""

89

90

def convert(self, han, style, errors, strict):

91

"""Convert single character with enhanced processing."""

92

```

93

94

#### Usage Examples

95

96

```python

97

from pypinyin.converter import DefaultConverter, UltimateConverter

98

from pypinyin.core import Pinyin

99

from pypinyin import Style

100

101

# Compare converter outputs

102

text = '重庆'

103

104

# Default converter

105

default_conv = DefaultConverter()

106

pinyin_default = Pinyin(converter=default_conv)

107

result1 = pinyin_default.pinyin(text)

108

print(f"Default: {result1}")

109

110

# Ultimate converter

111

ultimate_conv = UltimateConverter()

112

pinyin_ultimate = Pinyin(converter=ultimate_conv)

113

result2 = pinyin_ultimate.pinyin(text)

114

print(f"Ultimate: {result2}")

115

116

# Custom converter subclass

117

class CustomConverter(DefaultConverter):

118

def convert(self, han, style, errors, strict):

119

# Custom processing logic

120

result = super().convert(han, style, errors, strict)

121

# Post-process result...

122

return result

123

124

custom_conv = CustomConverter()

125

pinyin_custom = Pinyin(converter=custom_conv)

126

result3 = pinyin_custom.pinyin(text)

127

print(f"Custom: {result3}")

128

```

129

130

### Contrib Modules - Advanced Processing

131

132

Extended processing capabilities through contrib mixins and modules.

133

134

#### Tone Sandhi Processing

135

136

```python { .api }

137

# pypinyin.contrib.tone_sandhi

138

class ToneSandhiMixin:

139

"""Mixin providing tone sandhi rule processing."""

140

141

def pre_handle_tone_sandhi(self, han_list):

142

"""Apply tone sandhi rules to character sequence."""

143

```

144

145

Tone sandhi automatically applies tone change rules for natural pronunciation:

146

147

```python

148

from pypinyin.contrib.tone_sandhi import ToneSandhiMixin

149

from pypinyin import lazy_pinyin

150

151

# Enable tone sandhi in lazy_pinyin

152

result = lazy_pinyin('一个', tone_sandhi=True)

153

print(result) # ['yí', 'gè'] # 一 changes from tone 1 to tone 2

154

155

result = lazy_pinyin('不用', tone_sandhi=True)

156

print(result) # ['bú', 'yòng'] # 不 changes from tone 4 to tone 2

157

158

# Common tone sandhi patterns

159

examples = [

160

('一天', ['yì', 'tiān']), # 一 + 1st tone -> 4th tone

161

('一个', ['yí', 'gè']), # 一 + 4th tone -> 2nd tone

162

('一些', ['yì', 'xiē']), # 一 + 1st tone -> 4th tone

163

('不对', ['bú', 'duì']), # 不 + 4th tone -> 2nd tone

164

('不好', ['bù', 'hǎo']), # 不 + 3rd tone -> 4th tone

165

]

166

167

for text, expected in examples:

168

result = lazy_pinyin(text, tone_sandhi=True)

169

print(f"{text}: {result}")

170

```

171

172

#### Character Variant Handling

173

174

```python { .api }

175

# pypinyin.contrib.uv

176

class V2UMixin:

177

"""Mixin handling v/ü character conversion."""

178

179

def pre_handle_v_to_u(self, han_list):

180

"""Convert 'v' characters to 'ü' in output."""

181

```

182

183

```python

184

from pypinyin import lazy_pinyin, Style

185

186

# Standard output with 'v'

187

result = lazy_pinyin('女', style=Style.TONE2)

188

print(result) # ['nv3']

189

190

# Convert 'v' to 'ü'

191

result = lazy_pinyin('女', style=Style.TONE2, v_to_u=True)

192

print(result) # ['nü3']

193

194

# Works with different styles

195

result = lazy_pinyin('绿', style=Style.NORMAL, v_to_u=True)

196

print(result) # ['lü'] instead of ['lv']

197

```

198

199

#### Neutral Tone Handling

200

201

```python { .api }

202

# pypinyin.contrib.neutral_tone

203

class NeutralToneWith5Mixin:

204

"""Mixin for neutral tone handling with number 5."""

205

206

def pre_handle_neutral_tone_with_5(self, han_list):

207

"""Use '5' for neutral tone in numeric styles."""

208

```

209

210

```python

211

from pypinyin import lazy_pinyin, Style

212

213

# Standard neutral tone representation

214

result = lazy_pinyin('的', style=Style.TONE3)

215

print(result) # ['de'] (no tone number for neutral tone)

216

217

# Use '5' for neutral tone

218

result = lazy_pinyin('的', style=Style.TONE3, neutral_tone_with_five=True)

219

print(result) # ['de5']

220

221

# Examples with neutral tone particles

222

particles = ['的', '了', '着', '过']

223

for particle in particles:

224

standard = lazy_pinyin(particle, style=Style.TONE3)

225

with_five = lazy_pinyin(particle, style=Style.TONE3, neutral_tone_with_five=True)

226

print(f"{particle}: {standard} -> {with_five}")

227

```

228

229

### Segmentation Modules

230

231

Word boundary detection modules for accurate pronunciation through proper segmentation.

232

233

#### MMSeg Segmentation

234

235

```python { .api }

236

# pypinyin.seg.mmseg

237

def seg(hans):

238

"""

239

Segment Chinese text using MMSeg algorithm.

240

241

Parameters:

242

- hans (str): Chinese text to segment

243

244

Returns:

245

list: List of segmented words

246

"""

247

```

248

249

```python

250

from pypinyin.seg.mmseg import seg

251

from pypinyin import lazy_pinyin

252

253

# Compare with and without segmentation

254

text = '研究生命的起源'

255

256

# Without proper segmentation (character by character)

257

result1 = lazy_pinyin(text)

258

print(f"Character-by-character: {result1}")

259

260

# With MMSeg segmentation

261

segments = seg(text)

262

print(f"Segments: {segments}") # Better word boundaries

263

264

# Apply segmentation for better pronunciation

265

segmented_text = ' '.join(segments)

266

result2 = lazy_pinyin(segmented_text)

267

print(f"Segmented: {result2}")

268

```

269

270

#### Simple Segmentation

271

272

```python { .api }

273

# pypinyin.seg.simpleseg

274

def seg(hans):

275

"""

276

Simple character-by-character segmentation.

277

278

Parameters:

279

- hans (str): Chinese text to segment

280

281

Returns:

282

list: List of individual characters

283

"""

284

```

285

286

```python

287

from pypinyin.seg.simpleseg import seg

288

289

text = '中华人民共和国'

290

segments = seg(text)

291

print(segments) # ['中', '华', '人', '民', '共', '和', '国']

292

```

293

294

### Tone Conversion Utilities

295

296

Direct tone style conversion functions for format transformation.

297

298

```python { .api }

299

# pypinyin.contrib.tone_convert

300

def tone_to_tone2(tone_pinyin):

301

"""Convert tone marks to tone2 format."""

302

303

def tone2_to_tone(tone2_pinyin):

304

"""Convert tone2 format to tone marks."""

305

306

def tone_to_tone3(tone_pinyin):

307

"""Convert tone marks to tone3 format."""

308

309

def tone3_to_tone(tone3_pinyin):

310

"""Convert tone3 format to tone marks."""

311

312

# Additional conversion functions for all style pairs...

313

```

314

315

#### Usage Examples

316

317

```python

318

from pypinyin.contrib.tone_convert import (

319

tone_to_tone2, tone2_to_tone,

320

tone_to_tone3, tone3_to_tone

321

)

322

323

# Convert between tone formats

324

original = 'zhōng guó'

325

326

# To tone2 (numbers after vowels)

327

tone2_result = tone_to_tone2(original)

328

print(f"Tone2: {tone2_result}") # zho1ng guo2

329

330

# To tone3 (numbers after pinyin)

331

tone3_result = tone_to_tone3(original)

332

print(f"Tone3: {tone3_result}") # zhong1 guo2

333

334

# Back to tone marks

335

back_to_tone = tone3_to_tone(tone3_result)

336

print(f"Back to tone: {back_to_tone}") # zhōng guó

337

338

# Chain conversions

339

conversion_chain = [

340

('Original', 'zhōng guó'),

341

('Tone2', tone_to_tone2('zhōng guó')),

342

('Tone3', tone_to_tone3('zhōng guó')),

343

('Back', tone3_to_tone(tone_to_tone3('zhōng guó')))

344

]

345

346

for label, result in conversion_chain:

347

print(f"{label}: {result}")

348

```

349

350

## Advanced Integration Patterns

351

352

### Custom Converter Development

353

354

Creating specialized converters for domain-specific needs:

355

356

```python

357

from pypinyin.converter import DefaultConverter

358

from pypinyin.core import Pinyin

359

from pypinyin import Style

360

361

class DomainSpecificConverter(DefaultConverter):

362

"""Custom converter for domain-specific pronunciation."""

363

364

def __init__(self, domain='general'):

365

super().__init__()

366

self.domain = domain

367

self.domain_dict = self._load_domain_dict()

368

369

def _load_domain_dict(self):

370

"""Load domain-specific pronunciation mappings."""

371

domain_mappings = {

372

'medical': {

373

'症': ['zhèng'], # Medical symptom context

374

'脉': ['mài'], # Pulse context

375

},

376

'legal': {

377

'法': ['fǎ'], # Law context

378

'案': ['àn'], # Legal case context

379

}

380

}

381

return domain_mappings.get(self.domain, {})

382

383

def convert(self, han, style, errors, strict):

384

"""Convert with domain-specific rules."""

385

# Check domain dictionary first

386

if han in self.domain_dict:

387

domain_pronunciations = self.domain_dict[han]

388

# Format according to requested style...

389

return domain_pronunciations

390

391

# Fall back to default conversion

392

return super().convert(han, style, errors, strict)

393

394

# Use custom converter

395

medical_converter = DomainSpecificConverter(domain='medical')

396

medical_pinyin = Pinyin(converter=medical_converter)

397

398

medical_text = '症状分析'

399

result = medical_pinyin.pinyin(medical_text)

400

print(f"Medical context: {result}")

401

```

402

403

### Combining Advanced Features

404

405

Integrating multiple advanced features for comprehensive processing:

406

407

```python

408

from pypinyin import lazy_pinyin, Style

409

from pypinyin.seg.mmseg import seg as mmseg_seg

410

from pypinyin.contrib.tone_convert import tone_to_tone3

411

412

def advanced_processing_pipeline(text):

413

"""Comprehensive processing with multiple advanced features."""

414

415

# Step 1: Intelligent segmentation

416

segments = mmseg_seg(text)

417

print(f"Segments: {segments}")

418

419

# Step 2: Pinyin conversion with tone sandhi

420

pinyin_result = lazy_pinyin(

421

text,

422

style=Style.TONE,

423

tone_sandhi=True,

424

v_to_u=True,

425

neutral_tone_with_five=True

426

)

427

print(f"Pinyin with advanced features: {pinyin_result}")

428

429

# Step 3: Format conversion

430

tone_marked = ' '.join(pinyin_result)

431

tone3_format = tone_to_tone3(tone_marked)

432

print(f"Tone3 format: {tone3_format}")

433

434

return {

435

'segments': segments,

436

'pinyin_advanced': pinyin_result,

437

'tone3_format': tone3_format

438

}

439

440

# Example usage

441

text = '一个不错的研究生'

442

results = advanced_processing_pipeline(text)

443

444

# Access different processing results

445

for key, value in results.items():

446

print(f"{key}: {value}")

447

```

448

449

### Performance Optimization

450

451

Optimizing advanced feature usage for production scenarios:

452

453

```python

454

from functools import lru_cache

455

from pypinyin.core import Pinyin

456

from pypinyin.converter import DefaultConverter

457

458

class OptimizedConverter(DefaultConverter):

459

"""Performance-optimized converter with caching."""

460

461

def __init__(self, cache_size=1000):

462

super().__init__()

463

self.cache_size = cache_size

464

# Use LRU cache for frequent conversions

465

self.convert = lru_cache(maxsize=cache_size)(self.convert)

466

467

@lru_cache(maxsize=1000)

468

def convert_cached(self, han, style, errors, strict):

469

"""Cached conversion for performance."""

470

return super().convert(han, style, errors, strict)

471

472

# Batch processing with optimized converter

473

def batch_process_optimized(texts):

474

"""Process multiple texts with performance optimization."""

475

optimized_converter = OptimizedConverter(cache_size=5000)

476

pinyin_processor = Pinyin(converter=optimized_converter)

477

478

results = []

479

for text in texts:

480

result = pinyin_processor.lazy_pinyin(text)

481

results.append(result)

482

483

return results

484

485

# Example with large dataset

486

large_dataset = ['中国', '美国', '英国'] * 1000 # Repeated texts

487

results = batch_process_optimized(large_dataset)

488

print(f"Processed {len(results)} texts efficiently")

489

```