0
# Voice Management
1
2
## Overview
3
4
Voice management in the Google Cloud Text-to-Speech API involves discovering, selecting, and configuring voices for speech synthesis. The API provides access to hundreds of voices across multiple languages, including standard voices, high-quality WaveNet neural voices, and custom voice models.
5
6
## Voice Discovery
7
8
### Listing All Available Voices
9
10
```api { .api }
11
from google.cloud import texttospeech
12
13
# Initialize client
14
client = texttospeech.TextToSpeechClient()
15
16
# List all voices
17
response = client.list_voices()
18
19
# Iterate through available voices
20
for voice in response.voices:
21
print(f"Voice Name: {voice.name}")
22
print(f"Language Codes: {voice.language_codes}")
23
print(f"Gender: {voice.ssml_gender}")
24
print(f"Natural Sample Rate: {voice.natural_sample_rate_hertz} Hz")
25
print("---")
26
```
27
28
### Filtering Voices by Language
29
30
```api { .api }
31
from google.cloud.texttospeech import ListVoicesRequest
32
33
# List voices for specific language
34
request = ListVoicesRequest(language_code="en-US")
35
response = client.list_voices(request=request)
36
37
print(f"Found {len(response.voices)} voices for en-US:")
38
for voice in response.voices:
39
print(f"- {voice.name} ({voice.ssml_gender.name})")
40
41
# List voices for multiple languages
42
languages = ["en-US", "es-ES", "fr-FR", "de-DE"]
43
for lang in languages:
44
request = ListVoicesRequest(language_code=lang)
45
response = client.list_voices(request=request)
46
print(f"{lang}: {len(response.voices)} voices")
47
```
48
49
### Voice Information Analysis
50
51
```api { .api }
52
def analyze_voice_capabilities():
53
"""Analyze and categorize available voices."""
54
client = texttospeech.TextToSpeechClient()
55
response = client.list_voices()
56
57
# Group voices by type and language
58
voice_analysis = {
59
'by_language': {},
60
'by_type': {'wavenet': [], 'neural2': [], 'standard': [], 'other': []},
61
'by_gender': {'MALE': [], 'FEMALE': [], 'NEUTRAL': []}
62
}
63
64
for voice in response.voices:
65
# Group by language
66
for lang_code in voice.language_codes:
67
if lang_code not in voice_analysis['by_language']:
68
voice_analysis['by_language'][lang_code] = []
69
voice_analysis['by_language'][lang_code].append(voice.name)
70
71
# Group by voice type
72
if 'Wavenet' in voice.name:
73
voice_analysis['by_type']['wavenet'].append(voice.name)
74
elif 'Neural2' in voice.name:
75
voice_analysis['by_type']['neural2'].append(voice.name)
76
elif 'Standard' in voice.name:
77
voice_analysis['by_type']['standard'].append(voice.name)
78
else:
79
voice_analysis['by_type']['other'].append(voice.name)
80
81
# Group by gender
82
gender = voice.ssml_gender.name
83
if gender in voice_analysis['by_gender']:
84
voice_analysis['by_gender'][gender].append(voice.name)
85
86
return voice_analysis
87
88
# Usage
89
voice_stats = analyze_voice_capabilities()
90
print(f"WaveNet voices: {len(voice_stats['by_type']['wavenet'])}")
91
print(f"Neural2 voices: {len(voice_stats['by_type']['neural2'])}")
92
print(f"Standard voices: {len(voice_stats['by_type']['standard'])}")
93
```
94
95
## Voice Types and Models
96
97
### Voice Class Properties
98
99
```api { .api }
100
from google.cloud.texttospeech import Voice, SsmlVoiceGender
101
102
# Voice object contains:
103
# - name: str - Unique voice identifier (e.g., "en-US-Wavenet-A")
104
# - language_codes: List[str] - Supported language codes
105
# - ssml_gender: SsmlVoiceGender - Voice gender
106
# - natural_sample_rate_hertz: int - Optimal sample rate
107
108
# Access voice properties
109
def print_voice_details(voice: Voice):
110
print(f"Name: {voice.name}")
111
print(f"Languages: {', '.join(voice.language_codes)}")
112
print(f"Gender: {voice.ssml_gender.name}")
113
print(f"Sample Rate: {voice.natural_sample_rate_hertz} Hz")
114
115
# Example voice categorization
116
def categorize_voice(voice_name: str) -> str:
117
"""Categorize voice by type based on name."""
118
if "Wavenet" in voice_name:
119
return "WaveNet Neural Voice (High Quality)"
120
elif "Neural2" in voice_name:
121
return "Neural2 Voice (Premium Quality)"
122
elif "Standard" in voice_name:
123
return "Standard Voice (Basic Quality)"
124
elif "Studio" in voice_name:
125
return "Studio Voice (Premium)"
126
elif "Polyglot" in voice_name:
127
return "Polyglot Voice (Multi-language)"
128
else:
129
return "Custom or Special Voice"
130
```
131
132
### Voice Quality Comparison
133
134
```api { .api }
135
# Voice quality hierarchy (best to standard)
136
VOICE_QUALITY_TIERS = {
137
"premium": ["Neural2", "Studio", "Journey"],
138
"high": ["Wavenet"],
139
"standard": ["Standard"],
140
"custom": ["Custom"]
141
}
142
143
def get_best_voice_for_language(language_code: str, gender_preference=None):
144
"""Find the best available voice for a language."""
145
client = texttospeech.TextToSpeechClient()
146
request = texttospeech.ListVoicesRequest(language_code=language_code)
147
response = client.list_voices(request=request)
148
149
# Filter by gender if specified
150
voices = response.voices
151
if gender_preference:
152
voices = [v for v in voices if v.ssml_gender == gender_preference]
153
154
# Sort by quality tier
155
for tier_names in VOICE_QUALITY_TIERS.values():
156
for tier_name in tier_names:
157
for voice in voices:
158
if tier_name in voice.name:
159
return voice
160
161
# Return first available if no premium voices found
162
return voices[0] if voices else None
163
164
# Usage
165
best_voice = get_best_voice_for_language(
166
"en-US",
167
texttospeech.SsmlVoiceGender.FEMALE
168
)
169
if best_voice:
170
print(f"Best voice: {best_voice.name}")
171
```
172
173
## Voice Selection
174
175
### VoiceSelectionParams Configuration
176
177
```api { .api }
178
from google.cloud.texttospeech import VoiceSelectionParams, SsmlVoiceGender
179
180
# Basic voice selection by language and gender
181
voice_params = VoiceSelectionParams(
182
language_code="en-US", # Required: BCP-47 language code
183
ssml_gender=SsmlVoiceGender.FEMALE # Optional: gender preference
184
)
185
186
# Specific voice selection by name
187
voice_params = VoiceSelectionParams(
188
language_code="en-US",
189
name="en-US-Wavenet-D" # Exact voice model name
190
)
191
192
# Voice selection with custom pronunciations
193
voice_params = VoiceSelectionParams(
194
language_code="en-US",
195
name="en-US-Neural2-A",
196
custom_pronunciations=texttospeech.CustomPronunciations(
197
pronunciations=[
198
texttospeech.CustomPronunciationParams(
199
phrase="API",
200
ipa="ˌeɪ piː ˈaɪ",
201
phonetic_encoding=texttospeech.CustomPronunciationParams.PhoneticEncoding.IPA
202
)
203
]
204
)
205
)
206
```
207
208
### Advanced Voice Selection
209
210
```api { .api }
211
from google.cloud.texttospeech import (
212
VoiceSelectionParams,
213
AdvancedVoiceOptions,
214
CustomVoiceParams,
215
VoiceCloneParams
216
)
217
218
# Voice with advanced options
219
voice_params = VoiceSelectionParams(
220
language_code="en-US",
221
name="en-US-Neural2-C",
222
advanced_voice_options=AdvancedVoiceOptions(
223
low_latency_journey_synthesis=True # Enable low-latency mode
224
)
225
)
226
227
# Custom voice model
228
voice_params = VoiceSelectionParams(
229
language_code="en-US",
230
custom_voice=CustomVoiceParams(
231
model="projects/your-project/locations/us-central1/models/custom-voice-model"
232
)
233
)
234
235
# Voice cloning
236
voice_params = VoiceSelectionParams(
237
language_code="en-US",
238
voice_clone=VoiceCloneParams(
239
voice_clone_key="your-voice-clone-key"
240
)
241
)
242
```
243
244
## Gender and Language Options
245
246
### SsmlVoiceGender Enum
247
248
```api { .api }
249
from google.cloud.texttospeech import SsmlVoiceGender
250
251
# Available gender options
252
MALE = SsmlVoiceGender.MALE # Male voice
253
FEMALE = SsmlVoiceGender.FEMALE # Female voice
254
NEUTRAL = SsmlVoiceGender.NEUTRAL # Gender-neutral voice
255
UNSPECIFIED = SsmlVoiceGender.SSML_VOICE_GENDER_UNSPECIFIED # No preference
256
257
# Usage in voice selection
258
def create_voice_by_gender(language: str, gender: SsmlVoiceGender):
259
return VoiceSelectionParams(
260
language_code=language,
261
ssml_gender=gender
262
)
263
264
# Examples
265
male_voice = create_voice_by_gender("en-US", SsmlVoiceGender.MALE)
266
female_voice = create_voice_by_gender("fr-FR", SsmlVoiceGender.FEMALE)
267
neutral_voice = create_voice_by_gender("de-DE", SsmlVoiceGender.NEUTRAL)
268
```
269
270
### Language Code Examples
271
272
```api { .api }
273
# Common language codes for voice selection
274
SUPPORTED_LANGUAGES = {
275
"en-US": "English (United States)",
276
"en-GB": "English (United Kingdom)",
277
"en-AU": "English (Australia)",
278
"es-ES": "Spanish (Spain)",
279
"es-MX": "Spanish (Mexico)",
280
"fr-FR": "French (France)",
281
"fr-CA": "French (Canada)",
282
"de-DE": "German (Germany)",
283
"it-IT": "Italian (Italy)",
284
"pt-BR": "Portuguese (Brazil)",
285
"pt-PT": "Portuguese (Portugal)",
286
"ja-JP": "Japanese (Japan)",
287
"ko-KR": "Korean (South Korea)",
288
"zh-CN": "Chinese (Mainland)",
289
"zh-TW": "Chinese (Taiwan)",
290
"hi-IN": "Hindi (India)",
291
"ar-SA": "Arabic (Saudi Arabia)",
292
"ru-RU": "Russian (Russia)",
293
"nl-NL": "Dutch (Netherlands)",
294
"sv-SE": "Swedish (Sweden)",
295
"da-DK": "Danish (Denmark)",
296
"no-NO": "Norwegian (Norway)",
297
"fi-FI": "Finnish (Finland)",
298
}
299
300
def get_voices_for_languages(language_codes: list):
301
"""Get available voices for multiple languages."""
302
client = texttospeech.TextToSpeechClient()
303
results = {}
304
305
for lang_code in language_codes:
306
request = texttospeech.ListVoicesRequest(language_code=lang_code)
307
response = client.list_voices(request=request)
308
results[lang_code] = [voice.name for voice in response.voices]
309
310
return results
311
```
312
313
## Custom Pronunciations
314
315
### CustomPronunciationParams Configuration
316
317
```api { .api }
318
from google.cloud.texttospeech import (
319
CustomPronunciations,
320
CustomPronunciationParams
321
)
322
323
# IPA pronunciation
324
ipa_pronunciation = CustomPronunciationParams(
325
phrase="nuclear",
326
ipa="ˈnuːkliər",
327
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
328
)
329
330
# X-SAMPA pronunciation
331
xsampa_pronunciation = CustomPronunciationParams(
332
phrase="often",
333
ipa="Q:ft@n", # X-SAMPA notation
334
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
335
)
336
337
# Collection of custom pronunciations
338
custom_pronunciations = CustomPronunciations(
339
pronunciations=[
340
CustomPronunciationParams(
341
phrase="GitHub",
342
ipa="ˈɡɪt hʌb",
343
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
344
),
345
CustomPronunciationParams(
346
phrase="API",
347
ipa="ˌeɪ piː ˈaɪ",
348
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
349
),
350
CustomPronunciationParams(
351
phrase="OAuth",
352
ipa="ˈoʊ ɔːθ",
353
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
354
)
355
]
356
)
357
```
358
359
### Using Custom Pronunciations
360
361
```api { .api }
362
def create_voice_with_custom_pronunciations(language_code: str, pronunciations_dict: dict):
363
"""Create voice selection with custom pronunciations from dictionary."""
364
365
# Convert dictionary to CustomPronunciationParams
366
pronunciation_params = []
367
for phrase, ipa_pronunciation in pronunciations_dict.items():
368
param = CustomPronunciationParams(
369
phrase=phrase,
370
ipa=ipa_pronunciation,
371
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
372
)
373
pronunciation_params.append(param)
374
375
# Create custom pronunciations collection
376
custom_pronunciations = CustomPronunciations(
377
pronunciations=pronunciation_params
378
)
379
380
# Return voice selection with custom pronunciations
381
return VoiceSelectionParams(
382
language_code=language_code,
383
custom_pronunciations=custom_pronunciations
384
)
385
386
# Usage example
387
tech_pronunciations = {
388
"JSON": "ˈdʒeɪ sɒn",
389
"SQL": "ˈsiː kwəl",
390
"HTTP": "ˌeɪtʃ tiː tiː ˈpiː",
391
"URL": "ˌjuː ɑːr ˈɛl",
392
"CSS": "ˌsiː ɛs ˈɛs"
393
}
394
395
tech_voice = create_voice_with_custom_pronunciations("en-US", tech_pronunciations)
396
397
# Use in synthesis request
398
request = texttospeech.SynthesizeSpeechRequest(
399
input=texttospeech.SynthesisInput(
400
text="We'll use JSON data via HTTP API calls and style with CSS."
401
),
402
voice=tech_voice,
403
audio_config=texttospeech.AudioConfig(
404
audio_encoding=texttospeech.AudioEncoding.MP3
405
)
406
)
407
```
408
409
## Voice Filtering and Selection Helpers
410
411
### Voice Filtering Functions
412
413
```api { .api }
414
def filter_voices_by_criteria(language_code: str = None, gender: SsmlVoiceGender = None,
415
voice_type: str = None):
416
"""Filter voices by multiple criteria."""
417
client = texttospeech.TextToSpeechClient()
418
419
# Get voices for language or all voices
420
if language_code:
421
request = texttospeech.ListVoicesRequest(language_code=language_code)
422
response = client.list_voices(request=request)
423
else:
424
response = client.list_voices()
425
426
filtered_voices = response.voices
427
428
# Filter by gender
429
if gender:
430
filtered_voices = [v for v in filtered_voices if v.ssml_gender == gender]
431
432
# Filter by voice type
433
if voice_type:
434
filtered_voices = [v for v in filtered_voices if voice_type in v.name]
435
436
return filtered_voices
437
438
# Usage examples
439
wavenet_female_voices = filter_voices_by_criteria(
440
language_code="en-US",
441
gender=SsmlVoiceGender.FEMALE,
442
voice_type="Wavenet"
443
)
444
445
neural2_voices = filter_voices_by_criteria(voice_type="Neural2")
446
male_spanish_voices = filter_voices_by_criteria(
447
language_code="es-ES",
448
gender=SsmlVoiceGender.MALE
449
)
450
```
451
452
### Voice Recommendation System
453
454
```api { .api }
455
class VoiceRecommender:
456
"""Intelligent voice recommendation system."""
457
458
def __init__(self):
459
self.client = texttospeech.TextToSpeechClient()
460
self._voice_cache = {}
461
462
def get_cached_voices(self, language_code: str = None):
463
"""Get voices with caching for performance."""
464
cache_key = language_code or "all"
465
466
if cache_key not in self._voice_cache:
467
if language_code:
468
request = texttospeech.ListVoicesRequest(language_code=language_code)
469
response = self.client.list_voices(request=request)
470
else:
471
response = self.client.list_voices()
472
self._voice_cache[cache_key] = response.voices
473
474
return self._voice_cache[cache_key]
475
476
def recommend_voice(self, language_code: str, preferences: dict = None):
477
"""Recommend best voice based on preferences."""
478
preferences = preferences or {}
479
480
voices = self.get_cached_voices(language_code)
481
if not voices:
482
return None
483
484
# Scoring system
485
scored_voices = []
486
for voice in voices:
487
score = 0
488
489
# Quality scoring
490
if "Neural2" in voice.name:
491
score += 100
492
elif "Wavenet" in voice.name:
493
score += 80
494
elif "Standard" in voice.name:
495
score += 60
496
497
# Gender preference
498
if preferences.get("gender") == voice.ssml_gender:
499
score += 50
500
501
# Sample rate preference
502
preferred_rate = preferences.get("sample_rate")
503
if preferred_rate and voice.natural_sample_rate_hertz == preferred_rate:
504
score += 30
505
506
# Name preference (if specific voice requested)
507
if preferences.get("voice_name") and preferences["voice_name"] in voice.name:
508
score += 200
509
510
scored_voices.append((voice, score))
511
512
# Return highest scored voice
513
scored_voices.sort(key=lambda x: x[1], reverse=True)
514
return scored_voices[0][0] if scored_voices else None
515
516
def get_voice_alternatives(self, primary_voice_name: str, count: int = 3):
517
"""Get alternative voices similar to the primary voice."""
518
# Extract language from primary voice name
519
lang_parts = primary_voice_name.split("-")
520
if len(lang_parts) >= 2:
521
language_code = f"{lang_parts[0]}-{lang_parts[1]}"
522
else:
523
return []
524
525
voices = self.get_cached_voices(language_code)
526
527
# Find similar voices (same type and gender if possible)
528
primary_voice = next((v for v in voices if v.name == primary_voice_name), None)
529
if not primary_voice:
530
return voices[:count]
531
532
similar_voices = []
533
for voice in voices:
534
if (voice.name != primary_voice_name and
535
voice.ssml_gender == primary_voice.ssml_gender):
536
537
# Prefer same voice type
538
if any(vtype in voice.name and vtype in primary_voice_name
539
for vtype in ["Neural2", "Wavenet", "Standard"]):
540
similar_voices.insert(0, voice)
541
else:
542
similar_voices.append(voice)
543
544
return similar_voices[:count]
545
546
# Usage
547
recommender = VoiceRecommender()
548
549
# Get recommendation with preferences
550
preferences = {
551
"gender": SsmlVoiceGender.FEMALE,
552
"sample_rate": 24000
553
}
554
recommended_voice = recommender.recommend_voice("en-US", preferences)
555
556
# Get alternatives to a specific voice
557
alternatives = recommender.get_voice_alternatives("en-US-Wavenet-D", count=5)
558
```
559
560
## Voice Testing and Comparison
561
562
### Voice Comparison Tool
563
564
```api { .api }
565
def compare_voices(text: str, voice_names: list, output_dir: str = "voice_comparison"):
566
"""Generate audio samples for voice comparison."""
567
import os
568
569
client = texttospeech.TextToSpeechClient()
570
os.makedirs(output_dir, exist_ok=True)
571
572
results = []
573
574
for voice_name in voice_names:
575
# Extract language code from voice name
576
lang_parts = voice_name.split("-")
577
language_code = f"{lang_parts[0]}-{lang_parts[1]}" if len(lang_parts) >= 2 else "en-US"
578
579
try:
580
request = texttospeech.SynthesizeSpeechRequest(
581
input=texttospeech.SynthesisInput(text=text),
582
voice=VoiceSelectionParams(
583
language_code=language_code,
584
name=voice_name
585
),
586
audio_config=texttospeech.AudioConfig(
587
audio_encoding=texttospeech.AudioEncoding.MP3
588
)
589
)
590
591
response = client.synthesize_speech(request=request)
592
593
# Save audio file
594
filename = f"{voice_name.replace('-', '_')}.mp3"
595
filepath = os.path.join(output_dir, filename)
596
597
with open(filepath, "wb") as f:
598
f.write(response.audio_content)
599
600
results.append({
601
"voice_name": voice_name,
602
"file_path": filepath,
603
"success": True,
604
"audio_size": len(response.audio_content)
605
})
606
607
except Exception as e:
608
results.append({
609
"voice_name": voice_name,
610
"file_path": None,
611
"success": False,
612
"error": str(e)
613
})
614
615
return results
616
617
# Usage
618
test_voices = [
619
"en-US-Neural2-A",
620
"en-US-Neural2-C",
621
"en-US-Wavenet-A",
622
"en-US-Wavenet-D",
623
"en-US-Standard-A"
624
]
625
626
comparison_results = compare_voices(
627
"Hello, this is a test of different voice qualities and characteristics.",
628
test_voices
629
)
630
631
for result in comparison_results:
632
if result["success"]:
633
print(f"✓ {result['voice_name']}: {result['audio_size']} bytes")
634
else:
635
print(f"✗ {result['voice_name']}: {result['error']}")
636
```
637
638
### Voice Quality Assessment
639
640
```api { .api }
641
def assess_voice_quality(voice_name: str) -> dict:
642
"""Assess voice quality characteristics based on name and properties."""
643
644
quality_assessment = {
645
"voice_name": voice_name,
646
"quality_tier": "unknown",
647
"naturalness": "medium",
648
"recommended_use": "general",
649
"latency": "medium",
650
"cost": "medium"
651
}
652
653
# Assess based on voice type
654
if "Neural2" in voice_name:
655
quality_assessment.update({
656
"quality_tier": "premium",
657
"naturalness": "very_high",
658
"recommended_use": "professional_content",
659
"latency": "medium",
660
"cost": "high"
661
})
662
elif "Wavenet" in voice_name:
663
quality_assessment.update({
664
"quality_tier": "high",
665
"naturalness": "high",
666
"recommended_use": "content_creation",
667
"latency": "medium",
668
"cost": "medium_high"
669
})
670
elif "Standard" in voice_name:
671
quality_assessment.update({
672
"quality_tier": "basic",
673
"naturalness": "medium",
674
"recommended_use": "notifications",
675
"latency": "low",
676
"cost": "low"
677
})
678
elif "Studio" in voice_name:
679
quality_assessment.update({
680
"quality_tier": "premium",
681
"naturalness": "very_high",
682
"recommended_use": "audiobooks",
683
"latency": "high",
684
"cost": "high"
685
})
686
687
return quality_assessment
688
689
# Assess multiple voices
690
voice_assessments = [
691
assess_voice_quality("en-US-Neural2-A"),
692
assess_voice_quality("en-US-Wavenet-D"),
693
assess_voice_quality("en-US-Standard-B")
694
]
695
696
for assessment in voice_assessments:
697
print(f"{assessment['voice_name']}: {assessment['quality_tier']} quality, "
698
f"{assessment['naturalness']} naturalness, {assessment['cost']} cost")
699
```