0
# Configuration Types
1
2
## Overview
3
4
The Google Cloud Text-to-Speech API provides extensive configuration options through various classes and types. These configuration objects control voice selection, audio output, input formatting, and advanced features like custom pronunciations and multi-speaker synthesis.
5
6
## Core Configuration Classes
7
8
### SynthesisInput
9
10
```api { .api }
11
from google.cloud.texttospeech import SynthesisInput, MultiSpeakerMarkup
12
13
# Plain text input
14
text_input = SynthesisInput(
15
text="Convert this plain text to speech"
16
)
17
18
# SSML input
19
ssml_input = SynthesisInput(
20
ssml='<speak>Convert this <emphasis level="strong">SSML</emphasis> to speech</speak>'
21
)
22
23
# Multi-speaker markup input
24
multi_speaker_input = SynthesisInput(
25
multi_speaker_markup=MultiSpeakerMarkup(
26
ssml='''
27
<speak>
28
<voice name="en-US-Neural2-A">Hello from speaker one.</voice>
29
<voice name="en-US-Neural2-C">And greetings from speaker two.</voice>
30
</speak>
31
'''
32
)
33
)
34
35
# SynthesisInput only accepts ONE of: text, ssml, or multi_speaker_markup
36
# Using multiple will raise an error
37
```
38
39
### VoiceSelectionParams
40
41
```api { .api }
42
from google.cloud.texttospeech import (
43
VoiceSelectionParams,
44
SsmlVoiceGender,
45
CustomPronunciations,
46
CustomPronunciationParams,
47
AdvancedVoiceOptions,
48
CustomVoiceParams,
49
VoiceCloneParams
50
)
51
52
# Basic voice selection
53
basic_voice = VoiceSelectionParams(
54
language_code="en-US", # Required: BCP-47 language code
55
ssml_gender=SsmlVoiceGender.FEMALE # Optional: voice gender preference
56
)
57
58
# Specific voice selection
59
specific_voice = VoiceSelectionParams(
60
language_code="en-US",
61
name="en-US-Wavenet-D" # Exact voice model name
62
)
63
64
# Voice with custom pronunciations
65
voice_with_pronunciations = VoiceSelectionParams(
66
language_code="en-US",
67
name="en-US-Neural2-A",
68
custom_pronunciations=CustomPronunciations(
69
pronunciations=[
70
CustomPronunciationParams(
71
phrase="GitHub",
72
ipa="ˈɡɪt hʌb",
73
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
74
),
75
CustomPronunciationParams(
76
phrase="API",
77
ipa="ˌeɪ piː ˈaɪ",
78
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
79
)
80
]
81
)
82
)
83
84
# Voice with advanced options
85
advanced_voice = VoiceSelectionParams(
86
language_code="en-US",
87
name="en-US-Neural2-C",
88
advanced_voice_options=AdvancedVoiceOptions(
89
low_latency_journey_synthesis=True # Enable low-latency processing
90
)
91
)
92
93
# Custom voice model
94
custom_voice = VoiceSelectionParams(
95
language_code="en-US",
96
custom_voice=CustomVoiceParams(
97
model="projects/your-project/locations/us-central1/models/custom-model"
98
)
99
)
100
101
# Voice cloning
102
cloned_voice = VoiceSelectionParams(
103
language_code="en-US",
104
voice_clone=VoiceCloneParams(
105
voice_clone_key="your-voice-clone-key"
106
)
107
)
108
```
109
110
### AudioConfig
111
112
```api { .api }
113
from google.cloud.texttospeech import AudioConfig, AudioEncoding
114
115
# Basic audio configuration
116
basic_audio = AudioConfig(
117
audio_encoding=AudioEncoding.MP3, # Required: output format
118
sample_rate_hertz=22050 # Optional: sample rate (Hz)
119
)
120
121
# Complete audio configuration
122
complete_audio = AudioConfig(
123
audio_encoding=AudioEncoding.LINEAR16, # Audio format
124
sample_rate_hertz=24000, # Sample rate
125
speaking_rate=1.0, # Speech rate (0.25-4.0)
126
pitch=0.0, # Pitch adjustment (-20.0 to 20.0)
127
volume_gain_db=0.0, # Volume gain (-96.0 to 16.0)
128
effects_profile_id=["large-home-entertainment-class-device"] # Audio effects
129
)
130
131
# High-quality audio configuration
132
high_quality_audio = AudioConfig(
133
audio_encoding=AudioEncoding.LINEAR16,
134
sample_rate_hertz=48000,
135
speaking_rate=0.95,
136
pitch=1.0,
137
volume_gain_db=2.0
138
)
139
140
# Compressed audio for streaming
141
streaming_audio = AudioConfig(
142
audio_encoding=AudioEncoding.OGG_OPUS,
143
sample_rate_hertz=48000,
144
speaking_rate=1.1,
145
effects_profile_id=["wearable-class-device"]
146
)
147
148
# Telephony optimized audio
149
telephony_audio = AudioConfig(
150
audio_encoding=AudioEncoding.MULAW,
151
sample_rate_hertz=8000,
152
speaking_rate=1.2,
153
effects_profile_id=["telephony-class-application"]
154
)
155
```
156
157
### Voice
158
159
```api { .api }
160
from google.cloud.texttospeech import Voice, SsmlVoiceGender
161
162
# Voice object (returned by list_voices())
163
# Contains voice information and capabilities
164
165
def analyze_voice_properties(voice: Voice):
166
"""Analyze properties of a Voice object."""
167
168
print(f"Name: {voice.name}") # e.g., "en-US-Wavenet-A"
169
print(f"Language Codes: {voice.language_codes}") # e.g., ["en-US"]
170
print(f"SSML Gender: {voice.ssml_gender}") # SsmlVoiceGender enum
171
print(f"Natural Sample Rate: {voice.natural_sample_rate_hertz} Hz") # e.g., 24000
172
173
# Voice categorization based on name
174
if "Neural2" in voice.name:
175
print("Type: Premium Neural Voice")
176
elif "Wavenet" in voice.name:
177
print("Type: High-Quality Neural Voice")
178
elif "Standard" in voice.name:
179
print("Type: Standard Voice")
180
elif "Studio" in voice.name:
181
print("Type: Studio Voice")
182
else:
183
print("Type: Custom or Special Voice")
184
185
# Example usage with actual Voice objects
186
# voices_response = client.list_voices()
187
# for voice in voices_response.voices:
188
# analyze_voice_properties(voice)
189
```
190
191
## Streaming Configuration Classes
192
193
### StreamingAudioConfig
194
195
```api { .api }
196
from google.cloud.texttospeech import StreamingAudioConfig, AudioEncoding
197
198
# Basic streaming audio configuration
199
streaming_basic = StreamingAudioConfig(
200
audio_encoding=AudioEncoding.LINEAR16, # Required: audio format
201
sample_rate_hertz=22050 # Required: sample rate
202
)
203
204
# Advanced streaming audio configuration
205
streaming_advanced = StreamingAudioConfig(
206
audio_encoding=AudioEncoding.OGG_OPUS, # Compressed format
207
sample_rate_hertz=48000, # High sample rate
208
speaking_rate=1.0, # Normal speech rate
209
pitch=0.0, # Neutral pitch
210
volume_gain_db=1.0, # Slight volume boost
211
effects_profile_id=["small-bluetooth-speaker-class-device"] # Audio effects
212
)
213
214
# Low-latency streaming configuration
215
streaming_low_latency = StreamingAudioConfig(
216
audio_encoding=AudioEncoding.LINEAR16,
217
sample_rate_hertz=16000, # Lower rate for speed
218
speaking_rate=1.1 # Slightly faster
219
)
220
221
# High-quality streaming configuration
222
streaming_high_quality = StreamingAudioConfig(
223
audio_encoding=AudioEncoding.LINEAR16,
224
sample_rate_hertz=48000,
225
speaking_rate=0.9, # Slightly slower
226
pitch=-0.5, # Lower pitch
227
volume_gain_db=2.0 # Volume boost
228
)
229
```
230
231
### StreamingSynthesizeConfig
232
233
```api { .api }
234
from google.cloud.texttospeech import (
235
StreamingSynthesizeConfig,
236
VoiceSelectionParams,
237
StreamingAudioConfig
238
)
239
240
# Complete streaming synthesis configuration
241
streaming_config = StreamingSynthesizeConfig(
242
voice=VoiceSelectionParams(
243
language_code="en-US",
244
name="en-US-Neural2-A",
245
ssml_gender=SsmlVoiceGender.FEMALE
246
),
247
audio_config=StreamingAudioConfig(
248
audio_encoding=AudioEncoding.LINEAR16,
249
sample_rate_hertz=22050,
250
speaking_rate=1.0,
251
pitch=0.0,
252
volume_gain_db=0.0
253
)
254
)
255
256
# Low-latency streaming configuration
257
low_latency_streaming = StreamingSynthesizeConfig(
258
voice=VoiceSelectionParams(
259
language_code="en-US",
260
name="en-US-Standard-B", # Standard voice for speed
261
advanced_voice_options=AdvancedVoiceOptions(
262
low_latency_journey_synthesis=True
263
)
264
),
265
audio_config=StreamingAudioConfig(
266
audio_encoding=AudioEncoding.LINEAR16,
267
sample_rate_hertz=16000 # Lower sample rate
268
)
269
)
270
271
# Multi-language streaming configuration
272
multilang_streaming = StreamingSynthesizeConfig(
273
voice=VoiceSelectionParams(
274
language_code="en-US",
275
name="en-US-Polyglot-1" # Polyglot voice if available
276
),
277
audio_config=StreamingAudioConfig(
278
audio_encoding=AudioEncoding.MP3,
279
sample_rate_hertz=24000
280
)
281
)
282
```
283
284
### StreamingSynthesisInput
285
286
```api { .api }
287
from google.cloud.texttospeech import StreamingSynthesisInput
288
289
# Text input for streaming
290
text_stream_input = StreamingSynthesisInput(
291
text="This text will be streamed to the synthesis service."
292
)
293
294
# SSML input for streaming
295
ssml_stream_input = StreamingSynthesisInput(
296
ssml='<speak>This <emphasis level="moderate">SSML content</emphasis> will be streamed.</speak>'
297
)
298
299
# Note: StreamingSynthesisInput accepts either text OR ssml, not both
300
# Each streaming request should contain one input chunk
301
```
302
303
## Advanced Configuration Classes
304
305
### AdvancedVoiceOptions
306
307
```api { .api }
308
from google.cloud.texttospeech import AdvancedVoiceOptions
309
310
# Advanced voice configuration
311
advanced_options = AdvancedVoiceOptions(
312
low_latency_journey_synthesis=True # Enable low-latency processing
313
)
314
315
# Usage in voice selection
316
voice_with_advanced = VoiceSelectionParams(
317
language_code="en-US",
318
name="en-US-Neural2-A",
319
advanced_voice_options=advanced_options
320
)
321
322
# Direct configuration
323
direct_advanced_voice = VoiceSelectionParams(
324
language_code="en-US",
325
name="en-US-Neural2-C",
326
advanced_voice_options=AdvancedVoiceOptions(
327
low_latency_journey_synthesis=True
328
)
329
)
330
```
331
332
### CustomPronunciations and CustomPronunciationParams
333
334
```api { .api }
335
from google.cloud.texttospeech import (
336
CustomPronunciations,
337
CustomPronunciationParams
338
)
339
340
# Individual pronunciation parameter
341
pronunciation_param = CustomPronunciationParams(
342
phrase="PyTorch", # Word or phrase to customize
343
ipa="ˈpaɪ tɔrʧ", # IPA pronunciation
344
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA # Encoding type
345
)
346
347
# X-SAMPA encoding example
348
xsampa_param = CustomPronunciationParams(
349
phrase="neural",
350
ipa="n\"jU@r@l", # X-SAMPA notation
351
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
352
)
353
354
# Collection of custom pronunciations
355
custom_pronunciations = CustomPronunciations(
356
pronunciations=[
357
CustomPronunciationParams(
358
phrase="TensorFlow",
359
ipa="ˈtɛnsər floʊ",
360
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
361
),
362
CustomPronunciationParams(
363
phrase="Kubernetes",
364
ipa="ˌkubərˈnɛtɪs",
365
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
366
),
367
CustomPronunciationParams(
368
phrase="OAuth",
369
ipa="ˈoʊ ɔːθ",
370
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
371
),
372
CustomPronunciationParams(
373
phrase="JSON",
374
ipa="ˈdʒeɪ sɒn",
375
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
376
)
377
]
378
)
379
380
# Technical terms pronunciations
381
tech_pronunciations = CustomPronunciations(
382
pronunciations=[
383
CustomPronunciationParams(
384
phrase="API", ipa="ˌeɪ piː ˈaɪ",
385
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
386
),
387
CustomPronunciationParams(
388
phrase="HTTP", ipa="ˌeɪʧ tiː tiː ˈpiː",
389
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
390
),
391
CustomPronunciationParams(
392
phrase="URL", ipa="ˌjuː ɑːr ˈɛl",
393
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
394
),
395
CustomPronunciationParams(
396
phrase="SQL", ipa="ˈsiː kwəl",
397
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
398
)
399
]
400
)
401
```
402
403
### MultiSpeakerMarkup
404
405
```api { .api }
406
from google.cloud.texttospeech import MultiSpeakerMarkup
407
408
# Basic multi-speaker configuration
409
multi_speaker = MultiSpeakerMarkup(
410
ssml='''
411
<speak>
412
<voice name="en-US-Neural2-A">
413
Hello, I'm the first speaker in this conversation.
414
</voice>
415
<voice name="en-US-Neural2-C">
416
And I'm the second speaker responding to you.
417
</voice>
418
</speak>
419
'''
420
)
421
422
# Complex multi-speaker conversation
423
conversation_markup = MultiSpeakerMarkup(
424
ssml='''
425
<speak>
426
<voice name="en-US-Neural2-A">
427
<prosody rate="medium" pitch="normal">
428
Welcome to our technical presentation.
429
</prosody>
430
</voice>
431
432
<break time="1s"/>
433
434
<voice name="en-US-Neural2-C">
435
<prosody rate="slow" pitch="+2st">
436
Today we'll discuss advanced AI concepts.
437
</prosody>
438
</voice>
439
440
<break time="2s"/>
441
442
<voice name="en-US-Wavenet-D">
443
<prosody rate="fast" pitch="-1st">
444
Let's start with the technical implementation details.
445
</prosody>
446
</voice>
447
</speak>
448
'''
449
)
450
451
# Dialogue with emotions and pacing
452
dialogue_markup = MultiSpeakerMarkup(
453
ssml='''
454
<speak>
455
<voice name="en-US-Neural2-A">
456
<prosody rate="medium" pitch="normal" volume="loud">
457
I have exciting news to share!
458
</prosody>
459
</voice>
460
461
<voice name="en-US-Neural2-C">
462
<prosody rate="slow" pitch="low" volume="soft">
463
Please, tell me more about it.
464
</prosody>
465
</voice>
466
467
<voice name="en-US-Neural2-A">
468
<prosody rate="fast" pitch="high" volume="loud">
469
We've achieved a breakthrough in our research!
470
</prosody>
471
</voice>
472
</speak>
473
'''
474
)
475
```
476
477
### CustomVoiceParams
478
479
```api { .api }
480
from google.cloud.texttospeech import CustomVoiceParams
481
482
# Custom voice model configuration
483
custom_voice_params = CustomVoiceParams(
484
model="projects/your-project-id/locations/us-central1/models/your-custom-voice-model"
485
)
486
487
# Usage with voice selection
488
voice_with_custom_model = VoiceSelectionParams(
489
language_code="en-US",
490
custom_voice=custom_voice_params
491
)
492
493
# Complete custom voice configuration
494
complete_custom_voice = VoiceSelectionParams(
495
language_code="en-US",
496
custom_voice=CustomVoiceParams(
497
model="projects/your-project-id/locations/us-central1/models/custom-narrator-voice"
498
),
499
custom_pronunciations=CustomPronunciations(
500
pronunciations=[
501
CustomPronunciationParams(
502
phrase="company_name",
503
ipa="ˈkʌmpəni neɪm",
504
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
505
)
506
]
507
)
508
)
509
```
510
511
### VoiceCloneParams
512
513
```api { .api }
514
from google.cloud.texttospeech import VoiceCloneParams
515
516
# Voice cloning configuration
517
voice_clone_params = VoiceCloneParams(
518
voice_clone_key="your-voice-clone-key-from-console"
519
)
520
521
# Usage with voice selection
522
cloned_voice_selection = VoiceSelectionParams(
523
language_code="en-US",
524
voice_clone=voice_clone_params
525
)
526
527
# Complete cloned voice setup
528
complete_cloned_voice = VoiceSelectionParams(
529
language_code="en-US",
530
voice_clone=VoiceCloneParams(
531
voice_clone_key="abcd-1234-efgh-5678"
532
),
533
advanced_voice_options=AdvancedVoiceOptions(
534
low_latency_journey_synthesis=True
535
)
536
)
537
```
538
539
## Enums and Constants
540
541
### AudioEncoding
542
543
```api { .api }
544
from google.cloud.texttospeech import AudioEncoding
545
546
# Available audio encoding formats
547
LINEAR16 = AudioEncoding.LINEAR16 # 16-bit PCM with WAV header (lossless)
548
MP3 = AudioEncoding.MP3 # MP3 at 32kbps (compressed)
549
OGG_OPUS = AudioEncoding.OGG_OPUS # Opus in Ogg container (compressed)
550
MULAW = AudioEncoding.MULAW # 8-bit G.711 PCMU/mu-law (telephony)
551
ALAW = AudioEncoding.ALAW # 8-bit G.711 PCMU/A-law (telephony)
552
PCM = AudioEncoding.PCM # 16-bit PCM without header (raw)
553
M4A = AudioEncoding.M4A # M4A format (compressed)
554
UNSPECIFIED = AudioEncoding.AUDIO_ENCODING_UNSPECIFIED # Not specified
555
556
# Usage in audio configuration
557
high_quality_config = AudioConfig(
558
audio_encoding=AudioEncoding.LINEAR16, # Best quality
559
sample_rate_hertz=48000
560
)
561
562
compressed_config = AudioConfig(
563
audio_encoding=AudioEncoding.MP3, # Good compression
564
sample_rate_hertz=22050
565
)
566
567
telephony_config = AudioConfig(
568
audio_encoding=AudioEncoding.MULAW, # Telephony standard
569
sample_rate_hertz=8000
570
)
571
```
572
573
### SsmlVoiceGender
574
575
```api { .api }
576
from google.cloud.texttospeech import SsmlVoiceGender
577
578
# Available gender options
579
MALE = SsmlVoiceGender.MALE # Male voice
580
FEMALE = SsmlVoiceGender.FEMALE # Female voice
581
NEUTRAL = SsmlVoiceGender.NEUTRAL # Gender-neutral voice
582
UNSPECIFIED = SsmlVoiceGender.SSML_VOICE_GENDER_UNSPECIFIED # No preference
583
584
# Usage in voice selection
585
male_voice = VoiceSelectionParams(
586
language_code="en-US",
587
ssml_gender=SsmlVoiceGender.MALE
588
)
589
590
female_voice = VoiceSelectionParams(
591
language_code="en-US",
592
ssml_gender=SsmlVoiceGender.FEMALE
593
)
594
595
neutral_voice = VoiceSelectionParams(
596
language_code="en-US",
597
ssml_gender=SsmlVoiceGender.NEUTRAL
598
)
599
```
600
601
### PhoneticEncoding
602
603
```api { .api }
604
from google.cloud.texttospeech import CustomPronunciationParams
605
606
# Available phonetic encoding options
607
IPA = CustomPronunciationParams.PhoneticEncoding.IPA # International Phonetic Alphabet
608
X_SAMPA = CustomPronunciationParams.PhoneticEncoding.X_SAMPA # X-SAMPA notation
609
UNSPECIFIED = CustomPronunciationParams.PhoneticEncoding.PHONETIC_ENCODING_UNSPECIFIED
610
611
# Usage in pronunciation parameters
612
ipa_pronunciation = CustomPronunciationParams(
613
phrase="example",
614
ipa="ɪɡˈzæmpəl",
615
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
616
)
617
618
xsampa_pronunciation = CustomPronunciationParams(
619
phrase="example",
620
ipa="Ig\"z{mp@l",
621
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
622
)
623
```
624
625
## Configuration Validation and Helpers
626
627
### Configuration Validation
628
629
```api { .api }
630
def validate_audio_config(audio_config: AudioConfig) -> tuple[bool, list[str]]:
631
"""Validate audio configuration parameters."""
632
errors = []
633
634
# Check required fields
635
if not hasattr(audio_config, 'audio_encoding') or not audio_config.audio_encoding:
636
errors.append("audio_encoding is required")
637
638
# Validate sample rate ranges
639
if hasattr(audio_config, 'sample_rate_hertz') and audio_config.sample_rate_hertz:
640
sample_rate = audio_config.sample_rate_hertz
641
valid_rates = [8000, 16000, 22050, 24000, 32000, 44100, 48000]
642
if sample_rate not in valid_rates:
643
errors.append(f"sample_rate_hertz must be one of {valid_rates}, got {sample_rate}")
644
645
# Validate speaking rate
646
if hasattr(audio_config, 'speaking_rate') and audio_config.speaking_rate:
647
rate = audio_config.speaking_rate
648
if not (0.25 <= rate <= 4.0):
649
errors.append(f"speaking_rate must be between 0.25 and 4.0, got {rate}")
650
651
# Validate pitch
652
if hasattr(audio_config, 'pitch') and audio_config.pitch:
653
pitch = audio_config.pitch
654
if not (-20.0 <= pitch <= 20.0):
655
errors.append(f"pitch must be between -20.0 and 20.0, got {pitch}")
656
657
# Validate volume gain
658
if hasattr(audio_config, 'volume_gain_db') and audio_config.volume_gain_db:
659
volume = audio_config.volume_gain_db
660
if not (-96.0 <= volume <= 16.0):
661
errors.append(f"volume_gain_db must be between -96.0 and 16.0, got {volume}")
662
663
return len(errors) == 0, errors
664
665
def validate_voice_selection(voice: VoiceSelectionParams) -> tuple[bool, list[str]]:
666
"""Validate voice selection parameters."""
667
errors = []
668
669
# Check required fields
670
if not hasattr(voice, 'language_code') or not voice.language_code:
671
errors.append("language_code is required")
672
else:
673
# Validate language code format (basic check for BCP-47)
674
lang_code = voice.language_code
675
if not lang_code.count('-') >= 1 or len(lang_code) < 2:
676
errors.append(f"language_code should be in BCP-47 format (e.g., 'en-US'), got '{lang_code}'")
677
678
# Check conflicting voice specifications
679
specified_count = sum([
680
bool(getattr(voice, 'name', None)),
681
bool(getattr(voice, 'custom_voice', None)),
682
bool(getattr(voice, 'voice_clone', None))
683
])
684
685
if specified_count > 1:
686
errors.append("Only one of 'name', 'custom_voice', or 'voice_clone' should be specified")
687
688
return len(errors) == 0, errors
689
690
# Usage examples
691
audio_config = AudioConfig(
692
audio_encoding=AudioEncoding.MP3,
693
sample_rate_hertz=22050,
694
speaking_rate=1.5,
695
pitch=2.0
696
)
697
698
is_valid, validation_errors = validate_audio_config(audio_config)
699
if not is_valid:
700
print(f"Audio config validation errors: {validation_errors}")
701
```
702
703
### Configuration Builders
704
705
```api { .api }
706
class ConfigurationBuilder:
707
"""Helper class for building complex configurations."""
708
709
@staticmethod
710
def build_high_quality_config() -> AudioConfig:
711
"""Build high-quality audio configuration."""
712
return AudioConfig(
713
audio_encoding=AudioEncoding.LINEAR16,
714
sample_rate_hertz=48000,
715
speaking_rate=0.95,
716
pitch=0.0,
717
volume_gain_db=1.0
718
)
719
720
@staticmethod
721
def build_streaming_config() -> AudioConfig:
722
"""Build streaming-optimized audio configuration."""
723
return AudioConfig(
724
audio_encoding=AudioEncoding.OGG_OPUS,
725
sample_rate_hertz=24000,
726
speaking_rate=1.1,
727
volume_gain_db=0.0
728
)
729
730
@staticmethod
731
def build_mobile_config() -> AudioConfig:
732
"""Build mobile-optimized audio configuration."""
733
return AudioConfig(
734
audio_encoding=AudioEncoding.MP3,
735
sample_rate_hertz=16000,
736
speaking_rate=1.2,
737
effects_profile_id=["handset-class-device"]
738
)
739
740
@staticmethod
741
def build_tech_voice_with_pronunciations(language_code: str = "en-US") -> VoiceSelectionParams:
742
"""Build voice configuration optimized for technical content."""
743
744
tech_pronunciations = CustomPronunciations(
745
pronunciations=[
746
CustomPronunciationParams(
747
phrase="API", ipa="ˌeɪ piː ˈaɪ",
748
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
749
),
750
CustomPronunciationParams(
751
phrase="JSON", ipa="ˈdʒeɪ sɒn",
752
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
753
),
754
CustomPronunciationParams(
755
phrase="HTTP", ipa="ˌeɪʧ tiː tiː ˈpiː",
756
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
757
),
758
CustomPronunciationParams(
759
phrase="SQL", ipa="ˈsiː kwəl",
760
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
761
)
762
]
763
)
764
765
return VoiceSelectionParams(
766
language_code=language_code,
767
name=f"{language_code}-Neural2-A",
768
custom_pronunciations=tech_pronunciations
769
)
770
771
@staticmethod
772
def build_conversation_voices() -> list[VoiceSelectionParams]:
773
"""Build multiple voices for conversation synthesis."""
774
return [
775
VoiceSelectionParams(
776
language_code="en-US",
777
name="en-US-Neural2-A", # Female voice
778
ssml_gender=SsmlVoiceGender.FEMALE
779
),
780
VoiceSelectionParams(
781
language_code="en-US",
782
name="en-US-Neural2-C", # Male voice
783
ssml_gender=SsmlVoiceGender.MALE
784
),
785
VoiceSelectionParams(
786
language_code="en-US",
787
name="en-US-Neural2-F", # Neutral voice
788
ssml_gender=SsmlVoiceGender.NEUTRAL
789
)
790
]
791
792
# Usage examples
793
high_quality_audio = ConfigurationBuilder.build_high_quality_config()
794
streaming_audio = ConfigurationBuilder.build_streaming_config()
795
mobile_audio = ConfigurationBuilder.build_mobile_config()
796
tech_voice = ConfigurationBuilder.build_tech_voice_with_pronunciations("en-US")
797
conversation_voices = ConfigurationBuilder.build_conversation_voices()
798
```
799
800
### Configuration Templates
801
802
```api { .api }
803
class ConfigurationTemplates:
804
"""Pre-defined configuration templates for common use cases."""
805
806
AUDIOBOOK = {
807
'voice': VoiceSelectionParams(
808
language_code="en-US",
809
name="en-US-Wavenet-A"
810
),
811
'audio': AudioConfig(
812
audio_encoding=AudioEncoding.MP3,
813
sample_rate_hertz=22050,
814
speaking_rate=0.9,
815
volume_gain_db=2.0
816
)
817
}
818
819
PODCAST = {
820
'voice': VoiceSelectionParams(
821
language_code="en-US",
822
name="en-US-Neural2-C"
823
),
824
'audio': AudioConfig(
825
audio_encoding=AudioEncoding.MP3,
826
sample_rate_hertz=44100,
827
speaking_rate=1.0,
828
effects_profile_id=["large-home-entertainment-class-device"]
829
)
830
}
831
832
NEWS_BROADCAST = {
833
'voice': VoiceSelectionParams(
834
language_code="en-US",
835
name="en-US-Neural2-D",
836
ssml_gender=SsmlVoiceGender.MALE
837
),
838
'audio': AudioConfig(
839
audio_encoding=AudioEncoding.LINEAR16,
840
sample_rate_hertz=24000,
841
speaking_rate=1.1,
842
pitch=-1.0
843
)
844
}
845
846
EDUCATIONAL = {
847
'voice': VoiceSelectionParams(
848
language_code="en-US",
849
name="en-US-Neural2-A"
850
),
851
'audio': AudioConfig(
852
audio_encoding=AudioEncoding.MP3,
853
sample_rate_hertz=22050,
854
speaking_rate=0.95,
855
pitch=1.0
856
)
857
}
858
859
TELEPHONY = {
860
'voice': VoiceSelectionParams(
861
language_code="en-US",
862
name="en-US-Standard-C"
863
),
864
'audio': AudioConfig(
865
audio_encoding=AudioEncoding.MULAW,
866
sample_rate_hertz=8000,
867
speaking_rate=1.2,
868
effects_profile_id=["telephony-class-application"]
869
)
870
}
871
872
@classmethod
873
def get_template(cls, template_name: str) -> dict:
874
"""Get configuration template by name."""
875
template_map = {
876
'audiobook': cls.AUDIOBOOK,
877
'podcast': cls.PODCAST,
878
'news': cls.NEWS_BROADCAST,
879
'educational': cls.EDUCATIONAL,
880
'telephony': cls.TELEPHONY
881
}
882
883
return template_map.get(template_name.lower(), cls.AUDIOBOOK)
884
885
@classmethod
886
def create_request_from_template(cls, template_name: str, text: str) -> 'SynthesizeSpeechRequest':
887
"""Create synthesis request from template."""
888
template = cls.get_template(template_name)
889
890
return texttospeech.SynthesizeSpeechRequest(
891
input=SynthesisInput(text=text),
892
voice=template['voice'],
893
audio_config=template['audio']
894
)
895
896
# Usage examples
897
audiobook_config = ConfigurationTemplates.get_template('audiobook')
898
podcast_request = ConfigurationTemplates.create_request_from_template(
899
'podcast',
900
"Welcome to our technology podcast!"
901
)
902
```
903
904
## Best Practices for Configuration
905
906
### Configuration Guidelines
907
908
```api { .api }
909
class ConfigurationBestPractices:
910
"""Best practices for Text-to-Speech configuration."""
911
912
@staticmethod
913
def recommend_sample_rate(audio_encoding: AudioEncoding, use_case: str) -> int:
914
"""Recommend optimal sample rate for encoding and use case."""
915
916
recommendations = {
917
AudioEncoding.LINEAR16: {
918
'high_quality': 48000,
919
'standard': 24000,
920
'streaming': 22050,
921
'mobile': 16000
922
},
923
AudioEncoding.MP3: {
924
'high_quality': 44100,
925
'standard': 22050,
926
'streaming': 22050,
927
'mobile': 16000
928
},
929
AudioEncoding.OGG_OPUS: {
930
'high_quality': 48000,
931
'standard': 24000,
932
'streaming': 24000,
933
'mobile': 16000
934
},
935
AudioEncoding.MULAW: {
936
'telephony': 8000
937
},
938
AudioEncoding.ALAW: {
939
'telephony': 8000
940
}
941
}
942
943
encoding_rec = recommendations.get(audio_encoding, {})
944
return encoding_rec.get(use_case, 22050) # Default fallback
945
946
@staticmethod
947
def optimize_for_latency(voice_config: VoiceSelectionParams,
948
audio_config: AudioConfig) -> tuple[VoiceSelectionParams, AudioConfig]:
949
"""Optimize configuration for minimal latency."""
950
951
# Use Standard voice for speed
952
optimized_voice = VoiceSelectionParams(
953
language_code=voice_config.language_code,
954
name=voice_config.language_code.replace('-', '-Standard-A'),
955
advanced_voice_options=AdvancedVoiceOptions(
956
low_latency_journey_synthesis=True
957
)
958
)
959
960
# Use lower sample rate and compressed format
961
optimized_audio = AudioConfig(
962
audio_encoding=AudioEncoding.MP3,
963
sample_rate_hertz=16000,
964
speaking_rate=1.1
965
)
966
967
return optimized_voice, optimized_audio
968
969
@staticmethod
970
def optimize_for_quality(voice_config: VoiceSelectionParams,
971
audio_config: AudioConfig) -> tuple[VoiceSelectionParams, AudioConfig]:
972
"""Optimize configuration for maximum quality."""
973
974
# Use Neural2 or Wavenet voice
975
voice_name = voice_config.language_code
976
if 'Neural2' not in voice_config.name and 'Wavenet' not in voice_config.name:
977
voice_name += '-Neural2-A' # Default to Neural2
978
else:
979
voice_name = voice_config.name
980
981
optimized_voice = VoiceSelectionParams(
982
language_code=voice_config.language_code,
983
name=voice_name
984
)
985
986
# Use uncompressed format with high sample rate
987
optimized_audio = AudioConfig(
988
audio_encoding=AudioEncoding.LINEAR16,
989
sample_rate_hertz=48000,
990
speaking_rate=0.95, # Slightly slower for clarity
991
volume_gain_db=1.0
992
)
993
994
return optimized_voice, optimized_audio
995
996
# Usage examples
997
# Optimize for latency
998
original_voice = VoiceSelectionParams(language_code="en-US")
999
original_audio = AudioConfig(audio_encoding=AudioEncoding.LINEAR16)
1000
1001
fast_voice, fast_audio = ConfigurationBestPractices.optimize_for_latency(
1002
original_voice, original_audio
1003
)
1004
1005
# Optimize for quality
1006
quality_voice, quality_audio = ConfigurationBestPractices.optimize_for_quality(
1007
original_voice, original_audio
1008
)
1009
1010
# Get recommended sample rate
1011
recommended_rate = ConfigurationBestPractices.recommend_sample_rate(
1012
AudioEncoding.MP3, 'streaming'
1013
)
1014
```