Tessl Tile for pypi/google-cloud-texttospeech@2.29.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

async-clients.md configuration-types.md index.md long-audio-synthesis.md speech-synthesis.md streaming-synthesis.md voice-management.md

configuration-types.mddocs/

0
# Configuration Types
1

2
## Overview
3

4
The Google Cloud Text-to-Speech API provides extensive configuration options through various classes and types. These configuration objects control voice selection, audio output, input formatting, and advanced features like custom pronunciations and multi-speaker synthesis.
5

6
## Core Configuration Classes
7

8
### SynthesisInput
9

10
```api { .api }
11
from google.cloud.texttospeech import SynthesisInput, MultiSpeakerMarkup
12

13
# Plain text input
14
text_input = SynthesisInput(
15
    text="Convert this plain text to speech"
16
)
17

18
# SSML input
19
ssml_input = SynthesisInput(
20
    ssml='<speak>Convert this <emphasis level="strong">SSML</emphasis> to speech</speak>'
21
)
22

23
# Multi-speaker markup input
24
multi_speaker_input = SynthesisInput(
25
    multi_speaker_markup=MultiSpeakerMarkup(
26
        ssml='''
27
        <speak>
28
            <voice name="en-US-Neural2-A">Hello from speaker one.</voice>
29
            <voice name="en-US-Neural2-C">And greetings from speaker two.</voice>
30
        </speak>
31
        '''
32
    )
33
)
34

35
# SynthesisInput only accepts ONE of: text, ssml, or multi_speaker_markup
36
# Using multiple will raise an error
37
```
38

39
### VoiceSelectionParams
40

41
```api { .api }
42
from google.cloud.texttospeech import (
43
    VoiceSelectionParams,
44
    SsmlVoiceGender,
45
    CustomPronunciations,
46
    CustomPronunciationParams,
47
    AdvancedVoiceOptions,
48
    CustomVoiceParams,
49
    VoiceCloneParams
50
)
51

52
# Basic voice selection
53
basic_voice = VoiceSelectionParams(
54
    language_code="en-US",                           # Required: BCP-47 language code
55
    ssml_gender=SsmlVoiceGender.FEMALE              # Optional: voice gender preference
56
)
57

58
# Specific voice selection
59
specific_voice = VoiceSelectionParams(
60
    language_code="en-US",
61
    name="en-US-Wavenet-D"                         # Exact voice model name
62
)
63

64
# Voice with custom pronunciations
65
voice_with_pronunciations = VoiceSelectionParams(
66
    language_code="en-US",
67
    name="en-US-Neural2-A",
68
    custom_pronunciations=CustomPronunciations(
69
        pronunciations=[
70
            CustomPronunciationParams(
71
                phrase="GitHub",
72
                ipa="ˈɡɪt hʌb",
73
                phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
74
            ),
75
            CustomPronunciationParams(
76
                phrase="API",
77
                ipa="ˌeɪ piː ˈaɪ",
78
                phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
79
            )
80
        ]
81
    )
82
)
83

84
# Voice with advanced options
85
advanced_voice = VoiceSelectionParams(
86
    language_code="en-US",
87
    name="en-US-Neural2-C",
88
    advanced_voice_options=AdvancedVoiceOptions(
89
        low_latency_journey_synthesis=True          # Enable low-latency processing
90
    )
91
)
92

93
# Custom voice model
94
custom_voice = VoiceSelectionParams(
95
    language_code="en-US",
96
    custom_voice=CustomVoiceParams(
97
        model="projects/your-project/locations/us-central1/models/custom-model"
98
    )
99
)
100

101
# Voice cloning
102
cloned_voice = VoiceSelectionParams(
103
    language_code="en-US",
104
    voice_clone=VoiceCloneParams(
105
        voice_clone_key="your-voice-clone-key"
106
    )
107
)
108
```
109

110
### AudioConfig
111

112
```api { .api }
113
from google.cloud.texttospeech import AudioConfig, AudioEncoding
114

115
# Basic audio configuration
116
basic_audio = AudioConfig(
117
    audio_encoding=AudioEncoding.MP3,               # Required: output format
118
    sample_rate_hertz=22050                         # Optional: sample rate (Hz)
119
)
120

121
# Complete audio configuration
122
complete_audio = AudioConfig(
123
    audio_encoding=AudioEncoding.LINEAR16,          # Audio format
124
    sample_rate_hertz=24000,                        # Sample rate
125
    speaking_rate=1.0,                              # Speech rate (0.25-4.0)
126
    pitch=0.0,                                      # Pitch adjustment (-20.0 to 20.0)
127
    volume_gain_db=0.0,                            # Volume gain (-96.0 to 16.0)
128
    effects_profile_id=["large-home-entertainment-class-device"]  # Audio effects
129
)
130

131
# High-quality audio configuration
132
high_quality_audio = AudioConfig(
133
    audio_encoding=AudioEncoding.LINEAR16,
134
    sample_rate_hertz=48000,
135
    speaking_rate=0.95,
136
    pitch=1.0,
137
    volume_gain_db=2.0
138
)
139

140
# Compressed audio for streaming
141
streaming_audio = AudioConfig(
142
    audio_encoding=AudioEncoding.OGG_OPUS,
143
    sample_rate_hertz=48000,
144
    speaking_rate=1.1,
145
    effects_profile_id=["wearable-class-device"]
146
)
147

148
# Telephony optimized audio
149
telephony_audio = AudioConfig(
150
    audio_encoding=AudioEncoding.MULAW,
151
    sample_rate_hertz=8000,
152
    speaking_rate=1.2,
153
    effects_profile_id=["telephony-class-application"]
154
)
155
```
156

157
### Voice
158

159
```api { .api }
160
from google.cloud.texttospeech import Voice, SsmlVoiceGender
161

162
# Voice object (returned by list_voices())
163
# Contains voice information and capabilities
164

165
def analyze_voice_properties(voice: Voice):
166
    """Analyze properties of a Voice object."""
167
    
168
    print(f"Name: {voice.name}")                           # e.g., "en-US-Wavenet-A"
169
    print(f"Language Codes: {voice.language_codes}")       # e.g., ["en-US"]
170
    print(f"SSML Gender: {voice.ssml_gender}")            # SsmlVoiceGender enum
171
    print(f"Natural Sample Rate: {voice.natural_sample_rate_hertz} Hz")  # e.g., 24000
172
    
173
    # Voice categorization based on name
174
    if "Neural2" in voice.name:
175
        print("Type: Premium Neural Voice")
176
    elif "Wavenet" in voice.name:
177
        print("Type: High-Quality Neural Voice")
178
    elif "Standard" in voice.name:
179
        print("Type: Standard Voice")
180
    elif "Studio" in voice.name:
181
        print("Type: Studio Voice")
182
    else:
183
        print("Type: Custom or Special Voice")
184

185
# Example usage with actual Voice objects
186
# voices_response = client.list_voices()
187
# for voice in voices_response.voices:
188
#     analyze_voice_properties(voice)
189
```
190

191
## Streaming Configuration Classes
192

193
### StreamingAudioConfig
194

195
```api { .api }
196
from google.cloud.texttospeech import StreamingAudioConfig, AudioEncoding
197

198
# Basic streaming audio configuration
199
streaming_basic = StreamingAudioConfig(
200
    audio_encoding=AudioEncoding.LINEAR16,          # Required: audio format
201
    sample_rate_hertz=22050                         # Required: sample rate
202
)
203

204
# Advanced streaming audio configuration
205
streaming_advanced = StreamingAudioConfig(
206
    audio_encoding=AudioEncoding.OGG_OPUS,          # Compressed format
207
    sample_rate_hertz=48000,                        # High sample rate
208
    speaking_rate=1.0,                              # Normal speech rate
209
    pitch=0.0,                                      # Neutral pitch
210
    volume_gain_db=1.0,                            # Slight volume boost
211
    effects_profile_id=["small-bluetooth-speaker-class-device"]  # Audio effects
212
)
213

214
# Low-latency streaming configuration
215
streaming_low_latency = StreamingAudioConfig(
216
    audio_encoding=AudioEncoding.LINEAR16,
217
    sample_rate_hertz=16000,                        # Lower rate for speed
218
    speaking_rate=1.1                               # Slightly faster
219
)
220

221
# High-quality streaming configuration
222
streaming_high_quality = StreamingAudioConfig(
223
    audio_encoding=AudioEncoding.LINEAR16,
224
    sample_rate_hertz=48000,
225
    speaking_rate=0.9,                              # Slightly slower
226
    pitch=-0.5,                                     # Lower pitch
227
    volume_gain_db=2.0                             # Volume boost
228
)
229
```
230

231
### StreamingSynthesizeConfig
232

233
```api { .api }
234
from google.cloud.texttospeech import (
235
    StreamingSynthesizeConfig,
236
    VoiceSelectionParams,
237
    StreamingAudioConfig
238
)
239

240
# Complete streaming synthesis configuration
241
streaming_config = StreamingSynthesizeConfig(
242
    voice=VoiceSelectionParams(
243
        language_code="en-US",
244
        name="en-US-Neural2-A",
245
        ssml_gender=SsmlVoiceGender.FEMALE
246
    ),
247
    audio_config=StreamingAudioConfig(
248
        audio_encoding=AudioEncoding.LINEAR16,
249
        sample_rate_hertz=22050,
250
        speaking_rate=1.0,
251
        pitch=0.0,
252
        volume_gain_db=0.0
253
    )
254
)
255

256
# Low-latency streaming configuration
257
low_latency_streaming = StreamingSynthesizeConfig(
258
    voice=VoiceSelectionParams(
259
        language_code="en-US",
260
        name="en-US-Standard-B",                    # Standard voice for speed
261
        advanced_voice_options=AdvancedVoiceOptions(
262
            low_latency_journey_synthesis=True
263
        )
264
    ),
265
    audio_config=StreamingAudioConfig(
266
        audio_encoding=AudioEncoding.LINEAR16,
267
        sample_rate_hertz=16000                     # Lower sample rate
268
    )
269
)
270

271
# Multi-language streaming configuration
272
multilang_streaming = StreamingSynthesizeConfig(
273
    voice=VoiceSelectionParams(
274
        language_code="en-US",
275
        name="en-US-Polyglot-1"                    # Polyglot voice if available
276
    ),
277
    audio_config=StreamingAudioConfig(
278
        audio_encoding=AudioEncoding.MP3,
279
        sample_rate_hertz=24000
280
    )
281
)
282
```
283

284
### StreamingSynthesisInput
285

286
```api { .api }
287
from google.cloud.texttospeech import StreamingSynthesisInput
288

289
# Text input for streaming
290
text_stream_input = StreamingSynthesisInput(
291
    text="This text will be streamed to the synthesis service."
292
)
293

294
# SSML input for streaming
295
ssml_stream_input = StreamingSynthesisInput(
296
    ssml='<speak>This <emphasis level="moderate">SSML content</emphasis> will be streamed.</speak>'
297
)
298

299
# Note: StreamingSynthesisInput accepts either text OR ssml, not both
300
# Each streaming request should contain one input chunk
301
```
302

303
## Advanced Configuration Classes
304

305
### AdvancedVoiceOptions
306

307
```api { .api }
308
from google.cloud.texttospeech import AdvancedVoiceOptions
309

310
# Advanced voice configuration
311
advanced_options = AdvancedVoiceOptions(
312
    low_latency_journey_synthesis=True             # Enable low-latency processing
313
)
314

315
# Usage in voice selection
316
voice_with_advanced = VoiceSelectionParams(
317
    language_code="en-US",
318
    name="en-US-Neural2-A",
319
    advanced_voice_options=advanced_options
320
)
321

322
# Direct configuration
323
direct_advanced_voice = VoiceSelectionParams(
324
    language_code="en-US",
325
    name="en-US-Neural2-C",
326
    advanced_voice_options=AdvancedVoiceOptions(
327
        low_latency_journey_synthesis=True
328
    )
329
)
330
```
331

332
### CustomPronunciations and CustomPronunciationParams
333

334
```api { .api }
335
from google.cloud.texttospeech import (
336
    CustomPronunciations,
337
    CustomPronunciationParams
338
)
339

340
# Individual pronunciation parameter
341
pronunciation_param = CustomPronunciationParams(
342
    phrase="PyTorch",                              # Word or phrase to customize
343
    ipa="ˈpaɪ tɔrʧ",                               # IPA pronunciation
344
    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA  # Encoding type
345
)
346

347
# X-SAMPA encoding example
348
xsampa_param = CustomPronunciationParams(
349
    phrase="neural",
350
    ipa="n\"jU@r@l",                               # X-SAMPA notation
351
    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
352
)
353

354
# Collection of custom pronunciations
355
custom_pronunciations = CustomPronunciations(
356
    pronunciations=[
357
        CustomPronunciationParams(
358
            phrase="TensorFlow",
359
            ipa="ˈtɛnsər floʊ",
360
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
361
        ),
362
        CustomPronunciationParams(
363
            phrase="Kubernetes",
364
            ipa="ˌkubərˈnɛtɪs",
365
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
366
        ),
367
        CustomPronunciationParams(
368
            phrase="OAuth",
369
            ipa="ˈoʊ ɔːθ",
370
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
371
        ),
372
        CustomPronunciationParams(
373
            phrase="JSON",
374
            ipa="ˈdʒeɪ sɒn",
375
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
376
        )
377
    ]
378
)
379

380
# Technical terms pronunciations
381
tech_pronunciations = CustomPronunciations(
382
    pronunciations=[
383
        CustomPronunciationParams(
384
            phrase="API", ipa="ˌeɪ piː ˈaɪ",
385
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
386
        ),
387
        CustomPronunciationParams(
388
            phrase="HTTP", ipa="ˌeɪʧ tiː tiː ˈpiː",
389
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
390
        ),
391
        CustomPronunciationParams(
392
            phrase="URL", ipa="ˌjuː ɑːr ˈɛl",
393
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
394
        ),
395
        CustomPronunciationParams(
396
            phrase="SQL", ipa="ˈsiː kwəl",
397
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
398
        )
399
    ]
400
)
401
```
402

403
### MultiSpeakerMarkup
404

405
```api { .api }
406
from google.cloud.texttospeech import MultiSpeakerMarkup
407

408
# Basic multi-speaker configuration
409
multi_speaker = MultiSpeakerMarkup(
410
    ssml='''
411
    <speak>
412
        <voice name="en-US-Neural2-A">
413
            Hello, I'm the first speaker in this conversation.
414
        </voice>
415
        <voice name="en-US-Neural2-C">
416
            And I'm the second speaker responding to you.
417
        </voice>
418
    </speak>
419
    '''
420
)
421

422
# Complex multi-speaker conversation
423
conversation_markup = MultiSpeakerMarkup(
424
    ssml='''
425
    <speak>
426
        <voice name="en-US-Neural2-A">
427
            <prosody rate="medium" pitch="normal">
428
                Welcome to our technical presentation.
429
            </prosody>
430
        </voice>
431
        
432
        <break time="1s"/>
433
        
434
        <voice name="en-US-Neural2-C">
435
            <prosody rate="slow" pitch="+2st">
436
                Today we'll discuss advanced AI concepts.
437
            </prosody>
438
        </voice>
439
        
440
        <break time="2s"/>
441
        
442
        <voice name="en-US-Wavenet-D">
443
            <prosody rate="fast" pitch="-1st">
444
                Let's start with the technical implementation details.
445
            </prosody>
446
        </voice>
447
    </speak>
448
    '''
449
)
450

451
# Dialogue with emotions and pacing
452
dialogue_markup = MultiSpeakerMarkup(
453
    ssml='''
454
    <speak>
455
        <voice name="en-US-Neural2-A">
456
            <prosody rate="medium" pitch="normal" volume="loud">
457
                I have exciting news to share!
458
            </prosody>
459
        </voice>
460
        
461
        <voice name="en-US-Neural2-C">
462
            <prosody rate="slow" pitch="low" volume="soft">
463
                Please, tell me more about it.
464
            </prosody>
465
        </voice>
466
        
467
        <voice name="en-US-Neural2-A">
468
            <prosody rate="fast" pitch="high" volume="loud">
469
                We've achieved a breakthrough in our research!
470
            </prosody>
471
        </voice>
472
    </speak>
473
    '''
474
)
475
```
476

477
### CustomVoiceParams
478

479
```api { .api }
480
from google.cloud.texttospeech import CustomVoiceParams
481

482
# Custom voice model configuration
483
custom_voice_params = CustomVoiceParams(
484
    model="projects/your-project-id/locations/us-central1/models/your-custom-voice-model"
485
)
486

487
# Usage with voice selection
488
voice_with_custom_model = VoiceSelectionParams(
489
    language_code="en-US",
490
    custom_voice=custom_voice_params
491
)
492

493
# Complete custom voice configuration
494
complete_custom_voice = VoiceSelectionParams(
495
    language_code="en-US",
496
    custom_voice=CustomVoiceParams(
497
        model="projects/your-project-id/locations/us-central1/models/custom-narrator-voice"
498
    ),
499
    custom_pronunciations=CustomPronunciations(
500
        pronunciations=[
501
            CustomPronunciationParams(
502
                phrase="company_name",
503
                ipa="ˈkʌmpəni neɪm",
504
                phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
505
            )
506
        ]
507
    )
508
)
509
```
510

511
### VoiceCloneParams
512

513
```api { .api }
514
from google.cloud.texttospeech import VoiceCloneParams
515

516
# Voice cloning configuration
517
voice_clone_params = VoiceCloneParams(
518
    voice_clone_key="your-voice-clone-key-from-console"
519
)
520

521
# Usage with voice selection
522
cloned_voice_selection = VoiceSelectionParams(
523
    language_code="en-US",
524
    voice_clone=voice_clone_params
525
)
526

527
# Complete cloned voice setup
528
complete_cloned_voice = VoiceSelectionParams(
529
    language_code="en-US",
530
    voice_clone=VoiceCloneParams(
531
        voice_clone_key="abcd-1234-efgh-5678"
532
    ),
533
    advanced_voice_options=AdvancedVoiceOptions(
534
        low_latency_journey_synthesis=True
535
    )
536
)
537
```
538

539
## Enums and Constants
540

541
### AudioEncoding
542

543
```api { .api }
544
from google.cloud.texttospeech import AudioEncoding
545

546
# Available audio encoding formats
547
LINEAR16 = AudioEncoding.LINEAR16                   # 16-bit PCM with WAV header (lossless)
548
MP3 = AudioEncoding.MP3                             # MP3 at 32kbps (compressed)
549
OGG_OPUS = AudioEncoding.OGG_OPUS                   # Opus in Ogg container (compressed)
550
MULAW = AudioEncoding.MULAW                         # 8-bit G.711 PCMU/mu-law (telephony)
551
ALAW = AudioEncoding.ALAW                           # 8-bit G.711 PCMU/A-law (telephony)
552
PCM = AudioEncoding.PCM                             # 16-bit PCM without header (raw)
553
M4A = AudioEncoding.M4A                             # M4A format (compressed)
554
UNSPECIFIED = AudioEncoding.AUDIO_ENCODING_UNSPECIFIED  # Not specified
555

556
# Usage in audio configuration
557
high_quality_config = AudioConfig(
558
    audio_encoding=AudioEncoding.LINEAR16,          # Best quality
559
    sample_rate_hertz=48000
560
)
561

562
compressed_config = AudioConfig(
563
    audio_encoding=AudioEncoding.MP3,               # Good compression
564
    sample_rate_hertz=22050
565
)
566

567
telephony_config = AudioConfig(
568
    audio_encoding=AudioEncoding.MULAW,             # Telephony standard
569
    sample_rate_hertz=8000
570
)
571
```
572

573
### SsmlVoiceGender
574

575
```api { .api }
576
from google.cloud.texttospeech import SsmlVoiceGender
577

578
# Available gender options
579
MALE = SsmlVoiceGender.MALE                         # Male voice
580
FEMALE = SsmlVoiceGender.FEMALE                     # Female voice
581
NEUTRAL = SsmlVoiceGender.NEUTRAL                   # Gender-neutral voice
582
UNSPECIFIED = SsmlVoiceGender.SSML_VOICE_GENDER_UNSPECIFIED  # No preference
583

584
# Usage in voice selection
585
male_voice = VoiceSelectionParams(
586
    language_code="en-US",
587
    ssml_gender=SsmlVoiceGender.MALE
588
)
589

590
female_voice = VoiceSelectionParams(
591
    language_code="en-US", 
592
    ssml_gender=SsmlVoiceGender.FEMALE
593
)
594

595
neutral_voice = VoiceSelectionParams(
596
    language_code="en-US",
597
    ssml_gender=SsmlVoiceGender.NEUTRAL
598
)
599
```
600

601
### PhoneticEncoding
602

603
```api { .api }
604
from google.cloud.texttospeech import CustomPronunciationParams
605

606
# Available phonetic encoding options
607
IPA = CustomPronunciationParams.PhoneticEncoding.IPA        # International Phonetic Alphabet
608
X_SAMPA = CustomPronunciationParams.PhoneticEncoding.X_SAMPA  # X-SAMPA notation
609
UNSPECIFIED = CustomPronunciationParams.PhoneticEncoding.PHONETIC_ENCODING_UNSPECIFIED
610

611
# Usage in pronunciation parameters
612
ipa_pronunciation = CustomPronunciationParams(
613
    phrase="example",
614
    ipa="ɪɡˈzæmpəl",
615
    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
616
)
617

618
xsampa_pronunciation = CustomPronunciationParams(
619
    phrase="example", 
620
    ipa="Ig\"z{mp@l",
621
    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
622
)
623
```
624

625
## Configuration Validation and Helpers
626

627
### Configuration Validation
628

629
```api { .api }
630
def validate_audio_config(audio_config: AudioConfig) -> tuple[bool, list[str]]:
631
    """Validate audio configuration parameters."""
632
    errors = []
633
    
634
    # Check required fields
635
    if not hasattr(audio_config, 'audio_encoding') or not audio_config.audio_encoding:
636
        errors.append("audio_encoding is required")
637
    
638
    # Validate sample rate ranges
639
    if hasattr(audio_config, 'sample_rate_hertz') and audio_config.sample_rate_hertz:
640
        sample_rate = audio_config.sample_rate_hertz
641
        valid_rates = [8000, 16000, 22050, 24000, 32000, 44100, 48000]
642
        if sample_rate not in valid_rates:
643
            errors.append(f"sample_rate_hertz must be one of {valid_rates}, got {sample_rate}")
644
    
645
    # Validate speaking rate
646
    if hasattr(audio_config, 'speaking_rate') and audio_config.speaking_rate:
647
        rate = audio_config.speaking_rate
648
        if not (0.25 <= rate <= 4.0):
649
            errors.append(f"speaking_rate must be between 0.25 and 4.0, got {rate}")
650
    
651
    # Validate pitch
652
    if hasattr(audio_config, 'pitch') and audio_config.pitch:
653
        pitch = audio_config.pitch
654
        if not (-20.0 <= pitch <= 20.0):
655
            errors.append(f"pitch must be between -20.0 and 20.0, got {pitch}")
656
    
657
    # Validate volume gain
658
    if hasattr(audio_config, 'volume_gain_db') and audio_config.volume_gain_db:
659
        volume = audio_config.volume_gain_db
660
        if not (-96.0 <= volume <= 16.0):
661
            errors.append(f"volume_gain_db must be between -96.0 and 16.0, got {volume}")
662
    
663
    return len(errors) == 0, errors
664

665
def validate_voice_selection(voice: VoiceSelectionParams) -> tuple[bool, list[str]]:
666
    """Validate voice selection parameters."""
667
    errors = []
668
    
669
    # Check required fields
670
    if not hasattr(voice, 'language_code') or not voice.language_code:
671
        errors.append("language_code is required")
672
    else:
673
        # Validate language code format (basic check for BCP-47)
674
        lang_code = voice.language_code
675
        if not lang_code.count('-') >= 1 or len(lang_code) < 2:
676
            errors.append(f"language_code should be in BCP-47 format (e.g., 'en-US'), got '{lang_code}'")
677
    
678
    # Check conflicting voice specifications
679
    specified_count = sum([
680
        bool(getattr(voice, 'name', None)),
681
        bool(getattr(voice, 'custom_voice', None)),
682
        bool(getattr(voice, 'voice_clone', None))
683
    ])
684
    
685
    if specified_count > 1:
686
        errors.append("Only one of 'name', 'custom_voice', or 'voice_clone' should be specified")
687
    
688
    return len(errors) == 0, errors
689

690
# Usage examples
691
audio_config = AudioConfig(
692
    audio_encoding=AudioEncoding.MP3,
693
    sample_rate_hertz=22050,
694
    speaking_rate=1.5,
695
    pitch=2.0
696
)
697

698
is_valid, validation_errors = validate_audio_config(audio_config)
699
if not is_valid:
700
    print(f"Audio config validation errors: {validation_errors}")
701
```
702

703
### Configuration Builders
704

705
```api { .api }
706
class ConfigurationBuilder:
707
    """Helper class for building complex configurations."""
708
    
709
    @staticmethod
710
    def build_high_quality_config() -> AudioConfig:
711
        """Build high-quality audio configuration."""
712
        return AudioConfig(
713
            audio_encoding=AudioEncoding.LINEAR16,
714
            sample_rate_hertz=48000,
715
            speaking_rate=0.95,
716
            pitch=0.0,
717
            volume_gain_db=1.0
718
        )
719
    
720
    @staticmethod
721
    def build_streaming_config() -> AudioConfig:
722
        """Build streaming-optimized audio configuration."""
723
        return AudioConfig(
724
            audio_encoding=AudioEncoding.OGG_OPUS,
725
            sample_rate_hertz=24000,
726
            speaking_rate=1.1,
727
            volume_gain_db=0.0
728
        )
729
    
730
    @staticmethod
731
    def build_mobile_config() -> AudioConfig:
732
        """Build mobile-optimized audio configuration."""
733
        return AudioConfig(
734
            audio_encoding=AudioEncoding.MP3,
735
            sample_rate_hertz=16000,
736
            speaking_rate=1.2,
737
            effects_profile_id=["handset-class-device"]
738
        )
739
    
740
    @staticmethod
741
    def build_tech_voice_with_pronunciations(language_code: str = "en-US") -> VoiceSelectionParams:
742
        """Build voice configuration optimized for technical content."""
743
        
744
        tech_pronunciations = CustomPronunciations(
745
            pronunciations=[
746
                CustomPronunciationParams(
747
                    phrase="API", ipa="ˌeɪ piː ˈaɪ",
748
                    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
749
                ),
750
                CustomPronunciationParams(
751
                    phrase="JSON", ipa="ˈdʒeɪ sɒn",
752
                    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
753
                ),
754
                CustomPronunciationParams(
755
                    phrase="HTTP", ipa="ˌeɪʧ tiː tiː ˈpiː",
756
                    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
757
                ),
758
                CustomPronunciationParams(
759
                    phrase="SQL", ipa="ˈsiː kwəl",
760
                    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
761
                )
762
            ]
763
        )
764
        
765
        return VoiceSelectionParams(
766
            language_code=language_code,
767
            name=f"{language_code}-Neural2-A",
768
            custom_pronunciations=tech_pronunciations
769
        )
770
    
771
    @staticmethod
772
    def build_conversation_voices() -> list[VoiceSelectionParams]:
773
        """Build multiple voices for conversation synthesis."""
774
        return [
775
            VoiceSelectionParams(
776
                language_code="en-US",
777
                name="en-US-Neural2-A",  # Female voice
778
                ssml_gender=SsmlVoiceGender.FEMALE
779
            ),
780
            VoiceSelectionParams(
781
                language_code="en-US", 
782
                name="en-US-Neural2-C",  # Male voice
783
                ssml_gender=SsmlVoiceGender.MALE
784
            ),
785
            VoiceSelectionParams(
786
                language_code="en-US",
787
                name="en-US-Neural2-F",  # Neutral voice
788
                ssml_gender=SsmlVoiceGender.NEUTRAL
789
            )
790
        ]
791

792
# Usage examples
793
high_quality_audio = ConfigurationBuilder.build_high_quality_config()
794
streaming_audio = ConfigurationBuilder.build_streaming_config()
795
mobile_audio = ConfigurationBuilder.build_mobile_config()
796
tech_voice = ConfigurationBuilder.build_tech_voice_with_pronunciations("en-US")
797
conversation_voices = ConfigurationBuilder.build_conversation_voices()
798
```
799

800
### Configuration Templates
801

802
```api { .api }
803
class ConfigurationTemplates:
804
    """Pre-defined configuration templates for common use cases."""
805
    
806
    AUDIOBOOK = {
807
        'voice': VoiceSelectionParams(
808
            language_code="en-US",
809
            name="en-US-Wavenet-A"
810
        ),
811
        'audio': AudioConfig(
812
            audio_encoding=AudioEncoding.MP3,
813
            sample_rate_hertz=22050,
814
            speaking_rate=0.9,
815
            volume_gain_db=2.0
816
        )
817
    }
818
    
819
    PODCAST = {
820
        'voice': VoiceSelectionParams(
821
            language_code="en-US",
822
            name="en-US-Neural2-C"
823
        ),
824
        'audio': AudioConfig(
825
            audio_encoding=AudioEncoding.MP3,
826
            sample_rate_hertz=44100,
827
            speaking_rate=1.0,
828
            effects_profile_id=["large-home-entertainment-class-device"]
829
        )
830
    }
831
    
832
    NEWS_BROADCAST = {
833
        'voice': VoiceSelectionParams(
834
            language_code="en-US",
835
            name="en-US-Neural2-D",
836
            ssml_gender=SsmlVoiceGender.MALE
837
        ),
838
        'audio': AudioConfig(
839
            audio_encoding=AudioEncoding.LINEAR16,
840
            sample_rate_hertz=24000,
841
            speaking_rate=1.1,
842
            pitch=-1.0
843
        )
844
    }
845
    
846
    EDUCATIONAL = {
847
        'voice': VoiceSelectionParams(
848
            language_code="en-US",
849
            name="en-US-Neural2-A"
850
        ),
851
        'audio': AudioConfig(
852
            audio_encoding=AudioEncoding.MP3,
853
            sample_rate_hertz=22050,
854
            speaking_rate=0.95,
855
            pitch=1.0
856
        )
857
    }
858
    
859
    TELEPHONY = {
860
        'voice': VoiceSelectionParams(
861
            language_code="en-US",
862
            name="en-US-Standard-C"
863
        ),
864
        'audio': AudioConfig(
865
            audio_encoding=AudioEncoding.MULAW,
866
            sample_rate_hertz=8000,
867
            speaking_rate=1.2,
868
            effects_profile_id=["telephony-class-application"]
869
        )
870
    }
871
    
872
    @classmethod
873
    def get_template(cls, template_name: str) -> dict:
874
        """Get configuration template by name."""
875
        template_map = {
876
            'audiobook': cls.AUDIOBOOK,
877
            'podcast': cls.PODCAST, 
878
            'news': cls.NEWS_BROADCAST,
879
            'educational': cls.EDUCATIONAL,
880
            'telephony': cls.TELEPHONY
881
        }
882
        
883
        return template_map.get(template_name.lower(), cls.AUDIOBOOK)
884
    
885
    @classmethod
886
    def create_request_from_template(cls, template_name: str, text: str) -> 'SynthesizeSpeechRequest':
887
        """Create synthesis request from template."""
888
        template = cls.get_template(template_name)
889
        
890
        return texttospeech.SynthesizeSpeechRequest(
891
            input=SynthesisInput(text=text),
892
            voice=template['voice'],
893
            audio_config=template['audio']
894
        )
895

896
# Usage examples
897
audiobook_config = ConfigurationTemplates.get_template('audiobook')
898
podcast_request = ConfigurationTemplates.create_request_from_template(
899
    'podcast', 
900
    "Welcome to our technology podcast!"
901
)
902
```
903

904
## Best Practices for Configuration
905

906
### Configuration Guidelines
907

908
```api { .api }
909
class ConfigurationBestPractices:
910
    """Best practices for Text-to-Speech configuration."""
911
    
912
    @staticmethod
913
    def recommend_sample_rate(audio_encoding: AudioEncoding, use_case: str) -> int:
914
        """Recommend optimal sample rate for encoding and use case."""
915
        
916
        recommendations = {
917
            AudioEncoding.LINEAR16: {
918
                'high_quality': 48000,
919
                'standard': 24000,
920
                'streaming': 22050,
921
                'mobile': 16000
922
            },
923
            AudioEncoding.MP3: {
924
                'high_quality': 44100,
925
                'standard': 22050,
926
                'streaming': 22050,
927
                'mobile': 16000
928
            },
929
            AudioEncoding.OGG_OPUS: {
930
                'high_quality': 48000,
931
                'standard': 24000, 
932
                'streaming': 24000,
933
                'mobile': 16000
934
            },
935
            AudioEncoding.MULAW: {
936
                'telephony': 8000
937
            },
938
            AudioEncoding.ALAW: {
939
                'telephony': 8000
940
            }
941
        }
942
        
943
        encoding_rec = recommendations.get(audio_encoding, {})
944
        return encoding_rec.get(use_case, 22050)  # Default fallback
945
    
946
    @staticmethod
947
    def optimize_for_latency(voice_config: VoiceSelectionParams, 
948
                           audio_config: AudioConfig) -> tuple[VoiceSelectionParams, AudioConfig]:
949
        """Optimize configuration for minimal latency."""
950
        
951
        # Use Standard voice for speed
952
        optimized_voice = VoiceSelectionParams(
953
            language_code=voice_config.language_code,
954
            name=voice_config.language_code.replace('-', '-Standard-A'),
955
            advanced_voice_options=AdvancedVoiceOptions(
956
                low_latency_journey_synthesis=True
957
            )
958
        )
959
        
960
        # Use lower sample rate and compressed format
961
        optimized_audio = AudioConfig(
962
            audio_encoding=AudioEncoding.MP3,
963
            sample_rate_hertz=16000,
964
            speaking_rate=1.1
965
        )
966
        
967
        return optimized_voice, optimized_audio
968
    
969
    @staticmethod
970
    def optimize_for_quality(voice_config: VoiceSelectionParams,
971
                           audio_config: AudioConfig) -> tuple[VoiceSelectionParams, AudioConfig]:
972
        """Optimize configuration for maximum quality."""
973
        
974
        # Use Neural2 or Wavenet voice
975
        voice_name = voice_config.language_code
976
        if 'Neural2' not in voice_config.name and 'Wavenet' not in voice_config.name:
977
            voice_name += '-Neural2-A'  # Default to Neural2
978
        else:
979
            voice_name = voice_config.name
980
        
981
        optimized_voice = VoiceSelectionParams(
982
            language_code=voice_config.language_code,
983
            name=voice_name
984
        )
985
        
986
        # Use uncompressed format with high sample rate
987
        optimized_audio = AudioConfig(
988
            audio_encoding=AudioEncoding.LINEAR16,
989
            sample_rate_hertz=48000,
990
            speaking_rate=0.95,  # Slightly slower for clarity
991
            volume_gain_db=1.0
992
        )
993
        
994
        return optimized_voice, optimized_audio
995

996
# Usage examples
997
# Optimize for latency
998
original_voice = VoiceSelectionParams(language_code="en-US")
999
original_audio = AudioConfig(audio_encoding=AudioEncoding.LINEAR16)
1000

1001
fast_voice, fast_audio = ConfigurationBestPractices.optimize_for_latency(
1002
    original_voice, original_audio
1003
)
1004

1005
# Optimize for quality
1006
quality_voice, quality_audio = ConfigurationBestPractices.optimize_for_quality(
1007
    original_voice, original_audio
1008
)
1009

1010
# Get recommended sample rate
1011
recommended_rate = ConfigurationBestPractices.recommend_sample_rate(
1012
    AudioEncoding.MP3, 'streaming'
1013
)
1014
```

Version

Tile

Files

configuration-types.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

configuration-types.mddocs/