Tessl Tile for pypi/google-cloud-texttospeech@2.29.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

async-clients.md configuration-types.md index.md long-audio-synthesis.md speech-synthesis.md streaming-synthesis.md voice-management.md

speech-synthesis.mddocs/

0
# Speech Synthesis
1

2
## Overview
3

4
Speech synthesis is the core functionality of the Google Cloud Text-to-Speech API, converting text input into natural-sounding speech audio. The API supports both plain text and SSML (Speech Synthesis Markup Language) input with extensive configuration options for voice selection and audio output.
5

6
## Core Synthesis Operations
7

8
### Basic Text Synthesis
9

10
```api { .api }
11
from google.cloud import texttospeech
12

13
# Initialize client
14
client = texttospeech.TextToSpeechClient()
15

16
# Create synthesis request
17
request = texttospeech.SynthesizeSpeechRequest(
18
    input=texttospeech.SynthesisInput(text="Hello, this is a text-to-speech demo"),
19
    voice=texttospeech.VoiceSelectionParams(
20
        language_code="en-US",
21
        ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
22
    ),
23
    audio_config=texttospeech.AudioConfig(
24
        audio_encoding=texttospeech.AudioEncoding.MP3
25
    )
26
)
27

28
# Perform synthesis
29
response = client.synthesize_speech(request=request)
30

31
# Access audio data
32
audio_content = response.audio_content  # bytes
33
```
34

35
### SSML Synthesis
36

37
```api { .api }
38
from google.cloud import texttospeech
39

40
# SSML input with markup
41
ssml_text = """
42
<speak>
43
    <prosody rate="slow" pitch="+2st">
44
        Hello, this is spoken slowly with higher pitch.
45
    </prosody>
46
    <break time="1s"/>
47
    <prosody rate="fast" pitch="-2st">
48
        And this is spoken quickly with lower pitch.
49
    </prosody>
50
</speak>
51
"""
52

53
request = texttospeech.SynthesizeSpeechRequest(
54
    input=texttospeech.SynthesisInput(ssml=ssml_text),
55
    voice=texttospeech.VoiceSelectionParams(
56
        language_code="en-US",
57
        name="en-US-Wavenet-D"  # Specific voice model
58
    ),
59
    audio_config=texttospeech.AudioConfig(
60
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
61
        sample_rate_hertz=24000
62
    )
63
)
64

65
response = client.synthesize_speech(request=request)
66
```
67

68
## Input Configuration
69

70
### SynthesisInput Class
71

72
```api { .api }
73
from google.cloud.texttospeech import SynthesisInput
74

75
# Plain text input
76
text_input = SynthesisInput(text="Plain text to synthesize")
77

78
# SSML input
79
ssml_input = SynthesisInput(
80
    ssml='<speak>SSML <emphasis level="strong">markup</emphasis> text</speak>'
81
)
82

83
# Multi-speaker SSML input
84
multi_speaker_input = SynthesisInput(
85
    multi_speaker_markup=texttospeech.MultiSpeakerMarkup(
86
        ssml='<speak><voice name="speaker1">Hello</voice><voice name="speaker2">World</voice></speak>'
87
    )
88
)
89
```
90

91
### Advanced Input Options
92

93
```api { .api }
94
# Custom pronunciations with synthesis input
95
from google.cloud.texttospeech import (
96
    SynthesisInput, 
97
    CustomPronunciations, 
98
    CustomPronunciationParams
99
)
100

101
# Define custom pronunciations
102
custom_pronunciations = CustomPronunciations(
103
    pronunciations=[
104
        CustomPronunciationParams(
105
            phrase="Anthropic",
106
            ipa="ˌænθrəˈpɪk",
107
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
108
        ),
109
        CustomPronunciationParams(
110
            phrase="Claude",
111
            ipa="klɔːd",
112
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
113
        )
114
    ]
115
)
116

117
# Use with synthesis
118
request = texttospeech.SynthesizeSpeechRequest(
119
    input=SynthesisInput(text="Hello from Anthropic's Claude AI assistant"),
120
    voice=texttospeech.VoiceSelectionParams(
121
        language_code="en-US",
122
        custom_pronunciations=custom_pronunciations
123
    ),
124
    audio_config=texttospeech.AudioConfig(
125
        audio_encoding=texttospeech.AudioEncoding.MP3
126
    )
127
)
128
```
129

130
## Voice Selection
131

132
### VoiceSelectionParams Class
133

134
```api { .api }
135
from google.cloud.texttospeech import VoiceSelectionParams, SsmlVoiceGender
136

137
# Basic voice selection
138
voice = VoiceSelectionParams(
139
    language_code="en-US",           # Required: BCP-47 language code
140
    ssml_gender=SsmlVoiceGender.MALE # Optional: voice gender
141
)
142

143
# Specific voice model selection
144
voice = VoiceSelectionParams(
145
    language_code="en-US",
146
    name="en-US-Wavenet-A"          # Specific voice name
147
)
148

149
# Custom voice model
150
voice = VoiceSelectionParams(
151
    language_code="en-US",
152
    custom_voice=texttospeech.CustomVoiceParams(
153
        model="projects/your-project/locations/us-central1/models/your-model"
154
    )
155
)
156
```
157

158
### Advanced Voice Configuration
159

160
```api { .api }
161
from google.cloud.texttospeech import (
162
    VoiceSelectionParams, 
163
    AdvancedVoiceOptions,
164
    VoiceCloneParams
165
)
166

167
# Advanced voice options
168
voice = VoiceSelectionParams(
169
    language_code="en-US",
170
    name="en-US-Wavenet-A",
171
    advanced_voice_options=AdvancedVoiceOptions(
172
        low_latency_journey_synthesis=True
173
    )
174
)
175

176
# Voice cloning parameters
177
voice = VoiceSelectionParams(
178
    language_code="en-US",
179
    voice_clone=VoiceCloneParams(
180
        voice_clone_key="your-voice-clone-key"
181
    )
182
)
183
```
184

185
## Audio Configuration
186

187
### AudioConfig Class
188

189
```api { .api }
190
from google.cloud.texttospeech import AudioConfig, AudioEncoding
191

192
# Basic audio configuration
193
audio_config = AudioConfig(
194
    audio_encoding=AudioEncoding.MP3,    # Required: output format
195
    sample_rate_hertz=22050,             # Optional: sample rate
196
    speaking_rate=1.0,                   # Optional: speech rate (0.25-4.0)
197
    pitch=0.0,                          # Optional: pitch (-20.0 to 20.0)
198
    volume_gain_db=0.0                  # Optional: volume gain (-96.0 to 16.0)
199
)
200

201
# High-quality linear PCM
202
audio_config = AudioConfig(
203
    audio_encoding=AudioEncoding.LINEAR16,
204
    sample_rate_hertz=48000,
205
    speaking_rate=0.9,
206
    pitch=2.0
207
)
208

209
# OGG Opus for streaming
210
audio_config = AudioConfig(
211
    audio_encoding=AudioEncoding.OGG_OPUS,
212
    sample_rate_hertz=48000
213
)
214
```
215

216
### Audio Effects and Processing
217

218
```api { .api }
219
from google.cloud.texttospeech import AudioConfig, AudioEncoding
220

221
# Audio with effects profile
222
audio_config = AudioConfig(
223
    audio_encoding=AudioEncoding.MP3,
224
    effects_profile_id=["telephony-class-application"],  # Audio effects
225
    speaking_rate=1.2,
226
    pitch=-2.0,
227
    volume_gain_db=3.0
228
)
229

230
# Multiple effects profiles
231
audio_config = AudioConfig(
232
    audio_encoding=AudioEncoding.LINEAR16,
233
    effects_profile_id=[
234
        "wearable-class-device",
235
        "handset-class-device"
236
    ],
237
    sample_rate_hertz=16000
238
)
239
```
240

241
## Request and Response Types
242

243
### SynthesizeSpeechRequest Class
244

245
```api { .api }
246
from google.cloud.texttospeech import (
247
    SynthesizeSpeechRequest,
248
    SynthesisInput,
249
    VoiceSelectionParams,
250
    AudioConfig
251
)
252

253
# Complete request configuration
254
request = SynthesizeSpeechRequest(
255
    input=SynthesisInput(text="Text to synthesize"),
256
    voice=VoiceSelectionParams(
257
        language_code="en-US",
258
        ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
259
    ),
260
    audio_config=AudioConfig(
261
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
262
        sample_rate_hertz=22050
263
    )
264
)
265

266
# Request with advanced features
267
request = SynthesizeSpeechRequest(
268
    input=SynthesisInput(
269
        ssml='<speak>Hello <mark name="greeting"/>world!</speak>'
270
    ),
271
    voice=VoiceSelectionParams(
272
        language_code="en-US",
273
        name="en-US-Neural2-A"
274
    ),
275
    audio_config=AudioConfig(
276
        audio_encoding=AudioEncoding.MP3,
277
        effects_profile_id=["small-bluetooth-speaker-class-device"]
278
    )
279
)
280
```
281

282
### SynthesizeSpeechResponse Class
283

284
```api { .api }
285
from google.cloud.texttospeech import SynthesizeSpeechResponse
286

287
# Standard response
288
response = client.synthesize_speech(request=request)
289

290
# Access response data
291
audio_content = response.audio_content    # bytes: synthesized audio data
292

293
# Response provides audio as bytes
294
with open("output.mp3", "wb") as audio_file:
295
    audio_file.write(response.audio_content)
296

297
# Get audio length and properties
298
audio_size = len(response.audio_content)
299
print(f"Generated {audio_size} bytes of audio")
300
```
301

302
## Multi-Speaker Synthesis
303

304
### MultiSpeakerMarkup Configuration
305

306
```api { .api }
307
from google.cloud.texttospeech import (
308
    SynthesisInput,
309
    MultiSpeakerMarkup,
310
    VoiceSelectionParams
311
)
312

313
# Multi-speaker SSML
314
multi_speaker_ssml = '''
315
<speak>
316
    <voice name="en-US-Neural2-A">
317
        Hello, I'm the first speaker.
318
    </voice>
319
    <voice name="en-US-Neural2-B">
320
        And I'm the second speaker.
321
    </voice>
322
    <voice name="en-US-Neural2-C">
323
        Together we create a conversation.
324
    </voice>
325
</speak>
326
'''
327

328
# Configure multi-speaker input
329
multi_speaker_input = SynthesisInput(
330
    multi_speaker_markup=MultiSpeakerMarkup(
331
        ssml=multi_speaker_ssml
332
    )
333
)
334

335
# Create synthesis request
336
request = texttospeech.SynthesizeSpeechRequest(
337
    input=multi_speaker_input,
338
    voice=VoiceSelectionParams(
339
        language_code="en-US"  # Base language for multi-speaker
340
    ),
341
    audio_config=texttospeech.AudioConfig(
342
        audio_encoding=texttospeech.AudioEncoding.LINEAR16
343
    )
344
)
345
```
346

347
## Practical Examples
348

349
### File Processing
350

351
```api { .api }
352
import os
353
from google.cloud import texttospeech
354

355
def text_file_to_speech(input_file_path, output_file_path, voice_name=None):
356
    """Convert text file to speech audio file."""
357
    client = texttospeech.TextToSpeechClient()
358
    
359
    # Read text from file
360
    with open(input_file_path, 'r', encoding='utf-8') as file:
361
        text_content = file.read()
362
    
363
    # Configure synthesis
364
    voice = texttospeech.VoiceSelectionParams(
365
        language_code="en-US",
366
        name=voice_name or "en-US-Neural2-A"
367
    )
368
    
369
    audio_config = texttospeech.AudioConfig(
370
        audio_encoding=texttospeech.AudioEncoding.MP3
371
    )
372
    
373
    request = texttospeech.SynthesizeSpeechRequest(
374
        input=texttospeech.SynthesisInput(text=text_content),
375
        voice=voice,
376
        audio_config=audio_config
377
    )
378
    
379
    # Synthesize speech
380
    response = client.synthesize_speech(request=request)
381
    
382
    # Write audio file
383
    with open(output_file_path, "wb") as output_file:
384
        output_file.write(response.audio_content)
385
    
386
    print(f"Audio content written to '{output_file_path}'")
387

388
# Usage
389
text_file_to_speech("input.txt", "output.mp3", "en-US-Wavenet-D")
390
```
391

392
### Batch Processing
393

394
```api { .api }
395
from google.cloud import texttospeech
396
import concurrent.futures
397

398
def synthesize_text_batch(texts, output_dir="outputs"):
399
    """Synthesize multiple texts in parallel."""
400
    client = texttospeech.TextToSpeechClient()
401
    
402
    def synthesize_single(text_data):
403
        text, filename = text_data
404
        
405
        request = texttospeech.SynthesizeSpeechRequest(
406
            input=texttospeech.SynthesisInput(text=text),
407
            voice=texttospeech.VoiceSelectionParams(
408
                language_code="en-US",
409
                ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
410
            ),
411
            audio_config=texttospeech.AudioConfig(
412
                audio_encoding=texttospeech.AudioEncoding.MP3
413
            )
414
        )
415
        
416
        response = client.synthesize_speech(request=request)
417
        
418
        output_path = f"{output_dir}/{filename}.mp3"
419
        with open(output_path, "wb") as f:
420
            f.write(response.audio_content)
421
        
422
        return output_path
423
    
424
    # Prepare text data
425
    text_data = [(text, f"output_{i}") for i, text in enumerate(texts)]
426
    
427
    # Process in parallel
428
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
429
        results = list(executor.map(synthesize_single, text_data))
430
    
431
    return results
432

433
# Usage
434
texts = [
435
    "First text to synthesize",
436
    "Second text to synthesize", 
437
    "Third text to synthesize"
438
]
439
output_files = synthesize_text_batch(texts)
440
```
441

442
### SSML Template Processing
443

444
```api { .api }
445
from google.cloud import texttospeech
446

447
def synthesize_with_ssml_template(content_parts, template_path="ssml_template.xml"):
448
    """Use SSML template for consistent speech formatting."""
449
    
450
    # SSML template with placeholders
451
    ssml_template = """
452
    <speak>
453
        <prosody rate="medium" pitch="normal">
454
            <emphasis level="moderate">{title}</emphasis>
455
        </prosody>
456
        <break time="1s"/>
457
        <prosody rate="slow">
458
            {content}
459
        </prosody>
460
        <break time="2s"/>
461
        <prosody rate="fast" pitch="+1st">
462
            {conclusion}
463
        </prosody>
464
    </speak>
465
    """
466
    
467
    # Fill template
468
    ssml_content = ssml_template.format(**content_parts)
469
    
470
    client = texttospeech.TextToSpeechClient()
471
    
472
    request = texttospeech.SynthesizeSpeechRequest(
473
        input=texttospeech.SynthesisInput(ssml=ssml_content),
474
        voice=texttospeech.VoiceSelectionParams(
475
            language_code="en-US",
476
            name="en-US-Neural2-A"
477
        ),
478
        audio_config=texttospeech.AudioConfig(
479
            audio_encoding=texttospeech.AudioEncoding.LINEAR16,
480
            speaking_rate=0.9,
481
            pitch=1.0
482
        )
483
    )
484
    
485
    return client.synthesize_speech(request=request)
486

487
# Usage
488
content = {
489
    "title": "Welcome to our presentation",
490
    "content": "This is the main content of our speech synthesis example.",
491
    "conclusion": "Thank you for listening!"
492
}
493
response = synthesize_with_ssml_template(content)
494
```
495

496
## Error Handling
497

498
### Synthesis-Specific Errors
499

500
```api { .api }
501
from google.api_core import exceptions
502
from google.cloud import texttospeech
503

504
def safe_synthesize_speech(text, language_code="en-US"):
505
    """Synthesize speech with comprehensive error handling."""
506
    try:
507
        client = texttospeech.TextToSpeechClient()
508
        
509
        request = texttospeech.SynthesizeSpeechRequest(
510
            input=texttospeech.SynthesisInput(text=text),
511
            voice=texttospeech.VoiceSelectionParams(language_code=language_code),
512
            audio_config=texttospeech.AudioConfig(
513
                audio_encoding=texttospeech.AudioEncoding.MP3
514
            )
515
        )
516
        
517
        response = client.synthesize_speech(request=request)
518
        return response.audio_content
519
        
520
    except exceptions.InvalidArgument as e:
521
        print(f"Invalid request parameters: {e}")
522
        return None
523
    except exceptions.OutOfRange as e:
524
        print(f"Parameter out of valid range: {e}")
525
        return None
526
    except exceptions.FailedPrecondition as e:
527
        print(f"Failed precondition: {e}")
528
        return None
529
    except exceptions.ResourceExhausted as e:
530
        print(f"Quota exceeded or rate limited: {e}")
531
        return None
532
    except exceptions.Unauthenticated as e:
533
        print(f"Authentication failed: {e}")
534
        return None
535
    except exceptions.PermissionDenied as e:
536
        print(f"Permission denied: {e}")
537
        return None
538
    except Exception as e:
539
        print(f"Unexpected error: {e}")
540
        return None
541

542
# Usage with error handling
543
audio_data = safe_synthesize_speech("Hello world", "en-US")
544
if audio_data:
545
    with open("safe_output.mp3", "wb") as f:
546
        f.write(audio_data)
547
```
548

549
## Performance Optimization
550

551
### Request Optimization
552

553
```api { .api }
554
from google.cloud import texttospeech
555

556
# Optimize for latency
557
def create_low_latency_request(text):
558
    return texttospeech.SynthesizeSpeechRequest(
559
        input=texttospeech.SynthesisInput(text=text),
560
        voice=texttospeech.VoiceSelectionParams(
561
            language_code="en-US",
562
            name="en-US-Standard-A",  # Standard voices are faster
563
            advanced_voice_options=texttospeech.AdvancedVoiceOptions(
564
                low_latency_journey_synthesis=True
565
            )
566
        ),
567
        audio_config=texttospeech.AudioConfig(
568
            audio_encoding=texttospeech.AudioEncoding.MP3,  # MP3 is compressed
569
            sample_rate_hertz=16000  # Lower sample rate for faster processing
570
        )
571
    )
572

573
# Optimize for quality
574
def create_high_quality_request(text):
575
    return texttospeech.SynthesizeSpeechRequest(
576
        input=texttospeech.SynthesisInput(text=text),
577
        voice=texttospeech.VoiceSelectionParams(
578
            language_code="en-US",
579
            name="en-US-Wavenet-A"  # WaveNet for higher quality
580
        ),
581
        audio_config=texttospeech.AudioConfig(
582
            audio_encoding=texttospeech.AudioEncoding.LINEAR16,  # Uncompressed
583
            sample_rate_hertz=48000  # High sample rate
584
        )
585
    )
586
```

Version

Tile

Files

speech-synthesis.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

speech-synthesis.mddocs/