0
# Speech Synthesis
1
2
## Overview
3
4
Speech synthesis is the core functionality of the Google Cloud Text-to-Speech API, converting text input into natural-sounding speech audio. The API supports both plain text and SSML (Speech Synthesis Markup Language) input with extensive configuration options for voice selection and audio output.
5
6
## Core Synthesis Operations
7
8
### Basic Text Synthesis
9
10
```api { .api }
11
from google.cloud import texttospeech
12
13
# Initialize client
14
client = texttospeech.TextToSpeechClient()
15
16
# Create synthesis request
17
request = texttospeech.SynthesizeSpeechRequest(
18
input=texttospeech.SynthesisInput(text="Hello, this is a text-to-speech demo"),
19
voice=texttospeech.VoiceSelectionParams(
20
language_code="en-US",
21
ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
22
),
23
audio_config=texttospeech.AudioConfig(
24
audio_encoding=texttospeech.AudioEncoding.MP3
25
)
26
)
27
28
# Perform synthesis
29
response = client.synthesize_speech(request=request)
30
31
# Access audio data
32
audio_content = response.audio_content # bytes
33
```
34
35
### SSML Synthesis
36
37
```api { .api }
38
from google.cloud import texttospeech
39
40
# SSML input with markup
41
ssml_text = """
42
<speak>
43
<prosody rate="slow" pitch="+2st">
44
Hello, this is spoken slowly with higher pitch.
45
</prosody>
46
<break time="1s"/>
47
<prosody rate="fast" pitch="-2st">
48
And this is spoken quickly with lower pitch.
49
</prosody>
50
</speak>
51
"""
52
53
request = texttospeech.SynthesizeSpeechRequest(
54
input=texttospeech.SynthesisInput(ssml=ssml_text),
55
voice=texttospeech.VoiceSelectionParams(
56
language_code="en-US",
57
name="en-US-Wavenet-D" # Specific voice model
58
),
59
audio_config=texttospeech.AudioConfig(
60
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
61
sample_rate_hertz=24000
62
)
63
)
64
65
response = client.synthesize_speech(request=request)
66
```
67
68
## Input Configuration
69
70
### SynthesisInput Class
71
72
```api { .api }
73
from google.cloud.texttospeech import SynthesisInput
74
75
# Plain text input
76
text_input = SynthesisInput(text="Plain text to synthesize")
77
78
# SSML input
79
ssml_input = SynthesisInput(
80
ssml='<speak>SSML <emphasis level="strong">markup</emphasis> text</speak>'
81
)
82
83
# Multi-speaker SSML input
84
multi_speaker_input = SynthesisInput(
85
multi_speaker_markup=texttospeech.MultiSpeakerMarkup(
86
ssml='<speak><voice name="speaker1">Hello</voice><voice name="speaker2">World</voice></speak>'
87
)
88
)
89
```
90
91
### Advanced Input Options
92
93
```api { .api }
94
# Custom pronunciations with synthesis input
95
from google.cloud.texttospeech import (
96
SynthesisInput,
97
CustomPronunciations,
98
CustomPronunciationParams
99
)
100
101
# Define custom pronunciations
102
custom_pronunciations = CustomPronunciations(
103
pronunciations=[
104
CustomPronunciationParams(
105
phrase="Anthropic",
106
ipa="ˌænθrəˈpɪk",
107
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
108
),
109
CustomPronunciationParams(
110
phrase="Claude",
111
ipa="klɔːd",
112
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
113
)
114
]
115
)
116
117
# Use with synthesis
118
request = texttospeech.SynthesizeSpeechRequest(
119
input=SynthesisInput(text="Hello from Anthropic's Claude AI assistant"),
120
voice=texttospeech.VoiceSelectionParams(
121
language_code="en-US",
122
custom_pronunciations=custom_pronunciations
123
),
124
audio_config=texttospeech.AudioConfig(
125
audio_encoding=texttospeech.AudioEncoding.MP3
126
)
127
)
128
```
129
130
## Voice Selection
131
132
### VoiceSelectionParams Class
133
134
```api { .api }
135
from google.cloud.texttospeech import VoiceSelectionParams, SsmlVoiceGender
136
137
# Basic voice selection
138
voice = VoiceSelectionParams(
139
language_code="en-US", # Required: BCP-47 language code
140
ssml_gender=SsmlVoiceGender.MALE # Optional: voice gender
141
)
142
143
# Specific voice model selection
144
voice = VoiceSelectionParams(
145
language_code="en-US",
146
name="en-US-Wavenet-A" # Specific voice name
147
)
148
149
# Custom voice model
150
voice = VoiceSelectionParams(
151
language_code="en-US",
152
custom_voice=texttospeech.CustomVoiceParams(
153
model="projects/your-project/locations/us-central1/models/your-model"
154
)
155
)
156
```
157
158
### Advanced Voice Configuration
159
160
```api { .api }
161
from google.cloud.texttospeech import (
162
VoiceSelectionParams,
163
AdvancedVoiceOptions,
164
VoiceCloneParams
165
)
166
167
# Advanced voice options
168
voice = VoiceSelectionParams(
169
language_code="en-US",
170
name="en-US-Wavenet-A",
171
advanced_voice_options=AdvancedVoiceOptions(
172
low_latency_journey_synthesis=True
173
)
174
)
175
176
# Voice cloning parameters
177
voice = VoiceSelectionParams(
178
language_code="en-US",
179
voice_clone=VoiceCloneParams(
180
voice_clone_key="your-voice-clone-key"
181
)
182
)
183
```
184
185
## Audio Configuration
186
187
### AudioConfig Class
188
189
```api { .api }
190
from google.cloud.texttospeech import AudioConfig, AudioEncoding
191
192
# Basic audio configuration
193
audio_config = AudioConfig(
194
audio_encoding=AudioEncoding.MP3, # Required: output format
195
sample_rate_hertz=22050, # Optional: sample rate
196
speaking_rate=1.0, # Optional: speech rate (0.25-4.0)
197
pitch=0.0, # Optional: pitch (-20.0 to 20.0)
198
volume_gain_db=0.0 # Optional: volume gain (-96.0 to 16.0)
199
)
200
201
# High-quality linear PCM
202
audio_config = AudioConfig(
203
audio_encoding=AudioEncoding.LINEAR16,
204
sample_rate_hertz=48000,
205
speaking_rate=0.9,
206
pitch=2.0
207
)
208
209
# OGG Opus for streaming
210
audio_config = AudioConfig(
211
audio_encoding=AudioEncoding.OGG_OPUS,
212
sample_rate_hertz=48000
213
)
214
```
215
216
### Audio Effects and Processing
217
218
```api { .api }
219
from google.cloud.texttospeech import AudioConfig, AudioEncoding
220
221
# Audio with effects profile
222
audio_config = AudioConfig(
223
audio_encoding=AudioEncoding.MP3,
224
effects_profile_id=["telephony-class-application"], # Audio effects
225
speaking_rate=1.2,
226
pitch=-2.0,
227
volume_gain_db=3.0
228
)
229
230
# Multiple effects profiles
231
audio_config = AudioConfig(
232
audio_encoding=AudioEncoding.LINEAR16,
233
effects_profile_id=[
234
"wearable-class-device",
235
"handset-class-device"
236
],
237
sample_rate_hertz=16000
238
)
239
```
240
241
## Request and Response Types
242
243
### SynthesizeSpeechRequest Class
244
245
```api { .api }
246
from google.cloud.texttospeech import (
247
SynthesizeSpeechRequest,
248
SynthesisInput,
249
VoiceSelectionParams,
250
AudioConfig
251
)
252
253
# Complete request configuration
254
request = SynthesizeSpeechRequest(
255
input=SynthesisInput(text="Text to synthesize"),
256
voice=VoiceSelectionParams(
257
language_code="en-US",
258
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
259
),
260
audio_config=AudioConfig(
261
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
262
sample_rate_hertz=22050
263
)
264
)
265
266
# Request with advanced features
267
request = SynthesizeSpeechRequest(
268
input=SynthesisInput(
269
ssml='<speak>Hello <mark name="greeting"/>world!</speak>'
270
),
271
voice=VoiceSelectionParams(
272
language_code="en-US",
273
name="en-US-Neural2-A"
274
),
275
audio_config=AudioConfig(
276
audio_encoding=AudioEncoding.MP3,
277
effects_profile_id=["small-bluetooth-speaker-class-device"]
278
)
279
)
280
```
281
282
### SynthesizeSpeechResponse Class
283
284
```api { .api }
285
from google.cloud.texttospeech import SynthesizeSpeechResponse
286
287
# Standard response
288
response = client.synthesize_speech(request=request)
289
290
# Access response data
291
audio_content = response.audio_content # bytes: synthesized audio data
292
293
# Response provides audio as bytes
294
with open("output.mp3", "wb") as audio_file:
295
audio_file.write(response.audio_content)
296
297
# Get audio length and properties
298
audio_size = len(response.audio_content)
299
print(f"Generated {audio_size} bytes of audio")
300
```
301
302
## Multi-Speaker Synthesis
303
304
### MultiSpeakerMarkup Configuration
305
306
```api { .api }
307
from google.cloud.texttospeech import (
308
SynthesisInput,
309
MultiSpeakerMarkup,
310
VoiceSelectionParams
311
)
312
313
# Multi-speaker SSML
314
multi_speaker_ssml = '''
315
<speak>
316
<voice name="en-US-Neural2-A">
317
Hello, I'm the first speaker.
318
</voice>
319
<voice name="en-US-Neural2-B">
320
And I'm the second speaker.
321
</voice>
322
<voice name="en-US-Neural2-C">
323
Together we create a conversation.
324
</voice>
325
</speak>
326
'''
327
328
# Configure multi-speaker input
329
multi_speaker_input = SynthesisInput(
330
multi_speaker_markup=MultiSpeakerMarkup(
331
ssml=multi_speaker_ssml
332
)
333
)
334
335
# Create synthesis request
336
request = texttospeech.SynthesizeSpeechRequest(
337
input=multi_speaker_input,
338
voice=VoiceSelectionParams(
339
language_code="en-US" # Base language for multi-speaker
340
),
341
audio_config=texttospeech.AudioConfig(
342
audio_encoding=texttospeech.AudioEncoding.LINEAR16
343
)
344
)
345
```
346
347
## Practical Examples
348
349
### File Processing
350
351
```api { .api }
352
import os
353
from google.cloud import texttospeech
354
355
def text_file_to_speech(input_file_path, output_file_path, voice_name=None):
356
"""Convert text file to speech audio file."""
357
client = texttospeech.TextToSpeechClient()
358
359
# Read text from file
360
with open(input_file_path, 'r', encoding='utf-8') as file:
361
text_content = file.read()
362
363
# Configure synthesis
364
voice = texttospeech.VoiceSelectionParams(
365
language_code="en-US",
366
name=voice_name or "en-US-Neural2-A"
367
)
368
369
audio_config = texttospeech.AudioConfig(
370
audio_encoding=texttospeech.AudioEncoding.MP3
371
)
372
373
request = texttospeech.SynthesizeSpeechRequest(
374
input=texttospeech.SynthesisInput(text=text_content),
375
voice=voice,
376
audio_config=audio_config
377
)
378
379
# Synthesize speech
380
response = client.synthesize_speech(request=request)
381
382
# Write audio file
383
with open(output_file_path, "wb") as output_file:
384
output_file.write(response.audio_content)
385
386
print(f"Audio content written to '{output_file_path}'")
387
388
# Usage
389
text_file_to_speech("input.txt", "output.mp3", "en-US-Wavenet-D")
390
```
391
392
### Batch Processing
393
394
```api { .api }
395
from google.cloud import texttospeech
396
import concurrent.futures
397
398
def synthesize_text_batch(texts, output_dir="outputs"):
399
"""Synthesize multiple texts in parallel."""
400
client = texttospeech.TextToSpeechClient()
401
402
def synthesize_single(text_data):
403
text, filename = text_data
404
405
request = texttospeech.SynthesizeSpeechRequest(
406
input=texttospeech.SynthesisInput(text=text),
407
voice=texttospeech.VoiceSelectionParams(
408
language_code="en-US",
409
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
410
),
411
audio_config=texttospeech.AudioConfig(
412
audio_encoding=texttospeech.AudioEncoding.MP3
413
)
414
)
415
416
response = client.synthesize_speech(request=request)
417
418
output_path = f"{output_dir}/{filename}.mp3"
419
with open(output_path, "wb") as f:
420
f.write(response.audio_content)
421
422
return output_path
423
424
# Prepare text data
425
text_data = [(text, f"output_{i}") for i, text in enumerate(texts)]
426
427
# Process in parallel
428
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
429
results = list(executor.map(synthesize_single, text_data))
430
431
return results
432
433
# Usage
434
texts = [
435
"First text to synthesize",
436
"Second text to synthesize",
437
"Third text to synthesize"
438
]
439
output_files = synthesize_text_batch(texts)
440
```
441
442
### SSML Template Processing
443
444
```api { .api }
445
from google.cloud import texttospeech
446
447
def synthesize_with_ssml_template(content_parts, template_path="ssml_template.xml"):
448
"""Use SSML template for consistent speech formatting."""
449
450
# SSML template with placeholders
451
ssml_template = """
452
<speak>
453
<prosody rate="medium" pitch="normal">
454
<emphasis level="moderate">{title}</emphasis>
455
</prosody>
456
<break time="1s"/>
457
<prosody rate="slow">
458
{content}
459
</prosody>
460
<break time="2s"/>
461
<prosody rate="fast" pitch="+1st">
462
{conclusion}
463
</prosody>
464
</speak>
465
"""
466
467
# Fill template
468
ssml_content = ssml_template.format(**content_parts)
469
470
client = texttospeech.TextToSpeechClient()
471
472
request = texttospeech.SynthesizeSpeechRequest(
473
input=texttospeech.SynthesisInput(ssml=ssml_content),
474
voice=texttospeech.VoiceSelectionParams(
475
language_code="en-US",
476
name="en-US-Neural2-A"
477
),
478
audio_config=texttospeech.AudioConfig(
479
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
480
speaking_rate=0.9,
481
pitch=1.0
482
)
483
)
484
485
return client.synthesize_speech(request=request)
486
487
# Usage
488
content = {
489
"title": "Welcome to our presentation",
490
"content": "This is the main content of our speech synthesis example.",
491
"conclusion": "Thank you for listening!"
492
}
493
response = synthesize_with_ssml_template(content)
494
```
495
496
## Error Handling
497
498
### Synthesis-Specific Errors
499
500
```api { .api }
501
from google.api_core import exceptions
502
from google.cloud import texttospeech
503
504
def safe_synthesize_speech(text, language_code="en-US"):
505
"""Synthesize speech with comprehensive error handling."""
506
try:
507
client = texttospeech.TextToSpeechClient()
508
509
request = texttospeech.SynthesizeSpeechRequest(
510
input=texttospeech.SynthesisInput(text=text),
511
voice=texttospeech.VoiceSelectionParams(language_code=language_code),
512
audio_config=texttospeech.AudioConfig(
513
audio_encoding=texttospeech.AudioEncoding.MP3
514
)
515
)
516
517
response = client.synthesize_speech(request=request)
518
return response.audio_content
519
520
except exceptions.InvalidArgument as e:
521
print(f"Invalid request parameters: {e}")
522
return None
523
except exceptions.OutOfRange as e:
524
print(f"Parameter out of valid range: {e}")
525
return None
526
except exceptions.FailedPrecondition as e:
527
print(f"Failed precondition: {e}")
528
return None
529
except exceptions.ResourceExhausted as e:
530
print(f"Quota exceeded or rate limited: {e}")
531
return None
532
except exceptions.Unauthenticated as e:
533
print(f"Authentication failed: {e}")
534
return None
535
except exceptions.PermissionDenied as e:
536
print(f"Permission denied: {e}")
537
return None
538
except Exception as e:
539
print(f"Unexpected error: {e}")
540
return None
541
542
# Usage with error handling
543
audio_data = safe_synthesize_speech("Hello world", "en-US")
544
if audio_data:
545
with open("safe_output.mp3", "wb") as f:
546
f.write(audio_data)
547
```
548
549
## Performance Optimization
550
551
### Request Optimization
552
553
```api { .api }
554
from google.cloud import texttospeech
555
556
# Optimize for latency
557
def create_low_latency_request(text):
558
return texttospeech.SynthesizeSpeechRequest(
559
input=texttospeech.SynthesisInput(text=text),
560
voice=texttospeech.VoiceSelectionParams(
561
language_code="en-US",
562
name="en-US-Standard-A", # Standard voices are faster
563
advanced_voice_options=texttospeech.AdvancedVoiceOptions(
564
low_latency_journey_synthesis=True
565
)
566
),
567
audio_config=texttospeech.AudioConfig(
568
audio_encoding=texttospeech.AudioEncoding.MP3, # MP3 is compressed
569
sample_rate_hertz=16000 # Lower sample rate for faster processing
570
)
571
)
572
573
# Optimize for quality
574
def create_high_quality_request(text):
575
return texttospeech.SynthesizeSpeechRequest(
576
input=texttospeech.SynthesisInput(text=text),
577
voice=texttospeech.VoiceSelectionParams(
578
language_code="en-US",
579
name="en-US-Wavenet-A" # WaveNet for higher quality
580
),
581
audio_config=texttospeech.AudioConfig(
582
audio_encoding=texttospeech.AudioEncoding.LINEAR16, # Uncompressed
583
sample_rate_hertz=48000 # High sample rate
584
)
585
)
586
```