Google Cloud Texttospeech API client library for converting text to speech with multiple voices and audio formats
—
Speech synthesis is the core functionality of the Google Cloud Text-to-Speech API, converting text input into natural-sounding speech audio. The API supports both plain text and SSML (Speech Synthesis Markup Language) input with extensive configuration options for voice selection and audio output.
from google.cloud import texttospeech
# Initialize client
client = texttospeech.TextToSpeechClient()
# Create synthesis request
request = texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(text="Hello, this is a text-to-speech demo"),
voice=texttospeech.VoiceSelectionParams(
language_code="en-US",
ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
)
# Perform synthesis
response = client.synthesize_speech(request=request)
# Access audio data
audio_content = response.audio_content # bytesfrom google.cloud import texttospeech
# SSML input with markup
ssml_text = """
<speak>
<prosody rate="slow" pitch="+2st">
Hello, this is spoken slowly with higher pitch.
</prosody>
<break time="1s"/>
<prosody rate="fast" pitch="-2st">
And this is spoken quickly with lower pitch.
</prosody>
</speak>
"""
request = texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(ssml=ssml_text),
voice=texttospeech.VoiceSelectionParams(
language_code="en-US",
name="en-US-Wavenet-D" # Specific voice model
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
sample_rate_hertz=24000
)
)
response = client.synthesize_speech(request=request)from google.cloud.texttospeech import SynthesisInput
# Plain text input
text_input = SynthesisInput(text="Plain text to synthesize")
# SSML input
ssml_input = SynthesisInput(
ssml='<speak>SSML <emphasis level="strong">markup</emphasis> text</speak>'
)
# Multi-speaker SSML input
multi_speaker_input = SynthesisInput(
multi_speaker_markup=texttospeech.MultiSpeakerMarkup(
ssml='<speak><voice name="speaker1">Hello</voice><voice name="speaker2">World</voice></speak>'
)
)# Custom pronunciations with synthesis input
from google.cloud.texttospeech import (
SynthesisInput,
CustomPronunciations,
CustomPronunciationParams
)
# Define custom pronunciations
custom_pronunciations = CustomPronunciations(
pronunciations=[
CustomPronunciationParams(
phrase="Anthropic",
ipa="ˌænθrəˈpɪk",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="Claude",
ipa="klɔːd",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)
]
)
# Use with synthesis
request = texttospeech.SynthesizeSpeechRequest(
input=SynthesisInput(text="Hello from Anthropic's Claude AI assistant"),
voice=texttospeech.VoiceSelectionParams(
language_code="en-US",
custom_pronunciations=custom_pronunciations
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
)from google.cloud.texttospeech import VoiceSelectionParams, SsmlVoiceGender
# Basic voice selection
voice = VoiceSelectionParams(
language_code="en-US", # Required: BCP-47 language code
ssml_gender=SsmlVoiceGender.MALE # Optional: voice gender
)
# Specific voice model selection
voice = VoiceSelectionParams(
language_code="en-US",
name="en-US-Wavenet-A" # Specific voice name
)
# Custom voice model
voice = VoiceSelectionParams(
language_code="en-US",
custom_voice=texttospeech.CustomVoiceParams(
model="projects/your-project/locations/us-central1/models/your-model"
)
)from google.cloud.texttospeech import (
VoiceSelectionParams,
AdvancedVoiceOptions,
VoiceCloneParams
)
# Advanced voice options
voice = VoiceSelectionParams(
language_code="en-US",
name="en-US-Wavenet-A",
advanced_voice_options=AdvancedVoiceOptions(
low_latency_journey_synthesis=True
)
)
# Voice cloning parameters
voice = VoiceSelectionParams(
language_code="en-US",
voice_clone=VoiceCloneParams(
voice_clone_key="your-voice-clone-key"
)
)from google.cloud.texttospeech import AudioConfig, AudioEncoding
# Basic audio configuration
audio_config = AudioConfig(
audio_encoding=AudioEncoding.MP3, # Required: output format
sample_rate_hertz=22050, # Optional: sample rate
speaking_rate=1.0, # Optional: speech rate (0.25-4.0)
pitch=0.0, # Optional: pitch (-20.0 to 20.0)
volume_gain_db=0.0 # Optional: volume gain (-96.0 to 16.0)
)
# High-quality linear PCM
audio_config = AudioConfig(
audio_encoding=AudioEncoding.LINEAR16,
sample_rate_hertz=48000,
speaking_rate=0.9,
pitch=2.0
)
# OGG Opus for streaming
audio_config = AudioConfig(
audio_encoding=AudioEncoding.OGG_OPUS,
sample_rate_hertz=48000
)from google.cloud.texttospeech import AudioConfig, AudioEncoding
# Audio with effects profile
audio_config = AudioConfig(
audio_encoding=AudioEncoding.MP3,
effects_profile_id=["telephony-class-application"], # Audio effects
speaking_rate=1.2,
pitch=-2.0,
volume_gain_db=3.0
)
# Multiple effects profiles
audio_config = AudioConfig(
audio_encoding=AudioEncoding.LINEAR16,
effects_profile_id=[
"wearable-class-device",
"handset-class-device"
],
sample_rate_hertz=16000
)from google.cloud.texttospeech import (
SynthesizeSpeechRequest,
SynthesisInput,
VoiceSelectionParams,
AudioConfig
)
# Complete request configuration
request = SynthesizeSpeechRequest(
input=SynthesisInput(text="Text to synthesize"),
voice=VoiceSelectionParams(
language_code="en-US",
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
),
audio_config=AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
sample_rate_hertz=22050
)
)
# Request with advanced features
request = SynthesizeSpeechRequest(
input=SynthesisInput(
ssml='<speak>Hello <mark name="greeting"/>world!</speak>'
),
voice=VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-A"
),
audio_config=AudioConfig(
audio_encoding=AudioEncoding.MP3,
effects_profile_id=["small-bluetooth-speaker-class-device"]
)
)from google.cloud.texttospeech import SynthesizeSpeechResponse
# Standard response
response = client.synthesize_speech(request=request)
# Access response data
audio_content = response.audio_content # bytes: synthesized audio data
# Response provides audio as bytes
with open("output.mp3", "wb") as audio_file:
audio_file.write(response.audio_content)
# Get audio length and properties
audio_size = len(response.audio_content)
print(f"Generated {audio_size} bytes of audio")from google.cloud.texttospeech import (
SynthesisInput,
MultiSpeakerMarkup,
VoiceSelectionParams
)
# Multi-speaker SSML
multi_speaker_ssml = '''
<speak>
<voice name="en-US-Neural2-A">
Hello, I'm the first speaker.
</voice>
<voice name="en-US-Neural2-B">
And I'm the second speaker.
</voice>
<voice name="en-US-Neural2-C">
Together we create a conversation.
</voice>
</speak>
'''
# Configure multi-speaker input
multi_speaker_input = SynthesisInput(
multi_speaker_markup=MultiSpeakerMarkup(
ssml=multi_speaker_ssml
)
)
# Create synthesis request
request = texttospeech.SynthesizeSpeechRequest(
input=multi_speaker_input,
voice=VoiceSelectionParams(
language_code="en-US" # Base language for multi-speaker
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16
)
)import os
from google.cloud import texttospeech
def text_file_to_speech(input_file_path, output_file_path, voice_name=None):
"""Convert text file to speech audio file."""
client = texttospeech.TextToSpeechClient()
# Read text from file
with open(input_file_path, 'r', encoding='utf-8') as file:
text_content = file.read()
# Configure synthesis
voice = texttospeech.VoiceSelectionParams(
language_code="en-US",
name=voice_name or "en-US-Neural2-A"
)
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
request = texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(text=text_content),
voice=voice,
audio_config=audio_config
)
# Synthesize speech
response = client.synthesize_speech(request=request)
# Write audio file
with open(output_file_path, "wb") as output_file:
output_file.write(response.audio_content)
print(f"Audio content written to '{output_file_path}'")
# Usage
text_file_to_speech("input.txt", "output.mp3", "en-US-Wavenet-D")from google.cloud import texttospeech
import concurrent.futures
def synthesize_text_batch(texts, output_dir="outputs"):
"""Synthesize multiple texts in parallel."""
client = texttospeech.TextToSpeechClient()
def synthesize_single(text_data):
text, filename = text_data
request = texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(text=text),
voice=texttospeech.VoiceSelectionParams(
language_code="en-US",
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
)
response = client.synthesize_speech(request=request)
output_path = f"{output_dir}/{filename}.mp3"
with open(output_path, "wb") as f:
f.write(response.audio_content)
return output_path
# Prepare text data
text_data = [(text, f"output_{i}") for i, text in enumerate(texts)]
# Process in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
results = list(executor.map(synthesize_single, text_data))
return results
# Usage
texts = [
"First text to synthesize",
"Second text to synthesize",
"Third text to synthesize"
]
output_files = synthesize_text_batch(texts)from google.cloud import texttospeech
def synthesize_with_ssml_template(content_parts, template_path="ssml_template.xml"):
"""Use SSML template for consistent speech formatting."""
# SSML template with placeholders
ssml_template = """
<speak>
<prosody rate="medium" pitch="normal">
<emphasis level="moderate">{title}</emphasis>
</prosody>
<break time="1s"/>
<prosody rate="slow">
{content}
</prosody>
<break time="2s"/>
<prosody rate="fast" pitch="+1st">
{conclusion}
</prosody>
</speak>
"""
# Fill template
ssml_content = ssml_template.format(**content_parts)
client = texttospeech.TextToSpeechClient()
request = texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(ssml=ssml_content),
voice=texttospeech.VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-A"
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
speaking_rate=0.9,
pitch=1.0
)
)
return client.synthesize_speech(request=request)
# Usage
content = {
"title": "Welcome to our presentation",
"content": "This is the main content of our speech synthesis example.",
"conclusion": "Thank you for listening!"
}
response = synthesize_with_ssml_template(content)from google.api_core import exceptions
from google.cloud import texttospeech
def safe_synthesize_speech(text, language_code="en-US"):
"""Synthesize speech with comprehensive error handling."""
try:
client = texttospeech.TextToSpeechClient()
request = texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(text=text),
voice=texttospeech.VoiceSelectionParams(language_code=language_code),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
)
response = client.synthesize_speech(request=request)
return response.audio_content
except exceptions.InvalidArgument as e:
print(f"Invalid request parameters: {e}")
return None
except exceptions.OutOfRange as e:
print(f"Parameter out of valid range: {e}")
return None
except exceptions.FailedPrecondition as e:
print(f"Failed precondition: {e}")
return None
except exceptions.ResourceExhausted as e:
print(f"Quota exceeded or rate limited: {e}")
return None
except exceptions.Unauthenticated as e:
print(f"Authentication failed: {e}")
return None
except exceptions.PermissionDenied as e:
print(f"Permission denied: {e}")
return None
except Exception as e:
print(f"Unexpected error: {e}")
return None
# Usage with error handling
audio_data = safe_synthesize_speech("Hello world", "en-US")
if audio_data:
with open("safe_output.mp3", "wb") as f:
f.write(audio_data)from google.cloud import texttospeech
# Optimize for latency
def create_low_latency_request(text):
return texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(text=text),
voice=texttospeech.VoiceSelectionParams(
language_code="en-US",
name="en-US-Standard-A", # Standard voices are faster
advanced_voice_options=texttospeech.AdvancedVoiceOptions(
low_latency_journey_synthesis=True
)
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3, # MP3 is compressed
sample_rate_hertz=16000 # Lower sample rate for faster processing
)
)
# Optimize for quality
def create_high_quality_request(text):
return texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(text=text),
voice=texttospeech.VoiceSelectionParams(
language_code="en-US",
name="en-US-Wavenet-A" # WaveNet for higher quality
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16, # Uncompressed
sample_rate_hertz=48000 # High sample rate
)
)Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-texttospeech