Google Cloud Texttospeech API client library for converting text to speech with multiple voices and audio formats
—
The Google Cloud Text-to-Speech API provides extensive configuration options through various classes and types. These configuration objects control voice selection, audio output, input formatting, and advanced features like custom pronunciations and multi-speaker synthesis.
from google.cloud.texttospeech import SynthesisInput, MultiSpeakerMarkup
# Plain text input
text_input = SynthesisInput(
text="Convert this plain text to speech"
)
# SSML input
ssml_input = SynthesisInput(
ssml='<speak>Convert this <emphasis level="strong">SSML</emphasis> to speech</speak>'
)
# Multi-speaker markup input
multi_speaker_input = SynthesisInput(
multi_speaker_markup=MultiSpeakerMarkup(
ssml='''
<speak>
<voice name="en-US-Neural2-A">Hello from speaker one.</voice>
<voice name="en-US-Neural2-C">And greetings from speaker two.</voice>
</speak>
'''
)
)
# SynthesisInput only accepts ONE of: text, ssml, or multi_speaker_markup
# Using multiple will raise an errorfrom google.cloud.texttospeech import (
VoiceSelectionParams,
SsmlVoiceGender,
CustomPronunciations,
CustomPronunciationParams,
AdvancedVoiceOptions,
CustomVoiceParams,
VoiceCloneParams
)
# Basic voice selection
basic_voice = VoiceSelectionParams(
language_code="en-US", # Required: BCP-47 language code
ssml_gender=SsmlVoiceGender.FEMALE # Optional: voice gender preference
)
# Specific voice selection
specific_voice = VoiceSelectionParams(
language_code="en-US",
name="en-US-Wavenet-D" # Exact voice model name
)
# Voice with custom pronunciations
voice_with_pronunciations = VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-A",
custom_pronunciations=CustomPronunciations(
pronunciations=[
CustomPronunciationParams(
phrase="GitHub",
ipa="ˈɡɪt hʌb",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="API",
ipa="ˌeɪ piː ˈaɪ",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)
]
)
)
# Voice with advanced options
advanced_voice = VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-C",
advanced_voice_options=AdvancedVoiceOptions(
low_latency_journey_synthesis=True # Enable low-latency processing
)
)
# Custom voice model
custom_voice = VoiceSelectionParams(
language_code="en-US",
custom_voice=CustomVoiceParams(
model="projects/your-project/locations/us-central1/models/custom-model"
)
)
# Voice cloning
cloned_voice = VoiceSelectionParams(
language_code="en-US",
voice_clone=VoiceCloneParams(
voice_clone_key="your-voice-clone-key"
)
)from google.cloud.texttospeech import AudioConfig, AudioEncoding
# Basic audio configuration
basic_audio = AudioConfig(
audio_encoding=AudioEncoding.MP3, # Required: output format
sample_rate_hertz=22050 # Optional: sample rate (Hz)
)
# Complete audio configuration
complete_audio = AudioConfig(
audio_encoding=AudioEncoding.LINEAR16, # Audio format
sample_rate_hertz=24000, # Sample rate
speaking_rate=1.0, # Speech rate (0.25-4.0)
pitch=0.0, # Pitch adjustment (-20.0 to 20.0)
volume_gain_db=0.0, # Volume gain (-96.0 to 16.0)
effects_profile_id=["large-home-entertainment-class-device"] # Audio effects
)
# High-quality audio configuration
high_quality_audio = AudioConfig(
audio_encoding=AudioEncoding.LINEAR16,
sample_rate_hertz=48000,
speaking_rate=0.95,
pitch=1.0,
volume_gain_db=2.0
)
# Compressed audio for streaming
streaming_audio = AudioConfig(
audio_encoding=AudioEncoding.OGG_OPUS,
sample_rate_hertz=48000,
speaking_rate=1.1,
effects_profile_id=["wearable-class-device"]
)
# Telephony optimized audio
telephony_audio = AudioConfig(
audio_encoding=AudioEncoding.MULAW,
sample_rate_hertz=8000,
speaking_rate=1.2,
effects_profile_id=["telephony-class-application"]
)from google.cloud.texttospeech import Voice, SsmlVoiceGender
# Voice object (returned by list_voices())
# Contains voice information and capabilities
def analyze_voice_properties(voice: Voice):
"""Analyze properties of a Voice object."""
print(f"Name: {voice.name}") # e.g., "en-US-Wavenet-A"
print(f"Language Codes: {voice.language_codes}") # e.g., ["en-US"]
print(f"SSML Gender: {voice.ssml_gender}") # SsmlVoiceGender enum
print(f"Natural Sample Rate: {voice.natural_sample_rate_hertz} Hz") # e.g., 24000
# Voice categorization based on name
if "Neural2" in voice.name:
print("Type: Premium Neural Voice")
elif "Wavenet" in voice.name:
print("Type: High-Quality Neural Voice")
elif "Standard" in voice.name:
print("Type: Standard Voice")
elif "Studio" in voice.name:
print("Type: Studio Voice")
else:
print("Type: Custom or Special Voice")
# Example usage with actual Voice objects
# voices_response = client.list_voices()
# for voice in voices_response.voices:
# analyze_voice_properties(voice)from google.cloud.texttospeech import StreamingAudioConfig, AudioEncoding
# Basic streaming audio configuration
streaming_basic = StreamingAudioConfig(
audio_encoding=AudioEncoding.LINEAR16, # Required: audio format
sample_rate_hertz=22050 # Required: sample rate
)
# Advanced streaming audio configuration
streaming_advanced = StreamingAudioConfig(
audio_encoding=AudioEncoding.OGG_OPUS, # Compressed format
sample_rate_hertz=48000, # High sample rate
speaking_rate=1.0, # Normal speech rate
pitch=0.0, # Neutral pitch
volume_gain_db=1.0, # Slight volume boost
effects_profile_id=["small-bluetooth-speaker-class-device"] # Audio effects
)
# Low-latency streaming configuration
streaming_low_latency = StreamingAudioConfig(
audio_encoding=AudioEncoding.LINEAR16,
sample_rate_hertz=16000, # Lower rate for speed
speaking_rate=1.1 # Slightly faster
)
# High-quality streaming configuration
streaming_high_quality = StreamingAudioConfig(
audio_encoding=AudioEncoding.LINEAR16,
sample_rate_hertz=48000,
speaking_rate=0.9, # Slightly slower
pitch=-0.5, # Lower pitch
volume_gain_db=2.0 # Volume boost
)from google.cloud.texttospeech import (
StreamingSynthesizeConfig,
VoiceSelectionParams,
StreamingAudioConfig
)
# Complete streaming synthesis configuration
streaming_config = StreamingSynthesizeConfig(
voice=VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-A",
ssml_gender=SsmlVoiceGender.FEMALE
),
audio_config=StreamingAudioConfig(
audio_encoding=AudioEncoding.LINEAR16,
sample_rate_hertz=22050,
speaking_rate=1.0,
pitch=0.0,
volume_gain_db=0.0
)
)
# Low-latency streaming configuration
low_latency_streaming = StreamingSynthesizeConfig(
voice=VoiceSelectionParams(
language_code="en-US",
name="en-US-Standard-B", # Standard voice for speed
advanced_voice_options=AdvancedVoiceOptions(
low_latency_journey_synthesis=True
)
),
audio_config=StreamingAudioConfig(
audio_encoding=AudioEncoding.LINEAR16,
sample_rate_hertz=16000 # Lower sample rate
)
)
# Multi-language streaming configuration
multilang_streaming = StreamingSynthesizeConfig(
voice=VoiceSelectionParams(
language_code="en-US",
name="en-US-Polyglot-1" # Polyglot voice if available
),
audio_config=StreamingAudioConfig(
audio_encoding=AudioEncoding.MP3,
sample_rate_hertz=24000
)
)from google.cloud.texttospeech import StreamingSynthesisInput
# Text input for streaming
text_stream_input = StreamingSynthesisInput(
text="This text will be streamed to the synthesis service."
)
# SSML input for streaming
ssml_stream_input = StreamingSynthesisInput(
ssml='<speak>This <emphasis level="moderate">SSML content</emphasis> will be streamed.</speak>'
)
# Note: StreamingSynthesisInput accepts either text OR ssml, not both
# Each streaming request should contain one input chunkfrom google.cloud.texttospeech import AdvancedVoiceOptions
# Advanced voice configuration
advanced_options = AdvancedVoiceOptions(
low_latency_journey_synthesis=True # Enable low-latency processing
)
# Usage in voice selection
voice_with_advanced = VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-A",
advanced_voice_options=advanced_options
)
# Direct configuration
direct_advanced_voice = VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-C",
advanced_voice_options=AdvancedVoiceOptions(
low_latency_journey_synthesis=True
)
)from google.cloud.texttospeech import (
CustomPronunciations,
CustomPronunciationParams
)
# Individual pronunciation parameter
pronunciation_param = CustomPronunciationParams(
phrase="PyTorch", # Word or phrase to customize
ipa="ˈpaɪ tɔrʧ", # IPA pronunciation
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA # Encoding type
)
# X-SAMPA encoding example
xsampa_param = CustomPronunciationParams(
phrase="neural",
ipa="n\"jU@r@l", # X-SAMPA notation
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
)
# Collection of custom pronunciations
custom_pronunciations = CustomPronunciations(
pronunciations=[
CustomPronunciationParams(
phrase="TensorFlow",
ipa="ˈtɛnsər floʊ",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="Kubernetes",
ipa="ˌkubərˈnɛtɪs",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="OAuth",
ipa="ˈoʊ ɔːθ",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="JSON",
ipa="ˈdʒeɪ sɒn",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)
]
)
# Technical terms pronunciations
tech_pronunciations = CustomPronunciations(
pronunciations=[
CustomPronunciationParams(
phrase="API", ipa="ˌeɪ piː ˈaɪ",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="HTTP", ipa="ˌeɪʧ tiː tiː ˈpiː",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="URL", ipa="ˌjuː ɑːr ˈɛl",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="SQL", ipa="ˈsiː kwəl",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)
]
)from google.cloud.texttospeech import MultiSpeakerMarkup
# Basic multi-speaker configuration
multi_speaker = MultiSpeakerMarkup(
ssml='''
<speak>
<voice name="en-US-Neural2-A">
Hello, I'm the first speaker in this conversation.
</voice>
<voice name="en-US-Neural2-C">
And I'm the second speaker responding to you.
</voice>
</speak>
'''
)
# Complex multi-speaker conversation
conversation_markup = MultiSpeakerMarkup(
ssml='''
<speak>
<voice name="en-US-Neural2-A">
<prosody rate="medium" pitch="normal">
Welcome to our technical presentation.
</prosody>
</voice>
<break time="1s"/>
<voice name="en-US-Neural2-C">
<prosody rate="slow" pitch="+2st">
Today we'll discuss advanced AI concepts.
</prosody>
</voice>
<break time="2s"/>
<voice name="en-US-Wavenet-D">
<prosody rate="fast" pitch="-1st">
Let's start with the technical implementation details.
</prosody>
</voice>
</speak>
'''
)
# Dialogue with emotions and pacing
dialogue_markup = MultiSpeakerMarkup(
ssml='''
<speak>
<voice name="en-US-Neural2-A">
<prosody rate="medium" pitch="normal" volume="loud">
I have exciting news to share!
</prosody>
</voice>
<voice name="en-US-Neural2-C">
<prosody rate="slow" pitch="low" volume="soft">
Please, tell me more about it.
</prosody>
</voice>
<voice name="en-US-Neural2-A">
<prosody rate="fast" pitch="high" volume="loud">
We've achieved a breakthrough in our research!
</prosody>
</voice>
</speak>
'''
)from google.cloud.texttospeech import CustomVoiceParams
# Custom voice model configuration
custom_voice_params = CustomVoiceParams(
model="projects/your-project-id/locations/us-central1/models/your-custom-voice-model"
)
# Usage with voice selection
voice_with_custom_model = VoiceSelectionParams(
language_code="en-US",
custom_voice=custom_voice_params
)
# Complete custom voice configuration
complete_custom_voice = VoiceSelectionParams(
language_code="en-US",
custom_voice=CustomVoiceParams(
model="projects/your-project-id/locations/us-central1/models/custom-narrator-voice"
),
custom_pronunciations=CustomPronunciations(
pronunciations=[
CustomPronunciationParams(
phrase="company_name",
ipa="ˈkʌmpəni neɪm",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)
]
)
)from google.cloud.texttospeech import VoiceCloneParams
# Voice cloning configuration
voice_clone_params = VoiceCloneParams(
voice_clone_key="your-voice-clone-key-from-console"
)
# Usage with voice selection
cloned_voice_selection = VoiceSelectionParams(
language_code="en-US",
voice_clone=voice_clone_params
)
# Complete cloned voice setup
complete_cloned_voice = VoiceSelectionParams(
language_code="en-US",
voice_clone=VoiceCloneParams(
voice_clone_key="abcd-1234-efgh-5678"
),
advanced_voice_options=AdvancedVoiceOptions(
low_latency_journey_synthesis=True
)
)from google.cloud.texttospeech import AudioEncoding
# Available audio encoding formats
LINEAR16 = AudioEncoding.LINEAR16 # 16-bit PCM with WAV header (lossless)
MP3 = AudioEncoding.MP3 # MP3 at 32kbps (compressed)
OGG_OPUS = AudioEncoding.OGG_OPUS # Opus in Ogg container (compressed)
MULAW = AudioEncoding.MULAW # 8-bit G.711 PCMU/mu-law (telephony)
ALAW = AudioEncoding.ALAW # 8-bit G.711 PCMU/A-law (telephony)
PCM = AudioEncoding.PCM # 16-bit PCM without header (raw)
M4A = AudioEncoding.M4A # M4A format (compressed)
UNSPECIFIED = AudioEncoding.AUDIO_ENCODING_UNSPECIFIED # Not specified
# Usage in audio configuration
high_quality_config = AudioConfig(
audio_encoding=AudioEncoding.LINEAR16, # Best quality
sample_rate_hertz=48000
)
compressed_config = AudioConfig(
audio_encoding=AudioEncoding.MP3, # Good compression
sample_rate_hertz=22050
)
telephony_config = AudioConfig(
audio_encoding=AudioEncoding.MULAW, # Telephony standard
sample_rate_hertz=8000
)from google.cloud.texttospeech import SsmlVoiceGender
# Available gender options
MALE = SsmlVoiceGender.MALE # Male voice
FEMALE = SsmlVoiceGender.FEMALE # Female voice
NEUTRAL = SsmlVoiceGender.NEUTRAL # Gender-neutral voice
UNSPECIFIED = SsmlVoiceGender.SSML_VOICE_GENDER_UNSPECIFIED # No preference
# Usage in voice selection
male_voice = VoiceSelectionParams(
language_code="en-US",
ssml_gender=SsmlVoiceGender.MALE
)
female_voice = VoiceSelectionParams(
language_code="en-US",
ssml_gender=SsmlVoiceGender.FEMALE
)
neutral_voice = VoiceSelectionParams(
language_code="en-US",
ssml_gender=SsmlVoiceGender.NEUTRAL
)from google.cloud.texttospeech import CustomPronunciationParams
# Available phonetic encoding options
IPA = CustomPronunciationParams.PhoneticEncoding.IPA # International Phonetic Alphabet
X_SAMPA = CustomPronunciationParams.PhoneticEncoding.X_SAMPA # X-SAMPA notation
UNSPECIFIED = CustomPronunciationParams.PhoneticEncoding.PHONETIC_ENCODING_UNSPECIFIED
# Usage in pronunciation parameters
ipa_pronunciation = CustomPronunciationParams(
phrase="example",
ipa="ɪɡˈzæmpəl",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)
xsampa_pronunciation = CustomPronunciationParams(
phrase="example",
ipa="Ig\"z{mp@l",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
)def validate_audio_config(audio_config: AudioConfig) -> tuple[bool, list[str]]:
"""Validate audio configuration parameters."""
errors = []
# Check required fields
if not hasattr(audio_config, 'audio_encoding') or not audio_config.audio_encoding:
errors.append("audio_encoding is required")
# Validate sample rate ranges
if hasattr(audio_config, 'sample_rate_hertz') and audio_config.sample_rate_hertz:
sample_rate = audio_config.sample_rate_hertz
valid_rates = [8000, 16000, 22050, 24000, 32000, 44100, 48000]
if sample_rate not in valid_rates:
errors.append(f"sample_rate_hertz must be one of {valid_rates}, got {sample_rate}")
# Validate speaking rate
if hasattr(audio_config, 'speaking_rate') and audio_config.speaking_rate:
rate = audio_config.speaking_rate
if not (0.25 <= rate <= 4.0):
errors.append(f"speaking_rate must be between 0.25 and 4.0, got {rate}")
# Validate pitch
if hasattr(audio_config, 'pitch') and audio_config.pitch:
pitch = audio_config.pitch
if not (-20.0 <= pitch <= 20.0):
errors.append(f"pitch must be between -20.0 and 20.0, got {pitch}")
# Validate volume gain
if hasattr(audio_config, 'volume_gain_db') and audio_config.volume_gain_db:
volume = audio_config.volume_gain_db
if not (-96.0 <= volume <= 16.0):
errors.append(f"volume_gain_db must be between -96.0 and 16.0, got {volume}")
return len(errors) == 0, errors
def validate_voice_selection(voice: VoiceSelectionParams) -> tuple[bool, list[str]]:
"""Validate voice selection parameters."""
errors = []
# Check required fields
if not hasattr(voice, 'language_code') or not voice.language_code:
errors.append("language_code is required")
else:
# Validate language code format (basic check for BCP-47)
lang_code = voice.language_code
if not lang_code.count('-') >= 1 or len(lang_code) < 2:
errors.append(f"language_code should be in BCP-47 format (e.g., 'en-US'), got '{lang_code}'")
# Check conflicting voice specifications
specified_count = sum([
bool(getattr(voice, 'name', None)),
bool(getattr(voice, 'custom_voice', None)),
bool(getattr(voice, 'voice_clone', None))
])
if specified_count > 1:
errors.append("Only one of 'name', 'custom_voice', or 'voice_clone' should be specified")
return len(errors) == 0, errors
# Usage examples
audio_config = AudioConfig(
audio_encoding=AudioEncoding.MP3,
sample_rate_hertz=22050,
speaking_rate=1.5,
pitch=2.0
)
is_valid, validation_errors = validate_audio_config(audio_config)
if not is_valid:
print(f"Audio config validation errors: {validation_errors}")class ConfigurationBuilder:
"""Helper class for building complex configurations."""
@staticmethod
def build_high_quality_config() -> AudioConfig:
"""Build high-quality audio configuration."""
return AudioConfig(
audio_encoding=AudioEncoding.LINEAR16,
sample_rate_hertz=48000,
speaking_rate=0.95,
pitch=0.0,
volume_gain_db=1.0
)
@staticmethod
def build_streaming_config() -> AudioConfig:
"""Build streaming-optimized audio configuration."""
return AudioConfig(
audio_encoding=AudioEncoding.OGG_OPUS,
sample_rate_hertz=24000,
speaking_rate=1.1,
volume_gain_db=0.0
)
@staticmethod
def build_mobile_config() -> AudioConfig:
"""Build mobile-optimized audio configuration."""
return AudioConfig(
audio_encoding=AudioEncoding.MP3,
sample_rate_hertz=16000,
speaking_rate=1.2,
effects_profile_id=["handset-class-device"]
)
@staticmethod
def build_tech_voice_with_pronunciations(language_code: str = "en-US") -> VoiceSelectionParams:
"""Build voice configuration optimized for technical content."""
tech_pronunciations = CustomPronunciations(
pronunciations=[
CustomPronunciationParams(
phrase="API", ipa="ˌeɪ piː ˈaɪ",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="JSON", ipa="ˈdʒeɪ sɒn",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="HTTP", ipa="ˌeɪʧ tiː tiː ˈpiː",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="SQL", ipa="ˈsiː kwəl",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)
]
)
return VoiceSelectionParams(
language_code=language_code,
name=f"{language_code}-Neural2-A",
custom_pronunciations=tech_pronunciations
)
@staticmethod
def build_conversation_voices() -> list[VoiceSelectionParams]:
"""Build multiple voices for conversation synthesis."""
return [
VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-A", # Female voice
ssml_gender=SsmlVoiceGender.FEMALE
),
VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-C", # Male voice
ssml_gender=SsmlVoiceGender.MALE
),
VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-F", # Neutral voice
ssml_gender=SsmlVoiceGender.NEUTRAL
)
]
# Usage examples
high_quality_audio = ConfigurationBuilder.build_high_quality_config()
streaming_audio = ConfigurationBuilder.build_streaming_config()
mobile_audio = ConfigurationBuilder.build_mobile_config()
tech_voice = ConfigurationBuilder.build_tech_voice_with_pronunciations("en-US")
conversation_voices = ConfigurationBuilder.build_conversation_voices()class ConfigurationTemplates:
"""Pre-defined configuration templates for common use cases."""
AUDIOBOOK = {
'voice': VoiceSelectionParams(
language_code="en-US",
name="en-US-Wavenet-A"
),
'audio': AudioConfig(
audio_encoding=AudioEncoding.MP3,
sample_rate_hertz=22050,
speaking_rate=0.9,
volume_gain_db=2.0
)
}
PODCAST = {
'voice': VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-C"
),
'audio': AudioConfig(
audio_encoding=AudioEncoding.MP3,
sample_rate_hertz=44100,
speaking_rate=1.0,
effects_profile_id=["large-home-entertainment-class-device"]
)
}
NEWS_BROADCAST = {
'voice': VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-D",
ssml_gender=SsmlVoiceGender.MALE
),
'audio': AudioConfig(
audio_encoding=AudioEncoding.LINEAR16,
sample_rate_hertz=24000,
speaking_rate=1.1,
pitch=-1.0
)
}
EDUCATIONAL = {
'voice': VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-A"
),
'audio': AudioConfig(
audio_encoding=AudioEncoding.MP3,
sample_rate_hertz=22050,
speaking_rate=0.95,
pitch=1.0
)
}
TELEPHONY = {
'voice': VoiceSelectionParams(
language_code="en-US",
name="en-US-Standard-C"
),
'audio': AudioConfig(
audio_encoding=AudioEncoding.MULAW,
sample_rate_hertz=8000,
speaking_rate=1.2,
effects_profile_id=["telephony-class-application"]
)
}
@classmethod
def get_template(cls, template_name: str) -> dict:
"""Get configuration template by name."""
template_map = {
'audiobook': cls.AUDIOBOOK,
'podcast': cls.PODCAST,
'news': cls.NEWS_BROADCAST,
'educational': cls.EDUCATIONAL,
'telephony': cls.TELEPHONY
}
return template_map.get(template_name.lower(), cls.AUDIOBOOK)
@classmethod
def create_request_from_template(cls, template_name: str, text: str) -> 'SynthesizeSpeechRequest':
"""Create synthesis request from template."""
template = cls.get_template(template_name)
return texttospeech.SynthesizeSpeechRequest(
input=SynthesisInput(text=text),
voice=template['voice'],
audio_config=template['audio']
)
# Usage examples
audiobook_config = ConfigurationTemplates.get_template('audiobook')
podcast_request = ConfigurationTemplates.create_request_from_template(
'podcast',
"Welcome to our technology podcast!"
)class ConfigurationBestPractices:
"""Best practices for Text-to-Speech configuration."""
@staticmethod
def recommend_sample_rate(audio_encoding: AudioEncoding, use_case: str) -> int:
"""Recommend optimal sample rate for encoding and use case."""
recommendations = {
AudioEncoding.LINEAR16: {
'high_quality': 48000,
'standard': 24000,
'streaming': 22050,
'mobile': 16000
},
AudioEncoding.MP3: {
'high_quality': 44100,
'standard': 22050,
'streaming': 22050,
'mobile': 16000
},
AudioEncoding.OGG_OPUS: {
'high_quality': 48000,
'standard': 24000,
'streaming': 24000,
'mobile': 16000
},
AudioEncoding.MULAW: {
'telephony': 8000
},
AudioEncoding.ALAW: {
'telephony': 8000
}
}
encoding_rec = recommendations.get(audio_encoding, {})
return encoding_rec.get(use_case, 22050) # Default fallback
@staticmethod
def optimize_for_latency(voice_config: VoiceSelectionParams,
audio_config: AudioConfig) -> tuple[VoiceSelectionParams, AudioConfig]:
"""Optimize configuration for minimal latency."""
# Use Standard voice for speed
optimized_voice = VoiceSelectionParams(
language_code=voice_config.language_code,
name=voice_config.language_code.replace('-', '-Standard-A'),
advanced_voice_options=AdvancedVoiceOptions(
low_latency_journey_synthesis=True
)
)
# Use lower sample rate and compressed format
optimized_audio = AudioConfig(
audio_encoding=AudioEncoding.MP3,
sample_rate_hertz=16000,
speaking_rate=1.1
)
return optimized_voice, optimized_audio
@staticmethod
def optimize_for_quality(voice_config: VoiceSelectionParams,
audio_config: AudioConfig) -> tuple[VoiceSelectionParams, AudioConfig]:
"""Optimize configuration for maximum quality."""
# Use Neural2 or Wavenet voice
voice_name = voice_config.language_code
if 'Neural2' not in voice_config.name and 'Wavenet' not in voice_config.name:
voice_name += '-Neural2-A' # Default to Neural2
else:
voice_name = voice_config.name
optimized_voice = VoiceSelectionParams(
language_code=voice_config.language_code,
name=voice_name
)
# Use uncompressed format with high sample rate
optimized_audio = AudioConfig(
audio_encoding=AudioEncoding.LINEAR16,
sample_rate_hertz=48000,
speaking_rate=0.95, # Slightly slower for clarity
volume_gain_db=1.0
)
return optimized_voice, optimized_audio
# Usage examples
# Optimize for latency
original_voice = VoiceSelectionParams(language_code="en-US")
original_audio = AudioConfig(audio_encoding=AudioEncoding.LINEAR16)
fast_voice, fast_audio = ConfigurationBestPractices.optimize_for_latency(
original_voice, original_audio
)
# Optimize for quality
quality_voice, quality_audio = ConfigurationBestPractices.optimize_for_quality(
original_voice, original_audio
)
# Get recommended sample rate
recommended_rate = ConfigurationBestPractices.recommend_sample_rate(
AudioEncoding.MP3, 'streaming'
)Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-texttospeech