CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-google-cloud-texttospeech

Google Cloud Texttospeech API client library for converting text to speech with multiple voices and audio formats

Pending
Overview
Eval results
Files

configuration-types.mddocs/

Configuration Types

Overview

The Google Cloud Text-to-Speech API provides extensive configuration options through various classes and types. These configuration objects control voice selection, audio output, input formatting, and advanced features like custom pronunciations and multi-speaker synthesis.

Core Configuration Classes

SynthesisInput

from google.cloud.texttospeech import SynthesisInput, MultiSpeakerMarkup

# Plain text input
text_input = SynthesisInput(
    text="Convert this plain text to speech"
)

# SSML input
ssml_input = SynthesisInput(
    ssml='<speak>Convert this <emphasis level="strong">SSML</emphasis> to speech</speak>'
)

# Multi-speaker markup input
multi_speaker_input = SynthesisInput(
    multi_speaker_markup=MultiSpeakerMarkup(
        ssml='''
        <speak>
            <voice name="en-US-Neural2-A">Hello from speaker one.</voice>
            <voice name="en-US-Neural2-C">And greetings from speaker two.</voice>
        </speak>
        '''
    )
)

# SynthesisInput only accepts ONE of: text, ssml, or multi_speaker_markup
# Using multiple will raise an error

VoiceSelectionParams

from google.cloud.texttospeech import (
    VoiceSelectionParams,
    SsmlVoiceGender,
    CustomPronunciations,
    CustomPronunciationParams,
    AdvancedVoiceOptions,
    CustomVoiceParams,
    VoiceCloneParams
)

# Basic voice selection
basic_voice = VoiceSelectionParams(
    language_code="en-US",                           # Required: BCP-47 language code
    ssml_gender=SsmlVoiceGender.FEMALE              # Optional: voice gender preference
)

# Specific voice selection
specific_voice = VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Wavenet-D"                         # Exact voice model name
)

# Voice with custom pronunciations
voice_with_pronunciations = VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Neural2-A",
    custom_pronunciations=CustomPronunciations(
        pronunciations=[
            CustomPronunciationParams(
                phrase="GitHub",
                ipa="ˈɡɪt hʌb",
                phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
            ),
            CustomPronunciationParams(
                phrase="API",
                ipa="ˌeɪ piː ˈaɪ",
                phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
            )
        ]
    )
)

# Voice with advanced options
advanced_voice = VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Neural2-C",
    advanced_voice_options=AdvancedVoiceOptions(
        low_latency_journey_synthesis=True          # Enable low-latency processing
    )
)

# Custom voice model
custom_voice = VoiceSelectionParams(
    language_code="en-US",
    custom_voice=CustomVoiceParams(
        model="projects/your-project/locations/us-central1/models/custom-model"
    )
)

# Voice cloning
cloned_voice = VoiceSelectionParams(
    language_code="en-US",
    voice_clone=VoiceCloneParams(
        voice_clone_key="your-voice-clone-key"
    )
)

AudioConfig

from google.cloud.texttospeech import AudioConfig, AudioEncoding

# Basic audio configuration
basic_audio = AudioConfig(
    audio_encoding=AudioEncoding.MP3,               # Required: output format
    sample_rate_hertz=22050                         # Optional: sample rate (Hz)
)

# Complete audio configuration
complete_audio = AudioConfig(
    audio_encoding=AudioEncoding.LINEAR16,          # Audio format
    sample_rate_hertz=24000,                        # Sample rate
    speaking_rate=1.0,                              # Speech rate (0.25-4.0)
    pitch=0.0,                                      # Pitch adjustment (-20.0 to 20.0)
    volume_gain_db=0.0,                            # Volume gain (-96.0 to 16.0)
    effects_profile_id=["large-home-entertainment-class-device"]  # Audio effects
)

# High-quality audio configuration
high_quality_audio = AudioConfig(
    audio_encoding=AudioEncoding.LINEAR16,
    sample_rate_hertz=48000,
    speaking_rate=0.95,
    pitch=1.0,
    volume_gain_db=2.0
)

# Compressed audio for streaming
streaming_audio = AudioConfig(
    audio_encoding=AudioEncoding.OGG_OPUS,
    sample_rate_hertz=48000,
    speaking_rate=1.1,
    effects_profile_id=["wearable-class-device"]
)

# Telephony optimized audio
telephony_audio = AudioConfig(
    audio_encoding=AudioEncoding.MULAW,
    sample_rate_hertz=8000,
    speaking_rate=1.2,
    effects_profile_id=["telephony-class-application"]
)

Voice

from google.cloud.texttospeech import Voice, SsmlVoiceGender

# Voice object (returned by list_voices())
# Contains voice information and capabilities

def analyze_voice_properties(voice: Voice):
    """Analyze properties of a Voice object."""
    
    print(f"Name: {voice.name}")                           # e.g., "en-US-Wavenet-A"
    print(f"Language Codes: {voice.language_codes}")       # e.g., ["en-US"]
    print(f"SSML Gender: {voice.ssml_gender}")            # SsmlVoiceGender enum
    print(f"Natural Sample Rate: {voice.natural_sample_rate_hertz} Hz")  # e.g., 24000
    
    # Voice categorization based on name
    if "Neural2" in voice.name:
        print("Type: Premium Neural Voice")
    elif "Wavenet" in voice.name:
        print("Type: High-Quality Neural Voice")
    elif "Standard" in voice.name:
        print("Type: Standard Voice")
    elif "Studio" in voice.name:
        print("Type: Studio Voice")
    else:
        print("Type: Custom or Special Voice")

# Example usage with actual Voice objects
# voices_response = client.list_voices()
# for voice in voices_response.voices:
#     analyze_voice_properties(voice)

Streaming Configuration Classes

StreamingAudioConfig

from google.cloud.texttospeech import StreamingAudioConfig, AudioEncoding

# Basic streaming audio configuration
streaming_basic = StreamingAudioConfig(
    audio_encoding=AudioEncoding.LINEAR16,          # Required: audio format
    sample_rate_hertz=22050                         # Required: sample rate
)

# Advanced streaming audio configuration
streaming_advanced = StreamingAudioConfig(
    audio_encoding=AudioEncoding.OGG_OPUS,          # Compressed format
    sample_rate_hertz=48000,                        # High sample rate
    speaking_rate=1.0,                              # Normal speech rate
    pitch=0.0,                                      # Neutral pitch
    volume_gain_db=1.0,                            # Slight volume boost
    effects_profile_id=["small-bluetooth-speaker-class-device"]  # Audio effects
)

# Low-latency streaming configuration
streaming_low_latency = StreamingAudioConfig(
    audio_encoding=AudioEncoding.LINEAR16,
    sample_rate_hertz=16000,                        # Lower rate for speed
    speaking_rate=1.1                               # Slightly faster
)

# High-quality streaming configuration
streaming_high_quality = StreamingAudioConfig(
    audio_encoding=AudioEncoding.LINEAR16,
    sample_rate_hertz=48000,
    speaking_rate=0.9,                              # Slightly slower
    pitch=-0.5,                                     # Lower pitch
    volume_gain_db=2.0                             # Volume boost
)

StreamingSynthesizeConfig

from google.cloud.texttospeech import (
    StreamingSynthesizeConfig,
    VoiceSelectionParams,
    StreamingAudioConfig
)

# Complete streaming synthesis configuration
streaming_config = StreamingSynthesizeConfig(
    voice=VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Neural2-A",
        ssml_gender=SsmlVoiceGender.FEMALE
    ),
    audio_config=StreamingAudioConfig(
        audio_encoding=AudioEncoding.LINEAR16,
        sample_rate_hertz=22050,
        speaking_rate=1.0,
        pitch=0.0,
        volume_gain_db=0.0
    )
)

# Low-latency streaming configuration
low_latency_streaming = StreamingSynthesizeConfig(
    voice=VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Standard-B",                    # Standard voice for speed
        advanced_voice_options=AdvancedVoiceOptions(
            low_latency_journey_synthesis=True
        )
    ),
    audio_config=StreamingAudioConfig(
        audio_encoding=AudioEncoding.LINEAR16,
        sample_rate_hertz=16000                     # Lower sample rate
    )
)

# Multi-language streaming configuration
multilang_streaming = StreamingSynthesizeConfig(
    voice=VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Polyglot-1"                    # Polyglot voice if available
    ),
    audio_config=StreamingAudioConfig(
        audio_encoding=AudioEncoding.MP3,
        sample_rate_hertz=24000
    )
)

StreamingSynthesisInput

from google.cloud.texttospeech import StreamingSynthesisInput

# Text input for streaming
text_stream_input = StreamingSynthesisInput(
    text="This text will be streamed to the synthesis service."
)

# SSML input for streaming
ssml_stream_input = StreamingSynthesisInput(
    ssml='<speak>This <emphasis level="moderate">SSML content</emphasis> will be streamed.</speak>'
)

# Note: StreamingSynthesisInput accepts either text OR ssml, not both
# Each streaming request should contain one input chunk

Advanced Configuration Classes

AdvancedVoiceOptions

from google.cloud.texttospeech import AdvancedVoiceOptions

# Advanced voice configuration
advanced_options = AdvancedVoiceOptions(
    low_latency_journey_synthesis=True             # Enable low-latency processing
)

# Usage in voice selection
voice_with_advanced = VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Neural2-A",
    advanced_voice_options=advanced_options
)

# Direct configuration
direct_advanced_voice = VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Neural2-C",
    advanced_voice_options=AdvancedVoiceOptions(
        low_latency_journey_synthesis=True
    )
)

CustomPronunciations and CustomPronunciationParams

from google.cloud.texttospeech import (
    CustomPronunciations,
    CustomPronunciationParams
)

# Individual pronunciation parameter
pronunciation_param = CustomPronunciationParams(
    phrase="PyTorch",                              # Word or phrase to customize
    ipa="ˈpaɪ tɔrʧ",                               # IPA pronunciation
    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA  # Encoding type
)

# X-SAMPA encoding example
xsampa_param = CustomPronunciationParams(
    phrase="neural",
    ipa="n\"jU@r@l",                               # X-SAMPA notation
    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
)

# Collection of custom pronunciations
custom_pronunciations = CustomPronunciations(
    pronunciations=[
        CustomPronunciationParams(
            phrase="TensorFlow",
            ipa="ˈtɛnsər floʊ",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        ),
        CustomPronunciationParams(
            phrase="Kubernetes",
            ipa="ˌkubərˈnɛtɪs",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        ),
        CustomPronunciationParams(
            phrase="OAuth",
            ipa="ˈoʊ ɔːθ",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        ),
        CustomPronunciationParams(
            phrase="JSON",
            ipa="ˈdʒeɪ sɒn",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        )
    ]
)

# Technical terms pronunciations
tech_pronunciations = CustomPronunciations(
    pronunciations=[
        CustomPronunciationParams(
            phrase="API", ipa="ˌeɪ piː ˈaɪ",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        ),
        CustomPronunciationParams(
            phrase="HTTP", ipa="ˌeɪʧ tiː tiː ˈpiː",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        ),
        CustomPronunciationParams(
            phrase="URL", ipa="ˌjuː ɑːr ˈɛl",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        ),
        CustomPronunciationParams(
            phrase="SQL", ipa="ˈsiː kwəl",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        )
    ]
)

MultiSpeakerMarkup

from google.cloud.texttospeech import MultiSpeakerMarkup

# Basic multi-speaker configuration
multi_speaker = MultiSpeakerMarkup(
    ssml='''
    <speak>
        <voice name="en-US-Neural2-A">
            Hello, I'm the first speaker in this conversation.
        </voice>
        <voice name="en-US-Neural2-C">
            And I'm the second speaker responding to you.
        </voice>
    </speak>
    '''
)

# Complex multi-speaker conversation
conversation_markup = MultiSpeakerMarkup(
    ssml='''
    <speak>
        <voice name="en-US-Neural2-A">
            <prosody rate="medium" pitch="normal">
                Welcome to our technical presentation.
            </prosody>
        </voice>
        
        <break time="1s"/>
        
        <voice name="en-US-Neural2-C">
            <prosody rate="slow" pitch="+2st">
                Today we'll discuss advanced AI concepts.
            </prosody>
        </voice>
        
        <break time="2s"/>
        
        <voice name="en-US-Wavenet-D">
            <prosody rate="fast" pitch="-1st">
                Let's start with the technical implementation details.
            </prosody>
        </voice>
    </speak>
    '''
)

# Dialogue with emotions and pacing
dialogue_markup = MultiSpeakerMarkup(
    ssml='''
    <speak>
        <voice name="en-US-Neural2-A">
            <prosody rate="medium" pitch="normal" volume="loud">
                I have exciting news to share!
            </prosody>
        </voice>
        
        <voice name="en-US-Neural2-C">
            <prosody rate="slow" pitch="low" volume="soft">
                Please, tell me more about it.
            </prosody>
        </voice>
        
        <voice name="en-US-Neural2-A">
            <prosody rate="fast" pitch="high" volume="loud">
                We've achieved a breakthrough in our research!
            </prosody>
        </voice>
    </speak>
    '''
)

CustomVoiceParams

from google.cloud.texttospeech import CustomVoiceParams

# Custom voice model configuration
custom_voice_params = CustomVoiceParams(
    model="projects/your-project-id/locations/us-central1/models/your-custom-voice-model"
)

# Usage with voice selection
voice_with_custom_model = VoiceSelectionParams(
    language_code="en-US",
    custom_voice=custom_voice_params
)

# Complete custom voice configuration
complete_custom_voice = VoiceSelectionParams(
    language_code="en-US",
    custom_voice=CustomVoiceParams(
        model="projects/your-project-id/locations/us-central1/models/custom-narrator-voice"
    ),
    custom_pronunciations=CustomPronunciations(
        pronunciations=[
            CustomPronunciationParams(
                phrase="company_name",
                ipa="ˈkʌmpəni neɪm",
                phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
            )
        ]
    )
)

VoiceCloneParams

from google.cloud.texttospeech import VoiceCloneParams

# Voice cloning configuration
voice_clone_params = VoiceCloneParams(
    voice_clone_key="your-voice-clone-key-from-console"
)

# Usage with voice selection
cloned_voice_selection = VoiceSelectionParams(
    language_code="en-US",
    voice_clone=voice_clone_params
)

# Complete cloned voice setup
complete_cloned_voice = VoiceSelectionParams(
    language_code="en-US",
    voice_clone=VoiceCloneParams(
        voice_clone_key="abcd-1234-efgh-5678"
    ),
    advanced_voice_options=AdvancedVoiceOptions(
        low_latency_journey_synthesis=True
    )
)

Enums and Constants

AudioEncoding

from google.cloud.texttospeech import AudioEncoding

# Available audio encoding formats
LINEAR16 = AudioEncoding.LINEAR16                   # 16-bit PCM with WAV header (lossless)
MP3 = AudioEncoding.MP3                             # MP3 at 32kbps (compressed)
OGG_OPUS = AudioEncoding.OGG_OPUS                   # Opus in Ogg container (compressed)
MULAW = AudioEncoding.MULAW                         # 8-bit G.711 PCMU/mu-law (telephony)
ALAW = AudioEncoding.ALAW                           # 8-bit G.711 PCMU/A-law (telephony)
PCM = AudioEncoding.PCM                             # 16-bit PCM without header (raw)
M4A = AudioEncoding.M4A                             # M4A format (compressed)
UNSPECIFIED = AudioEncoding.AUDIO_ENCODING_UNSPECIFIED  # Not specified

# Usage in audio configuration
high_quality_config = AudioConfig(
    audio_encoding=AudioEncoding.LINEAR16,          # Best quality
    sample_rate_hertz=48000
)

compressed_config = AudioConfig(
    audio_encoding=AudioEncoding.MP3,               # Good compression
    sample_rate_hertz=22050
)

telephony_config = AudioConfig(
    audio_encoding=AudioEncoding.MULAW,             # Telephony standard
    sample_rate_hertz=8000
)

SsmlVoiceGender

from google.cloud.texttospeech import SsmlVoiceGender

# Available gender options
MALE = SsmlVoiceGender.MALE                         # Male voice
FEMALE = SsmlVoiceGender.FEMALE                     # Female voice
NEUTRAL = SsmlVoiceGender.NEUTRAL                   # Gender-neutral voice
UNSPECIFIED = SsmlVoiceGender.SSML_VOICE_GENDER_UNSPECIFIED  # No preference

# Usage in voice selection
male_voice = VoiceSelectionParams(
    language_code="en-US",
    ssml_gender=SsmlVoiceGender.MALE
)

female_voice = VoiceSelectionParams(
    language_code="en-US", 
    ssml_gender=SsmlVoiceGender.FEMALE
)

neutral_voice = VoiceSelectionParams(
    language_code="en-US",
    ssml_gender=SsmlVoiceGender.NEUTRAL
)

PhoneticEncoding

from google.cloud.texttospeech import CustomPronunciationParams

# Available phonetic encoding options
IPA = CustomPronunciationParams.PhoneticEncoding.IPA        # International Phonetic Alphabet
X_SAMPA = CustomPronunciationParams.PhoneticEncoding.X_SAMPA  # X-SAMPA notation
UNSPECIFIED = CustomPronunciationParams.PhoneticEncoding.PHONETIC_ENCODING_UNSPECIFIED

# Usage in pronunciation parameters
ipa_pronunciation = CustomPronunciationParams(
    phrase="example",
    ipa="ɪɡˈzæmpəl",
    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)

xsampa_pronunciation = CustomPronunciationParams(
    phrase="example", 
    ipa="Ig\"z{mp@l",
    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
)

Configuration Validation and Helpers

Configuration Validation

def validate_audio_config(audio_config: AudioConfig) -> tuple[bool, list[str]]:
    """Validate audio configuration parameters."""
    errors = []
    
    # Check required fields
    if not hasattr(audio_config, 'audio_encoding') or not audio_config.audio_encoding:
        errors.append("audio_encoding is required")
    
    # Validate sample rate ranges
    if hasattr(audio_config, 'sample_rate_hertz') and audio_config.sample_rate_hertz:
        sample_rate = audio_config.sample_rate_hertz
        valid_rates = [8000, 16000, 22050, 24000, 32000, 44100, 48000]
        if sample_rate not in valid_rates:
            errors.append(f"sample_rate_hertz must be one of {valid_rates}, got {sample_rate}")
    
    # Validate speaking rate
    if hasattr(audio_config, 'speaking_rate') and audio_config.speaking_rate:
        rate = audio_config.speaking_rate
        if not (0.25 <= rate <= 4.0):
            errors.append(f"speaking_rate must be between 0.25 and 4.0, got {rate}")
    
    # Validate pitch
    if hasattr(audio_config, 'pitch') and audio_config.pitch:
        pitch = audio_config.pitch
        if not (-20.0 <= pitch <= 20.0):
            errors.append(f"pitch must be between -20.0 and 20.0, got {pitch}")
    
    # Validate volume gain
    if hasattr(audio_config, 'volume_gain_db') and audio_config.volume_gain_db:
        volume = audio_config.volume_gain_db
        if not (-96.0 <= volume <= 16.0):
            errors.append(f"volume_gain_db must be between -96.0 and 16.0, got {volume}")
    
    return len(errors) == 0, errors

def validate_voice_selection(voice: VoiceSelectionParams) -> tuple[bool, list[str]]:
    """Validate voice selection parameters."""
    errors = []
    
    # Check required fields
    if not hasattr(voice, 'language_code') or not voice.language_code:
        errors.append("language_code is required")
    else:
        # Validate language code format (basic check for BCP-47)
        lang_code = voice.language_code
        if not lang_code.count('-') >= 1 or len(lang_code) < 2:
            errors.append(f"language_code should be in BCP-47 format (e.g., 'en-US'), got '{lang_code}'")
    
    # Check conflicting voice specifications
    specified_count = sum([
        bool(getattr(voice, 'name', None)),
        bool(getattr(voice, 'custom_voice', None)),
        bool(getattr(voice, 'voice_clone', None))
    ])
    
    if specified_count > 1:
        errors.append("Only one of 'name', 'custom_voice', or 'voice_clone' should be specified")
    
    return len(errors) == 0, errors

# Usage examples
audio_config = AudioConfig(
    audio_encoding=AudioEncoding.MP3,
    sample_rate_hertz=22050,
    speaking_rate=1.5,
    pitch=2.0
)

is_valid, validation_errors = validate_audio_config(audio_config)
if not is_valid:
    print(f"Audio config validation errors: {validation_errors}")

Configuration Builders

class ConfigurationBuilder:
    """Helper class for building complex configurations."""
    
    @staticmethod
    def build_high_quality_config() -> AudioConfig:
        """Build high-quality audio configuration."""
        return AudioConfig(
            audio_encoding=AudioEncoding.LINEAR16,
            sample_rate_hertz=48000,
            speaking_rate=0.95,
            pitch=0.0,
            volume_gain_db=1.0
        )
    
    @staticmethod
    def build_streaming_config() -> AudioConfig:
        """Build streaming-optimized audio configuration."""
        return AudioConfig(
            audio_encoding=AudioEncoding.OGG_OPUS,
            sample_rate_hertz=24000,
            speaking_rate=1.1,
            volume_gain_db=0.0
        )
    
    @staticmethod
    def build_mobile_config() -> AudioConfig:
        """Build mobile-optimized audio configuration."""
        return AudioConfig(
            audio_encoding=AudioEncoding.MP3,
            sample_rate_hertz=16000,
            speaking_rate=1.2,
            effects_profile_id=["handset-class-device"]
        )
    
    @staticmethod
    def build_tech_voice_with_pronunciations(language_code: str = "en-US") -> VoiceSelectionParams:
        """Build voice configuration optimized for technical content."""
        
        tech_pronunciations = CustomPronunciations(
            pronunciations=[
                CustomPronunciationParams(
                    phrase="API", ipa="ˌeɪ piː ˈaɪ",
                    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
                ),
                CustomPronunciationParams(
                    phrase="JSON", ipa="ˈdʒeɪ sɒn",
                    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
                ),
                CustomPronunciationParams(
                    phrase="HTTP", ipa="ˌeɪʧ tiː tiː ˈpiː",
                    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
                ),
                CustomPronunciationParams(
                    phrase="SQL", ipa="ˈsiː kwəl",
                    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
                )
            ]
        )
        
        return VoiceSelectionParams(
            language_code=language_code,
            name=f"{language_code}-Neural2-A",
            custom_pronunciations=tech_pronunciations
        )
    
    @staticmethod
    def build_conversation_voices() -> list[VoiceSelectionParams]:
        """Build multiple voices for conversation synthesis."""
        return [
            VoiceSelectionParams(
                language_code="en-US",
                name="en-US-Neural2-A",  # Female voice
                ssml_gender=SsmlVoiceGender.FEMALE
            ),
            VoiceSelectionParams(
                language_code="en-US", 
                name="en-US-Neural2-C",  # Male voice
                ssml_gender=SsmlVoiceGender.MALE
            ),
            VoiceSelectionParams(
                language_code="en-US",
                name="en-US-Neural2-F",  # Neutral voice
                ssml_gender=SsmlVoiceGender.NEUTRAL
            )
        ]

# Usage examples
high_quality_audio = ConfigurationBuilder.build_high_quality_config()
streaming_audio = ConfigurationBuilder.build_streaming_config()
mobile_audio = ConfigurationBuilder.build_mobile_config()
tech_voice = ConfigurationBuilder.build_tech_voice_with_pronunciations("en-US")
conversation_voices = ConfigurationBuilder.build_conversation_voices()

Configuration Templates

class ConfigurationTemplates:
    """Pre-defined configuration templates for common use cases."""
    
    AUDIOBOOK = {
        'voice': VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Wavenet-A"
        ),
        'audio': AudioConfig(
            audio_encoding=AudioEncoding.MP3,
            sample_rate_hertz=22050,
            speaking_rate=0.9,
            volume_gain_db=2.0
        )
    }
    
    PODCAST = {
        'voice': VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Neural2-C"
        ),
        'audio': AudioConfig(
            audio_encoding=AudioEncoding.MP3,
            sample_rate_hertz=44100,
            speaking_rate=1.0,
            effects_profile_id=["large-home-entertainment-class-device"]
        )
    }
    
    NEWS_BROADCAST = {
        'voice': VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Neural2-D",
            ssml_gender=SsmlVoiceGender.MALE
        ),
        'audio': AudioConfig(
            audio_encoding=AudioEncoding.LINEAR16,
            sample_rate_hertz=24000,
            speaking_rate=1.1,
            pitch=-1.0
        )
    }
    
    EDUCATIONAL = {
        'voice': VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Neural2-A"
        ),
        'audio': AudioConfig(
            audio_encoding=AudioEncoding.MP3,
            sample_rate_hertz=22050,
            speaking_rate=0.95,
            pitch=1.0
        )
    }
    
    TELEPHONY = {
        'voice': VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Standard-C"
        ),
        'audio': AudioConfig(
            audio_encoding=AudioEncoding.MULAW,
            sample_rate_hertz=8000,
            speaking_rate=1.2,
            effects_profile_id=["telephony-class-application"]
        )
    }
    
    @classmethod
    def get_template(cls, template_name: str) -> dict:
        """Get configuration template by name."""
        template_map = {
            'audiobook': cls.AUDIOBOOK,
            'podcast': cls.PODCAST, 
            'news': cls.NEWS_BROADCAST,
            'educational': cls.EDUCATIONAL,
            'telephony': cls.TELEPHONY
        }
        
        return template_map.get(template_name.lower(), cls.AUDIOBOOK)
    
    @classmethod
    def create_request_from_template(cls, template_name: str, text: str) -> 'SynthesizeSpeechRequest':
        """Create synthesis request from template."""
        template = cls.get_template(template_name)
        
        return texttospeech.SynthesizeSpeechRequest(
            input=SynthesisInput(text=text),
            voice=template['voice'],
            audio_config=template['audio']
        )

# Usage examples
audiobook_config = ConfigurationTemplates.get_template('audiobook')
podcast_request = ConfigurationTemplates.create_request_from_template(
    'podcast', 
    "Welcome to our technology podcast!"
)

Best Practices for Configuration

Configuration Guidelines

class ConfigurationBestPractices:
    """Best practices for Text-to-Speech configuration."""
    
    @staticmethod
    def recommend_sample_rate(audio_encoding: AudioEncoding, use_case: str) -> int:
        """Recommend optimal sample rate for encoding and use case."""
        
        recommendations = {
            AudioEncoding.LINEAR16: {
                'high_quality': 48000,
                'standard': 24000,
                'streaming': 22050,
                'mobile': 16000
            },
            AudioEncoding.MP3: {
                'high_quality': 44100,
                'standard': 22050,
                'streaming': 22050,
                'mobile': 16000
            },
            AudioEncoding.OGG_OPUS: {
                'high_quality': 48000,
                'standard': 24000, 
                'streaming': 24000,
                'mobile': 16000
            },
            AudioEncoding.MULAW: {
                'telephony': 8000
            },
            AudioEncoding.ALAW: {
                'telephony': 8000
            }
        }
        
        encoding_rec = recommendations.get(audio_encoding, {})
        return encoding_rec.get(use_case, 22050)  # Default fallback
    
    @staticmethod
    def optimize_for_latency(voice_config: VoiceSelectionParams, 
                           audio_config: AudioConfig) -> tuple[VoiceSelectionParams, AudioConfig]:
        """Optimize configuration for minimal latency."""
        
        # Use Standard voice for speed
        optimized_voice = VoiceSelectionParams(
            language_code=voice_config.language_code,
            name=voice_config.language_code.replace('-', '-Standard-A'),
            advanced_voice_options=AdvancedVoiceOptions(
                low_latency_journey_synthesis=True
            )
        )
        
        # Use lower sample rate and compressed format
        optimized_audio = AudioConfig(
            audio_encoding=AudioEncoding.MP3,
            sample_rate_hertz=16000,
            speaking_rate=1.1
        )
        
        return optimized_voice, optimized_audio
    
    @staticmethod
    def optimize_for_quality(voice_config: VoiceSelectionParams,
                           audio_config: AudioConfig) -> tuple[VoiceSelectionParams, AudioConfig]:
        """Optimize configuration for maximum quality."""
        
        # Use Neural2 or Wavenet voice
        voice_name = voice_config.language_code
        if 'Neural2' not in voice_config.name and 'Wavenet' not in voice_config.name:
            voice_name += '-Neural2-A'  # Default to Neural2
        else:
            voice_name = voice_config.name
        
        optimized_voice = VoiceSelectionParams(
            language_code=voice_config.language_code,
            name=voice_name
        )
        
        # Use uncompressed format with high sample rate
        optimized_audio = AudioConfig(
            audio_encoding=AudioEncoding.LINEAR16,
            sample_rate_hertz=48000,
            speaking_rate=0.95,  # Slightly slower for clarity
            volume_gain_db=1.0
        )
        
        return optimized_voice, optimized_audio

# Usage examples
# Optimize for latency
original_voice = VoiceSelectionParams(language_code="en-US")
original_audio = AudioConfig(audio_encoding=AudioEncoding.LINEAR16)

fast_voice, fast_audio = ConfigurationBestPractices.optimize_for_latency(
    original_voice, original_audio
)

# Optimize for quality
quality_voice, quality_audio = ConfigurationBestPractices.optimize_for_quality(
    original_voice, original_audio
)

# Get recommended sample rate
recommended_rate = ConfigurationBestPractices.recommend_sample_rate(
    AudioEncoding.MP3, 'streaming'
)

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-texttospeech

docs

async-clients.md

configuration-types.md

index.md

long-audio-synthesis.md

speech-synthesis.md

streaming-synthesis.md

voice-management.md

tile.json