CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-google-cloud-texttospeech

Google Cloud Texttospeech API client library for converting text to speech with multiple voices and audio formats

Pending
Overview
Eval results
Files

voice-management.mddocs/

Voice Management

Overview

Voice management in the Google Cloud Text-to-Speech API involves discovering, selecting, and configuring voices for speech synthesis. The API provides access to hundreds of voices across multiple languages, including standard voices, high-quality WaveNet neural voices, and custom voice models.

Voice Discovery

Listing All Available Voices

from google.cloud import texttospeech

# Initialize client
client = texttospeech.TextToSpeechClient()

# List all voices
response = client.list_voices()

# Iterate through available voices
for voice in response.voices:
    print(f"Voice Name: {voice.name}")
    print(f"Language Codes: {voice.language_codes}")
    print(f"Gender: {voice.ssml_gender}")
    print(f"Natural Sample Rate: {voice.natural_sample_rate_hertz} Hz")
    print("---")

Filtering Voices by Language

from google.cloud.texttospeech import ListVoicesRequest

# List voices for specific language
request = ListVoicesRequest(language_code="en-US")
response = client.list_voices(request=request)

print(f"Found {len(response.voices)} voices for en-US:")
for voice in response.voices:
    print(f"- {voice.name} ({voice.ssml_gender.name})")

# List voices for multiple languages
languages = ["en-US", "es-ES", "fr-FR", "de-DE"]
for lang in languages:
    request = ListVoicesRequest(language_code=lang)
    response = client.list_voices(request=request)
    print(f"{lang}: {len(response.voices)} voices")

Voice Information Analysis

def analyze_voice_capabilities():
    """Analyze and categorize available voices."""
    client = texttospeech.TextToSpeechClient()
    response = client.list_voices()
    
    # Group voices by type and language
    voice_analysis = {
        'by_language': {},
        'by_type': {'wavenet': [], 'neural2': [], 'standard': [], 'other': []},
        'by_gender': {'MALE': [], 'FEMALE': [], 'NEUTRAL': []}
    }
    
    for voice in response.voices:
        # Group by language
        for lang_code in voice.language_codes:
            if lang_code not in voice_analysis['by_language']:
                voice_analysis['by_language'][lang_code] = []
            voice_analysis['by_language'][lang_code].append(voice.name)
        
        # Group by voice type
        if 'Wavenet' in voice.name:
            voice_analysis['by_type']['wavenet'].append(voice.name)
        elif 'Neural2' in voice.name:
            voice_analysis['by_type']['neural2'].append(voice.name)
        elif 'Standard' in voice.name:
            voice_analysis['by_type']['standard'].append(voice.name)
        else:
            voice_analysis['by_type']['other'].append(voice.name)
        
        # Group by gender
        gender = voice.ssml_gender.name
        if gender in voice_analysis['by_gender']:
            voice_analysis['by_gender'][gender].append(voice.name)
    
    return voice_analysis

# Usage
voice_stats = analyze_voice_capabilities()
print(f"WaveNet voices: {len(voice_stats['by_type']['wavenet'])}")
print(f"Neural2 voices: {len(voice_stats['by_type']['neural2'])}")
print(f"Standard voices: {len(voice_stats['by_type']['standard'])}")

Voice Types and Models

Voice Class Properties

from google.cloud.texttospeech import Voice, SsmlVoiceGender

# Voice object contains:
# - name: str - Unique voice identifier (e.g., "en-US-Wavenet-A")
# - language_codes: List[str] - Supported language codes
# - ssml_gender: SsmlVoiceGender - Voice gender
# - natural_sample_rate_hertz: int - Optimal sample rate

# Access voice properties
def print_voice_details(voice: Voice):
    print(f"Name: {voice.name}")
    print(f"Languages: {', '.join(voice.language_codes)}")
    print(f"Gender: {voice.ssml_gender.name}")
    print(f"Sample Rate: {voice.natural_sample_rate_hertz} Hz")

# Example voice categorization
def categorize_voice(voice_name: str) -> str:
    """Categorize voice by type based on name."""
    if "Wavenet" in voice_name:
        return "WaveNet Neural Voice (High Quality)"
    elif "Neural2" in voice_name:
        return "Neural2 Voice (Premium Quality)"
    elif "Standard" in voice_name:
        return "Standard Voice (Basic Quality)"
    elif "Studio" in voice_name:
        return "Studio Voice (Premium)"
    elif "Polyglot" in voice_name:
        return "Polyglot Voice (Multi-language)"
    else:
        return "Custom or Special Voice"

Voice Quality Comparison

# Voice quality hierarchy (best to standard)
VOICE_QUALITY_TIERS = {
    "premium": ["Neural2", "Studio", "Journey"],
    "high": ["Wavenet"],
    "standard": ["Standard"],
    "custom": ["Custom"]
}

def get_best_voice_for_language(language_code: str, gender_preference=None):
    """Find the best available voice for a language."""
    client = texttospeech.TextToSpeechClient()
    request = texttospeech.ListVoicesRequest(language_code=language_code)
    response = client.list_voices(request=request)
    
    # Filter by gender if specified
    voices = response.voices
    if gender_preference:
        voices = [v for v in voices if v.ssml_gender == gender_preference]
    
    # Sort by quality tier
    for tier_names in VOICE_QUALITY_TIERS.values():
        for tier_name in tier_names:
            for voice in voices:
                if tier_name in voice.name:
                    return voice
    
    # Return first available if no premium voices found
    return voices[0] if voices else None

# Usage
best_voice = get_best_voice_for_language(
    "en-US", 
    texttospeech.SsmlVoiceGender.FEMALE
)
if best_voice:
    print(f"Best voice: {best_voice.name}")

Voice Selection

VoiceSelectionParams Configuration

from google.cloud.texttospeech import VoiceSelectionParams, SsmlVoiceGender

# Basic voice selection by language and gender
voice_params = VoiceSelectionParams(
    language_code="en-US",                    # Required: BCP-47 language code
    ssml_gender=SsmlVoiceGender.FEMALE       # Optional: gender preference
)

# Specific voice selection by name
voice_params = VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Wavenet-D"                   # Exact voice model name
)

# Voice selection with custom pronunciations
voice_params = VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Neural2-A",
    custom_pronunciations=texttospeech.CustomPronunciations(
        pronunciations=[
            texttospeech.CustomPronunciationParams(
                phrase="API",
                ipa="ˌeɪ piː ˈaɪ",
                phonetic_encoding=texttospeech.CustomPronunciationParams.PhoneticEncoding.IPA
            )
        ]
    )
)

Advanced Voice Selection

from google.cloud.texttospeech import (
    VoiceSelectionParams,
    AdvancedVoiceOptions,
    CustomVoiceParams,
    VoiceCloneParams
)

# Voice with advanced options
voice_params = VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Neural2-C",
    advanced_voice_options=AdvancedVoiceOptions(
        low_latency_journey_synthesis=True     # Enable low-latency mode
    )
)

# Custom voice model
voice_params = VoiceSelectionParams(
    language_code="en-US",
    custom_voice=CustomVoiceParams(
        model="projects/your-project/locations/us-central1/models/custom-voice-model"
    )
)

# Voice cloning
voice_params = VoiceSelectionParams(
    language_code="en-US",
    voice_clone=VoiceCloneParams(
        voice_clone_key="your-voice-clone-key"
    )
)

Gender and Language Options

SsmlVoiceGender Enum

from google.cloud.texttospeech import SsmlVoiceGender

# Available gender options
MALE = SsmlVoiceGender.MALE                     # Male voice
FEMALE = SsmlVoiceGender.FEMALE                 # Female voice  
NEUTRAL = SsmlVoiceGender.NEUTRAL               # Gender-neutral voice
UNSPECIFIED = SsmlVoiceGender.SSML_VOICE_GENDER_UNSPECIFIED  # No preference

# Usage in voice selection
def create_voice_by_gender(language: str, gender: SsmlVoiceGender):
    return VoiceSelectionParams(
        language_code=language,
        ssml_gender=gender
    )

# Examples
male_voice = create_voice_by_gender("en-US", SsmlVoiceGender.MALE)
female_voice = create_voice_by_gender("fr-FR", SsmlVoiceGender.FEMALE)
neutral_voice = create_voice_by_gender("de-DE", SsmlVoiceGender.NEUTRAL)

Language Code Examples

# Common language codes for voice selection
SUPPORTED_LANGUAGES = {
    "en-US": "English (United States)",
    "en-GB": "English (United Kingdom)", 
    "en-AU": "English (Australia)",
    "es-ES": "Spanish (Spain)",
    "es-MX": "Spanish (Mexico)",
    "fr-FR": "French (France)",
    "fr-CA": "French (Canada)",
    "de-DE": "German (Germany)",
    "it-IT": "Italian (Italy)",
    "pt-BR": "Portuguese (Brazil)",
    "pt-PT": "Portuguese (Portugal)",
    "ja-JP": "Japanese (Japan)",
    "ko-KR": "Korean (South Korea)",
    "zh-CN": "Chinese (Mainland)",
    "zh-TW": "Chinese (Taiwan)",
    "hi-IN": "Hindi (India)",
    "ar-SA": "Arabic (Saudi Arabia)",
    "ru-RU": "Russian (Russia)",
    "nl-NL": "Dutch (Netherlands)",
    "sv-SE": "Swedish (Sweden)",
    "da-DK": "Danish (Denmark)",
    "no-NO": "Norwegian (Norway)",
    "fi-FI": "Finnish (Finland)",
}

def get_voices_for_languages(language_codes: list):
    """Get available voices for multiple languages."""
    client = texttospeech.TextToSpeechClient()
    results = {}
    
    for lang_code in language_codes:
        request = texttospeech.ListVoicesRequest(language_code=lang_code)
        response = client.list_voices(request=request)
        results[lang_code] = [voice.name for voice in response.voices]
    
    return results

Custom Pronunciations

CustomPronunciationParams Configuration

from google.cloud.texttospeech import (
    CustomPronunciations,
    CustomPronunciationParams
)

# IPA pronunciation
ipa_pronunciation = CustomPronunciationParams(
    phrase="nuclear",
    ipa="ˈnuːkliər",
    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)

# X-SAMPA pronunciation
xsampa_pronunciation = CustomPronunciationParams(
    phrase="often",
    ipa="Q:ft@n",  # X-SAMPA notation
    phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
)

# Collection of custom pronunciations
custom_pronunciations = CustomPronunciations(
    pronunciations=[
        CustomPronunciationParams(
            phrase="GitHub",
            ipa="ˈɡɪt hʌb",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        ),
        CustomPronunciationParams(
            phrase="API",
            ipa="ˌeɪ piː ˈaɪ",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        ),
        CustomPronunciationParams(
            phrase="OAuth",
            ipa="ˈoʊ ɔːθ",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        )
    ]
)

Using Custom Pronunciations

def create_voice_with_custom_pronunciations(language_code: str, pronunciations_dict: dict):
    """Create voice selection with custom pronunciations from dictionary."""
    
    # Convert dictionary to CustomPronunciationParams
    pronunciation_params = []
    for phrase, ipa_pronunciation in pronunciations_dict.items():
        param = CustomPronunciationParams(
            phrase=phrase,
            ipa=ipa_pronunciation,
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        )
        pronunciation_params.append(param)
    
    # Create custom pronunciations collection
    custom_pronunciations = CustomPronunciations(
        pronunciations=pronunciation_params
    )
    
    # Return voice selection with custom pronunciations
    return VoiceSelectionParams(
        language_code=language_code,
        custom_pronunciations=custom_pronunciations
    )

# Usage example
tech_pronunciations = {
    "JSON": "ˈdʒeɪ sɒn",
    "SQL": "ˈsiː kwəl", 
    "HTTP": "ˌeɪtʃ tiː tiː ˈpiː",
    "URL": "ˌjuː ɑːr ˈɛl",
    "CSS": "ˌsiː ɛs ˈɛs"
}

tech_voice = create_voice_with_custom_pronunciations("en-US", tech_pronunciations)

# Use in synthesis request
request = texttospeech.SynthesizeSpeechRequest(
    input=texttospeech.SynthesisInput(
        text="We'll use JSON data via HTTP API calls and style with CSS."
    ),
    voice=tech_voice,
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )
)

Voice Filtering and Selection Helpers

Voice Filtering Functions

def filter_voices_by_criteria(language_code: str = None, gender: SsmlVoiceGender = None, 
                             voice_type: str = None):
    """Filter voices by multiple criteria."""
    client = texttospeech.TextToSpeechClient()
    
    # Get voices for language or all voices
    if language_code:
        request = texttospeech.ListVoicesRequest(language_code=language_code)
        response = client.list_voices(request=request)
    else:
        response = client.list_voices()
    
    filtered_voices = response.voices
    
    # Filter by gender
    if gender:
        filtered_voices = [v for v in filtered_voices if v.ssml_gender == gender]
    
    # Filter by voice type
    if voice_type:
        filtered_voices = [v for v in filtered_voices if voice_type in v.name]
    
    return filtered_voices

# Usage examples
wavenet_female_voices = filter_voices_by_criteria(
    language_code="en-US",
    gender=SsmlVoiceGender.FEMALE,
    voice_type="Wavenet"
)

neural2_voices = filter_voices_by_criteria(voice_type="Neural2")
male_spanish_voices = filter_voices_by_criteria(
    language_code="es-ES",
    gender=SsmlVoiceGender.MALE
)

Voice Recommendation System

class VoiceRecommender:
    """Intelligent voice recommendation system."""
    
    def __init__(self):
        self.client = texttospeech.TextToSpeechClient()
        self._voice_cache = {}
    
    def get_cached_voices(self, language_code: str = None):
        """Get voices with caching for performance."""
        cache_key = language_code or "all"
        
        if cache_key not in self._voice_cache:
            if language_code:
                request = texttospeech.ListVoicesRequest(language_code=language_code)
                response = self.client.list_voices(request=request)
            else:
                response = self.client.list_voices()
            self._voice_cache[cache_key] = response.voices
        
        return self._voice_cache[cache_key]
    
    def recommend_voice(self, language_code: str, preferences: dict = None):
        """Recommend best voice based on preferences."""
        preferences = preferences or {}
        
        voices = self.get_cached_voices(language_code)
        if not voices:
            return None
        
        # Scoring system
        scored_voices = []
        for voice in voices:
            score = 0
            
            # Quality scoring
            if "Neural2" in voice.name:
                score += 100
            elif "Wavenet" in voice.name:
                score += 80
            elif "Standard" in voice.name:
                score += 60
            
            # Gender preference
            if preferences.get("gender") == voice.ssml_gender:
                score += 50
            
            # Sample rate preference
            preferred_rate = preferences.get("sample_rate")
            if preferred_rate and voice.natural_sample_rate_hertz == preferred_rate:
                score += 30
            
            # Name preference (if specific voice requested)
            if preferences.get("voice_name") and preferences["voice_name"] in voice.name:
                score += 200
            
            scored_voices.append((voice, score))
        
        # Return highest scored voice
        scored_voices.sort(key=lambda x: x[1], reverse=True)
        return scored_voices[0][0] if scored_voices else None
    
    def get_voice_alternatives(self, primary_voice_name: str, count: int = 3):
        """Get alternative voices similar to the primary voice."""
        # Extract language from primary voice name
        lang_parts = primary_voice_name.split("-")
        if len(lang_parts) >= 2:
            language_code = f"{lang_parts[0]}-{lang_parts[1]}"
        else:
            return []
        
        voices = self.get_cached_voices(language_code)
        
        # Find similar voices (same type and gender if possible)
        primary_voice = next((v for v in voices if v.name == primary_voice_name), None)
        if not primary_voice:
            return voices[:count]
        
        similar_voices = []
        for voice in voices:
            if (voice.name != primary_voice_name and 
                voice.ssml_gender == primary_voice.ssml_gender):
                
                # Prefer same voice type
                if any(vtype in voice.name and vtype in primary_voice_name 
                      for vtype in ["Neural2", "Wavenet", "Standard"]):
                    similar_voices.insert(0, voice)
                else:
                    similar_voices.append(voice)
        
        return similar_voices[:count]

# Usage
recommender = VoiceRecommender()

# Get recommendation with preferences
preferences = {
    "gender": SsmlVoiceGender.FEMALE,
    "sample_rate": 24000
}
recommended_voice = recommender.recommend_voice("en-US", preferences)

# Get alternatives to a specific voice
alternatives = recommender.get_voice_alternatives("en-US-Wavenet-D", count=5)

Voice Testing and Comparison

Voice Comparison Tool

def compare_voices(text: str, voice_names: list, output_dir: str = "voice_comparison"):
    """Generate audio samples for voice comparison."""
    import os
    
    client = texttospeech.TextToSpeechClient()
    os.makedirs(output_dir, exist_ok=True)
    
    results = []
    
    for voice_name in voice_names:
        # Extract language code from voice name
        lang_parts = voice_name.split("-")
        language_code = f"{lang_parts[0]}-{lang_parts[1]}" if len(lang_parts) >= 2 else "en-US"
        
        try:
            request = texttospeech.SynthesizeSpeechRequest(
                input=texttospeech.SynthesisInput(text=text),
                voice=VoiceSelectionParams(
                    language_code=language_code,
                    name=voice_name
                ),
                audio_config=texttospeech.AudioConfig(
                    audio_encoding=texttospeech.AudioEncoding.MP3
                )
            )
            
            response = client.synthesize_speech(request=request)
            
            # Save audio file
            filename = f"{voice_name.replace('-', '_')}.mp3"
            filepath = os.path.join(output_dir, filename)
            
            with open(filepath, "wb") as f:
                f.write(response.audio_content)
            
            results.append({
                "voice_name": voice_name,
                "file_path": filepath,
                "success": True,
                "audio_size": len(response.audio_content)
            })
            
        except Exception as e:
            results.append({
                "voice_name": voice_name,
                "file_path": None,
                "success": False,
                "error": str(e)
            })
    
    return results

# Usage
test_voices = [
    "en-US-Neural2-A",
    "en-US-Neural2-C",
    "en-US-Wavenet-A",
    "en-US-Wavenet-D",
    "en-US-Standard-A"
]

comparison_results = compare_voices(
    "Hello, this is a test of different voice qualities and characteristics.",
    test_voices
)

for result in comparison_results:
    if result["success"]:
        print(f"✓ {result['voice_name']}: {result['audio_size']} bytes")
    else:
        print(f"✗ {result['voice_name']}: {result['error']}")

Voice Quality Assessment

def assess_voice_quality(voice_name: str) -> dict:
    """Assess voice quality characteristics based on name and properties."""
    
    quality_assessment = {
        "voice_name": voice_name,
        "quality_tier": "unknown",
        "naturalness": "medium",
        "recommended_use": "general",
        "latency": "medium",
        "cost": "medium"
    }
    
    # Assess based on voice type
    if "Neural2" in voice_name:
        quality_assessment.update({
            "quality_tier": "premium",
            "naturalness": "very_high",
            "recommended_use": "professional_content",
            "latency": "medium",
            "cost": "high"
        })
    elif "Wavenet" in voice_name:
        quality_assessment.update({
            "quality_tier": "high",
            "naturalness": "high", 
            "recommended_use": "content_creation",
            "latency": "medium",
            "cost": "medium_high"
        })
    elif "Standard" in voice_name:
        quality_assessment.update({
            "quality_tier": "basic",
            "naturalness": "medium",
            "recommended_use": "notifications",
            "latency": "low",
            "cost": "low"
        })
    elif "Studio" in voice_name:
        quality_assessment.update({
            "quality_tier": "premium",
            "naturalness": "very_high",
            "recommended_use": "audiobooks",
            "latency": "high", 
            "cost": "high"
        })
    
    return quality_assessment

# Assess multiple voices
voice_assessments = [
    assess_voice_quality("en-US-Neural2-A"),
    assess_voice_quality("en-US-Wavenet-D"),
    assess_voice_quality("en-US-Standard-B")
]

for assessment in voice_assessments:
    print(f"{assessment['voice_name']}: {assessment['quality_tier']} quality, "
          f"{assessment['naturalness']} naturalness, {assessment['cost']} cost")

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-texttospeech

docs

async-clients.md

configuration-types.md

index.md

long-audio-synthesis.md

speech-synthesis.md

streaming-synthesis.md

voice-management.md

tile.json