Google Cloud Texttospeech API client library for converting text to speech with multiple voices and audio formats
—
Voice management in the Google Cloud Text-to-Speech API involves discovering, selecting, and configuring voices for speech synthesis. The API provides access to hundreds of voices across multiple languages, including standard voices, high-quality WaveNet neural voices, and custom voice models.
from google.cloud import texttospeech
# Initialize client
client = texttospeech.TextToSpeechClient()
# List all voices
response = client.list_voices()
# Iterate through available voices
for voice in response.voices:
print(f"Voice Name: {voice.name}")
print(f"Language Codes: {voice.language_codes}")
print(f"Gender: {voice.ssml_gender}")
print(f"Natural Sample Rate: {voice.natural_sample_rate_hertz} Hz")
print("---")from google.cloud.texttospeech import ListVoicesRequest
# List voices for specific language
request = ListVoicesRequest(language_code="en-US")
response = client.list_voices(request=request)
print(f"Found {len(response.voices)} voices for en-US:")
for voice in response.voices:
print(f"- {voice.name} ({voice.ssml_gender.name})")
# List voices for multiple languages
languages = ["en-US", "es-ES", "fr-FR", "de-DE"]
for lang in languages:
request = ListVoicesRequest(language_code=lang)
response = client.list_voices(request=request)
print(f"{lang}: {len(response.voices)} voices")def analyze_voice_capabilities():
"""Analyze and categorize available voices."""
client = texttospeech.TextToSpeechClient()
response = client.list_voices()
# Group voices by type and language
voice_analysis = {
'by_language': {},
'by_type': {'wavenet': [], 'neural2': [], 'standard': [], 'other': []},
'by_gender': {'MALE': [], 'FEMALE': [], 'NEUTRAL': []}
}
for voice in response.voices:
# Group by language
for lang_code in voice.language_codes:
if lang_code not in voice_analysis['by_language']:
voice_analysis['by_language'][lang_code] = []
voice_analysis['by_language'][lang_code].append(voice.name)
# Group by voice type
if 'Wavenet' in voice.name:
voice_analysis['by_type']['wavenet'].append(voice.name)
elif 'Neural2' in voice.name:
voice_analysis['by_type']['neural2'].append(voice.name)
elif 'Standard' in voice.name:
voice_analysis['by_type']['standard'].append(voice.name)
else:
voice_analysis['by_type']['other'].append(voice.name)
# Group by gender
gender = voice.ssml_gender.name
if gender in voice_analysis['by_gender']:
voice_analysis['by_gender'][gender].append(voice.name)
return voice_analysis
# Usage
voice_stats = analyze_voice_capabilities()
print(f"WaveNet voices: {len(voice_stats['by_type']['wavenet'])}")
print(f"Neural2 voices: {len(voice_stats['by_type']['neural2'])}")
print(f"Standard voices: {len(voice_stats['by_type']['standard'])}")from google.cloud.texttospeech import Voice, SsmlVoiceGender
# Voice object contains:
# - name: str - Unique voice identifier (e.g., "en-US-Wavenet-A")
# - language_codes: List[str] - Supported language codes
# - ssml_gender: SsmlVoiceGender - Voice gender
# - natural_sample_rate_hertz: int - Optimal sample rate
# Access voice properties
def print_voice_details(voice: Voice):
print(f"Name: {voice.name}")
print(f"Languages: {', '.join(voice.language_codes)}")
print(f"Gender: {voice.ssml_gender.name}")
print(f"Sample Rate: {voice.natural_sample_rate_hertz} Hz")
# Example voice categorization
def categorize_voice(voice_name: str) -> str:
"""Categorize voice by type based on name."""
if "Wavenet" in voice_name:
return "WaveNet Neural Voice (High Quality)"
elif "Neural2" in voice_name:
return "Neural2 Voice (Premium Quality)"
elif "Standard" in voice_name:
return "Standard Voice (Basic Quality)"
elif "Studio" in voice_name:
return "Studio Voice (Premium)"
elif "Polyglot" in voice_name:
return "Polyglot Voice (Multi-language)"
else:
return "Custom or Special Voice"# Voice quality hierarchy (best to standard)
VOICE_QUALITY_TIERS = {
"premium": ["Neural2", "Studio", "Journey"],
"high": ["Wavenet"],
"standard": ["Standard"],
"custom": ["Custom"]
}
def get_best_voice_for_language(language_code: str, gender_preference=None):
"""Find the best available voice for a language."""
client = texttospeech.TextToSpeechClient()
request = texttospeech.ListVoicesRequest(language_code=language_code)
response = client.list_voices(request=request)
# Filter by gender if specified
voices = response.voices
if gender_preference:
voices = [v for v in voices if v.ssml_gender == gender_preference]
# Sort by quality tier
for tier_names in VOICE_QUALITY_TIERS.values():
for tier_name in tier_names:
for voice in voices:
if tier_name in voice.name:
return voice
# Return first available if no premium voices found
return voices[0] if voices else None
# Usage
best_voice = get_best_voice_for_language(
"en-US",
texttospeech.SsmlVoiceGender.FEMALE
)
if best_voice:
print(f"Best voice: {best_voice.name}")from google.cloud.texttospeech import VoiceSelectionParams, SsmlVoiceGender
# Basic voice selection by language and gender
voice_params = VoiceSelectionParams(
language_code="en-US", # Required: BCP-47 language code
ssml_gender=SsmlVoiceGender.FEMALE # Optional: gender preference
)
# Specific voice selection by name
voice_params = VoiceSelectionParams(
language_code="en-US",
name="en-US-Wavenet-D" # Exact voice model name
)
# Voice selection with custom pronunciations
voice_params = VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-A",
custom_pronunciations=texttospeech.CustomPronunciations(
pronunciations=[
texttospeech.CustomPronunciationParams(
phrase="API",
ipa="ˌeɪ piː ˈaɪ",
phonetic_encoding=texttospeech.CustomPronunciationParams.PhoneticEncoding.IPA
)
]
)
)from google.cloud.texttospeech import (
VoiceSelectionParams,
AdvancedVoiceOptions,
CustomVoiceParams,
VoiceCloneParams
)
# Voice with advanced options
voice_params = VoiceSelectionParams(
language_code="en-US",
name="en-US-Neural2-C",
advanced_voice_options=AdvancedVoiceOptions(
low_latency_journey_synthesis=True # Enable low-latency mode
)
)
# Custom voice model
voice_params = VoiceSelectionParams(
language_code="en-US",
custom_voice=CustomVoiceParams(
model="projects/your-project/locations/us-central1/models/custom-voice-model"
)
)
# Voice cloning
voice_params = VoiceSelectionParams(
language_code="en-US",
voice_clone=VoiceCloneParams(
voice_clone_key="your-voice-clone-key"
)
)from google.cloud.texttospeech import SsmlVoiceGender
# Available gender options
MALE = SsmlVoiceGender.MALE # Male voice
FEMALE = SsmlVoiceGender.FEMALE # Female voice
NEUTRAL = SsmlVoiceGender.NEUTRAL # Gender-neutral voice
UNSPECIFIED = SsmlVoiceGender.SSML_VOICE_GENDER_UNSPECIFIED # No preference
# Usage in voice selection
def create_voice_by_gender(language: str, gender: SsmlVoiceGender):
return VoiceSelectionParams(
language_code=language,
ssml_gender=gender
)
# Examples
male_voice = create_voice_by_gender("en-US", SsmlVoiceGender.MALE)
female_voice = create_voice_by_gender("fr-FR", SsmlVoiceGender.FEMALE)
neutral_voice = create_voice_by_gender("de-DE", SsmlVoiceGender.NEUTRAL)# Common language codes for voice selection
SUPPORTED_LANGUAGES = {
"en-US": "English (United States)",
"en-GB": "English (United Kingdom)",
"en-AU": "English (Australia)",
"es-ES": "Spanish (Spain)",
"es-MX": "Spanish (Mexico)",
"fr-FR": "French (France)",
"fr-CA": "French (Canada)",
"de-DE": "German (Germany)",
"it-IT": "Italian (Italy)",
"pt-BR": "Portuguese (Brazil)",
"pt-PT": "Portuguese (Portugal)",
"ja-JP": "Japanese (Japan)",
"ko-KR": "Korean (South Korea)",
"zh-CN": "Chinese (Mainland)",
"zh-TW": "Chinese (Taiwan)",
"hi-IN": "Hindi (India)",
"ar-SA": "Arabic (Saudi Arabia)",
"ru-RU": "Russian (Russia)",
"nl-NL": "Dutch (Netherlands)",
"sv-SE": "Swedish (Sweden)",
"da-DK": "Danish (Denmark)",
"no-NO": "Norwegian (Norway)",
"fi-FI": "Finnish (Finland)",
}
def get_voices_for_languages(language_codes: list):
"""Get available voices for multiple languages."""
client = texttospeech.TextToSpeechClient()
results = {}
for lang_code in language_codes:
request = texttospeech.ListVoicesRequest(language_code=lang_code)
response = client.list_voices(request=request)
results[lang_code] = [voice.name for voice in response.voices]
return resultsfrom google.cloud.texttospeech import (
CustomPronunciations,
CustomPronunciationParams
)
# IPA pronunciation
ipa_pronunciation = CustomPronunciationParams(
phrase="nuclear",
ipa="ˈnuːkliər",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)
# X-SAMPA pronunciation
xsampa_pronunciation = CustomPronunciationParams(
phrase="often",
ipa="Q:ft@n", # X-SAMPA notation
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA
)
# Collection of custom pronunciations
custom_pronunciations = CustomPronunciations(
pronunciations=[
CustomPronunciationParams(
phrase="GitHub",
ipa="ˈɡɪt hʌb",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="API",
ipa="ˌeɪ piː ˈaɪ",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
),
CustomPronunciationParams(
phrase="OAuth",
ipa="ˈoʊ ɔːθ",
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)
]
)def create_voice_with_custom_pronunciations(language_code: str, pronunciations_dict: dict):
"""Create voice selection with custom pronunciations from dictionary."""
# Convert dictionary to CustomPronunciationParams
pronunciation_params = []
for phrase, ipa_pronunciation in pronunciations_dict.items():
param = CustomPronunciationParams(
phrase=phrase,
ipa=ipa_pronunciation,
phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
)
pronunciation_params.append(param)
# Create custom pronunciations collection
custom_pronunciations = CustomPronunciations(
pronunciations=pronunciation_params
)
# Return voice selection with custom pronunciations
return VoiceSelectionParams(
language_code=language_code,
custom_pronunciations=custom_pronunciations
)
# Usage example
tech_pronunciations = {
"JSON": "ˈdʒeɪ sɒn",
"SQL": "ˈsiː kwəl",
"HTTP": "ˌeɪtʃ tiː tiː ˈpiː",
"URL": "ˌjuː ɑːr ˈɛl",
"CSS": "ˌsiː ɛs ˈɛs"
}
tech_voice = create_voice_with_custom_pronunciations("en-US", tech_pronunciations)
# Use in synthesis request
request = texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(
text="We'll use JSON data via HTTP API calls and style with CSS."
),
voice=tech_voice,
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
)def filter_voices_by_criteria(language_code: str = None, gender: SsmlVoiceGender = None,
voice_type: str = None):
"""Filter voices by multiple criteria."""
client = texttospeech.TextToSpeechClient()
# Get voices for language or all voices
if language_code:
request = texttospeech.ListVoicesRequest(language_code=language_code)
response = client.list_voices(request=request)
else:
response = client.list_voices()
filtered_voices = response.voices
# Filter by gender
if gender:
filtered_voices = [v for v in filtered_voices if v.ssml_gender == gender]
# Filter by voice type
if voice_type:
filtered_voices = [v for v in filtered_voices if voice_type in v.name]
return filtered_voices
# Usage examples
wavenet_female_voices = filter_voices_by_criteria(
language_code="en-US",
gender=SsmlVoiceGender.FEMALE,
voice_type="Wavenet"
)
neural2_voices = filter_voices_by_criteria(voice_type="Neural2")
male_spanish_voices = filter_voices_by_criteria(
language_code="es-ES",
gender=SsmlVoiceGender.MALE
)class VoiceRecommender:
"""Intelligent voice recommendation system."""
def __init__(self):
self.client = texttospeech.TextToSpeechClient()
self._voice_cache = {}
def get_cached_voices(self, language_code: str = None):
"""Get voices with caching for performance."""
cache_key = language_code or "all"
if cache_key not in self._voice_cache:
if language_code:
request = texttospeech.ListVoicesRequest(language_code=language_code)
response = self.client.list_voices(request=request)
else:
response = self.client.list_voices()
self._voice_cache[cache_key] = response.voices
return self._voice_cache[cache_key]
def recommend_voice(self, language_code: str, preferences: dict = None):
"""Recommend best voice based on preferences."""
preferences = preferences or {}
voices = self.get_cached_voices(language_code)
if not voices:
return None
# Scoring system
scored_voices = []
for voice in voices:
score = 0
# Quality scoring
if "Neural2" in voice.name:
score += 100
elif "Wavenet" in voice.name:
score += 80
elif "Standard" in voice.name:
score += 60
# Gender preference
if preferences.get("gender") == voice.ssml_gender:
score += 50
# Sample rate preference
preferred_rate = preferences.get("sample_rate")
if preferred_rate and voice.natural_sample_rate_hertz == preferred_rate:
score += 30
# Name preference (if specific voice requested)
if preferences.get("voice_name") and preferences["voice_name"] in voice.name:
score += 200
scored_voices.append((voice, score))
# Return highest scored voice
scored_voices.sort(key=lambda x: x[1], reverse=True)
return scored_voices[0][0] if scored_voices else None
def get_voice_alternatives(self, primary_voice_name: str, count: int = 3):
"""Get alternative voices similar to the primary voice."""
# Extract language from primary voice name
lang_parts = primary_voice_name.split("-")
if len(lang_parts) >= 2:
language_code = f"{lang_parts[0]}-{lang_parts[1]}"
else:
return []
voices = self.get_cached_voices(language_code)
# Find similar voices (same type and gender if possible)
primary_voice = next((v for v in voices if v.name == primary_voice_name), None)
if not primary_voice:
return voices[:count]
similar_voices = []
for voice in voices:
if (voice.name != primary_voice_name and
voice.ssml_gender == primary_voice.ssml_gender):
# Prefer same voice type
if any(vtype in voice.name and vtype in primary_voice_name
for vtype in ["Neural2", "Wavenet", "Standard"]):
similar_voices.insert(0, voice)
else:
similar_voices.append(voice)
return similar_voices[:count]
# Usage
recommender = VoiceRecommender()
# Get recommendation with preferences
preferences = {
"gender": SsmlVoiceGender.FEMALE,
"sample_rate": 24000
}
recommended_voice = recommender.recommend_voice("en-US", preferences)
# Get alternatives to a specific voice
alternatives = recommender.get_voice_alternatives("en-US-Wavenet-D", count=5)def compare_voices(text: str, voice_names: list, output_dir: str = "voice_comparison"):
"""Generate audio samples for voice comparison."""
import os
client = texttospeech.TextToSpeechClient()
os.makedirs(output_dir, exist_ok=True)
results = []
for voice_name in voice_names:
# Extract language code from voice name
lang_parts = voice_name.split("-")
language_code = f"{lang_parts[0]}-{lang_parts[1]}" if len(lang_parts) >= 2 else "en-US"
try:
request = texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(text=text),
voice=VoiceSelectionParams(
language_code=language_code,
name=voice_name
),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
)
response = client.synthesize_speech(request=request)
# Save audio file
filename = f"{voice_name.replace('-', '_')}.mp3"
filepath = os.path.join(output_dir, filename)
with open(filepath, "wb") as f:
f.write(response.audio_content)
results.append({
"voice_name": voice_name,
"file_path": filepath,
"success": True,
"audio_size": len(response.audio_content)
})
except Exception as e:
results.append({
"voice_name": voice_name,
"file_path": None,
"success": False,
"error": str(e)
})
return results
# Usage
test_voices = [
"en-US-Neural2-A",
"en-US-Neural2-C",
"en-US-Wavenet-A",
"en-US-Wavenet-D",
"en-US-Standard-A"
]
comparison_results = compare_voices(
"Hello, this is a test of different voice qualities and characteristics.",
test_voices
)
for result in comparison_results:
if result["success"]:
print(f"✓ {result['voice_name']}: {result['audio_size']} bytes")
else:
print(f"✗ {result['voice_name']}: {result['error']}")def assess_voice_quality(voice_name: str) -> dict:
"""Assess voice quality characteristics based on name and properties."""
quality_assessment = {
"voice_name": voice_name,
"quality_tier": "unknown",
"naturalness": "medium",
"recommended_use": "general",
"latency": "medium",
"cost": "medium"
}
# Assess based on voice type
if "Neural2" in voice_name:
quality_assessment.update({
"quality_tier": "premium",
"naturalness": "very_high",
"recommended_use": "professional_content",
"latency": "medium",
"cost": "high"
})
elif "Wavenet" in voice_name:
quality_assessment.update({
"quality_tier": "high",
"naturalness": "high",
"recommended_use": "content_creation",
"latency": "medium",
"cost": "medium_high"
})
elif "Standard" in voice_name:
quality_assessment.update({
"quality_tier": "basic",
"naturalness": "medium",
"recommended_use": "notifications",
"latency": "low",
"cost": "low"
})
elif "Studio" in voice_name:
quality_assessment.update({
"quality_tier": "premium",
"naturalness": "very_high",
"recommended_use": "audiobooks",
"latency": "high",
"cost": "high"
})
return quality_assessment
# Assess multiple voices
voice_assessments = [
assess_voice_quality("en-US-Neural2-A"),
assess_voice_quality("en-US-Wavenet-D"),
assess_voice_quality("en-US-Standard-B")
]
for assessment in voice_assessments:
print(f"{assessment['voice_name']}: {assessment['quality_tier']} quality, "
f"{assessment['naturalness']} naturalness, {assessment['cost']} cost")Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-texttospeech