Google Cloud Texttospeech API client library for converting text to speech with multiple voices and audio formats
npx @tessl/cli install tessl/pypi-google-cloud-texttospeech@2.29.0The Google Cloud Text-to-Speech API provides advanced text-to-speech capabilities that convert text into natural-sounding speech. The API supports over 380 voices across more than 50 languages and variants, offering both standard and WaveNet neural voices for high-quality audio synthesis.
Key Features:
# Installation
pip install google-cloud-texttospeech
# Package: google-cloud-texttospeech
# Version: 2.29.0
# Main Module: google.cloud.texttospeechfrom google.cloud import texttospeech
# Main client classes
client = texttospeech.TextToSpeechClient()
async_client = texttospeech.TextToSpeechAsyncClient()# Stable API (v1)
from google.cloud import texttospeech_v1
# Beta API (v1beta1) - includes timepoint features
from google.cloud import texttospeech_v1beta1from google.cloud.texttospeech import (
TextToSpeechClient,
AudioConfig,
AudioEncoding,
SynthesisInput,
VoiceSelectionParams,
SsmlVoiceGender,
SynthesizeSpeechRequest,
SynthesizeSpeechResponse
)from google.cloud import texttospeech
# Initialize the client
client = texttospeech.TextToSpeechClient()
# Configure the synthesis input
synthesis_input = texttospeech.SynthesisInput(text="Hello, World!")
# Select voice parameters
voice = texttospeech.VoiceSelectionParams(
language_code="en-US",
ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
)
# Configure audio output
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
# Create synthesis request
request = texttospeech.SynthesizeSpeechRequest(
input=synthesis_input,
voice=voice,
audio_config=audio_config
)
# Perform the text-to-speech synthesis
response = client.synthesize_speech(request=request)
# Save the synthesized audio to a file
with open("output.mp3", "wb") as out:
out.write(response.audio_content)
print("Audio content written to file 'output.mp3'")The API provides four main client classes for different use cases:
Basic text-to-speech synthesis with support for plain text and SSML input.
# Quick synthesis example
response = client.synthesize_speech(
input=texttospeech.SynthesisInput(text="Convert this text to speech"),
voice=texttospeech.VoiceSelectionParams(language_code="en-US"),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16
)
)See: Speech Synthesis for complete synthesis operations documentation.
List and select from available voices with filtering by language and characteristics.
# List all available voices
voices_response = client.list_voices()
for voice in voices_response.voices:
print(f"Voice: {voice.name}, Language: {voice.language_codes}")
# List voices for specific language
request = texttospeech.ListVoicesRequest(language_code="en-US")
response = client.list_voices(request=request)See: Voice Management for voice discovery and selection.
Real-time bidirectional streaming for interactive applications.
# Streaming synthesis configuration
config = texttospeech.StreamingSynthesizeConfig(
voice=texttospeech.VoiceSelectionParams(language_code="en-US"),
audio_config=texttospeech.StreamingAudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16,
sample_rate_hertz=22050
)
)See: Streaming Synthesis for real-time streaming operations.
Generate extended audio content using long-running operations.
from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize
# Long audio client
long_client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
# Create long audio request
request = texttospeech.SynthesizeLongAudioRequest(
parent="projects/your-project-id/locations/us-central1",
input=texttospeech.SynthesisInput(text="Very long text content..."),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16
),
voice=texttospeech.VoiceSelectionParams(language_code="en-US"),
output_gcs_uri="gs://your-bucket/output.wav"
)See: Long Audio Synthesis for extended audio operations.
Comprehensive configuration options for voice selection, audio output, and advanced features.
# Advanced voice configuration
advanced_voice = texttospeech.AdvancedVoiceOptions(
low_latency_journey_synthesis=True
)
# Custom pronunciations
custom_pronunciations = texttospeech.CustomPronunciations(
pronunciations=[
texttospeech.CustomPronunciationParams(
phrase="example",
ipa="ɪɡˈzæmpəl",
phonetic_encoding=texttospeech.CustomPronunciationParams.PhoneticEncoding.IPA
)
]
)See: Configuration Types for all configuration classes and options.
Full async/await support for all Text-to-Speech operations.
import asyncio
from google.cloud import texttospeech
async def synthesize_async():
async_client = texttospeech.TextToSpeechAsyncClient()
request = texttospeech.SynthesizeSpeechRequest(
input=texttospeech.SynthesisInput(text="Async synthesis"),
voice=texttospeech.VoiceSelectionParams(language_code="en-US"),
audio_config=texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
)
response = await async_client.synthesize_speech(request=request)
return response.audio_content
# Run async operation
audio_data = asyncio.run(synthesize_async())See: Async Clients for asynchronous operation patterns.
# Available audio encoding formats
from google.cloud.texttospeech import AudioEncoding
LINEAR16 = AudioEncoding.LINEAR16 # 16-bit PCM with WAV header
MP3 = AudioEncoding.MP3 # MP3 at 32kbps
OGG_OPUS = AudioEncoding.OGG_OPUS # Opus in Ogg container
MULAW = AudioEncoding.MULAW # 8-bit G.711 PCMU/mu-law
ALAW = AudioEncoding.ALAW # 8-bit G.711 PCMU/A-law
PCM = AudioEncoding.PCM # 16-bit PCM without header
M4A = AudioEncoding.M4A # M4A formatfrom google.api_core import exceptions
from google.cloud import texttospeech
try:
client = texttospeech.TextToSpeechClient()
response = client.synthesize_speech(request=request)
except exceptions.InvalidArgument as e:
print(f"Invalid request parameters: {e}")
except exceptions.PermissionDenied as e:
print(f"Permission denied: {e}")
except exceptions.ResourceExhausted as e:
print(f"Quota exceeded: {e}")
except Exception as e:
print(f"Unexpected error: {e}")# Using beta API for timepoint information
from google.cloud import texttospeech_v1beta1
client = texttospeech_v1beta1.TextToSpeechClient()
request = texttospeech_v1beta1.SynthesizeSpeechRequest(
input=texttospeech_v1beta1.SynthesisInput(
ssml='<speak>Hello <mark name="greeting"/> world!</speak>'
),
voice=texttospeech_v1beta1.VoiceSelectionParams(language_code="en-US"),
audio_config=texttospeech_v1beta1.AudioConfig(
audio_encoding=texttospeech_v1beta1.AudioEncoding.LINEAR16
),
enable_time_pointing=[
texttospeech_v1beta1.SynthesizeSpeechRequest.TimepointType.SSML_MARK
]
)
response = client.synthesize_speech(request=request)
# Response includes timepoints field with timestamp information