CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-google-cloud-texttospeech

Google Cloud Texttospeech API client library for converting text to speech with multiple voices and audio formats

Pending
Overview
Eval results
Files

speech-synthesis.mddocs/

Speech Synthesis

Overview

Speech synthesis is the core functionality of the Google Cloud Text-to-Speech API, converting text input into natural-sounding speech audio. The API supports both plain text and SSML (Speech Synthesis Markup Language) input with extensive configuration options for voice selection and audio output.

Core Synthesis Operations

Basic Text Synthesis

from google.cloud import texttospeech

# Initialize client
client = texttospeech.TextToSpeechClient()

# Create synthesis request
request = texttospeech.SynthesizeSpeechRequest(
    input=texttospeech.SynthesisInput(text="Hello, this is a text-to-speech demo"),
    voice=texttospeech.VoiceSelectionParams(
        language_code="en-US",
        ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
    ),
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )
)

# Perform synthesis
response = client.synthesize_speech(request=request)

# Access audio data
audio_content = response.audio_content  # bytes

SSML Synthesis

from google.cloud import texttospeech

# SSML input with markup
ssml_text = """
<speak>
    <prosody rate="slow" pitch="+2st">
        Hello, this is spoken slowly with higher pitch.
    </prosody>
    <break time="1s"/>
    <prosody rate="fast" pitch="-2st">
        And this is spoken quickly with lower pitch.
    </prosody>
</speak>
"""

request = texttospeech.SynthesizeSpeechRequest(
    input=texttospeech.SynthesisInput(ssml=ssml_text),
    voice=texttospeech.VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Wavenet-D"  # Specific voice model
    ),
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
        sample_rate_hertz=24000
    )
)

response = client.synthesize_speech(request=request)

Input Configuration

SynthesisInput Class

from google.cloud.texttospeech import SynthesisInput

# Plain text input
text_input = SynthesisInput(text="Plain text to synthesize")

# SSML input
ssml_input = SynthesisInput(
    ssml='<speak>SSML <emphasis level="strong">markup</emphasis> text</speak>'
)

# Multi-speaker SSML input
multi_speaker_input = SynthesisInput(
    multi_speaker_markup=texttospeech.MultiSpeakerMarkup(
        ssml='<speak><voice name="speaker1">Hello</voice><voice name="speaker2">World</voice></speak>'
    )
)

Advanced Input Options

# Custom pronunciations with synthesis input
from google.cloud.texttospeech import (
    SynthesisInput, 
    CustomPronunciations, 
    CustomPronunciationParams
)

# Define custom pronunciations
custom_pronunciations = CustomPronunciations(
    pronunciations=[
        CustomPronunciationParams(
            phrase="Anthropic",
            ipa="ˌænθrəˈpɪk",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        ),
        CustomPronunciationParams(
            phrase="Claude",
            ipa="klɔːd",
            phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA
        )
    ]
)

# Use with synthesis
request = texttospeech.SynthesizeSpeechRequest(
    input=SynthesisInput(text="Hello from Anthropic's Claude AI assistant"),
    voice=texttospeech.VoiceSelectionParams(
        language_code="en-US",
        custom_pronunciations=custom_pronunciations
    ),
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )
)

Voice Selection

VoiceSelectionParams Class

from google.cloud.texttospeech import VoiceSelectionParams, SsmlVoiceGender

# Basic voice selection
voice = VoiceSelectionParams(
    language_code="en-US",           # Required: BCP-47 language code
    ssml_gender=SsmlVoiceGender.MALE # Optional: voice gender
)

# Specific voice model selection
voice = VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Wavenet-A"          # Specific voice name
)

# Custom voice model
voice = VoiceSelectionParams(
    language_code="en-US",
    custom_voice=texttospeech.CustomVoiceParams(
        model="projects/your-project/locations/us-central1/models/your-model"
    )
)

Advanced Voice Configuration

from google.cloud.texttospeech import (
    VoiceSelectionParams, 
    AdvancedVoiceOptions,
    VoiceCloneParams
)

# Advanced voice options
voice = VoiceSelectionParams(
    language_code="en-US",
    name="en-US-Wavenet-A",
    advanced_voice_options=AdvancedVoiceOptions(
        low_latency_journey_synthesis=True
    )
)

# Voice cloning parameters
voice = VoiceSelectionParams(
    language_code="en-US",
    voice_clone=VoiceCloneParams(
        voice_clone_key="your-voice-clone-key"
    )
)

Audio Configuration

AudioConfig Class

from google.cloud.texttospeech import AudioConfig, AudioEncoding

# Basic audio configuration
audio_config = AudioConfig(
    audio_encoding=AudioEncoding.MP3,    # Required: output format
    sample_rate_hertz=22050,             # Optional: sample rate
    speaking_rate=1.0,                   # Optional: speech rate (0.25-4.0)
    pitch=0.0,                          # Optional: pitch (-20.0 to 20.0)
    volume_gain_db=0.0                  # Optional: volume gain (-96.0 to 16.0)
)

# High-quality linear PCM
audio_config = AudioConfig(
    audio_encoding=AudioEncoding.LINEAR16,
    sample_rate_hertz=48000,
    speaking_rate=0.9,
    pitch=2.0
)

# OGG Opus for streaming
audio_config = AudioConfig(
    audio_encoding=AudioEncoding.OGG_OPUS,
    sample_rate_hertz=48000
)

Audio Effects and Processing

from google.cloud.texttospeech import AudioConfig, AudioEncoding

# Audio with effects profile
audio_config = AudioConfig(
    audio_encoding=AudioEncoding.MP3,
    effects_profile_id=["telephony-class-application"],  # Audio effects
    speaking_rate=1.2,
    pitch=-2.0,
    volume_gain_db=3.0
)

# Multiple effects profiles
audio_config = AudioConfig(
    audio_encoding=AudioEncoding.LINEAR16,
    effects_profile_id=[
        "wearable-class-device",
        "handset-class-device"
    ],
    sample_rate_hertz=16000
)

Request and Response Types

SynthesizeSpeechRequest Class

from google.cloud.texttospeech import (
    SynthesizeSpeechRequest,
    SynthesisInput,
    VoiceSelectionParams,
    AudioConfig
)

# Complete request configuration
request = SynthesizeSpeechRequest(
    input=SynthesisInput(text="Text to synthesize"),
    voice=VoiceSelectionParams(
        language_code="en-US",
        ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
    ),
    audio_config=AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
        sample_rate_hertz=22050
    )
)

# Request with advanced features
request = SynthesizeSpeechRequest(
    input=SynthesisInput(
        ssml='<speak>Hello <mark name="greeting"/>world!</speak>'
    ),
    voice=VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Neural2-A"
    ),
    audio_config=AudioConfig(
        audio_encoding=AudioEncoding.MP3,
        effects_profile_id=["small-bluetooth-speaker-class-device"]
    )
)

SynthesizeSpeechResponse Class

from google.cloud.texttospeech import SynthesizeSpeechResponse

# Standard response
response = client.synthesize_speech(request=request)

# Access response data
audio_content = response.audio_content    # bytes: synthesized audio data

# Response provides audio as bytes
with open("output.mp3", "wb") as audio_file:
    audio_file.write(response.audio_content)

# Get audio length and properties
audio_size = len(response.audio_content)
print(f"Generated {audio_size} bytes of audio")

Multi-Speaker Synthesis

MultiSpeakerMarkup Configuration

from google.cloud.texttospeech import (
    SynthesisInput,
    MultiSpeakerMarkup,
    VoiceSelectionParams
)

# Multi-speaker SSML
multi_speaker_ssml = '''
<speak>
    <voice name="en-US-Neural2-A">
        Hello, I'm the first speaker.
    </voice>
    <voice name="en-US-Neural2-B">
        And I'm the second speaker.
    </voice>
    <voice name="en-US-Neural2-C">
        Together we create a conversation.
    </voice>
</speak>
'''

# Configure multi-speaker input
multi_speaker_input = SynthesisInput(
    multi_speaker_markup=MultiSpeakerMarkup(
        ssml=multi_speaker_ssml
    )
)

# Create synthesis request
request = texttospeech.SynthesizeSpeechRequest(
    input=multi_speaker_input,
    voice=VoiceSelectionParams(
        language_code="en-US"  # Base language for multi-speaker
    ),
    audio_config=texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16
    )
)

Practical Examples

File Processing

import os
from google.cloud import texttospeech

def text_file_to_speech(input_file_path, output_file_path, voice_name=None):
    """Convert text file to speech audio file."""
    client = texttospeech.TextToSpeechClient()
    
    # Read text from file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        text_content = file.read()
    
    # Configure synthesis
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US",
        name=voice_name or "en-US-Neural2-A"
    )
    
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )
    
    request = texttospeech.SynthesizeSpeechRequest(
        input=texttospeech.SynthesisInput(text=text_content),
        voice=voice,
        audio_config=audio_config
    )
    
    # Synthesize speech
    response = client.synthesize_speech(request=request)
    
    # Write audio file
    with open(output_file_path, "wb") as output_file:
        output_file.write(response.audio_content)
    
    print(f"Audio content written to '{output_file_path}'")

# Usage
text_file_to_speech("input.txt", "output.mp3", "en-US-Wavenet-D")

Batch Processing

from google.cloud import texttospeech
import concurrent.futures

def synthesize_text_batch(texts, output_dir="outputs"):
    """Synthesize multiple texts in parallel."""
    client = texttospeech.TextToSpeechClient()
    
    def synthesize_single(text_data):
        text, filename = text_data
        
        request = texttospeech.SynthesizeSpeechRequest(
            input=texttospeech.SynthesisInput(text=text),
            voice=texttospeech.VoiceSelectionParams(
                language_code="en-US",
                ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
            ),
            audio_config=texttospeech.AudioConfig(
                audio_encoding=texttospeech.AudioEncoding.MP3
            )
        )
        
        response = client.synthesize_speech(request=request)
        
        output_path = f"{output_dir}/{filename}.mp3"
        with open(output_path, "wb") as f:
            f.write(response.audio_content)
        
        return output_path
    
    # Prepare text data
    text_data = [(text, f"output_{i}") for i, text in enumerate(texts)]
    
    # Process in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(synthesize_single, text_data))
    
    return results

# Usage
texts = [
    "First text to synthesize",
    "Second text to synthesize", 
    "Third text to synthesize"
]
output_files = synthesize_text_batch(texts)

SSML Template Processing

from google.cloud import texttospeech

def synthesize_with_ssml_template(content_parts, template_path="ssml_template.xml"):
    """Use SSML template for consistent speech formatting."""
    
    # SSML template with placeholders
    ssml_template = """
    <speak>
        <prosody rate="medium" pitch="normal">
            <emphasis level="moderate">{title}</emphasis>
        </prosody>
        <break time="1s"/>
        <prosody rate="slow">
            {content}
        </prosody>
        <break time="2s"/>
        <prosody rate="fast" pitch="+1st">
            {conclusion}
        </prosody>
    </speak>
    """
    
    # Fill template
    ssml_content = ssml_template.format(**content_parts)
    
    client = texttospeech.TextToSpeechClient()
    
    request = texttospeech.SynthesizeSpeechRequest(
        input=texttospeech.SynthesisInput(ssml=ssml_content),
        voice=texttospeech.VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Neural2-A"
        ),
        audio_config=texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.LINEAR16,
            speaking_rate=0.9,
            pitch=1.0
        )
    )
    
    return client.synthesize_speech(request=request)

# Usage
content = {
    "title": "Welcome to our presentation",
    "content": "This is the main content of our speech synthesis example.",
    "conclusion": "Thank you for listening!"
}
response = synthesize_with_ssml_template(content)

Error Handling

Synthesis-Specific Errors

from google.api_core import exceptions
from google.cloud import texttospeech

def safe_synthesize_speech(text, language_code="en-US"):
    """Synthesize speech with comprehensive error handling."""
    try:
        client = texttospeech.TextToSpeechClient()
        
        request = texttospeech.SynthesizeSpeechRequest(
            input=texttospeech.SynthesisInput(text=text),
            voice=texttospeech.VoiceSelectionParams(language_code=language_code),
            audio_config=texttospeech.AudioConfig(
                audio_encoding=texttospeech.AudioEncoding.MP3
            )
        )
        
        response = client.synthesize_speech(request=request)
        return response.audio_content
        
    except exceptions.InvalidArgument as e:
        print(f"Invalid request parameters: {e}")
        return None
    except exceptions.OutOfRange as e:
        print(f"Parameter out of valid range: {e}")
        return None
    except exceptions.FailedPrecondition as e:
        print(f"Failed precondition: {e}")
        return None
    except exceptions.ResourceExhausted as e:
        print(f"Quota exceeded or rate limited: {e}")
        return None
    except exceptions.Unauthenticated as e:
        print(f"Authentication failed: {e}")
        return None
    except exceptions.PermissionDenied as e:
        print(f"Permission denied: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None

# Usage with error handling
audio_data = safe_synthesize_speech("Hello world", "en-US")
if audio_data:
    with open("safe_output.mp3", "wb") as f:
        f.write(audio_data)

Performance Optimization

Request Optimization

from google.cloud import texttospeech

# Optimize for latency
def create_low_latency_request(text):
    return texttospeech.SynthesizeSpeechRequest(
        input=texttospeech.SynthesisInput(text=text),
        voice=texttospeech.VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Standard-A",  # Standard voices are faster
            advanced_voice_options=texttospeech.AdvancedVoiceOptions(
                low_latency_journey_synthesis=True
            )
        ),
        audio_config=texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3,  # MP3 is compressed
            sample_rate_hertz=16000  # Lower sample rate for faster processing
        )
    )

# Optimize for quality
def create_high_quality_request(text):
    return texttospeech.SynthesizeSpeechRequest(
        input=texttospeech.SynthesisInput(text=text),
        voice=texttospeech.VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Wavenet-A"  # WaveNet for higher quality
        ),
        audio_config=texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.LINEAR16,  # Uncompressed
            sample_rate_hertz=48000  # High sample rate
        )
    )

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-texttospeech

docs

async-clients.md

configuration-types.md

index.md

long-audio-synthesis.md

speech-synthesis.md

streaming-synthesis.md

voice-management.md

tile.json