Blog Docs Log in Get started

tessl/pypi-google-cloud-texttospeech

Google Cloud Texttospeech API client library for converting text to speech with multiple voices and audio formats

—

Pending

Long Audio Synthesis

Overview

Long audio synthesis is designed for generating extended audio content that exceeds the limits of standard synthesis operations. It uses Google Cloud's long-running operations (LRO) pattern to handle large-scale text-to-speech generation asynchronously, with output delivered to Google Cloud Storage.

Key Features:

Supports very large text inputs (up to several hours of audio)
Asynchronous processing with operation monitoring
Direct output to Google Cloud Storage
Progress tracking and metadata
Suitable for audiobooks, long documents, and batch processing

Client Setup

Long Audio Synthesis Clients

from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize

# Synchronous long audio client
long_client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()

# Asynchronous long audio client  
async_long_client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeAsyncClient()

# Alternative import paths
from google.cloud import texttospeech_v1

# Through main module
long_client = texttospeech_v1.services.text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()

Authentication and Project Setup

import os
from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize

# Set up authentication (if not using default credentials)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/path/to/service-account-key.json'

# Initialize with explicit project
client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()

# Project and location information
PROJECT_ID = "your-project-id"
LOCATION = "us-central1"  # or other supported location
PARENT = f"projects/{PROJECT_ID}/locations/{LOCATION}"

Core Long Audio Operations

Basic Long Audio Synthesis

from google.cloud import texttospeech_v1
from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize

# Initialize client
client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()

# Create long audio synthesis request
request = texttospeech_v1.SynthesizeLongAudioRequest(
    parent="projects/your-project-id/locations/us-central1",
    input=texttospeech_v1.SynthesisInput(
        text="This is a very long text that will be converted to audio. " * 100
    ),
    audio_config=texttospeech_v1.AudioConfig(
        audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
        sample_rate_hertz=22050
    ),
    voice=texttospeech_v1.VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Wavenet-A"
    ),
    output_gcs_uri="gs://your-bucket-name/output-audio.wav"
)

# Start long-running operation
operation = client.synthesize_long_audio(request=request)

print(f"Operation name: {operation.name}")
print("Long audio synthesis started...")

# Wait for completion
result = operation.result()  # Blocks until complete

print("Long audio synthesis completed!")
print(f"Result: {result}")

SSML Long Audio Synthesis

from google.cloud import texttospeech_v1

# Prepare long SSML content
long_ssml_content = """
<speak>
    <p>
        <s>Welcome to this long audio demonstration.</s>
        <s>This content will be processed as a long-running operation.</s>
    </p>
    
    <break time="2s"/>
    
    <p>
        <s>Here we have multiple paragraphs with various SSML features.</s>
        <s><prosody rate="slow">This part is spoken slowly.</prosody></s>
        <s><prosody rate="fast">While this part is much faster.</prosody></s>
    </p>
    
    <break time="3s"/>
    
    <p>
        <s><emphasis level="strong">This is emphasized text.</emphasis></s>
        <s>And this concludes our long audio sample.</s>
    </p>
</speak>
"""

# Create request with SSML
request = texttospeech_v1.SynthesizeLongAudioRequest(
    parent="projects/your-project-id/locations/us-central1",
    input=texttospeech_v1.SynthesisInput(ssml=long_ssml_content),
    audio_config=texttospeech_v1.AudioConfig(
        audio_encoding=texttospeech_v1.AudioEncoding.MP3,
        speaking_rate=1.0,
        pitch=0.0,
        volume_gain_db=0.0
    ),
    voice=texttospeech_v1.VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Neural2-A"
    ),
    output_gcs_uri="gs://your-bucket-name/long-ssml-output.mp3"
)

client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
operation = client.synthesize_long_audio(request=request)

Request and Response Types

SynthesizeLongAudioRequest

from google.cloud.texttospeech_v1 import (
    SynthesizeLongAudioRequest,
    SynthesisInput,
    AudioConfig,
    VoiceSelectionParams,
    AudioEncoding
)

# Complete long audio request configuration
request = SynthesizeLongAudioRequest(
    parent="projects/your-project-id/locations/us-central1",  # Required: parent resource
    
    input=SynthesisInput(
        text="Long text content to synthesize..."  # or ssml="<speak>...</speak>"
    ),
    
    audio_config=AudioConfig(
        audio_encoding=AudioEncoding.LINEAR16,      # Audio format
        sample_rate_hertz=24000,                    # Sample rate
        speaking_rate=1.0,                          # Speech rate
        pitch=0.0,                                  # Pitch adjustment
        volume_gain_db=0.0,                         # Volume gain
        effects_profile_id=["large-home-entertainment-class-device"]  # Audio effects
    ),
    
    voice=VoiceSelectionParams(
        language_code="en-US",                      # Required: language
        name="en-US-Wavenet-D",                     # Specific voice
        ssml_gender=texttospeech_v1.SsmlVoiceGender.FEMALE
    ),
    
    output_gcs_uri="gs://your-bucket-name/path/output.wav"  # Required: GCS output location
)

# Request with custom pronunciations
request_with_pronunciations = SynthesizeLongAudioRequest(
    parent="projects/your-project-id/locations/us-central1",
    input=SynthesisInput(text="Text with custom pronunciations for API and JSON terms."),
    audio_config=AudioConfig(
        audio_encoding=AudioEncoding.MP3,
        sample_rate_hertz=22050
    ),
    voice=VoiceSelectionParams(
        language_code="en-US", 
        name="en-US-Neural2-A",
        custom_pronunciations=texttospeech_v1.CustomPronunciations(
            pronunciations=[
                texttospeech_v1.CustomPronunciationParams(
                    phrase="API",
                    ipa="ˌeɪ piː ˈaɪ",
                    phonetic_encoding=texttospeech_v1.CustomPronunciationParams.PhoneticEncoding.IPA
                )
            ]
        )
    ),
    output_gcs_uri="gs://your-bucket-name/custom-pronunciation-output.mp3"
)

SynthesizeLongAudioResponse and Metadata

from google.cloud.texttospeech_v1 import SynthesizeLongAudioResponse, SynthesizeLongAudioMetadata

# Response object (returned when operation completes)
# SynthesizeLongAudioResponse is typically empty - the audio is written to GCS

# Metadata object (available during operation)
def process_operation_metadata(operation):
    """Process metadata from long-running operation."""
    
    if operation.metadata:
        # Metadata contains progress information
        metadata = SynthesizeLongAudioMetadata()
        operation.metadata.Unpack(metadata)
        
        print(f"Progress: {metadata.progress_percentage}%")
        print(f"Start time: {metadata.start_time}")
        
        if metadata.last_update_time:
            print(f"Last update: {metadata.last_update_time}")
    
    return operation.metadata

# Access operation result
def get_operation_result(operation):
    """Get result from completed operation."""
    
    if operation.done():
        if operation.error:
            print(f"Operation failed: {operation.error}")
            return None
        else:
            result = operation.result()
            print("Operation completed successfully")
            # Result is typically empty - check GCS for output file
            return result
    else:
        print(f"Operation still running: {operation.name}")
        return None

Operation Management

Monitoring Long-Running Operations

import time
from google.api_core import operation
from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize

def monitor_long_audio_operation(operation_name: str, check_interval: int = 30):
    """Monitor a long-running audio synthesis operation."""
    
    client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
    
    # Get operation by name
    op = client.get_operation(request={"name": operation_name})
    
    print(f"Monitoring operation: {operation_name}")
    
    while not op.done():
        # Process metadata
        if op.metadata:
            try:
                metadata = texttospeech_v1.SynthesizeLongAudioMetadata()
                op.metadata.Unpack(metadata)
                
                progress = getattr(metadata, 'progress_percentage', 0)
                print(f"Progress: {progress}%")
                
                if hasattr(metadata, 'start_time') and metadata.start_time:
                    print(f"Started at: {metadata.start_time}")
                
            except Exception as e:
                print(f"Could not parse metadata: {e}")
        
        print(f"Operation still running. Checking again in {check_interval} seconds...")
        time.sleep(check_interval)
        
        # Refresh operation status
        op = client.get_operation(request={"name": operation_name})
    
    # Operation completed
    if op.error:
        print(f"Operation failed: {op.error}")
        return False
    else:
        print("Operation completed successfully!")
        print(f"Output should be available at the specified GCS URI")
        return True

# Usage
# operation_name = "projects/your-project/locations/us-central1/operations/long-operation-id"
# success = monitor_long_audio_operation(operation_name)

Cancelling Operations

def cancel_long_audio_operation(operation_name: str):
    """Cancel a running long audio synthesis operation."""
    
    client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
    
    try:
        # Cancel the operation
        client.cancel_operation(request={"name": operation_name})
        print(f"Cancellation requested for operation: {operation_name}")
        
        # Check if cancellation was successful
        op = client.get_operation(request={"name": operation_name})
        
        if op.done():
            if op.cancelled():
                print("Operation successfully cancelled")
                return True
            else:
                print("Operation completed before cancellation")
                return False
        else:
            print("Cancellation in progress...")
            return True
            
    except Exception as e:
        print(f"Failed to cancel operation: {e}")
        return False

# Usage
# cancel_long_audio_operation("projects/your-project/locations/us-central1/operations/op-id")

Listing Operations

def list_long_audio_operations(project_id: str, location: str = "us-central1"):
    """List all long audio synthesis operations for a project."""
    
    client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
    
    parent = f"projects/{project_id}/locations/{location}"
    
    try:
        # List operations
        operations = client.list_operations(request={"name": parent})
        
        print(f"Operations in {parent}:")
        
        for op in operations:
            print(f"\nOperation: {op.name}")
            print(f"Done: {op.done()}")
            
            if op.done():
                if op.error:
                    print(f"Error: {op.error}")
                else:
                    print("Status: Completed successfully")
            else:
                print("Status: Running")
                
                # Try to get metadata
                if op.metadata:
                    try:
                        metadata = texttospeech_v1.SynthesizeLongAudioMetadata()
                        op.metadata.Unpack(metadata)
                        progress = getattr(metadata, 'progress_percentage', 0)
                        print(f"Progress: {progress}%")
                    except:
                        print("Progress: Unknown")
        
        return operations
        
    except Exception as e:
        print(f"Failed to list operations: {e}")
        return []

# Usage
# operations = list_long_audio_operations("your-project-id")

Practical Examples

Audiobook Generation

import os
from google.cloud import storage
from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize

class AudiobookGenerator:
    """Generate audiobooks from long text content."""
    
    def __init__(self, project_id: str, bucket_name: str, location: str = "us-central1"):
        self.project_id = project_id
        self.bucket_name = bucket_name
        self.location = location
        self.parent = f"projects/{project_id}/locations/{location}"
        
        # Initialize clients
        self.tts_client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
        self.storage_client = storage.Client()
    
    def generate_audiobook(self, text_content: str, output_filename: str, 
                          voice_name: str = "en-US-Wavenet-A",
                          language_code: str = "en-US"):
        """Generate audiobook from text content."""
        
        # Ensure GCS bucket exists
        try:
            bucket = self.storage_client.bucket(self.bucket_name)
            if not bucket.exists():
                bucket = self.storage_client.create_bucket(self.bucket_name)
                print(f"Created bucket: {self.bucket_name}")
        except Exception as e:
            print(f"Bucket setup error: {e}")
            return None
        
        # Configure audiobook synthesis
        gcs_uri = f"gs://{self.bucket_name}/{output_filename}"
        
        request = texttospeech_v1.SynthesizeLongAudioRequest(
            parent=self.parent,
            input=texttospeech_v1.SynthesisInput(text=text_content),
            audio_config=texttospeech_v1.AudioConfig(
                audio_encoding=texttospeech_v1.AudioEncoding.MP3,
                sample_rate_hertz=22050,
                speaking_rate=0.9,  # Slightly slower for audiobooks
                volume_gain_db=2.0   # Boost volume
            ),
            voice=texttospeech_v1.VoiceSelectionParams(
                language_code=language_code,
                name=voice_name
            ),
            output_gcs_uri=gcs_uri
        )
        
        print(f"Starting audiobook generation...")
        print(f"Output will be saved to: {gcs_uri}")
        
        # Start synthesis
        operation = self.tts_client.synthesize_long_audio(request=request)
        
        return {
            'operation': operation,
            'operation_name': operation.name,
            'output_uri': gcs_uri
        }
    
    def wait_for_audiobook(self, operation, check_interval: int = 60):
        """Wait for audiobook generation to complete."""
        
        print("Waiting for audiobook generation to complete...")
        
        while not operation.done():
            # Get progress
            if operation.metadata:
                try:
                    metadata = texttospeech_v1.SynthesizeLongAudioMetadata()
                    operation.metadata.Unpack(metadata)
                    progress = getattr(metadata, 'progress_percentage', 0)
                    print(f"Progress: {progress}%")
                except:
                    print("Checking progress...")
            
            time.sleep(check_interval)
            
            # Refresh operation
            operation = self.tts_client.get_operation(
                request={"name": operation.name}
            )
        
        if operation.error:
            print(f"Audiobook generation failed: {operation.error}")
            return False
        else:
            print("Audiobook generation completed successfully!")
            return True
    
    def download_audiobook(self, gcs_uri: str, local_filename: str):
        """Download generated audiobook from GCS."""
        
        # Parse GCS URI
        if not gcs_uri.startswith("gs://"):
            raise ValueError("Invalid GCS URI")
        
        path_parts = gcs_uri[5:].split("/", 1)
        bucket_name = path_parts[0]
        blob_name = path_parts[1]
        
        # Download file
        bucket = self.storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_name)
        
        blob.download_to_filename(local_filename)
        print(f"Audiobook downloaded to: {local_filename}")
        
        # Get file info
        file_size = os.path.getsize(local_filename)
        print(f"File size: {file_size / (1024*1024):.2f} MB")
        
        return local_filename

# Usage example
def generate_sample_audiobook():
    """Generate a sample audiobook."""
    
    # Sample long text (could be loaded from file)
    sample_text = """
    Chapter 1: Introduction
    
    Welcome to this sample audiobook demonstration. This text will be converted
    into high-quality speech using Google Cloud Text-to-Speech long audio synthesis.
    
    The long audio synthesis feature is specifically designed for content like this,
    where the text is too long for standard synthesis operations. It processes the
    content asynchronously and delivers the results to Google Cloud Storage.
    
    Chapter 2: Features
    
    Long audio synthesis supports all the same features as standard synthesis,
    including SSML markup, custom voices, and audio configuration options.
    The main difference is that it can handle much larger amounts of text
    and processes them as long-running operations.
    
    This makes it ideal for generating audiobooks, processing long documents,
    or creating extended audio content for podcasts and presentations.
    
    Chapter 3: Conclusion
    
    Thank you for listening to this sample audiobook. The long audio synthesis
    feature provides a powerful way to convert large amounts of text into
    natural-sounding speech.
    """ * 5  # Repeat to make it longer
    
    # Generate audiobook
    generator = AudiobookGenerator(
        project_id="your-project-id",
        bucket_name="your-audiobook-bucket"
    )
    
    result = generator.generate_audiobook(
        text_content=sample_text,
        output_filename="sample_audiobook.mp3",
        voice_name="en-US-Wavenet-A"
    )
    
    if result:
        # Wait for completion
        success = generator.wait_for_audiobook(result['operation'])
        
        if success:
            # Download the result
            generator.download_audiobook(
                result['output_uri'],
                "local_audiobook.mp3"
            )
            print("Audiobook generation complete!")
        
        return result
    
    return None

# Run the example
# audiobook_result = generate_sample_audiobook()

Batch Document Processing

import concurrent.futures
from typing import List, Dict

class BatchDocumentProcessor:
    """Process multiple documents for long audio synthesis."""
    
    def __init__(self, project_id: str, bucket_name: str, location: str = "us-central1"):
        self.project_id = project_id
        self.bucket_name = bucket_name
        self.location = location
        self.parent = f"projects/{project_id}/locations/{location}"
        
        self.client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
    
    def process_document_batch(self, documents: List[Dict], max_workers: int = 5):
        """Process multiple documents in parallel."""
        
        def process_single_document(doc_info):
            """Process a single document."""
            try:
                doc_name = doc_info['name']
                text_content = doc_info['content']
                voice_config = doc_info.get('voice', {})
                audio_config = doc_info.get('audio', {})
                
                # Default configurations
                voice_name = voice_config.get('name', 'en-US-Wavenet-A')
                language_code = voice_config.get('language_code', 'en-US')
                
                audio_encoding = audio_config.get('encoding', texttospeech_v1.AudioEncoding.MP3)
                sample_rate = audio_config.get('sample_rate', 22050)
                
                # Create request
                output_uri = f"gs://{self.bucket_name}/batch/{doc_name}.mp3"
                
                request = texttospeech_v1.SynthesizeLongAudioRequest(
                    parent=self.parent,
                    input=texttospeech_v1.SynthesisInput(text=text_content),
                    audio_config=texttospeech_v1.AudioConfig(
                        audio_encoding=audio_encoding,
                        sample_rate_hertz=sample_rate
                    ),
                    voice=texttospeech_v1.VoiceSelectionParams(
                        language_code=language_code,
                        name=voice_name
                    ),
                    output_gcs_uri=output_uri
                )
                
                # Start synthesis
                operation = self.client.synthesize_long_audio(request=request)
                
                return {
                    'document': doc_name,
                    'operation_name': operation.name,
                    'output_uri': output_uri,
                    'success': True,
                    'operation': operation
                }
                
            except Exception as e:
                return {
                    'document': doc_info['name'],
                    'operation_name': None,
                    'output_uri': None,
                    'success': False,
                    'error': str(e)
                }
        
        # Process documents in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(executor.map(process_single_document, documents))
        
        return results
    
    def monitor_batch_operations(self, operation_results: List[Dict], 
                                check_interval: int = 30):
        """Monitor multiple long-running operations."""
        
        pending_operations = [r for r in operation_results if r['success']]
        completed_operations = []
        
        print(f"Monitoring {len(pending_operations)} operations...")
        
        while pending_operations:
            still_pending = []
            
            for op_result in pending_operations:
                try:
                    # Check operation status
                    operation = self.client.get_operation(
                        request={"name": op_result['operation_name']}
                    )
                    
                    if operation.done():
                        if operation.error:
                            op_result['final_status'] = 'failed'
                            op_result['error'] = str(operation.error)
                            print(f"❌ {op_result['document']}: Failed")
                        else:
                            op_result['final_status'] = 'completed'
                            print(f"✅ {op_result['document']}: Completed")
                        
                        completed_operations.append(op_result)
                    else:
                        # Still running
                        if operation.metadata:
                            try:
                                metadata = texttospeech_v1.SynthesizeLongAudioMetadata()
                                operation.metadata.Unpack(metadata)
                                progress = getattr(metadata, 'progress_percentage', 0)
                                print(f"⏳ {op_result['document']}: {progress}%")
                            except:
                                print(f"⏳ {op_result['document']}: In progress...")
                        
                        still_pending.append(op_result)
                
                except Exception as e:
                    print(f"Error checking {op_result['document']}: {e}")
                    still_pending.append(op_result)
            
            pending_operations = still_pending
            
            if pending_operations:
                print(f"\n{len(pending_operations)} operations still running. "
                      f"Checking again in {check_interval} seconds...\n")
                time.sleep(check_interval)
        
        print(f"\nBatch processing complete!")
        print(f"Completed: {len([op for op in completed_operations if op.get('final_status') == 'completed'])}")
        print(f"Failed: {len([op for op in completed_operations if op.get('final_status') == 'failed'])}")
        
        return completed_operations

# Usage example
def batch_process_example():
    """Example of batch processing multiple documents."""
    
    # Sample documents
    documents = [
        {
            'name': 'document1',
            'content': 'This is the first document content. ' * 100,
            'voice': {'name': 'en-US-Neural2-A', 'language_code': 'en-US'},
            'audio': {'encoding': texttospeech_v1.AudioEncoding.MP3, 'sample_rate': 22050}
        },
        {
            'name': 'document2', 
            'content': 'This is the second document content. ' * 100,
            'voice': {'name': 'en-US-Wavenet-D', 'language_code': 'en-US'},
            'audio': {'encoding': texttospeech_v1.AudioEncoding.LINEAR16, 'sample_rate': 24000}
        },
        {
            'name': 'document3',
            'content': 'This is the third document content. ' * 100,
            'voice': {'name': 'en-US-Standard-B', 'language_code': 'en-US'},
            'audio': {'encoding': texttospeech_v1.AudioEncoding.OGG_OPUS, 'sample_rate': 48000}
        }
    ]
    
    # Process batch
    processor = BatchDocumentProcessor(
        project_id="your-project-id",
        bucket_name="your-batch-bucket"
    )
    
    # Start batch processing
    results = processor.process_document_batch(documents, max_workers=3)
    
    # Monitor operations
    final_results = processor.monitor_batch_operations(results)
    
    return final_results

# Run batch processing
# batch_results = batch_process_example()

Error Handling and Best Practices

Comprehensive Error Handling

from google.api_core import exceptions
import logging

def robust_long_audio_synthesis(text_content: str, output_gcs_uri: str, 
                               project_id: str, location: str = "us-central1"):
    """Long audio synthesis with comprehensive error handling."""
    
    client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
    parent = f"projects/{project_id}/locations/{location}"
    
    try:
        # Validate inputs
        if not text_content or not text_content.strip():
            raise ValueError("Text content cannot be empty")
        
        if not output_gcs_uri.startswith("gs://"):
            raise ValueError("Output URI must be a valid GCS URI (gs://...)")
        
        # Create request
        request = texttospeech_v1.SynthesizeLongAudioRequest(
            parent=parent,
            input=texttospeech_v1.SynthesisInput(text=text_content),
            audio_config=texttospeech_v1.AudioConfig(
                audio_encoding=texttospeech_v1.AudioEncoding.MP3,
                sample_rate_hertz=22050
            ),
            voice=texttospeech_v1.VoiceSelectionParams(
                language_code="en-US",
                name="en-US-Neural2-A"
            ),
            output_gcs_uri=output_gcs_uri
        )
        
        # Start operation
        operation = client.synthesize_long_audio(request=request)
        
        return {
            'success': True,
            'operation': operation,
            'operation_name': operation.name
        }
        
    except exceptions.InvalidArgument as e:
        logging.error(f"Invalid request parameters: {e}")
        return {'success': False, 'error': 'Invalid parameters', 'details': str(e)}
    
    except exceptions.PermissionDenied as e:
        logging.error(f"Permission denied: {e}")
        return {'success': False, 'error': 'Permission denied', 'details': str(e)}
    
    except exceptions.ResourceExhausted as e:
        logging.error(f"Quota exceeded: {e}")
        return {'success': False, 'error': 'Quota exceeded', 'details': str(e)}
    
    except exceptions.FailedPrecondition as e:
        logging.error(f"Failed precondition: {e}")
        return {'success': False, 'error': 'Precondition failed', 'details': str(e)}
    
    except exceptions.NotFound as e:
        logging.error(f"Resource not found: {e}")
        return {'success': False, 'error': 'Resource not found', 'details': str(e)}
    
    except Exception as e:
        logging.error(f"Unexpected error: {e}")
        return {'success': False, 'error': 'Unexpected error', 'details': str(e)}

# Usage with error handling
result = robust_long_audio_synthesis(
    text_content="Long text content...",
    output_gcs_uri="gs://your-bucket/output.mp3",
    project_id="your-project-id"
)

if result['success']:
    print(f"Operation started: {result['operation_name']}")
else:
    print(f"Error: {result['error']} - {result['details']}")

Best Practices for Long Audio Synthesis

class LongAudioBestPractices:
    """Best practices for long audio synthesis."""
    
    @staticmethod
    def validate_text_length(text: str) -> bool:
        """Validate text length for long audio synthesis."""
        # Recommended maximum: ~1 million characters
        MAX_CHARS = 1_000_000
        
        if len(text) > MAX_CHARS:
            print(f"Warning: Text length ({len(text)}) exceeds recommended maximum ({MAX_CHARS})")
            return False
        
        return True
    
    @staticmethod  
    def optimize_text_for_synthesis(text: str) -> str:
        """Optimize text content for better synthesis."""
        import re
        
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Add proper punctuation for better pacing
        text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
        
        # Ensure paragraph breaks
        text = re.sub(r'\n\s*\n', '\n\n', text)
        
        return text.strip()
    
    @staticmethod
    def choose_optimal_voice(content_type: str, language: str = "en-US") -> str:
        """Choose optimal voice based on content type."""
        
        voice_recommendations = {
            "audiobook": f"{language}-Wavenet-A",      # Clear, pleasant for long listening
            "news": f"{language}-Neural2-C",           # Authoritative
            "educational": f"{language}-Neural2-A",    # Clear, engaging
            "documentation": f"{language}-Standard-A", # Clear, efficient
            "narrative": f"{language}-Wavenet-D"       # Expressive
        }
        
        return voice_recommendations.get(content_type, f"{language}-Neural2-A")
    
    @staticmethod
    def create_optimal_audio_config(use_case: str) -> texttospeech_v1.AudioConfig:
        """Create optimal audio configuration for different use cases."""
        
        configs = {
            "audiobook": texttospeech_v1.AudioConfig(
                audio_encoding=texttospeech_v1.AudioEncoding.MP3,
                sample_rate_hertz=22050,
                speaking_rate=0.9,
                volume_gain_db=2.0
            ),
            "podcast": texttospeech_v1.AudioConfig(
                audio_encoding=texttospeech_v1.AudioEncoding.MP3,
                sample_rate_hertz=44100,
                speaking_rate=1.0,
                volume_gain_db=1.0,
                effects_profile_id=["large-home-entertainment-class-device"]
            ),
            "telephony": texttospeech_v1.AudioConfig(
                audio_encoding=texttospeech_v1.AudioEncoding.MULAW,
                sample_rate_hertz=8000,
                speaking_rate=1.1,
                effects_profile_id=["telephony-class-application"]
            ),
            "archive": texttospeech_v1.AudioConfig(
                audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
                sample_rate_hertz=48000,
                speaking_rate=1.0
            )
        }
        
        return configs.get(use_case, configs["audiobook"])

# Apply best practices
def create_optimized_long_audio_request(text_content: str, output_uri: str, 
                                       content_type: str = "audiobook"):
    """Create optimized long audio request following best practices."""
    
    # Validate and optimize text
    if not LongAudioBestPractices.validate_text_length(text_content):
        print("Consider breaking content into smaller chunks")
    
    optimized_text = LongAudioBestPractices.optimize_text_for_synthesis(text_content)
    
    # Choose optimal voice and config
    voice_name = LongAudioBestPractices.choose_optimal_voice(content_type)
    audio_config = LongAudioBestPractices.create_optimal_audio_config(content_type)
    
    # Create request
    request = texttospeech_v1.SynthesizeLongAudioRequest(
        parent="projects/your-project-id/locations/us-central1",
        input=texttospeech_v1.SynthesisInput(text=optimized_text),
        audio_config=audio_config,
        voice=texttospeech_v1.VoiceSelectionParams(
            language_code="en-US",
            name=voice_name
        ),
        output_gcs_uri=output_uri
    )
    
    return request

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-texttospeech

docs

async-clients.md

configuration-types.md

long-audio-synthesis.md

speech-synthesis.md

streaming-synthesis.md

voice-management.md