Tessl Tile for pypi/google-cloud-texttospeech@2.29.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

async-clients.md configuration-types.md index.md long-audio-synthesis.md speech-synthesis.md streaming-synthesis.md voice-management.md

streaming-synthesis.mddocs/

0
# Streaming Synthesis
1

2
## Overview
3

4
Streaming synthesis enables real-time, bidirectional audio generation where text can be sent incrementally and audio is received as it's generated. This is ideal for interactive applications like chatbots, live assistants, and real-time communication systems where low latency is crucial.
5

6
## Core Streaming Operations
7

8
### Basic Streaming Setup
9

10
```api { .api }
11
from google.cloud import texttospeech
12

13
# Initialize client for streaming
14
client = texttospeech.TextToSpeechClient()
15

16
# Configure streaming synthesis
17
config = texttospeech.StreamingSynthesizeConfig(
18
    voice=texttospeech.VoiceSelectionParams(
19
        language_code="en-US",
20
        name="en-US-Neural2-A"
21
    ),
22
    audio_config=texttospeech.StreamingAudioConfig(
23
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
24
        sample_rate_hertz=22050
25
    )
26
)
27

28
# Create streaming request iterator
29
def create_streaming_requests():
30
    # First request with configuration
31
    yield texttospeech.StreamingSynthesizeRequest(streaming_config=config)
32
    
33
    # Input requests
34
    yield texttospeech.StreamingSynthesizeRequest(
35
        input=texttospeech.StreamingSynthesisInput(text="Hello, ")
36
    )
37
    yield texttospeech.StreamingSynthesizeRequest(
38
        input=texttospeech.StreamingSynthesisInput(text="this is streaming synthesis.")
39
    )
40

41
# Perform streaming synthesis
42
streaming_responses = client.streaming_synthesize(create_streaming_requests())
43

44
# Process responses
45
for response in streaming_responses:
46
    if response.audio_content:
47
        # Handle audio chunks as they arrive
48
        print(f"Received audio chunk: {len(response.audio_content)} bytes")
49
        # Process or play audio chunk immediately
50
```
51

52
### Streaming with SSML
53

54
```api { .api }
55
from google.cloud.texttospeech import (
56
    StreamingSynthesizeRequest,
57
    StreamingSynthesizeConfig, 
58
    StreamingSynthesisInput,
59
    StreamingAudioConfig
60
)
61

62
def streaming_ssml_synthesis():
63
    """Stream SSML content with markup."""
64
    client = texttospeech.TextToSpeechClient()
65
    
66
    # Configure for SSML streaming
67
    config = StreamingSynthesizeConfig(
68
        voice=texttospeech.VoiceSelectionParams(
69
            language_code="en-US",
70
            name="en-US-Wavenet-D"
71
        ),
72
        audio_config=StreamingAudioConfig(
73
            audio_encoding=texttospeech.AudioEncoding.LINEAR16,
74
            sample_rate_hertz=24000
75
        )
76
    )
77
    
78
    def request_generator():
79
        # Configuration request
80
        yield StreamingSynthesizeRequest(streaming_config=config)
81
        
82
        # SSML input chunks
83
        ssml_parts = [
84
            '<speak><prosody rate="slow">Hello there!</prosody>',
85
            '<break time="1s"/>',
86
            '<prosody pitch="+5st">This is exciting!</prosody>',
87
            '</speak>'
88
        ]
89
        
90
        for ssml_part in ssml_parts:
91
            yield StreamingSynthesizeRequest(
92
                input=StreamingSynthesisInput(markup=ssml_part)
93
            )
94
    
95
    # Stream and collect audio
96
    responses = client.streaming_synthesize(request_generator())
97
    
98
    audio_chunks = []
99
    for response in responses:
100
        if response.audio_content:
101
            audio_chunks.append(response.audio_content)
102
    
103
    return b''.join(audio_chunks)
104

105
# Usage
106
streaming_audio = streaming_ssml_synthesis()
107
```
108

109
## Configuration Classes
110

111
### StreamingSynthesizeConfig
112

113
```api { .api }
114
from google.cloud.texttospeech import (
115
    StreamingSynthesizeConfig,
116
    VoiceSelectionParams,
117
    StreamingAudioConfig,
118
    AudioEncoding
119
)
120

121
# Complete streaming configuration
122
streaming_config = StreamingSynthesizeConfig(
123
    voice=VoiceSelectionParams(
124
        language_code="en-US",
125
        name="en-US-Neural2-C",
126
        ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
127
    ),
128
    audio_config=StreamingAudioConfig(
129
        audio_encoding=AudioEncoding.LINEAR16,
130
        sample_rate_hertz=22050,
131
        speaking_rate=1.1,              # Optional: speech rate
132
        pitch=2.0,                      # Optional: pitch adjustment
133
        volume_gain_db=1.5              # Optional: volume gain
134
    )
135
)
136

137
# Streaming config with advanced voice options
138
streaming_config = StreamingSynthesizeConfig(
139
    voice=VoiceSelectionParams(
140
        language_code="en-US",
141
        name="en-US-Neural2-A",
142
        advanced_voice_options=texttospeech.AdvancedVoiceOptions(
143
            low_latency_journey_synthesis=True  # Enable low latency
144
        )
145
    ),
146
    audio_config=StreamingAudioConfig(
147
        audio_encoding=AudioEncoding.LINEAR16,
148
        sample_rate_hertz=16000  # Lower rate for reduced latency
149
    )
150
)
151
```
152

153
### StreamingAudioConfig
154

155
```api { .api }
156
class StreamingAudioConfig:
157
    """Description of the desired output audio data for streaming.
158
    
159
    Parameters:
160
    - audio_encoding (AudioEncoding): Required. Format of audio byte stream.
161
        Streaming supports PCM, ALAW, MULAW and OGG_OPUS only.
162
    - sample_rate_hertz (int): Optional. Synthesis sample rate in hertz.
163
    - speaking_rate (float): Optional. Speaking rate/speed in range [0.25, 2.0].
164
        1.0 is normal speed, 2.0 is twice as fast, 0.5 is half speed.
165
    """
166
    def __init__(self, audio_encoding, sample_rate_hertz=None, speaking_rate=None): ...
167
```
168

169
```api { .api }
170
from google.cloud.texttospeech import StreamingAudioConfig, AudioEncoding
171

172
# Basic streaming audio configuration
173
audio_config = StreamingAudioConfig(
174
    audio_encoding=AudioEncoding.LINEAR16,     # Required: audio format
175
    sample_rate_hertz=22050                    # Optional: sample rate
176
)
177

178
# Advanced streaming audio configuration
179
audio_config = StreamingAudioConfig(
180
    audio_encoding=AudioEncoding.OGG_OPUS,     # Compressed format for streaming
181
    sample_rate_hertz=48000,
182
    speaking_rate=0.9                          # Slightly slower speech
183
)
184

185
# Low-latency configuration
186
low_latency_config = StreamingAudioConfig(
187
    audio_encoding=AudioEncoding.LINEAR16,
188
    sample_rate_hertz=16000,                   # Lower sample rate
189
    speaking_rate=1.0                          # Normal rate
190
)
191
```
192

193
### StreamingSynthesisInput
194

195
```api { .api }
196
class StreamingSynthesisInput:
197
    """Input to be synthesized in streaming requests.
198
    
199
    This uses oneof fields - only one can be set at a time.
200
    
201
    Parameters:
202
    - text (str): Raw text to be synthesized. Recommended to use complete sentences.
203
    - markup (str): Markup for HD voices specifically. Cannot be used with other voices.
204
    - prompt (str): System instruction for controllable voice models only.
205
    """
206
    def __init__(self, text=None, markup=None, prompt=None): ...
207
```
208

209
```api { .api }
210
from google.cloud.texttospeech import StreamingSynthesisInput
211

212
# Text input for streaming
213
text_input = StreamingSynthesisInput(
214
    text="This is a chunk of text to be synthesized."
215
)
216

217
# Markup input for streaming (HD voices only)
218
markup_input = StreamingSynthesisInput(
219
    markup="Markup content for HD voices specifically."
220
)
221

222
# Prompt input for controllable voice models
223
prompt_input = StreamingSynthesisInput(
224
    prompt="System instruction for controllable voice models."
225
)
226

227
# Note: StreamingSynthesisInput uses oneof fields - only one can be set at a time
228
```
229

230
## Request and Response Types
231

232
### StreamingSynthesizeRequest
233

234
```api { .api }
235
class StreamingSynthesizeRequest:
236
    """Request message for StreamingSynthesize method.
237
    
238
    Uses oneof fields - only one can be set at a time.
239
    First message must contain streaming_config, subsequent messages contain input.
240
    
241
    Parameters:
242
    - streaming_config (StreamingSynthesizeConfig): Configuration for first request only.
243
    - input (StreamingSynthesisInput): Input text/markup for subsequent requests.
244
    """
245
    def __init__(self, streaming_config=None, input=None): ...
246
```
247

248
```api { .api }
249
from google.cloud.texttospeech import (
250
    StreamingSynthesizeRequest,
251
    StreamingSynthesizeConfig,
252
    StreamingSynthesisInput
253
)
254

255
# Configuration request (first request in stream)
256
config_request = StreamingSynthesizeRequest(
257
    streaming_config=StreamingSynthesizeConfig(
258
        voice=texttospeech.VoiceSelectionParams(language_code="en-US"),
259
        audio_config=texttospeech.StreamingAudioConfig(
260
            audio_encoding=texttospeech.AudioEncoding.LINEAR16,
261
            sample_rate_hertz=22050
262
        )
263
    )
264
)
265

266
# Input request (subsequent requests)
267
input_request = StreamingSynthesizeRequest(
268
    input=StreamingSynthesisInput(text="Text to synthesize")
269
)
270

271
# Markup input request (for HD voices)
272
markup_request = StreamingSynthesizeRequest(
273
    input=StreamingSynthesisInput(
274
        markup='Markup content with specific formatting for HD voices'
275
    )
276
)
277
```
278

279
### StreamingSynthesizeResponse
280

281
```api { .api }
282
from google.cloud.texttospeech import StreamingSynthesizeResponse
283

284
# Response processing
285
def process_streaming_response(response: StreamingSynthesizeResponse):
286
    """Process individual streaming response."""
287
    
288
    # Check for audio content
289
    if response.audio_content:
290
        audio_size = len(response.audio_content)
291
        print(f"Received audio chunk: {audio_size} bytes")
292
        return response.audio_content
293
    
294
    # Handle other response fields
295
    if hasattr(response, 'error') and response.error:
296
        print(f"Streaming error: {response.error}")
297
    
298
    return None
299

300
# Example response handling
301
def handle_streaming_responses(response_iterator):
302
    """Handle complete streaming response sequence."""
303
    audio_chunks = []
304
    total_chunks = 0
305
    total_bytes = 0
306
    
307
    for response in response_iterator:
308
        audio_chunk = process_streaming_response(response)
309
        if audio_chunk:
310
            audio_chunks.append(audio_chunk)
311
            total_chunks += 1
312
            total_bytes += len(audio_chunk)
313
    
314
    print(f"Streaming complete: {total_chunks} chunks, {total_bytes} bytes total")
315
    return b''.join(audio_chunks)
316
```
317

318
## Practical Streaming Examples
319

320
### Real-Time Text Processing
321

322
```api { .api }
323
import threading
324
import queue
325
import time
326
from google.cloud import texttospeech
327

328
class RealTimeTextToSpeech:
329
    """Real-time text-to-speech streaming processor."""
330
    
331
    def __init__(self, language_code="en-US", voice_name=None):
332
        self.client = texttospeech.TextToSpeechClient()
333
        self.text_queue = queue.Queue()
334
        self.audio_queue = queue.Queue()
335
        self.is_running = False
336
        
337
        # Configure streaming
338
        self.config = texttospeech.StreamingSynthesizeConfig(
339
            voice=texttospeech.VoiceSelectionParams(
340
                language_code=language_code,
341
                name=voice_name or "en-US-Neural2-A",
342
                advanced_voice_options=texttospeech.AdvancedVoiceOptions(
343
                    low_latency_journey_synthesis=True
344
                )
345
            ),
346
            audio_config=texttospeech.StreamingAudioConfig(
347
                audio_encoding=texttospeech.AudioEncoding.LINEAR16,
348
                sample_rate_hertz=16000  # Lower rate for real-time
349
            )
350
        )
351
    
352
    def start_streaming(self):
353
        """Start the streaming synthesis thread."""
354
        self.is_running = True
355
        self.streaming_thread = threading.Thread(target=self._stream_worker)
356
        self.streaming_thread.start()
357
    
358
    def stop_streaming(self):
359
        """Stop streaming synthesis."""
360
        self.is_running = False
361
        self.text_queue.put(None)  # Sentinel to end stream
362
        if hasattr(self, 'streaming_thread'):
363
            self.streaming_thread.join()
364
    
365
    def add_text(self, text: str):
366
        """Add text to synthesis queue."""
367
        if self.is_running:
368
            self.text_queue.put(text)
369
    
370
    def get_audio(self, timeout: float = 1.0):
371
        """Get synthesized audio chunk."""
372
        try:
373
            return self.audio_queue.get(timeout=timeout)
374
        except queue.Empty:
375
            return None
376
    
377
    def _stream_worker(self):
378
        """Background streaming worker."""
379
        def request_generator():
380
            # Send configuration first
381
            yield texttospeech.StreamingSynthesizeRequest(
382
                streaming_config=self.config
383
            )
384
            
385
            # Send text inputs as they arrive
386
            while self.is_running:
387
                try:
388
                    text = self.text_queue.get(timeout=1.0)
389
                    if text is None:  # Sentinel to end
390
                        break
391
                    
392
                    yield texttospeech.StreamingSynthesizeRequest(
393
                        input=texttospeech.StreamingSynthesisInput(text=text)
394
                    )
395
                except queue.Empty:
396
                    continue
397
        
398
        try:
399
            # Start streaming
400
            responses = self.client.streaming_synthesize(request_generator())
401
            
402
            # Process responses
403
            for response in responses:
404
                if response.audio_content and self.is_running:
405
                    self.audio_queue.put(response.audio_content)
406
        
407
        except Exception as e:
408
            print(f"Streaming error: {e}")
409
        finally:
410
            self.audio_queue.put(None)  # Signal end of audio
411

412
# Usage example
413
tts_stream = RealTimeTextToSpeech()
414
tts_stream.start_streaming()
415

416
# Add text for synthesis
417
tts_stream.add_text("Hello, this is real-time synthesis.")
418
tts_stream.add_text("Each text chunk is processed immediately.")
419
tts_stream.add_text("Great for interactive applications!")
420

421
# Collect audio chunks
422
audio_chunks = []
423
while True:
424
    audio_chunk = tts_stream.get_audio()
425
    if audio_chunk is None:
426
        break
427
    audio_chunks.append(audio_chunk)
428
    print(f"Got audio chunk: {len(audio_chunk)} bytes")
429

430
tts_stream.stop_streaming()
431

432
# Combine all audio
433
complete_audio = b''.join(audio_chunks)
434
with open("realtime_output.wav", "wb") as f:
435
    f.write(complete_audio)
436
```
437

438
### Interactive Conversation Streaming
439

440
```api { .api }
441
import asyncio
442
from google.cloud import texttospeech
443

444
class ConversationSynthesizer:
445
    """Interactive conversation streaming synthesis."""
446
    
447
    def __init__(self):
448
        self.client = texttospeech.TextToSpeechClient()
449
    
450
    def synthesize_conversation(self, conversation_parts: list, output_file: str):
451
        """Synthesize conversation with different voices for different speakers."""
452
        
453
        # Voice configurations for different speakers
454
        speaker_configs = {
455
            "speaker1": texttospeech.StreamingSynthesizeConfig(
456
                voice=texttospeech.VoiceSelectionParams(
457
                    language_code="en-US",
458
                    name="en-US-Neural2-A"  # Female voice
459
                ),
460
                audio_config=texttospeech.StreamingAudioConfig(
461
                    audio_encoding=texttospeech.AudioEncoding.LINEAR16,
462
                    sample_rate_hertz=22050
463
                )
464
            ),
465
            "speaker2": texttospeech.StreamingSynthesizeConfig(
466
                voice=texttospeech.VoiceSelectionParams(
467
                    language_code="en-US", 
468
                    name="en-US-Neural2-C"  # Male voice
469
                ),
470
                audio_config=texttospeech.StreamingAudioConfig(
471
                    audio_encoding=texttospeech.AudioEncoding.LINEAR16,
472
                    sample_rate_hertz=22050
473
                )
474
            )
475
        }
476
        
477
        all_audio_chunks = []
478
        
479
        # Process each speaker separately for voice consistency
480
        for speaker_id, config in speaker_configs.items():
481
            speaker_parts = [part for part in conversation_parts 
482
                           if part.get('speaker') == speaker_id]
483
            
484
            if not speaker_parts:
485
                continue
486
            
487
            def request_generator():
488
                # Configuration
489
                yield texttospeech.StreamingSynthesizeRequest(
490
                    streaming_config=config
491
                )
492
                
493
                # Speaker's dialogue parts
494
                for part in speaker_parts:
495
                    yield texttospeech.StreamingSynthesizeRequest(
496
                        input=texttospeech.StreamingSynthesisInput(
497
                            text=part['text']
498
                        )
499
                    )
500
            
501
            # Collect audio for this speaker
502
            responses = self.client.streaming_synthesize(request_generator())
503
            speaker_audio = []
504
            
505
            for response in responses:
506
                if response.audio_content:
507
                    speaker_audio.append(response.audio_content)
508
            
509
            # Store with timing information
510
            for i, part in enumerate(speaker_parts):
511
                part['audio_data'] = b''.join(speaker_audio) if i == 0 else b''
512
        
513
        # Reconstruct conversation in original order
514
        final_audio = []
515
        for part in conversation_parts:
516
            if 'audio_data' in part and part['audio_data']:
517
                final_audio.append(part['audio_data'])
518
        
519
        # Save complete conversation
520
        with open(output_file, "wb") as f:
521
            f.write(b''.join(final_audio))
522
        
523
        return output_file
524

525
# Usage example
526
conversation = [
527
    {"speaker": "speaker1", "text": "Hello! How are you today?"},
528
    {"speaker": "speaker2", "text": "I'm doing great, thanks for asking!"},
529
    {"speaker": "speaker1", "text": "That's wonderful to hear. What are your plans?"},
530
    {"speaker": "speaker2", "text": "I'm planning to work on some exciting projects."}
531
]
532

533
synthesizer = ConversationSynthesizer()
534
output_file = synthesizer.synthesize_conversation(conversation, "conversation.wav")
535
print(f"Conversation saved to {output_file}")
536
```
537

538
### Chunked Text Streaming
539

540
```api { .api }
541
def stream_long_text(text: str, chunk_size: int = 100):
542
    """Stream long text by breaking it into manageable chunks."""
543
    import re
544
    
545
    client = texttospeech.TextToSpeechClient()
546
    
547
    # Configure streaming for long content
548
    config = texttospeech.StreamingSynthesizeConfig(
549
        voice=texttospeech.VoiceSelectionParams(
550
            language_code="en-US",
551
            name="en-US-Wavenet-A"
552
        ),
553
        audio_config=texttospeech.StreamingAudioConfig(
554
            audio_encoding=texttospeech.AudioEncoding.LINEAR16,
555
            sample_rate_hertz=22050
556
        )
557
    )
558
    
559
    # Smart text chunking (respect sentence boundaries)
560
    def smart_chunk_text(text: str, max_size: int):
561
        """Break text into chunks at sentence boundaries when possible."""
562
        sentences = re.split(r'(?<=[.!?])\s+', text)
563
        chunks = []
564
        current_chunk = ""
565
        
566
        for sentence in sentences:
567
            if len(current_chunk + sentence) <= max_size:
568
                current_chunk += sentence + " "
569
            else:
570
                if current_chunk:
571
                    chunks.append(current_chunk.strip())
572
                current_chunk = sentence + " "
573
        
574
        if current_chunk:
575
            chunks.append(current_chunk.strip())
576
        
577
        return chunks
578
    
579
    # Create text chunks
580
    text_chunks = smart_chunk_text(text, chunk_size)
581
    
582
    def request_generator():
583
        # Configuration request
584
        yield texttospeech.StreamingSynthesizeRequest(streaming_config=config)
585
        
586
        # Send text chunks
587
        for i, chunk in enumerate(text_chunks):
588
            print(f"Streaming chunk {i+1}/{len(text_chunks)}: {len(chunk)} chars")
589
            yield texttospeech.StreamingSynthesizeRequest(
590
                input=texttospeech.StreamingSynthesisInput(text=chunk)
591
            )
592
    
593
    # Stream and collect results
594
    responses = client.streaming_synthesize(request_generator())
595
    
596
    audio_chunks = []
597
    chunk_count = 0
598
    
599
    for response in responses:
600
        if response.audio_content:
601
            chunk_count += 1
602
            audio_chunks.append(response.audio_content)
603
            print(f"Received audio chunk {chunk_count}: {len(response.audio_content)} bytes")
604
    
605
    return b''.join(audio_chunks)
606

607
# Usage with long text
608
long_text = """
609
This is a very long piece of text that demonstrates streaming synthesis 
610
with automatic chunking. The system will break this text into smaller 
611
pieces and stream them to the Text-to-Speech API. This approach is useful 
612
for processing long documents, articles, or books where you want to start 
613
receiving audio output before the entire text is processed. The streaming 
614
approach also helps manage memory usage and provides better user experience 
615
for real-time applications.
616
"""
617

618
audio_data = stream_long_text(long_text, chunk_size=80)
619
with open("streamed_long_text.wav", "wb") as f:
620
    f.write(audio_data)
621
```
622

623
## Performance Optimization
624

625
### Low-Latency Streaming
626

627
```api { .api }
628
def create_low_latency_stream_config():
629
    """Create optimized configuration for minimal latency."""
630
    
631
    return texttospeech.StreamingSynthesizeConfig(
632
        voice=texttospeech.VoiceSelectionParams(
633
            language_code="en-US",
634
            name="en-US-Standard-A",  # Standard voices have lower latency
635
            advanced_voice_options=texttospeech.AdvancedVoiceOptions(
636
                low_latency_journey_synthesis=True
637
            )
638
        ),
639
        audio_config=texttospeech.StreamingAudioConfig(
640
            audio_encoding=texttospeech.AudioEncoding.LINEAR16,  # Uncompressed
641
            sample_rate_hertz=16000,  # Lower sample rate
642
            speaking_rate=1.1         # Slightly faster speech
643
        )
644
    )
645

646
def optimized_streaming_synthesis(text_parts: list):
647
    """Optimized streaming for real-time applications."""
648
    client = texttospeech.TextToSpeechClient()
649
    
650
    config = create_low_latency_stream_config()
651
    
652
    def fast_request_generator():
653
        yield texttospeech.StreamingSynthesizeRequest(streaming_config=config)
654
        
655
        for text in text_parts:
656
            # Send smaller chunks for faster processing
657
            if len(text) > 50:
658
                # Break into smaller pieces
659
                words = text.split()
660
                chunk_size = 10  # words per chunk
661
                for i in range(0, len(words), chunk_size):
662
                    chunk = " ".join(words[i:i + chunk_size])
663
                    yield texttospeech.StreamingSynthesizeRequest(
664
                        input=texttospeech.StreamingSynthesisInput(text=chunk)
665
                    )
666
            else:
667
                yield texttospeech.StreamingSynthesizeRequest(
668
                    input=texttospeech.StreamingSynthesisInput(text=text)
669
                )
670
    
671
    # Process with timing
672
    import time
673
    start_time = time.time()
674
    
675
    responses = client.streaming_synthesize(fast_request_generator())
676
    first_response_time = None
677
    audio_chunks = []
678
    
679
    for response in responses:
680
        if response.audio_content:
681
            if first_response_time is None:
682
                first_response_time = time.time()
683
                print(f"First audio received in: {first_response_time - start_time:.2f}s")
684
            
685
            audio_chunks.append(response.audio_content)
686
    
687
    total_time = time.time() - start_time
688
    print(f"Total streaming time: {total_time:.2f}s")
689
    
690
    return b''.join(audio_chunks)
691
```
692

693
### Error Handling for Streaming
694

695
```api { .api }
696
from google.api_core import exceptions
697
import logging
698

699
def robust_streaming_synthesis(text_parts: list, max_retries: int = 3):
700
    """Streaming synthesis with comprehensive error handling."""
701
    
702
    client = texttospeech.TextToSpeechClient()
703
    
704
    config = texttospeech.StreamingSynthesizeConfig(
705
        voice=texttospeech.VoiceSelectionParams(language_code="en-US"),
706
        audio_config=texttospeech.StreamingAudioConfig(
707
            audio_encoding=texttospeech.AudioEncoding.LINEAR16,
708
            sample_rate_hertz=22050
709
        )
710
    )
711
    
712
    for attempt in range(max_retries):
713
        try:
714
            def request_generator():
715
                yield texttospeech.StreamingSynthesizeRequest(streaming_config=config)
716
                
717
                for text in text_parts:
718
                    yield texttospeech.StreamingSynthesizeRequest(
719
                        input=texttospeech.StreamingSynthesisInput(text=text)
720
                    )
721
            
722
            # Attempt streaming
723
            responses = client.streaming_synthesize(request_generator())
724
            
725
            audio_chunks = []
726
            for response in responses:
727
                if response.audio_content:
728
                    audio_chunks.append(response.audio_content)
729
            
730
            return b''.join(audio_chunks)
731
            
732
        except exceptions.DeadlineExceeded as e:
733
            logging.warning(f"Streaming timeout (attempt {attempt + 1}): {e}")
734
            if attempt == max_retries - 1:
735
                raise
736
        
737
        except exceptions.ResourceExhausted as e:
738
            logging.warning(f"Rate limit exceeded (attempt {attempt + 1}): {e}")
739
            if attempt == max_retries - 1:
740
                raise
741
            # Wait before retry
742
            import time
743
            time.sleep(2 ** attempt)  # Exponential backoff
744
        
745
        except exceptions.ServiceUnavailable as e:
746
            logging.warning(f"Service unavailable (attempt {attempt + 1}): {e}")
747
            if attempt == max_retries - 1:
748
                raise
749
            import time
750
            time.sleep(1)
751
        
752
        except Exception as e:
753
            logging.error(f"Unexpected streaming error: {e}")
754
            raise
755
    
756
    raise RuntimeError(f"Streaming failed after {max_retries} attempts")
757

758
# Usage with error handling
759
try:
760
    text_parts = [
761
        "This is the first part of the streaming text.",
762
        "Here's the second part with more content.",
763
        "And finally, this is the conclusion."
764
    ]
765
    
766
    audio_result = robust_streaming_synthesis(text_parts)
767
    print(f"Successfully generated {len(audio_result)} bytes of audio")
768
    
769
except Exception as e:
770
    print(f"Streaming synthesis failed: {e}")
771
```

Version

Tile

Files

streaming-synthesis.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

streaming-synthesis.mddocs/