0
# Long Audio Synthesis
1
2
## Overview
3
4
Long audio synthesis is designed for generating extended audio content that exceeds the limits of standard synthesis operations. It uses Google Cloud's long-running operations (LRO) pattern to handle large-scale text-to-speech generation asynchronously, with output delivered to Google Cloud Storage.
5
6
**Key Features:**
7
- Supports very large text inputs (up to several hours of audio)
8
- Asynchronous processing with operation monitoring
9
- Direct output to Google Cloud Storage
10
- Progress tracking and metadata
11
- Suitable for audiobooks, long documents, and batch processing
12
13
## Client Setup
14
15
### Long Audio Synthesis Clients
16
17
```api { .api }
18
from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize
19
20
# Synchronous long audio client
21
long_client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
22
23
# Asynchronous long audio client
24
async_long_client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeAsyncClient()
25
26
# Alternative import paths
27
from google.cloud import texttospeech_v1
28
29
# Through main module
30
long_client = texttospeech_v1.services.text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
31
```
32
33
### Authentication and Project Setup
34
35
```api { .api }
36
import os
37
from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize
38
39
# Set up authentication (if not using default credentials)
40
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/path/to/service-account-key.json'
41
42
# Initialize with explicit project
43
client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
44
45
# Project and location information
46
PROJECT_ID = "your-project-id"
47
LOCATION = "us-central1" # or other supported location
48
PARENT = f"projects/{PROJECT_ID}/locations/{LOCATION}"
49
```
50
51
## Core Long Audio Operations
52
53
### Basic Long Audio Synthesis
54
55
```api { .api }
56
from google.cloud import texttospeech_v1
57
from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize
58
59
# Initialize client
60
client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
61
62
# Create long audio synthesis request
63
request = texttospeech_v1.SynthesizeLongAudioRequest(
64
parent="projects/your-project-id/locations/us-central1",
65
input=texttospeech_v1.SynthesisInput(
66
text="This is a very long text that will be converted to audio. " * 100
67
),
68
audio_config=texttospeech_v1.AudioConfig(
69
audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
70
sample_rate_hertz=22050
71
),
72
voice=texttospeech_v1.VoiceSelectionParams(
73
language_code="en-US",
74
name="en-US-Wavenet-A"
75
),
76
output_gcs_uri="gs://your-bucket-name/output-audio.wav"
77
)
78
79
# Start long-running operation
80
operation = client.synthesize_long_audio(request=request)
81
82
print(f"Operation name: {operation.name}")
83
print("Long audio synthesis started...")
84
85
# Wait for completion
86
result = operation.result() # Blocks until complete
87
88
print("Long audio synthesis completed!")
89
print(f"Result: {result}")
90
```
91
92
### SSML Long Audio Synthesis
93
94
```api { .api }
95
from google.cloud import texttospeech_v1
96
97
# Prepare long SSML content
98
long_ssml_content = """
99
<speak>
100
<p>
101
<s>Welcome to this long audio demonstration.</s>
102
<s>This content will be processed as a long-running operation.</s>
103
</p>
104
105
<break time="2s"/>
106
107
<p>
108
<s>Here we have multiple paragraphs with various SSML features.</s>
109
<s><prosody rate="slow">This part is spoken slowly.</prosody></s>
110
<s><prosody rate="fast">While this part is much faster.</prosody></s>
111
</p>
112
113
<break time="3s"/>
114
115
<p>
116
<s><emphasis level="strong">This is emphasized text.</emphasis></s>
117
<s>And this concludes our long audio sample.</s>
118
</p>
119
</speak>
120
"""
121
122
# Create request with SSML
123
request = texttospeech_v1.SynthesizeLongAudioRequest(
124
parent="projects/your-project-id/locations/us-central1",
125
input=texttospeech_v1.SynthesisInput(ssml=long_ssml_content),
126
audio_config=texttospeech_v1.AudioConfig(
127
audio_encoding=texttospeech_v1.AudioEncoding.MP3,
128
speaking_rate=1.0,
129
pitch=0.0,
130
volume_gain_db=0.0
131
),
132
voice=texttospeech_v1.VoiceSelectionParams(
133
language_code="en-US",
134
name="en-US-Neural2-A"
135
),
136
output_gcs_uri="gs://your-bucket-name/long-ssml-output.mp3"
137
)
138
139
client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
140
operation = client.synthesize_long_audio(request=request)
141
```
142
143
## Request and Response Types
144
145
### SynthesizeLongAudioRequest
146
147
```api { .api }
148
from google.cloud.texttospeech_v1 import (
149
SynthesizeLongAudioRequest,
150
SynthesisInput,
151
AudioConfig,
152
VoiceSelectionParams,
153
AudioEncoding
154
)
155
156
# Complete long audio request configuration
157
request = SynthesizeLongAudioRequest(
158
parent="projects/your-project-id/locations/us-central1", # Required: parent resource
159
160
input=SynthesisInput(
161
text="Long text content to synthesize..." # or ssml="<speak>...</speak>"
162
),
163
164
audio_config=AudioConfig(
165
audio_encoding=AudioEncoding.LINEAR16, # Audio format
166
sample_rate_hertz=24000, # Sample rate
167
speaking_rate=1.0, # Speech rate
168
pitch=0.0, # Pitch adjustment
169
volume_gain_db=0.0, # Volume gain
170
effects_profile_id=["large-home-entertainment-class-device"] # Audio effects
171
),
172
173
voice=VoiceSelectionParams(
174
language_code="en-US", # Required: language
175
name="en-US-Wavenet-D", # Specific voice
176
ssml_gender=texttospeech_v1.SsmlVoiceGender.FEMALE
177
),
178
179
output_gcs_uri="gs://your-bucket-name/path/output.wav" # Required: GCS output location
180
)
181
182
# Request with custom pronunciations
183
request_with_pronunciations = SynthesizeLongAudioRequest(
184
parent="projects/your-project-id/locations/us-central1",
185
input=SynthesisInput(text="Text with custom pronunciations for API and JSON terms."),
186
audio_config=AudioConfig(
187
audio_encoding=AudioEncoding.MP3,
188
sample_rate_hertz=22050
189
),
190
voice=VoiceSelectionParams(
191
language_code="en-US",
192
name="en-US-Neural2-A",
193
custom_pronunciations=texttospeech_v1.CustomPronunciations(
194
pronunciations=[
195
texttospeech_v1.CustomPronunciationParams(
196
phrase="API",
197
ipa="ˌeɪ piː ˈaɪ",
198
phonetic_encoding=texttospeech_v1.CustomPronunciationParams.PhoneticEncoding.IPA
199
)
200
]
201
)
202
),
203
output_gcs_uri="gs://your-bucket-name/custom-pronunciation-output.mp3"
204
)
205
```
206
207
### SynthesizeLongAudioResponse and Metadata
208
209
```api { .api }
210
from google.cloud.texttospeech_v1 import SynthesizeLongAudioResponse, SynthesizeLongAudioMetadata
211
212
# Response object (returned when operation completes)
213
# SynthesizeLongAudioResponse is typically empty - the audio is written to GCS
214
215
# Metadata object (available during operation)
216
def process_operation_metadata(operation):
217
"""Process metadata from long-running operation."""
218
219
if operation.metadata:
220
# Metadata contains progress information
221
metadata = SynthesizeLongAudioMetadata()
222
operation.metadata.Unpack(metadata)
223
224
print(f"Progress: {metadata.progress_percentage}%")
225
print(f"Start time: {metadata.start_time}")
226
227
if metadata.last_update_time:
228
print(f"Last update: {metadata.last_update_time}")
229
230
return operation.metadata
231
232
# Access operation result
233
def get_operation_result(operation):
234
"""Get result from completed operation."""
235
236
if operation.done():
237
if operation.error:
238
print(f"Operation failed: {operation.error}")
239
return None
240
else:
241
result = operation.result()
242
print("Operation completed successfully")
243
# Result is typically empty - check GCS for output file
244
return result
245
else:
246
print(f"Operation still running: {operation.name}")
247
return None
248
```
249
250
## Operation Management
251
252
### Monitoring Long-Running Operations
253
254
```api { .api }
255
import time
256
from google.api_core import operation
257
from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize
258
259
def monitor_long_audio_operation(operation_name: str, check_interval: int = 30):
260
"""Monitor a long-running audio synthesis operation."""
261
262
client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
263
264
# Get operation by name
265
op = client.get_operation(request={"name": operation_name})
266
267
print(f"Monitoring operation: {operation_name}")
268
269
while not op.done():
270
# Process metadata
271
if op.metadata:
272
try:
273
metadata = texttospeech_v1.SynthesizeLongAudioMetadata()
274
op.metadata.Unpack(metadata)
275
276
progress = getattr(metadata, 'progress_percentage', 0)
277
print(f"Progress: {progress}%")
278
279
if hasattr(metadata, 'start_time') and metadata.start_time:
280
print(f"Started at: {metadata.start_time}")
281
282
except Exception as e:
283
print(f"Could not parse metadata: {e}")
284
285
print(f"Operation still running. Checking again in {check_interval} seconds...")
286
time.sleep(check_interval)
287
288
# Refresh operation status
289
op = client.get_operation(request={"name": operation_name})
290
291
# Operation completed
292
if op.error:
293
print(f"Operation failed: {op.error}")
294
return False
295
else:
296
print("Operation completed successfully!")
297
print(f"Output should be available at the specified GCS URI")
298
return True
299
300
# Usage
301
# operation_name = "projects/your-project/locations/us-central1/operations/long-operation-id"
302
# success = monitor_long_audio_operation(operation_name)
303
```
304
305
### Cancelling Operations
306
307
```api { .api }
308
def cancel_long_audio_operation(operation_name: str):
309
"""Cancel a running long audio synthesis operation."""
310
311
client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
312
313
try:
314
# Cancel the operation
315
client.cancel_operation(request={"name": operation_name})
316
print(f"Cancellation requested for operation: {operation_name}")
317
318
# Check if cancellation was successful
319
op = client.get_operation(request={"name": operation_name})
320
321
if op.done():
322
if op.cancelled():
323
print("Operation successfully cancelled")
324
return True
325
else:
326
print("Operation completed before cancellation")
327
return False
328
else:
329
print("Cancellation in progress...")
330
return True
331
332
except Exception as e:
333
print(f"Failed to cancel operation: {e}")
334
return False
335
336
# Usage
337
# cancel_long_audio_operation("projects/your-project/locations/us-central1/operations/op-id")
338
```
339
340
### Listing Operations
341
342
```api { .api }
343
def list_long_audio_operations(project_id: str, location: str = "us-central1"):
344
"""List all long audio synthesis operations for a project."""
345
346
client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
347
348
parent = f"projects/{project_id}/locations/{location}"
349
350
try:
351
# List operations
352
operations = client.list_operations(request={"name": parent})
353
354
print(f"Operations in {parent}:")
355
356
for op in operations:
357
print(f"\nOperation: {op.name}")
358
print(f"Done: {op.done()}")
359
360
if op.done():
361
if op.error:
362
print(f"Error: {op.error}")
363
else:
364
print("Status: Completed successfully")
365
else:
366
print("Status: Running")
367
368
# Try to get metadata
369
if op.metadata:
370
try:
371
metadata = texttospeech_v1.SynthesizeLongAudioMetadata()
372
op.metadata.Unpack(metadata)
373
progress = getattr(metadata, 'progress_percentage', 0)
374
print(f"Progress: {progress}%")
375
except:
376
print("Progress: Unknown")
377
378
return operations
379
380
except Exception as e:
381
print(f"Failed to list operations: {e}")
382
return []
383
384
# Usage
385
# operations = list_long_audio_operations("your-project-id")
386
```
387
388
## Practical Examples
389
390
### Audiobook Generation
391
392
```api { .api }
393
import os
394
from google.cloud import storage
395
from google.cloud.texttospeech_v1.services import text_to_speech_long_audio_synthesize
396
397
class AudiobookGenerator:
398
"""Generate audiobooks from long text content."""
399
400
def __init__(self, project_id: str, bucket_name: str, location: str = "us-central1"):
401
self.project_id = project_id
402
self.bucket_name = bucket_name
403
self.location = location
404
self.parent = f"projects/{project_id}/locations/{location}"
405
406
# Initialize clients
407
self.tts_client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
408
self.storage_client = storage.Client()
409
410
def generate_audiobook(self, text_content: str, output_filename: str,
411
voice_name: str = "en-US-Wavenet-A",
412
language_code: str = "en-US"):
413
"""Generate audiobook from text content."""
414
415
# Ensure GCS bucket exists
416
try:
417
bucket = self.storage_client.bucket(self.bucket_name)
418
if not bucket.exists():
419
bucket = self.storage_client.create_bucket(self.bucket_name)
420
print(f"Created bucket: {self.bucket_name}")
421
except Exception as e:
422
print(f"Bucket setup error: {e}")
423
return None
424
425
# Configure audiobook synthesis
426
gcs_uri = f"gs://{self.bucket_name}/{output_filename}"
427
428
request = texttospeech_v1.SynthesizeLongAudioRequest(
429
parent=self.parent,
430
input=texttospeech_v1.SynthesisInput(text=text_content),
431
audio_config=texttospeech_v1.AudioConfig(
432
audio_encoding=texttospeech_v1.AudioEncoding.MP3,
433
sample_rate_hertz=22050,
434
speaking_rate=0.9, # Slightly slower for audiobooks
435
volume_gain_db=2.0 # Boost volume
436
),
437
voice=texttospeech_v1.VoiceSelectionParams(
438
language_code=language_code,
439
name=voice_name
440
),
441
output_gcs_uri=gcs_uri
442
)
443
444
print(f"Starting audiobook generation...")
445
print(f"Output will be saved to: {gcs_uri}")
446
447
# Start synthesis
448
operation = self.tts_client.synthesize_long_audio(request=request)
449
450
return {
451
'operation': operation,
452
'operation_name': operation.name,
453
'output_uri': gcs_uri
454
}
455
456
def wait_for_audiobook(self, operation, check_interval: int = 60):
457
"""Wait for audiobook generation to complete."""
458
459
print("Waiting for audiobook generation to complete...")
460
461
while not operation.done():
462
# Get progress
463
if operation.metadata:
464
try:
465
metadata = texttospeech_v1.SynthesizeLongAudioMetadata()
466
operation.metadata.Unpack(metadata)
467
progress = getattr(metadata, 'progress_percentage', 0)
468
print(f"Progress: {progress}%")
469
except:
470
print("Checking progress...")
471
472
time.sleep(check_interval)
473
474
# Refresh operation
475
operation = self.tts_client.get_operation(
476
request={"name": operation.name}
477
)
478
479
if operation.error:
480
print(f"Audiobook generation failed: {operation.error}")
481
return False
482
else:
483
print("Audiobook generation completed successfully!")
484
return True
485
486
def download_audiobook(self, gcs_uri: str, local_filename: str):
487
"""Download generated audiobook from GCS."""
488
489
# Parse GCS URI
490
if not gcs_uri.startswith("gs://"):
491
raise ValueError("Invalid GCS URI")
492
493
path_parts = gcs_uri[5:].split("/", 1)
494
bucket_name = path_parts[0]
495
blob_name = path_parts[1]
496
497
# Download file
498
bucket = self.storage_client.bucket(bucket_name)
499
blob = bucket.blob(blob_name)
500
501
blob.download_to_filename(local_filename)
502
print(f"Audiobook downloaded to: {local_filename}")
503
504
# Get file info
505
file_size = os.path.getsize(local_filename)
506
print(f"File size: {file_size / (1024*1024):.2f} MB")
507
508
return local_filename
509
510
# Usage example
511
def generate_sample_audiobook():
512
"""Generate a sample audiobook."""
513
514
# Sample long text (could be loaded from file)
515
sample_text = """
516
Chapter 1: Introduction
517
518
Welcome to this sample audiobook demonstration. This text will be converted
519
into high-quality speech using Google Cloud Text-to-Speech long audio synthesis.
520
521
The long audio synthesis feature is specifically designed for content like this,
522
where the text is too long for standard synthesis operations. It processes the
523
content asynchronously and delivers the results to Google Cloud Storage.
524
525
Chapter 2: Features
526
527
Long audio synthesis supports all the same features as standard synthesis,
528
including SSML markup, custom voices, and audio configuration options.
529
The main difference is that it can handle much larger amounts of text
530
and processes them as long-running operations.
531
532
This makes it ideal for generating audiobooks, processing long documents,
533
or creating extended audio content for podcasts and presentations.
534
535
Chapter 3: Conclusion
536
537
Thank you for listening to this sample audiobook. The long audio synthesis
538
feature provides a powerful way to convert large amounts of text into
539
natural-sounding speech.
540
""" * 5 # Repeat to make it longer
541
542
# Generate audiobook
543
generator = AudiobookGenerator(
544
project_id="your-project-id",
545
bucket_name="your-audiobook-bucket"
546
)
547
548
result = generator.generate_audiobook(
549
text_content=sample_text,
550
output_filename="sample_audiobook.mp3",
551
voice_name="en-US-Wavenet-A"
552
)
553
554
if result:
555
# Wait for completion
556
success = generator.wait_for_audiobook(result['operation'])
557
558
if success:
559
# Download the result
560
generator.download_audiobook(
561
result['output_uri'],
562
"local_audiobook.mp3"
563
)
564
print("Audiobook generation complete!")
565
566
return result
567
568
return None
569
570
# Run the example
571
# audiobook_result = generate_sample_audiobook()
572
```
573
574
### Batch Document Processing
575
576
```api { .api }
577
import concurrent.futures
578
from typing import List, Dict
579
580
class BatchDocumentProcessor:
581
"""Process multiple documents for long audio synthesis."""
582
583
def __init__(self, project_id: str, bucket_name: str, location: str = "us-central1"):
584
self.project_id = project_id
585
self.bucket_name = bucket_name
586
self.location = location
587
self.parent = f"projects/{project_id}/locations/{location}"
588
589
self.client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
590
591
def process_document_batch(self, documents: List[Dict], max_workers: int = 5):
592
"""Process multiple documents in parallel."""
593
594
def process_single_document(doc_info):
595
"""Process a single document."""
596
try:
597
doc_name = doc_info['name']
598
text_content = doc_info['content']
599
voice_config = doc_info.get('voice', {})
600
audio_config = doc_info.get('audio', {})
601
602
# Default configurations
603
voice_name = voice_config.get('name', 'en-US-Wavenet-A')
604
language_code = voice_config.get('language_code', 'en-US')
605
606
audio_encoding = audio_config.get('encoding', texttospeech_v1.AudioEncoding.MP3)
607
sample_rate = audio_config.get('sample_rate', 22050)
608
609
# Create request
610
output_uri = f"gs://{self.bucket_name}/batch/{doc_name}.mp3"
611
612
request = texttospeech_v1.SynthesizeLongAudioRequest(
613
parent=self.parent,
614
input=texttospeech_v1.SynthesisInput(text=text_content),
615
audio_config=texttospeech_v1.AudioConfig(
616
audio_encoding=audio_encoding,
617
sample_rate_hertz=sample_rate
618
),
619
voice=texttospeech_v1.VoiceSelectionParams(
620
language_code=language_code,
621
name=voice_name
622
),
623
output_gcs_uri=output_uri
624
)
625
626
# Start synthesis
627
operation = self.client.synthesize_long_audio(request=request)
628
629
return {
630
'document': doc_name,
631
'operation_name': operation.name,
632
'output_uri': output_uri,
633
'success': True,
634
'operation': operation
635
}
636
637
except Exception as e:
638
return {
639
'document': doc_info['name'],
640
'operation_name': None,
641
'output_uri': None,
642
'success': False,
643
'error': str(e)
644
}
645
646
# Process documents in parallel
647
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
648
results = list(executor.map(process_single_document, documents))
649
650
return results
651
652
def monitor_batch_operations(self, operation_results: List[Dict],
653
check_interval: int = 30):
654
"""Monitor multiple long-running operations."""
655
656
pending_operations = [r for r in operation_results if r['success']]
657
completed_operations = []
658
659
print(f"Monitoring {len(pending_operations)} operations...")
660
661
while pending_operations:
662
still_pending = []
663
664
for op_result in pending_operations:
665
try:
666
# Check operation status
667
operation = self.client.get_operation(
668
request={"name": op_result['operation_name']}
669
)
670
671
if operation.done():
672
if operation.error:
673
op_result['final_status'] = 'failed'
674
op_result['error'] = str(operation.error)
675
print(f"❌ {op_result['document']}: Failed")
676
else:
677
op_result['final_status'] = 'completed'
678
print(f"✅ {op_result['document']}: Completed")
679
680
completed_operations.append(op_result)
681
else:
682
# Still running
683
if operation.metadata:
684
try:
685
metadata = texttospeech_v1.SynthesizeLongAudioMetadata()
686
operation.metadata.Unpack(metadata)
687
progress = getattr(metadata, 'progress_percentage', 0)
688
print(f"⏳ {op_result['document']}: {progress}%")
689
except:
690
print(f"⏳ {op_result['document']}: In progress...")
691
692
still_pending.append(op_result)
693
694
except Exception as e:
695
print(f"Error checking {op_result['document']}: {e}")
696
still_pending.append(op_result)
697
698
pending_operations = still_pending
699
700
if pending_operations:
701
print(f"\n{len(pending_operations)} operations still running. "
702
f"Checking again in {check_interval} seconds...\n")
703
time.sleep(check_interval)
704
705
print(f"\nBatch processing complete!")
706
print(f"Completed: {len([op for op in completed_operations if op.get('final_status') == 'completed'])}")
707
print(f"Failed: {len([op for op in completed_operations if op.get('final_status') == 'failed'])}")
708
709
return completed_operations
710
711
# Usage example
712
def batch_process_example():
713
"""Example of batch processing multiple documents."""
714
715
# Sample documents
716
documents = [
717
{
718
'name': 'document1',
719
'content': 'This is the first document content. ' * 100,
720
'voice': {'name': 'en-US-Neural2-A', 'language_code': 'en-US'},
721
'audio': {'encoding': texttospeech_v1.AudioEncoding.MP3, 'sample_rate': 22050}
722
},
723
{
724
'name': 'document2',
725
'content': 'This is the second document content. ' * 100,
726
'voice': {'name': 'en-US-Wavenet-D', 'language_code': 'en-US'},
727
'audio': {'encoding': texttospeech_v1.AudioEncoding.LINEAR16, 'sample_rate': 24000}
728
},
729
{
730
'name': 'document3',
731
'content': 'This is the third document content. ' * 100,
732
'voice': {'name': 'en-US-Standard-B', 'language_code': 'en-US'},
733
'audio': {'encoding': texttospeech_v1.AudioEncoding.OGG_OPUS, 'sample_rate': 48000}
734
}
735
]
736
737
# Process batch
738
processor = BatchDocumentProcessor(
739
project_id="your-project-id",
740
bucket_name="your-batch-bucket"
741
)
742
743
# Start batch processing
744
results = processor.process_document_batch(documents, max_workers=3)
745
746
# Monitor operations
747
final_results = processor.monitor_batch_operations(results)
748
749
return final_results
750
751
# Run batch processing
752
# batch_results = batch_process_example()
753
```
754
755
## Error Handling and Best Practices
756
757
### Comprehensive Error Handling
758
759
```api { .api }
760
from google.api_core import exceptions
761
import logging
762
763
def robust_long_audio_synthesis(text_content: str, output_gcs_uri: str,
764
project_id: str, location: str = "us-central1"):
765
"""Long audio synthesis with comprehensive error handling."""
766
767
client = text_to_speech_long_audio_synthesize.TextToSpeechLongAudioSynthesizeClient()
768
parent = f"projects/{project_id}/locations/{location}"
769
770
try:
771
# Validate inputs
772
if not text_content or not text_content.strip():
773
raise ValueError("Text content cannot be empty")
774
775
if not output_gcs_uri.startswith("gs://"):
776
raise ValueError("Output URI must be a valid GCS URI (gs://...)")
777
778
# Create request
779
request = texttospeech_v1.SynthesizeLongAudioRequest(
780
parent=parent,
781
input=texttospeech_v1.SynthesisInput(text=text_content),
782
audio_config=texttospeech_v1.AudioConfig(
783
audio_encoding=texttospeech_v1.AudioEncoding.MP3,
784
sample_rate_hertz=22050
785
),
786
voice=texttospeech_v1.VoiceSelectionParams(
787
language_code="en-US",
788
name="en-US-Neural2-A"
789
),
790
output_gcs_uri=output_gcs_uri
791
)
792
793
# Start operation
794
operation = client.synthesize_long_audio(request=request)
795
796
return {
797
'success': True,
798
'operation': operation,
799
'operation_name': operation.name
800
}
801
802
except exceptions.InvalidArgument as e:
803
logging.error(f"Invalid request parameters: {e}")
804
return {'success': False, 'error': 'Invalid parameters', 'details': str(e)}
805
806
except exceptions.PermissionDenied as e:
807
logging.error(f"Permission denied: {e}")
808
return {'success': False, 'error': 'Permission denied', 'details': str(e)}
809
810
except exceptions.ResourceExhausted as e:
811
logging.error(f"Quota exceeded: {e}")
812
return {'success': False, 'error': 'Quota exceeded', 'details': str(e)}
813
814
except exceptions.FailedPrecondition as e:
815
logging.error(f"Failed precondition: {e}")
816
return {'success': False, 'error': 'Precondition failed', 'details': str(e)}
817
818
except exceptions.NotFound as e:
819
logging.error(f"Resource not found: {e}")
820
return {'success': False, 'error': 'Resource not found', 'details': str(e)}
821
822
except Exception as e:
823
logging.error(f"Unexpected error: {e}")
824
return {'success': False, 'error': 'Unexpected error', 'details': str(e)}
825
826
# Usage with error handling
827
result = robust_long_audio_synthesis(
828
text_content="Long text content...",
829
output_gcs_uri="gs://your-bucket/output.mp3",
830
project_id="your-project-id"
831
)
832
833
if result['success']:
834
print(f"Operation started: {result['operation_name']}")
835
else:
836
print(f"Error: {result['error']} - {result['details']}")
837
```
838
839
### Best Practices for Long Audio Synthesis
840
841
```api { .api }
842
class LongAudioBestPractices:
843
"""Best practices for long audio synthesis."""
844
845
@staticmethod
846
def validate_text_length(text: str) -> bool:
847
"""Validate text length for long audio synthesis."""
848
# Recommended maximum: ~1 million characters
849
MAX_CHARS = 1_000_000
850
851
if len(text) > MAX_CHARS:
852
print(f"Warning: Text length ({len(text)}) exceeds recommended maximum ({MAX_CHARS})")
853
return False
854
855
return True
856
857
@staticmethod
858
def optimize_text_for_synthesis(text: str) -> str:
859
"""Optimize text content for better synthesis."""
860
import re
861
862
# Remove excessive whitespace
863
text = re.sub(r'\s+', ' ', text)
864
865
# Add proper punctuation for better pacing
866
text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
867
868
# Ensure paragraph breaks
869
text = re.sub(r'\n\s*\n', '\n\n', text)
870
871
return text.strip()
872
873
@staticmethod
874
def choose_optimal_voice(content_type: str, language: str = "en-US") -> str:
875
"""Choose optimal voice based on content type."""
876
877
voice_recommendations = {
878
"audiobook": f"{language}-Wavenet-A", # Clear, pleasant for long listening
879
"news": f"{language}-Neural2-C", # Authoritative
880
"educational": f"{language}-Neural2-A", # Clear, engaging
881
"documentation": f"{language}-Standard-A", # Clear, efficient
882
"narrative": f"{language}-Wavenet-D" # Expressive
883
}
884
885
return voice_recommendations.get(content_type, f"{language}-Neural2-A")
886
887
@staticmethod
888
def create_optimal_audio_config(use_case: str) -> texttospeech_v1.AudioConfig:
889
"""Create optimal audio configuration for different use cases."""
890
891
configs = {
892
"audiobook": texttospeech_v1.AudioConfig(
893
audio_encoding=texttospeech_v1.AudioEncoding.MP3,
894
sample_rate_hertz=22050,
895
speaking_rate=0.9,
896
volume_gain_db=2.0
897
),
898
"podcast": texttospeech_v1.AudioConfig(
899
audio_encoding=texttospeech_v1.AudioEncoding.MP3,
900
sample_rate_hertz=44100,
901
speaking_rate=1.0,
902
volume_gain_db=1.0,
903
effects_profile_id=["large-home-entertainment-class-device"]
904
),
905
"telephony": texttospeech_v1.AudioConfig(
906
audio_encoding=texttospeech_v1.AudioEncoding.MULAW,
907
sample_rate_hertz=8000,
908
speaking_rate=1.1,
909
effects_profile_id=["telephony-class-application"]
910
),
911
"archive": texttospeech_v1.AudioConfig(
912
audio_encoding=texttospeech_v1.AudioEncoding.LINEAR16,
913
sample_rate_hertz=48000,
914
speaking_rate=1.0
915
)
916
}
917
918
return configs.get(use_case, configs["audiobook"])
919
920
# Apply best practices
921
def create_optimized_long_audio_request(text_content: str, output_uri: str,
922
content_type: str = "audiobook"):
923
"""Create optimized long audio request following best practices."""
924
925
# Validate and optimize text
926
if not LongAudioBestPractices.validate_text_length(text_content):
927
print("Consider breaking content into smaller chunks")
928
929
optimized_text = LongAudioBestPractices.optimize_text_for_synthesis(text_content)
930
931
# Choose optimal voice and config
932
voice_name = LongAudioBestPractices.choose_optimal_voice(content_type)
933
audio_config = LongAudioBestPractices.create_optimal_audio_config(content_type)
934
935
# Create request
936
request = texttospeech_v1.SynthesizeLongAudioRequest(
937
parent="projects/your-project-id/locations/us-central1",
938
input=texttospeech_v1.SynthesisInput(text=optimized_text),
939
audio_config=audio_config,
940
voice=texttospeech_v1.VoiceSelectionParams(
941
language_code="en-US",
942
name=voice_name
943
),
944
output_gcs_uri=output_uri
945
)
946
947
return request
948
```