or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

async-clients.mdconfiguration-types.mdindex.mdlong-audio-synthesis.mdspeech-synthesis.mdstreaming-synthesis.mdvoice-management.md

configuration-types.mddocs/

0

# Configuration Types

1

2

## Overview

3

4

The Google Cloud Text-to-Speech API provides extensive configuration options through various classes and types. These configuration objects control voice selection, audio output, input formatting, and advanced features like custom pronunciations and multi-speaker synthesis.

5

6

## Core Configuration Classes

7

8

### SynthesisInput

9

10

```api { .api }

11

from google.cloud.texttospeech import SynthesisInput, MultiSpeakerMarkup

12

13

# Plain text input

14

text_input = SynthesisInput(

15

text="Convert this plain text to speech"

16

)

17

18

# SSML input

19

ssml_input = SynthesisInput(

20

ssml='<speak>Convert this <emphasis level="strong">SSML</emphasis> to speech</speak>'

21

)

22

23

# Multi-speaker markup input

24

multi_speaker_input = SynthesisInput(

25

multi_speaker_markup=MultiSpeakerMarkup(

26

ssml='''

27

<speak>

28

<voice name="en-US-Neural2-A">Hello from speaker one.</voice>

29

<voice name="en-US-Neural2-C">And greetings from speaker two.</voice>

30

</speak>

31

'''

32

)

33

)

34

35

# SynthesisInput only accepts ONE of: text, ssml, or multi_speaker_markup

36

# Using multiple will raise an error

37

```

38

39

### VoiceSelectionParams

40

41

```api { .api }

42

from google.cloud.texttospeech import (

43

VoiceSelectionParams,

44

SsmlVoiceGender,

45

CustomPronunciations,

46

CustomPronunciationParams,

47

AdvancedVoiceOptions,

48

CustomVoiceParams,

49

VoiceCloneParams

50

)

51

52

# Basic voice selection

53

basic_voice = VoiceSelectionParams(

54

language_code="en-US", # Required: BCP-47 language code

55

ssml_gender=SsmlVoiceGender.FEMALE # Optional: voice gender preference

56

)

57

58

# Specific voice selection

59

specific_voice = VoiceSelectionParams(

60

language_code="en-US",

61

name="en-US-Wavenet-D" # Exact voice model name

62

)

63

64

# Voice with custom pronunciations

65

voice_with_pronunciations = VoiceSelectionParams(

66

language_code="en-US",

67

name="en-US-Neural2-A",

68

custom_pronunciations=CustomPronunciations(

69

pronunciations=[

70

CustomPronunciationParams(

71

phrase="GitHub",

72

ipa="ˈɡɪt hʌb",

73

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

74

),

75

CustomPronunciationParams(

76

phrase="API",

77

ipa="ˌeɪ piː ˈaɪ",

78

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

79

)

80

]

81

)

82

)

83

84

# Voice with advanced options

85

advanced_voice = VoiceSelectionParams(

86

language_code="en-US",

87

name="en-US-Neural2-C",

88

advanced_voice_options=AdvancedVoiceOptions(

89

low_latency_journey_synthesis=True # Enable low-latency processing

90

)

91

)

92

93

# Custom voice model

94

custom_voice = VoiceSelectionParams(

95

language_code="en-US",

96

custom_voice=CustomVoiceParams(

97

model="projects/your-project/locations/us-central1/models/custom-model"

98

)

99

)

100

101

# Voice cloning

102

cloned_voice = VoiceSelectionParams(

103

language_code="en-US",

104

voice_clone=VoiceCloneParams(

105

voice_clone_key="your-voice-clone-key"

106

)

107

)

108

```

109

110

### AudioConfig

111

112

```api { .api }

113

from google.cloud.texttospeech import AudioConfig, AudioEncoding

114

115

# Basic audio configuration

116

basic_audio = AudioConfig(

117

audio_encoding=AudioEncoding.MP3, # Required: output format

118

sample_rate_hertz=22050 # Optional: sample rate (Hz)

119

)

120

121

# Complete audio configuration

122

complete_audio = AudioConfig(

123

audio_encoding=AudioEncoding.LINEAR16, # Audio format

124

sample_rate_hertz=24000, # Sample rate

125

speaking_rate=1.0, # Speech rate (0.25-4.0)

126

pitch=0.0, # Pitch adjustment (-20.0 to 20.0)

127

volume_gain_db=0.0, # Volume gain (-96.0 to 16.0)

128

effects_profile_id=["large-home-entertainment-class-device"] # Audio effects

129

)

130

131

# High-quality audio configuration

132

high_quality_audio = AudioConfig(

133

audio_encoding=AudioEncoding.LINEAR16,

134

sample_rate_hertz=48000,

135

speaking_rate=0.95,

136

pitch=1.0,

137

volume_gain_db=2.0

138

)

139

140

# Compressed audio for streaming

141

streaming_audio = AudioConfig(

142

audio_encoding=AudioEncoding.OGG_OPUS,

143

sample_rate_hertz=48000,

144

speaking_rate=1.1,

145

effects_profile_id=["wearable-class-device"]

146

)

147

148

# Telephony optimized audio

149

telephony_audio = AudioConfig(

150

audio_encoding=AudioEncoding.MULAW,

151

sample_rate_hertz=8000,

152

speaking_rate=1.2,

153

effects_profile_id=["telephony-class-application"]

154

)

155

```

156

157

### Voice

158

159

```api { .api }

160

from google.cloud.texttospeech import Voice, SsmlVoiceGender

161

162

# Voice object (returned by list_voices())

163

# Contains voice information and capabilities

164

165

def analyze_voice_properties(voice: Voice):

166

"""Analyze properties of a Voice object."""

167

168

print(f"Name: {voice.name}") # e.g., "en-US-Wavenet-A"

169

print(f"Language Codes: {voice.language_codes}") # e.g., ["en-US"]

170

print(f"SSML Gender: {voice.ssml_gender}") # SsmlVoiceGender enum

171

print(f"Natural Sample Rate: {voice.natural_sample_rate_hertz} Hz") # e.g., 24000

172

173

# Voice categorization based on name

174

if "Neural2" in voice.name:

175

print("Type: Premium Neural Voice")

176

elif "Wavenet" in voice.name:

177

print("Type: High-Quality Neural Voice")

178

elif "Standard" in voice.name:

179

print("Type: Standard Voice")

180

elif "Studio" in voice.name:

181

print("Type: Studio Voice")

182

else:

183

print("Type: Custom or Special Voice")

184

185

# Example usage with actual Voice objects

186

# voices_response = client.list_voices()

187

# for voice in voices_response.voices:

188

# analyze_voice_properties(voice)

189

```

190

191

## Streaming Configuration Classes

192

193

### StreamingAudioConfig

194

195

```api { .api }

196

from google.cloud.texttospeech import StreamingAudioConfig, AudioEncoding

197

198

# Basic streaming audio configuration

199

streaming_basic = StreamingAudioConfig(

200

audio_encoding=AudioEncoding.LINEAR16, # Required: audio format

201

sample_rate_hertz=22050 # Required: sample rate

202

)

203

204

# Advanced streaming audio configuration

205

streaming_advanced = StreamingAudioConfig(

206

audio_encoding=AudioEncoding.OGG_OPUS, # Compressed format

207

sample_rate_hertz=48000, # High sample rate

208

speaking_rate=1.0, # Normal speech rate

209

pitch=0.0, # Neutral pitch

210

volume_gain_db=1.0, # Slight volume boost

211

effects_profile_id=["small-bluetooth-speaker-class-device"] # Audio effects

212

)

213

214

# Low-latency streaming configuration

215

streaming_low_latency = StreamingAudioConfig(

216

audio_encoding=AudioEncoding.LINEAR16,

217

sample_rate_hertz=16000, # Lower rate for speed

218

speaking_rate=1.1 # Slightly faster

219

)

220

221

# High-quality streaming configuration

222

streaming_high_quality = StreamingAudioConfig(

223

audio_encoding=AudioEncoding.LINEAR16,

224

sample_rate_hertz=48000,

225

speaking_rate=0.9, # Slightly slower

226

pitch=-0.5, # Lower pitch

227

volume_gain_db=2.0 # Volume boost

228

)

229

```

230

231

### StreamingSynthesizeConfig

232

233

```api { .api }

234

from google.cloud.texttospeech import (

235

StreamingSynthesizeConfig,

236

VoiceSelectionParams,

237

StreamingAudioConfig

238

)

239

240

# Complete streaming synthesis configuration

241

streaming_config = StreamingSynthesizeConfig(

242

voice=VoiceSelectionParams(

243

language_code="en-US",

244

name="en-US-Neural2-A",

245

ssml_gender=SsmlVoiceGender.FEMALE

246

),

247

audio_config=StreamingAudioConfig(

248

audio_encoding=AudioEncoding.LINEAR16,

249

sample_rate_hertz=22050,

250

speaking_rate=1.0,

251

pitch=0.0,

252

volume_gain_db=0.0

253

)

254

)

255

256

# Low-latency streaming configuration

257

low_latency_streaming = StreamingSynthesizeConfig(

258

voice=VoiceSelectionParams(

259

language_code="en-US",

260

name="en-US-Standard-B", # Standard voice for speed

261

advanced_voice_options=AdvancedVoiceOptions(

262

low_latency_journey_synthesis=True

263

)

264

),

265

audio_config=StreamingAudioConfig(

266

audio_encoding=AudioEncoding.LINEAR16,

267

sample_rate_hertz=16000 # Lower sample rate

268

)

269

)

270

271

# Multi-language streaming configuration

272

multilang_streaming = StreamingSynthesizeConfig(

273

voice=VoiceSelectionParams(

274

language_code="en-US",

275

name="en-US-Polyglot-1" # Polyglot voice if available

276

),

277

audio_config=StreamingAudioConfig(

278

audio_encoding=AudioEncoding.MP3,

279

sample_rate_hertz=24000

280

)

281

)

282

```

283

284

### StreamingSynthesisInput

285

286

```api { .api }

287

from google.cloud.texttospeech import StreamingSynthesisInput

288

289

# Text input for streaming

290

text_stream_input = StreamingSynthesisInput(

291

text="This text will be streamed to the synthesis service."

292

)

293

294

# SSML input for streaming

295

ssml_stream_input = StreamingSynthesisInput(

296

ssml='<speak>This <emphasis level="moderate">SSML content</emphasis> will be streamed.</speak>'

297

)

298

299

# Note: StreamingSynthesisInput accepts either text OR ssml, not both

300

# Each streaming request should contain one input chunk

301

```

302

303

## Advanced Configuration Classes

304

305

### AdvancedVoiceOptions

306

307

```api { .api }

308

from google.cloud.texttospeech import AdvancedVoiceOptions

309

310

# Advanced voice configuration

311

advanced_options = AdvancedVoiceOptions(

312

low_latency_journey_synthesis=True # Enable low-latency processing

313

)

314

315

# Usage in voice selection

316

voice_with_advanced = VoiceSelectionParams(

317

language_code="en-US",

318

name="en-US-Neural2-A",

319

advanced_voice_options=advanced_options

320

)

321

322

# Direct configuration

323

direct_advanced_voice = VoiceSelectionParams(

324

language_code="en-US",

325

name="en-US-Neural2-C",

326

advanced_voice_options=AdvancedVoiceOptions(

327

low_latency_journey_synthesis=True

328

)

329

)

330

```

331

332

### CustomPronunciations and CustomPronunciationParams

333

334

```api { .api }

335

from google.cloud.texttospeech import (

336

CustomPronunciations,

337

CustomPronunciationParams

338

)

339

340

# Individual pronunciation parameter

341

pronunciation_param = CustomPronunciationParams(

342

phrase="PyTorch", # Word or phrase to customize

343

ipa="ˈpaɪ tɔrʧ", # IPA pronunciation

344

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA # Encoding type

345

)

346

347

# X-SAMPA encoding example

348

xsampa_param = CustomPronunciationParams(

349

phrase="neural",

350

ipa="n\"jU@r@l", # X-SAMPA notation

351

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA

352

)

353

354

# Collection of custom pronunciations

355

custom_pronunciations = CustomPronunciations(

356

pronunciations=[

357

CustomPronunciationParams(

358

phrase="TensorFlow",

359

ipa="ˈtɛnsər floʊ",

360

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

361

),

362

CustomPronunciationParams(

363

phrase="Kubernetes",

364

ipa="ˌkubərˈnɛtɪs",

365

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

366

),

367

CustomPronunciationParams(

368

phrase="OAuth",

369

ipa="ˈoʊ ɔːθ",

370

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

371

),

372

CustomPronunciationParams(

373

phrase="JSON",

374

ipa="ˈdʒeɪ sɒn",

375

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

376

)

377

]

378

)

379

380

# Technical terms pronunciations

381

tech_pronunciations = CustomPronunciations(

382

pronunciations=[

383

CustomPronunciationParams(

384

phrase="API", ipa="ˌeɪ piː ˈaɪ",

385

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

386

),

387

CustomPronunciationParams(

388

phrase="HTTP", ipa="ˌeɪʧ tiː tiː ˈpiː",

389

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

390

),

391

CustomPronunciationParams(

392

phrase="URL", ipa="ˌjuː ɑːr ˈɛl",

393

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

394

),

395

CustomPronunciationParams(

396

phrase="SQL", ipa="ˈsiː kwəl",

397

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

398

)

399

]

400

)

401

```

402

403

### MultiSpeakerMarkup

404

405

```api { .api }

406

from google.cloud.texttospeech import MultiSpeakerMarkup

407

408

# Basic multi-speaker configuration

409

multi_speaker = MultiSpeakerMarkup(

410

ssml='''

411

<speak>

412

<voice name="en-US-Neural2-A">

413

Hello, I'm the first speaker in this conversation.

414

</voice>

415

<voice name="en-US-Neural2-C">

416

And I'm the second speaker responding to you.

417

</voice>

418

</speak>

419

'''

420

)

421

422

# Complex multi-speaker conversation

423

conversation_markup = MultiSpeakerMarkup(

424

ssml='''

425

<speak>

426

<voice name="en-US-Neural2-A">

427

<prosody rate="medium" pitch="normal">

428

Welcome to our technical presentation.

429

</prosody>

430

</voice>

431

432

<break time="1s"/>

433

434

<voice name="en-US-Neural2-C">

435

<prosody rate="slow" pitch="+2st">

436

Today we'll discuss advanced AI concepts.

437

</prosody>

438

</voice>

439

440

<break time="2s"/>

441

442

<voice name="en-US-Wavenet-D">

443

<prosody rate="fast" pitch="-1st">

444

Let's start with the technical implementation details.

445

</prosody>

446

</voice>

447

</speak>

448

'''

449

)

450

451

# Dialogue with emotions and pacing

452

dialogue_markup = MultiSpeakerMarkup(

453

ssml='''

454

<speak>

455

<voice name="en-US-Neural2-A">

456

<prosody rate="medium" pitch="normal" volume="loud">

457

I have exciting news to share!

458

</prosody>

459

</voice>

460

461

<voice name="en-US-Neural2-C">

462

<prosody rate="slow" pitch="low" volume="soft">

463

Please, tell me more about it.

464

</prosody>

465

</voice>

466

467

<voice name="en-US-Neural2-A">

468

<prosody rate="fast" pitch="high" volume="loud">

469

We've achieved a breakthrough in our research!

470

</prosody>

471

</voice>

472

</speak>

473

'''

474

)

475

```

476

477

### CustomVoiceParams

478

479

```api { .api }

480

from google.cloud.texttospeech import CustomVoiceParams

481

482

# Custom voice model configuration

483

custom_voice_params = CustomVoiceParams(

484

model="projects/your-project-id/locations/us-central1/models/your-custom-voice-model"

485

)

486

487

# Usage with voice selection

488

voice_with_custom_model = VoiceSelectionParams(

489

language_code="en-US",

490

custom_voice=custom_voice_params

491

)

492

493

# Complete custom voice configuration

494

complete_custom_voice = VoiceSelectionParams(

495

language_code="en-US",

496

custom_voice=CustomVoiceParams(

497

model="projects/your-project-id/locations/us-central1/models/custom-narrator-voice"

498

),

499

custom_pronunciations=CustomPronunciations(

500

pronunciations=[

501

CustomPronunciationParams(

502

phrase="company_name",

503

ipa="ˈkʌmpəni neɪm",

504

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

505

)

506

]

507

)

508

)

509

```

510

511

### VoiceCloneParams

512

513

```api { .api }

514

from google.cloud.texttospeech import VoiceCloneParams

515

516

# Voice cloning configuration

517

voice_clone_params = VoiceCloneParams(

518

voice_clone_key="your-voice-clone-key-from-console"

519

)

520

521

# Usage with voice selection

522

cloned_voice_selection = VoiceSelectionParams(

523

language_code="en-US",

524

voice_clone=voice_clone_params

525

)

526

527

# Complete cloned voice setup

528

complete_cloned_voice = VoiceSelectionParams(

529

language_code="en-US",

530

voice_clone=VoiceCloneParams(

531

voice_clone_key="abcd-1234-efgh-5678"

532

),

533

advanced_voice_options=AdvancedVoiceOptions(

534

low_latency_journey_synthesis=True

535

)

536

)

537

```

538

539

## Enums and Constants

540

541

### AudioEncoding

542

543

```api { .api }

544

from google.cloud.texttospeech import AudioEncoding

545

546

# Available audio encoding formats

547

LINEAR16 = AudioEncoding.LINEAR16 # 16-bit PCM with WAV header (lossless)

548

MP3 = AudioEncoding.MP3 # MP3 at 32kbps (compressed)

549

OGG_OPUS = AudioEncoding.OGG_OPUS # Opus in Ogg container (compressed)

550

MULAW = AudioEncoding.MULAW # 8-bit G.711 PCMU/mu-law (telephony)

551

ALAW = AudioEncoding.ALAW # 8-bit G.711 PCMU/A-law (telephony)

552

PCM = AudioEncoding.PCM # 16-bit PCM without header (raw)

553

M4A = AudioEncoding.M4A # M4A format (compressed)

554

UNSPECIFIED = AudioEncoding.AUDIO_ENCODING_UNSPECIFIED # Not specified

555

556

# Usage in audio configuration

557

high_quality_config = AudioConfig(

558

audio_encoding=AudioEncoding.LINEAR16, # Best quality

559

sample_rate_hertz=48000

560

)

561

562

compressed_config = AudioConfig(

563

audio_encoding=AudioEncoding.MP3, # Good compression

564

sample_rate_hertz=22050

565

)

566

567

telephony_config = AudioConfig(

568

audio_encoding=AudioEncoding.MULAW, # Telephony standard

569

sample_rate_hertz=8000

570

)

571

```

572

573

### SsmlVoiceGender

574

575

```api { .api }

576

from google.cloud.texttospeech import SsmlVoiceGender

577

578

# Available gender options

579

MALE = SsmlVoiceGender.MALE # Male voice

580

FEMALE = SsmlVoiceGender.FEMALE # Female voice

581

NEUTRAL = SsmlVoiceGender.NEUTRAL # Gender-neutral voice

582

UNSPECIFIED = SsmlVoiceGender.SSML_VOICE_GENDER_UNSPECIFIED # No preference

583

584

# Usage in voice selection

585

male_voice = VoiceSelectionParams(

586

language_code="en-US",

587

ssml_gender=SsmlVoiceGender.MALE

588

)

589

590

female_voice = VoiceSelectionParams(

591

language_code="en-US",

592

ssml_gender=SsmlVoiceGender.FEMALE

593

)

594

595

neutral_voice = VoiceSelectionParams(

596

language_code="en-US",

597

ssml_gender=SsmlVoiceGender.NEUTRAL

598

)

599

```

600

601

### PhoneticEncoding

602

603

```api { .api }

604

from google.cloud.texttospeech import CustomPronunciationParams

605

606

# Available phonetic encoding options

607

IPA = CustomPronunciationParams.PhoneticEncoding.IPA # International Phonetic Alphabet

608

X_SAMPA = CustomPronunciationParams.PhoneticEncoding.X_SAMPA # X-SAMPA notation

609

UNSPECIFIED = CustomPronunciationParams.PhoneticEncoding.PHONETIC_ENCODING_UNSPECIFIED

610

611

# Usage in pronunciation parameters

612

ipa_pronunciation = CustomPronunciationParams(

613

phrase="example",

614

ipa="ɪɡˈzæmpəl",

615

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

616

)

617

618

xsampa_pronunciation = CustomPronunciationParams(

619

phrase="example",

620

ipa="Ig\"z{mp@l",

621

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA

622

)

623

```

624

625

## Configuration Validation and Helpers

626

627

### Configuration Validation

628

629

```api { .api }

630

def validate_audio_config(audio_config: AudioConfig) -> tuple[bool, list[str]]:

631

"""Validate audio configuration parameters."""

632

errors = []

633

634

# Check required fields

635

if not hasattr(audio_config, 'audio_encoding') or not audio_config.audio_encoding:

636

errors.append("audio_encoding is required")

637

638

# Validate sample rate ranges

639

if hasattr(audio_config, 'sample_rate_hertz') and audio_config.sample_rate_hertz:

640

sample_rate = audio_config.sample_rate_hertz

641

valid_rates = [8000, 16000, 22050, 24000, 32000, 44100, 48000]

642

if sample_rate not in valid_rates:

643

errors.append(f"sample_rate_hertz must be one of {valid_rates}, got {sample_rate}")

644

645

# Validate speaking rate

646

if hasattr(audio_config, 'speaking_rate') and audio_config.speaking_rate:

647

rate = audio_config.speaking_rate

648

if not (0.25 <= rate <= 4.0):

649

errors.append(f"speaking_rate must be between 0.25 and 4.0, got {rate}")

650

651

# Validate pitch

652

if hasattr(audio_config, 'pitch') and audio_config.pitch:

653

pitch = audio_config.pitch

654

if not (-20.0 <= pitch <= 20.0):

655

errors.append(f"pitch must be between -20.0 and 20.0, got {pitch}")

656

657

# Validate volume gain

658

if hasattr(audio_config, 'volume_gain_db') and audio_config.volume_gain_db:

659

volume = audio_config.volume_gain_db

660

if not (-96.0 <= volume <= 16.0):

661

errors.append(f"volume_gain_db must be between -96.0 and 16.0, got {volume}")

662

663

return len(errors) == 0, errors

664

665

def validate_voice_selection(voice: VoiceSelectionParams) -> tuple[bool, list[str]]:

666

"""Validate voice selection parameters."""

667

errors = []

668

669

# Check required fields

670

if not hasattr(voice, 'language_code') or not voice.language_code:

671

errors.append("language_code is required")

672

else:

673

# Validate language code format (basic check for BCP-47)

674

lang_code = voice.language_code

675

if not lang_code.count('-') >= 1 or len(lang_code) < 2:

676

errors.append(f"language_code should be in BCP-47 format (e.g., 'en-US'), got '{lang_code}'")

677

678

# Check conflicting voice specifications

679

specified_count = sum([

680

bool(getattr(voice, 'name', None)),

681

bool(getattr(voice, 'custom_voice', None)),

682

bool(getattr(voice, 'voice_clone', None))

683

])

684

685

if specified_count > 1:

686

errors.append("Only one of 'name', 'custom_voice', or 'voice_clone' should be specified")

687

688

return len(errors) == 0, errors

689

690

# Usage examples

691

audio_config = AudioConfig(

692

audio_encoding=AudioEncoding.MP3,

693

sample_rate_hertz=22050,

694

speaking_rate=1.5,

695

pitch=2.0

696

)

697

698

is_valid, validation_errors = validate_audio_config(audio_config)

699

if not is_valid:

700

print(f"Audio config validation errors: {validation_errors}")

701

```

702

703

### Configuration Builders

704

705

```api { .api }

706

class ConfigurationBuilder:

707

"""Helper class for building complex configurations."""

708

709

@staticmethod

710

def build_high_quality_config() -> AudioConfig:

711

"""Build high-quality audio configuration."""

712

return AudioConfig(

713

audio_encoding=AudioEncoding.LINEAR16,

714

sample_rate_hertz=48000,

715

speaking_rate=0.95,

716

pitch=0.0,

717

volume_gain_db=1.0

718

)

719

720

@staticmethod

721

def build_streaming_config() -> AudioConfig:

722

"""Build streaming-optimized audio configuration."""

723

return AudioConfig(

724

audio_encoding=AudioEncoding.OGG_OPUS,

725

sample_rate_hertz=24000,

726

speaking_rate=1.1,

727

volume_gain_db=0.0

728

)

729

730

@staticmethod

731

def build_mobile_config() -> AudioConfig:

732

"""Build mobile-optimized audio configuration."""

733

return AudioConfig(

734

audio_encoding=AudioEncoding.MP3,

735

sample_rate_hertz=16000,

736

speaking_rate=1.2,

737

effects_profile_id=["handset-class-device"]

738

)

739

740

@staticmethod

741

def build_tech_voice_with_pronunciations(language_code: str = "en-US") -> VoiceSelectionParams:

742

"""Build voice configuration optimized for technical content."""

743

744

tech_pronunciations = CustomPronunciations(

745

pronunciations=[

746

CustomPronunciationParams(

747

phrase="API", ipa="ˌeɪ piː ˈaɪ",

748

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

749

),

750

CustomPronunciationParams(

751

phrase="JSON", ipa="ˈdʒeɪ sɒn",

752

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

753

),

754

CustomPronunciationParams(

755

phrase="HTTP", ipa="ˌeɪʧ tiː tiː ˈpiː",

756

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

757

),

758

CustomPronunciationParams(

759

phrase="SQL", ipa="ˈsiː kwəl",

760

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

761

)

762

]

763

)

764

765

return VoiceSelectionParams(

766

language_code=language_code,

767

name=f"{language_code}-Neural2-A",

768

custom_pronunciations=tech_pronunciations

769

)

770

771

@staticmethod

772

def build_conversation_voices() -> list[VoiceSelectionParams]:

773

"""Build multiple voices for conversation synthesis."""

774

return [

775

VoiceSelectionParams(

776

language_code="en-US",

777

name="en-US-Neural2-A", # Female voice

778

ssml_gender=SsmlVoiceGender.FEMALE

779

),

780

VoiceSelectionParams(

781

language_code="en-US",

782

name="en-US-Neural2-C", # Male voice

783

ssml_gender=SsmlVoiceGender.MALE

784

),

785

VoiceSelectionParams(

786

language_code="en-US",

787

name="en-US-Neural2-F", # Neutral voice

788

ssml_gender=SsmlVoiceGender.NEUTRAL

789

)

790

]

791

792

# Usage examples

793

high_quality_audio = ConfigurationBuilder.build_high_quality_config()

794

streaming_audio = ConfigurationBuilder.build_streaming_config()

795

mobile_audio = ConfigurationBuilder.build_mobile_config()

796

tech_voice = ConfigurationBuilder.build_tech_voice_with_pronunciations("en-US")

797

conversation_voices = ConfigurationBuilder.build_conversation_voices()

798

```

799

800

### Configuration Templates

801

802

```api { .api }

803

class ConfigurationTemplates:

804

"""Pre-defined configuration templates for common use cases."""

805

806

AUDIOBOOK = {

807

'voice': VoiceSelectionParams(

808

language_code="en-US",

809

name="en-US-Wavenet-A"

810

),

811

'audio': AudioConfig(

812

audio_encoding=AudioEncoding.MP3,

813

sample_rate_hertz=22050,

814

speaking_rate=0.9,

815

volume_gain_db=2.0

816

)

817

}

818

819

PODCAST = {

820

'voice': VoiceSelectionParams(

821

language_code="en-US",

822

name="en-US-Neural2-C"

823

),

824

'audio': AudioConfig(

825

audio_encoding=AudioEncoding.MP3,

826

sample_rate_hertz=44100,

827

speaking_rate=1.0,

828

effects_profile_id=["large-home-entertainment-class-device"]

829

)

830

}

831

832

NEWS_BROADCAST = {

833

'voice': VoiceSelectionParams(

834

language_code="en-US",

835

name="en-US-Neural2-D",

836

ssml_gender=SsmlVoiceGender.MALE

837

),

838

'audio': AudioConfig(

839

audio_encoding=AudioEncoding.LINEAR16,

840

sample_rate_hertz=24000,

841

speaking_rate=1.1,

842

pitch=-1.0

843

)

844

}

845

846

EDUCATIONAL = {

847

'voice': VoiceSelectionParams(

848

language_code="en-US",

849

name="en-US-Neural2-A"

850

),

851

'audio': AudioConfig(

852

audio_encoding=AudioEncoding.MP3,

853

sample_rate_hertz=22050,

854

speaking_rate=0.95,

855

pitch=1.0

856

)

857

}

858

859

TELEPHONY = {

860

'voice': VoiceSelectionParams(

861

language_code="en-US",

862

name="en-US-Standard-C"

863

),

864

'audio': AudioConfig(

865

audio_encoding=AudioEncoding.MULAW,

866

sample_rate_hertz=8000,

867

speaking_rate=1.2,

868

effects_profile_id=["telephony-class-application"]

869

)

870

}

871

872

@classmethod

873

def get_template(cls, template_name: str) -> dict:

874

"""Get configuration template by name."""

875

template_map = {

876

'audiobook': cls.AUDIOBOOK,

877

'podcast': cls.PODCAST,

878

'news': cls.NEWS_BROADCAST,

879

'educational': cls.EDUCATIONAL,

880

'telephony': cls.TELEPHONY

881

}

882

883

return template_map.get(template_name.lower(), cls.AUDIOBOOK)

884

885

@classmethod

886

def create_request_from_template(cls, template_name: str, text: str) -> 'SynthesizeSpeechRequest':

887

"""Create synthesis request from template."""

888

template = cls.get_template(template_name)

889

890

return texttospeech.SynthesizeSpeechRequest(

891

input=SynthesisInput(text=text),

892

voice=template['voice'],

893

audio_config=template['audio']

894

)

895

896

# Usage examples

897

audiobook_config = ConfigurationTemplates.get_template('audiobook')

898

podcast_request = ConfigurationTemplates.create_request_from_template(

899

'podcast',

900

"Welcome to our technology podcast!"

901

)

902

```

903

904

## Best Practices for Configuration

905

906

### Configuration Guidelines

907

908

```api { .api }

909

class ConfigurationBestPractices:

910

"""Best practices for Text-to-Speech configuration."""

911

912

@staticmethod

913

def recommend_sample_rate(audio_encoding: AudioEncoding, use_case: str) -> int:

914

"""Recommend optimal sample rate for encoding and use case."""

915

916

recommendations = {

917

AudioEncoding.LINEAR16: {

918

'high_quality': 48000,

919

'standard': 24000,

920

'streaming': 22050,

921

'mobile': 16000

922

},

923

AudioEncoding.MP3: {

924

'high_quality': 44100,

925

'standard': 22050,

926

'streaming': 22050,

927

'mobile': 16000

928

},

929

AudioEncoding.OGG_OPUS: {

930

'high_quality': 48000,

931

'standard': 24000,

932

'streaming': 24000,

933

'mobile': 16000

934

},

935

AudioEncoding.MULAW: {

936

'telephony': 8000

937

},

938

AudioEncoding.ALAW: {

939

'telephony': 8000

940

}

941

}

942

943

encoding_rec = recommendations.get(audio_encoding, {})

944

return encoding_rec.get(use_case, 22050) # Default fallback

945

946

@staticmethod

947

def optimize_for_latency(voice_config: VoiceSelectionParams,

948

audio_config: AudioConfig) -> tuple[VoiceSelectionParams, AudioConfig]:

949

"""Optimize configuration for minimal latency."""

950

951

# Use Standard voice for speed

952

optimized_voice = VoiceSelectionParams(

953

language_code=voice_config.language_code,

954

name=voice_config.language_code.replace('-', '-Standard-A'),

955

advanced_voice_options=AdvancedVoiceOptions(

956

low_latency_journey_synthesis=True

957

)

958

)

959

960

# Use lower sample rate and compressed format

961

optimized_audio = AudioConfig(

962

audio_encoding=AudioEncoding.MP3,

963

sample_rate_hertz=16000,

964

speaking_rate=1.1

965

)

966

967

return optimized_voice, optimized_audio

968

969

@staticmethod

970

def optimize_for_quality(voice_config: VoiceSelectionParams,

971

audio_config: AudioConfig) -> tuple[VoiceSelectionParams, AudioConfig]:

972

"""Optimize configuration for maximum quality."""

973

974

# Use Neural2 or Wavenet voice

975

voice_name = voice_config.language_code

976

if 'Neural2' not in voice_config.name and 'Wavenet' not in voice_config.name:

977

voice_name += '-Neural2-A' # Default to Neural2

978

else:

979

voice_name = voice_config.name

980

981

optimized_voice = VoiceSelectionParams(

982

language_code=voice_config.language_code,

983

name=voice_name

984

)

985

986

# Use uncompressed format with high sample rate

987

optimized_audio = AudioConfig(

988

audio_encoding=AudioEncoding.LINEAR16,

989

sample_rate_hertz=48000,

990

speaking_rate=0.95, # Slightly slower for clarity

991

volume_gain_db=1.0

992

)

993

994

return optimized_voice, optimized_audio

995

996

# Usage examples

997

# Optimize for latency

998

original_voice = VoiceSelectionParams(language_code="en-US")

999

original_audio = AudioConfig(audio_encoding=AudioEncoding.LINEAR16)

1000

1001

fast_voice, fast_audio = ConfigurationBestPractices.optimize_for_latency(

1002

original_voice, original_audio

1003

)

1004

1005

# Optimize for quality

1006

quality_voice, quality_audio = ConfigurationBestPractices.optimize_for_quality(

1007

original_voice, original_audio

1008

)

1009

1010

# Get recommended sample rate

1011

recommended_rate = ConfigurationBestPractices.recommend_sample_rate(

1012

AudioEncoding.MP3, 'streaming'

1013

)

1014

```