or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

async-clients.mdconfiguration-types.mdindex.mdlong-audio-synthesis.mdspeech-synthesis.mdstreaming-synthesis.mdvoice-management.md

voice-management.mddocs/

0

# Voice Management

1

2

## Overview

3

4

Voice management in the Google Cloud Text-to-Speech API involves discovering, selecting, and configuring voices for speech synthesis. The API provides access to hundreds of voices across multiple languages, including standard voices, high-quality WaveNet neural voices, and custom voice models.

5

6

## Voice Discovery

7

8

### Listing All Available Voices

9

10

```api { .api }

11

from google.cloud import texttospeech

12

13

# Initialize client

14

client = texttospeech.TextToSpeechClient()

15

16

# List all voices

17

response = client.list_voices()

18

19

# Iterate through available voices

20

for voice in response.voices:

21

print(f"Voice Name: {voice.name}")

22

print(f"Language Codes: {voice.language_codes}")

23

print(f"Gender: {voice.ssml_gender}")

24

print(f"Natural Sample Rate: {voice.natural_sample_rate_hertz} Hz")

25

print("---")

26

```

27

28

### Filtering Voices by Language

29

30

```api { .api }

31

from google.cloud.texttospeech import ListVoicesRequest

32

33

# List voices for specific language

34

request = ListVoicesRequest(language_code="en-US")

35

response = client.list_voices(request=request)

36

37

print(f"Found {len(response.voices)} voices for en-US:")

38

for voice in response.voices:

39

print(f"- {voice.name} ({voice.ssml_gender.name})")

40

41

# List voices for multiple languages

42

languages = ["en-US", "es-ES", "fr-FR", "de-DE"]

43

for lang in languages:

44

request = ListVoicesRequest(language_code=lang)

45

response = client.list_voices(request=request)

46

print(f"{lang}: {len(response.voices)} voices")

47

```

48

49

### Voice Information Analysis

50

51

```api { .api }

52

def analyze_voice_capabilities():

53

"""Analyze and categorize available voices."""

54

client = texttospeech.TextToSpeechClient()

55

response = client.list_voices()

56

57

# Group voices by type and language

58

voice_analysis = {

59

'by_language': {},

60

'by_type': {'wavenet': [], 'neural2': [], 'standard': [], 'other': []},

61

'by_gender': {'MALE': [], 'FEMALE': [], 'NEUTRAL': []}

62

}

63

64

for voice in response.voices:

65

# Group by language

66

for lang_code in voice.language_codes:

67

if lang_code not in voice_analysis['by_language']:

68

voice_analysis['by_language'][lang_code] = []

69

voice_analysis['by_language'][lang_code].append(voice.name)

70

71

# Group by voice type

72

if 'Wavenet' in voice.name:

73

voice_analysis['by_type']['wavenet'].append(voice.name)

74

elif 'Neural2' in voice.name:

75

voice_analysis['by_type']['neural2'].append(voice.name)

76

elif 'Standard' in voice.name:

77

voice_analysis['by_type']['standard'].append(voice.name)

78

else:

79

voice_analysis['by_type']['other'].append(voice.name)

80

81

# Group by gender

82

gender = voice.ssml_gender.name

83

if gender in voice_analysis['by_gender']:

84

voice_analysis['by_gender'][gender].append(voice.name)

85

86

return voice_analysis

87

88

# Usage

89

voice_stats = analyze_voice_capabilities()

90

print(f"WaveNet voices: {len(voice_stats['by_type']['wavenet'])}")

91

print(f"Neural2 voices: {len(voice_stats['by_type']['neural2'])}")

92

print(f"Standard voices: {len(voice_stats['by_type']['standard'])}")

93

```

94

95

## Voice Types and Models

96

97

### Voice Class Properties

98

99

```api { .api }

100

from google.cloud.texttospeech import Voice, SsmlVoiceGender

101

102

# Voice object contains:

103

# - name: str - Unique voice identifier (e.g., "en-US-Wavenet-A")

104

# - language_codes: List[str] - Supported language codes

105

# - ssml_gender: SsmlVoiceGender - Voice gender

106

# - natural_sample_rate_hertz: int - Optimal sample rate

107

108

# Access voice properties

109

def print_voice_details(voice: Voice):

110

print(f"Name: {voice.name}")

111

print(f"Languages: {', '.join(voice.language_codes)}")

112

print(f"Gender: {voice.ssml_gender.name}")

113

print(f"Sample Rate: {voice.natural_sample_rate_hertz} Hz")

114

115

# Example voice categorization

116

def categorize_voice(voice_name: str) -> str:

117

"""Categorize voice by type based on name."""

118

if "Wavenet" in voice_name:

119

return "WaveNet Neural Voice (High Quality)"

120

elif "Neural2" in voice_name:

121

return "Neural2 Voice (Premium Quality)"

122

elif "Standard" in voice_name:

123

return "Standard Voice (Basic Quality)"

124

elif "Studio" in voice_name:

125

return "Studio Voice (Premium)"

126

elif "Polyglot" in voice_name:

127

return "Polyglot Voice (Multi-language)"

128

else:

129

return "Custom or Special Voice"

130

```

131

132

### Voice Quality Comparison

133

134

```api { .api }

135

# Voice quality hierarchy (best to standard)

136

VOICE_QUALITY_TIERS = {

137

"premium": ["Neural2", "Studio", "Journey"],

138

"high": ["Wavenet"],

139

"standard": ["Standard"],

140

"custom": ["Custom"]

141

}

142

143

def get_best_voice_for_language(language_code: str, gender_preference=None):

144

"""Find the best available voice for a language."""

145

client = texttospeech.TextToSpeechClient()

146

request = texttospeech.ListVoicesRequest(language_code=language_code)

147

response = client.list_voices(request=request)

148

149

# Filter by gender if specified

150

voices = response.voices

151

if gender_preference:

152

voices = [v for v in voices if v.ssml_gender == gender_preference]

153

154

# Sort by quality tier

155

for tier_names in VOICE_QUALITY_TIERS.values():

156

for tier_name in tier_names:

157

for voice in voices:

158

if tier_name in voice.name:

159

return voice

160

161

# Return first available if no premium voices found

162

return voices[0] if voices else None

163

164

# Usage

165

best_voice = get_best_voice_for_language(

166

"en-US",

167

texttospeech.SsmlVoiceGender.FEMALE

168

)

169

if best_voice:

170

print(f"Best voice: {best_voice.name}")

171

```

172

173

## Voice Selection

174

175

### VoiceSelectionParams Configuration

176

177

```api { .api }

178

from google.cloud.texttospeech import VoiceSelectionParams, SsmlVoiceGender

179

180

# Basic voice selection by language and gender

181

voice_params = VoiceSelectionParams(

182

language_code="en-US", # Required: BCP-47 language code

183

ssml_gender=SsmlVoiceGender.FEMALE # Optional: gender preference

184

)

185

186

# Specific voice selection by name

187

voice_params = VoiceSelectionParams(

188

language_code="en-US",

189

name="en-US-Wavenet-D" # Exact voice model name

190

)

191

192

# Voice selection with custom pronunciations

193

voice_params = VoiceSelectionParams(

194

language_code="en-US",

195

name="en-US-Neural2-A",

196

custom_pronunciations=texttospeech.CustomPronunciations(

197

pronunciations=[

198

texttospeech.CustomPronunciationParams(

199

phrase="API",

200

ipa="ˌeɪ piː ˈaɪ",

201

phonetic_encoding=texttospeech.CustomPronunciationParams.PhoneticEncoding.IPA

202

)

203

]

204

)

205

)

206

```

207

208

### Advanced Voice Selection

209

210

```api { .api }

211

from google.cloud.texttospeech import (

212

VoiceSelectionParams,

213

AdvancedVoiceOptions,

214

CustomVoiceParams,

215

VoiceCloneParams

216

)

217

218

# Voice with advanced options

219

voice_params = VoiceSelectionParams(

220

language_code="en-US",

221

name="en-US-Neural2-C",

222

advanced_voice_options=AdvancedVoiceOptions(

223

low_latency_journey_synthesis=True # Enable low-latency mode

224

)

225

)

226

227

# Custom voice model

228

voice_params = VoiceSelectionParams(

229

language_code="en-US",

230

custom_voice=CustomVoiceParams(

231

model="projects/your-project/locations/us-central1/models/custom-voice-model"

232

)

233

)

234

235

# Voice cloning

236

voice_params = VoiceSelectionParams(

237

language_code="en-US",

238

voice_clone=VoiceCloneParams(

239

voice_clone_key="your-voice-clone-key"

240

)

241

)

242

```

243

244

## Gender and Language Options

245

246

### SsmlVoiceGender Enum

247

248

```api { .api }

249

from google.cloud.texttospeech import SsmlVoiceGender

250

251

# Available gender options

252

MALE = SsmlVoiceGender.MALE # Male voice

253

FEMALE = SsmlVoiceGender.FEMALE # Female voice

254

NEUTRAL = SsmlVoiceGender.NEUTRAL # Gender-neutral voice

255

UNSPECIFIED = SsmlVoiceGender.SSML_VOICE_GENDER_UNSPECIFIED # No preference

256

257

# Usage in voice selection

258

def create_voice_by_gender(language: str, gender: SsmlVoiceGender):

259

return VoiceSelectionParams(

260

language_code=language,

261

ssml_gender=gender

262

)

263

264

# Examples

265

male_voice = create_voice_by_gender("en-US", SsmlVoiceGender.MALE)

266

female_voice = create_voice_by_gender("fr-FR", SsmlVoiceGender.FEMALE)

267

neutral_voice = create_voice_by_gender("de-DE", SsmlVoiceGender.NEUTRAL)

268

```

269

270

### Language Code Examples

271

272

```api { .api }

273

# Common language codes for voice selection

274

SUPPORTED_LANGUAGES = {

275

"en-US": "English (United States)",

276

"en-GB": "English (United Kingdom)",

277

"en-AU": "English (Australia)",

278

"es-ES": "Spanish (Spain)",

279

"es-MX": "Spanish (Mexico)",

280

"fr-FR": "French (France)",

281

"fr-CA": "French (Canada)",

282

"de-DE": "German (Germany)",

283

"it-IT": "Italian (Italy)",

284

"pt-BR": "Portuguese (Brazil)",

285

"pt-PT": "Portuguese (Portugal)",

286

"ja-JP": "Japanese (Japan)",

287

"ko-KR": "Korean (South Korea)",

288

"zh-CN": "Chinese (Mainland)",

289

"zh-TW": "Chinese (Taiwan)",

290

"hi-IN": "Hindi (India)",

291

"ar-SA": "Arabic (Saudi Arabia)",

292

"ru-RU": "Russian (Russia)",

293

"nl-NL": "Dutch (Netherlands)",

294

"sv-SE": "Swedish (Sweden)",

295

"da-DK": "Danish (Denmark)",

296

"no-NO": "Norwegian (Norway)",

297

"fi-FI": "Finnish (Finland)",

298

}

299

300

def get_voices_for_languages(language_codes: list):

301

"""Get available voices for multiple languages."""

302

client = texttospeech.TextToSpeechClient()

303

results = {}

304

305

for lang_code in language_codes:

306

request = texttospeech.ListVoicesRequest(language_code=lang_code)

307

response = client.list_voices(request=request)

308

results[lang_code] = [voice.name for voice in response.voices]

309

310

return results

311

```

312

313

## Custom Pronunciations

314

315

### CustomPronunciationParams Configuration

316

317

```api { .api }

318

from google.cloud.texttospeech import (

319

CustomPronunciations,

320

CustomPronunciationParams

321

)

322

323

# IPA pronunciation

324

ipa_pronunciation = CustomPronunciationParams(

325

phrase="nuclear",

326

ipa="ˈnuːkliər",

327

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

328

)

329

330

# X-SAMPA pronunciation

331

xsampa_pronunciation = CustomPronunciationParams(

332

phrase="often",

333

ipa="Q:ft@n", # X-SAMPA notation

334

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.X_SAMPA

335

)

336

337

# Collection of custom pronunciations

338

custom_pronunciations = CustomPronunciations(

339

pronunciations=[

340

CustomPronunciationParams(

341

phrase="GitHub",

342

ipa="ˈɡɪt hʌb",

343

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

344

),

345

CustomPronunciationParams(

346

phrase="API",

347

ipa="ˌeɪ piː ˈaɪ",

348

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

349

),

350

CustomPronunciationParams(

351

phrase="OAuth",

352

ipa="ˈoʊ ɔːθ",

353

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

354

)

355

]

356

)

357

```

358

359

### Using Custom Pronunciations

360

361

```api { .api }

362

def create_voice_with_custom_pronunciations(language_code: str, pronunciations_dict: dict):

363

"""Create voice selection with custom pronunciations from dictionary."""

364

365

# Convert dictionary to CustomPronunciationParams

366

pronunciation_params = []

367

for phrase, ipa_pronunciation in pronunciations_dict.items():

368

param = CustomPronunciationParams(

369

phrase=phrase,

370

ipa=ipa_pronunciation,

371

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

372

)

373

pronunciation_params.append(param)

374

375

# Create custom pronunciations collection

376

custom_pronunciations = CustomPronunciations(

377

pronunciations=pronunciation_params

378

)

379

380

# Return voice selection with custom pronunciations

381

return VoiceSelectionParams(

382

language_code=language_code,

383

custom_pronunciations=custom_pronunciations

384

)

385

386

# Usage example

387

tech_pronunciations = {

388

"JSON": "ˈdʒeɪ sɒn",

389

"SQL": "ˈsiː kwəl",

390

"HTTP": "ˌeɪtʃ tiː tiː ˈpiː",

391

"URL": "ˌjuː ɑːr ˈɛl",

392

"CSS": "ˌsiː ɛs ˈɛs"

393

}

394

395

tech_voice = create_voice_with_custom_pronunciations("en-US", tech_pronunciations)

396

397

# Use in synthesis request

398

request = texttospeech.SynthesizeSpeechRequest(

399

input=texttospeech.SynthesisInput(

400

text="We'll use JSON data via HTTP API calls and style with CSS."

401

),

402

voice=tech_voice,

403

audio_config=texttospeech.AudioConfig(

404

audio_encoding=texttospeech.AudioEncoding.MP3

405

)

406

)

407

```

408

409

## Voice Filtering and Selection Helpers

410

411

### Voice Filtering Functions

412

413

```api { .api }

414

def filter_voices_by_criteria(language_code: str = None, gender: SsmlVoiceGender = None,

415

voice_type: str = None):

416

"""Filter voices by multiple criteria."""

417

client = texttospeech.TextToSpeechClient()

418

419

# Get voices for language or all voices

420

if language_code:

421

request = texttospeech.ListVoicesRequest(language_code=language_code)

422

response = client.list_voices(request=request)

423

else:

424

response = client.list_voices()

425

426

filtered_voices = response.voices

427

428

# Filter by gender

429

if gender:

430

filtered_voices = [v for v in filtered_voices if v.ssml_gender == gender]

431

432

# Filter by voice type

433

if voice_type:

434

filtered_voices = [v for v in filtered_voices if voice_type in v.name]

435

436

return filtered_voices

437

438

# Usage examples

439

wavenet_female_voices = filter_voices_by_criteria(

440

language_code="en-US",

441

gender=SsmlVoiceGender.FEMALE,

442

voice_type="Wavenet"

443

)

444

445

neural2_voices = filter_voices_by_criteria(voice_type="Neural2")

446

male_spanish_voices = filter_voices_by_criteria(

447

language_code="es-ES",

448

gender=SsmlVoiceGender.MALE

449

)

450

```

451

452

### Voice Recommendation System

453

454

```api { .api }

455

class VoiceRecommender:

456

"""Intelligent voice recommendation system."""

457

458

def __init__(self):

459

self.client = texttospeech.TextToSpeechClient()

460

self._voice_cache = {}

461

462

def get_cached_voices(self, language_code: str = None):

463

"""Get voices with caching for performance."""

464

cache_key = language_code or "all"

465

466

if cache_key not in self._voice_cache:

467

if language_code:

468

request = texttospeech.ListVoicesRequest(language_code=language_code)

469

response = self.client.list_voices(request=request)

470

else:

471

response = self.client.list_voices()

472

self._voice_cache[cache_key] = response.voices

473

474

return self._voice_cache[cache_key]

475

476

def recommend_voice(self, language_code: str, preferences: dict = None):

477

"""Recommend best voice based on preferences."""

478

preferences = preferences or {}

479

480

voices = self.get_cached_voices(language_code)

481

if not voices:

482

return None

483

484

# Scoring system

485

scored_voices = []

486

for voice in voices:

487

score = 0

488

489

# Quality scoring

490

if "Neural2" in voice.name:

491

score += 100

492

elif "Wavenet" in voice.name:

493

score += 80

494

elif "Standard" in voice.name:

495

score += 60

496

497

# Gender preference

498

if preferences.get("gender") == voice.ssml_gender:

499

score += 50

500

501

# Sample rate preference

502

preferred_rate = preferences.get("sample_rate")

503

if preferred_rate and voice.natural_sample_rate_hertz == preferred_rate:

504

score += 30

505

506

# Name preference (if specific voice requested)

507

if preferences.get("voice_name") and preferences["voice_name"] in voice.name:

508

score += 200

509

510

scored_voices.append((voice, score))

511

512

# Return highest scored voice

513

scored_voices.sort(key=lambda x: x[1], reverse=True)

514

return scored_voices[0][0] if scored_voices else None

515

516

def get_voice_alternatives(self, primary_voice_name: str, count: int = 3):

517

"""Get alternative voices similar to the primary voice."""

518

# Extract language from primary voice name

519

lang_parts = primary_voice_name.split("-")

520

if len(lang_parts) >= 2:

521

language_code = f"{lang_parts[0]}-{lang_parts[1]}"

522

else:

523

return []

524

525

voices = self.get_cached_voices(language_code)

526

527

# Find similar voices (same type and gender if possible)

528

primary_voice = next((v for v in voices if v.name == primary_voice_name), None)

529

if not primary_voice:

530

return voices[:count]

531

532

similar_voices = []

533

for voice in voices:

534

if (voice.name != primary_voice_name and

535

voice.ssml_gender == primary_voice.ssml_gender):

536

537

# Prefer same voice type

538

if any(vtype in voice.name and vtype in primary_voice_name

539

for vtype in ["Neural2", "Wavenet", "Standard"]):

540

similar_voices.insert(0, voice)

541

else:

542

similar_voices.append(voice)

543

544

return similar_voices[:count]

545

546

# Usage

547

recommender = VoiceRecommender()

548

549

# Get recommendation with preferences

550

preferences = {

551

"gender": SsmlVoiceGender.FEMALE,

552

"sample_rate": 24000

553

}

554

recommended_voice = recommender.recommend_voice("en-US", preferences)

555

556

# Get alternatives to a specific voice

557

alternatives = recommender.get_voice_alternatives("en-US-Wavenet-D", count=5)

558

```

559

560

## Voice Testing and Comparison

561

562

### Voice Comparison Tool

563

564

```api { .api }

565

def compare_voices(text: str, voice_names: list, output_dir: str = "voice_comparison"):

566

"""Generate audio samples for voice comparison."""

567

import os

568

569

client = texttospeech.TextToSpeechClient()

570

os.makedirs(output_dir, exist_ok=True)

571

572

results = []

573

574

for voice_name in voice_names:

575

# Extract language code from voice name

576

lang_parts = voice_name.split("-")

577

language_code = f"{lang_parts[0]}-{lang_parts[1]}" if len(lang_parts) >= 2 else "en-US"

578

579

try:

580

request = texttospeech.SynthesizeSpeechRequest(

581

input=texttospeech.SynthesisInput(text=text),

582

voice=VoiceSelectionParams(

583

language_code=language_code,

584

name=voice_name

585

),

586

audio_config=texttospeech.AudioConfig(

587

audio_encoding=texttospeech.AudioEncoding.MP3

588

)

589

)

590

591

response = client.synthesize_speech(request=request)

592

593

# Save audio file

594

filename = f"{voice_name.replace('-', '_')}.mp3"

595

filepath = os.path.join(output_dir, filename)

596

597

with open(filepath, "wb") as f:

598

f.write(response.audio_content)

599

600

results.append({

601

"voice_name": voice_name,

602

"file_path": filepath,

603

"success": True,

604

"audio_size": len(response.audio_content)

605

})

606

607

except Exception as e:

608

results.append({

609

"voice_name": voice_name,

610

"file_path": None,

611

"success": False,

612

"error": str(e)

613

})

614

615

return results

616

617

# Usage

618

test_voices = [

619

"en-US-Neural2-A",

620

"en-US-Neural2-C",

621

"en-US-Wavenet-A",

622

"en-US-Wavenet-D",

623

"en-US-Standard-A"

624

]

625

626

comparison_results = compare_voices(

627

"Hello, this is a test of different voice qualities and characteristics.",

628

test_voices

629

)

630

631

for result in comparison_results:

632

if result["success"]:

633

print(f"✓ {result['voice_name']}: {result['audio_size']} bytes")

634

else:

635

print(f"✗ {result['voice_name']}: {result['error']}")

636

```

637

638

### Voice Quality Assessment

639

640

```api { .api }

641

def assess_voice_quality(voice_name: str) -> dict:

642

"""Assess voice quality characteristics based on name and properties."""

643

644

quality_assessment = {

645

"voice_name": voice_name,

646

"quality_tier": "unknown",

647

"naturalness": "medium",

648

"recommended_use": "general",

649

"latency": "medium",

650

"cost": "medium"

651

}

652

653

# Assess based on voice type

654

if "Neural2" in voice_name:

655

quality_assessment.update({

656

"quality_tier": "premium",

657

"naturalness": "very_high",

658

"recommended_use": "professional_content",

659

"latency": "medium",

660

"cost": "high"

661

})

662

elif "Wavenet" in voice_name:

663

quality_assessment.update({

664

"quality_tier": "high",

665

"naturalness": "high",

666

"recommended_use": "content_creation",

667

"latency": "medium",

668

"cost": "medium_high"

669

})

670

elif "Standard" in voice_name:

671

quality_assessment.update({

672

"quality_tier": "basic",

673

"naturalness": "medium",

674

"recommended_use": "notifications",

675

"latency": "low",

676

"cost": "low"

677

})

678

elif "Studio" in voice_name:

679

quality_assessment.update({

680

"quality_tier": "premium",

681

"naturalness": "very_high",

682

"recommended_use": "audiobooks",

683

"latency": "high",

684

"cost": "high"

685

})

686

687

return quality_assessment

688

689

# Assess multiple voices

690

voice_assessments = [

691

assess_voice_quality("en-US-Neural2-A"),

692

assess_voice_quality("en-US-Wavenet-D"),

693

assess_voice_quality("en-US-Standard-B")

694

]

695

696

for assessment in voice_assessments:

697

print(f"{assessment['voice_name']}: {assessment['quality_tier']} quality, "

698

f"{assessment['naturalness']} naturalness, {assessment['cost']} cost")

699

```