or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

async-clients.mdconfiguration-types.mdindex.mdlong-audio-synthesis.mdspeech-synthesis.mdstreaming-synthesis.mdvoice-management.md

speech-synthesis.mddocs/

0

# Speech Synthesis

1

2

## Overview

3

4

Speech synthesis is the core functionality of the Google Cloud Text-to-Speech API, converting text input into natural-sounding speech audio. The API supports both plain text and SSML (Speech Synthesis Markup Language) input with extensive configuration options for voice selection and audio output.

5

6

## Core Synthesis Operations

7

8

### Basic Text Synthesis

9

10

```api { .api }

11

from google.cloud import texttospeech

12

13

# Initialize client

14

client = texttospeech.TextToSpeechClient()

15

16

# Create synthesis request

17

request = texttospeech.SynthesizeSpeechRequest(

18

input=texttospeech.SynthesisInput(text="Hello, this is a text-to-speech demo"),

19

voice=texttospeech.VoiceSelectionParams(

20

language_code="en-US",

21

ssml_gender=texttospeech.SsmlVoiceGender.FEMALE

22

),

23

audio_config=texttospeech.AudioConfig(

24

audio_encoding=texttospeech.AudioEncoding.MP3

25

)

26

)

27

28

# Perform synthesis

29

response = client.synthesize_speech(request=request)

30

31

# Access audio data

32

audio_content = response.audio_content # bytes

33

```

34

35

### SSML Synthesis

36

37

```api { .api }

38

from google.cloud import texttospeech

39

40

# SSML input with markup

41

ssml_text = """

42

<speak>

43

<prosody rate="slow" pitch="+2st">

44

Hello, this is spoken slowly with higher pitch.

45

</prosody>

46

<break time="1s"/>

47

<prosody rate="fast" pitch="-2st">

48

And this is spoken quickly with lower pitch.

49

</prosody>

50

</speak>

51

"""

52

53

request = texttospeech.SynthesizeSpeechRequest(

54

input=texttospeech.SynthesisInput(ssml=ssml_text),

55

voice=texttospeech.VoiceSelectionParams(

56

language_code="en-US",

57

name="en-US-Wavenet-D" # Specific voice model

58

),

59

audio_config=texttospeech.AudioConfig(

60

audio_encoding=texttospeech.AudioEncoding.LINEAR16,

61

sample_rate_hertz=24000

62

)

63

)

64

65

response = client.synthesize_speech(request=request)

66

```

67

68

## Input Configuration

69

70

### SynthesisInput Class

71

72

```api { .api }

73

from google.cloud.texttospeech import SynthesisInput

74

75

# Plain text input

76

text_input = SynthesisInput(text="Plain text to synthesize")

77

78

# SSML input

79

ssml_input = SynthesisInput(

80

ssml='<speak>SSML <emphasis level="strong">markup</emphasis> text</speak>'

81

)

82

83

# Multi-speaker SSML input

84

multi_speaker_input = SynthesisInput(

85

multi_speaker_markup=texttospeech.MultiSpeakerMarkup(

86

ssml='<speak><voice name="speaker1">Hello</voice><voice name="speaker2">World</voice></speak>'

87

)

88

)

89

```

90

91

### Advanced Input Options

92

93

```api { .api }

94

# Custom pronunciations with synthesis input

95

from google.cloud.texttospeech import (

96

SynthesisInput,

97

CustomPronunciations,

98

CustomPronunciationParams

99

)

100

101

# Define custom pronunciations

102

custom_pronunciations = CustomPronunciations(

103

pronunciations=[

104

CustomPronunciationParams(

105

phrase="Anthropic",

106

ipa="ˌænθrəˈpɪk",

107

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

108

),

109

CustomPronunciationParams(

110

phrase="Claude",

111

ipa="klɔːd",

112

phonetic_encoding=CustomPronunciationParams.PhoneticEncoding.IPA

113

)

114

]

115

)

116

117

# Use with synthesis

118

request = texttospeech.SynthesizeSpeechRequest(

119

input=SynthesisInput(text="Hello from Anthropic's Claude AI assistant"),

120

voice=texttospeech.VoiceSelectionParams(

121

language_code="en-US",

122

custom_pronunciations=custom_pronunciations

123

),

124

audio_config=texttospeech.AudioConfig(

125

audio_encoding=texttospeech.AudioEncoding.MP3

126

)

127

)

128

```

129

130

## Voice Selection

131

132

### VoiceSelectionParams Class

133

134

```api { .api }

135

from google.cloud.texttospeech import VoiceSelectionParams, SsmlVoiceGender

136

137

# Basic voice selection

138

voice = VoiceSelectionParams(

139

language_code="en-US", # Required: BCP-47 language code

140

ssml_gender=SsmlVoiceGender.MALE # Optional: voice gender

141

)

142

143

# Specific voice model selection

144

voice = VoiceSelectionParams(

145

language_code="en-US",

146

name="en-US-Wavenet-A" # Specific voice name

147

)

148

149

# Custom voice model

150

voice = VoiceSelectionParams(

151

language_code="en-US",

152

custom_voice=texttospeech.CustomVoiceParams(

153

model="projects/your-project/locations/us-central1/models/your-model"

154

)

155

)

156

```

157

158

### Advanced Voice Configuration

159

160

```api { .api }

161

from google.cloud.texttospeech import (

162

VoiceSelectionParams,

163

AdvancedVoiceOptions,

164

VoiceCloneParams

165

)

166

167

# Advanced voice options

168

voice = VoiceSelectionParams(

169

language_code="en-US",

170

name="en-US-Wavenet-A",

171

advanced_voice_options=AdvancedVoiceOptions(

172

low_latency_journey_synthesis=True

173

)

174

)

175

176

# Voice cloning parameters

177

voice = VoiceSelectionParams(

178

language_code="en-US",

179

voice_clone=VoiceCloneParams(

180

voice_clone_key="your-voice-clone-key"

181

)

182

)

183

```

184

185

## Audio Configuration

186

187

### AudioConfig Class

188

189

```api { .api }

190

from google.cloud.texttospeech import AudioConfig, AudioEncoding

191

192

# Basic audio configuration

193

audio_config = AudioConfig(

194

audio_encoding=AudioEncoding.MP3, # Required: output format

195

sample_rate_hertz=22050, # Optional: sample rate

196

speaking_rate=1.0, # Optional: speech rate (0.25-4.0)

197

pitch=0.0, # Optional: pitch (-20.0 to 20.0)

198

volume_gain_db=0.0 # Optional: volume gain (-96.0 to 16.0)

199

)

200

201

# High-quality linear PCM

202

audio_config = AudioConfig(

203

audio_encoding=AudioEncoding.LINEAR16,

204

sample_rate_hertz=48000,

205

speaking_rate=0.9,

206

pitch=2.0

207

)

208

209

# OGG Opus for streaming

210

audio_config = AudioConfig(

211

audio_encoding=AudioEncoding.OGG_OPUS,

212

sample_rate_hertz=48000

213

)

214

```

215

216

### Audio Effects and Processing

217

218

```api { .api }

219

from google.cloud.texttospeech import AudioConfig, AudioEncoding

220

221

# Audio with effects profile

222

audio_config = AudioConfig(

223

audio_encoding=AudioEncoding.MP3,

224

effects_profile_id=["telephony-class-application"], # Audio effects

225

speaking_rate=1.2,

226

pitch=-2.0,

227

volume_gain_db=3.0

228

)

229

230

# Multiple effects profiles

231

audio_config = AudioConfig(

232

audio_encoding=AudioEncoding.LINEAR16,

233

effects_profile_id=[

234

"wearable-class-device",

235

"handset-class-device"

236

],

237

sample_rate_hertz=16000

238

)

239

```

240

241

## Request and Response Types

242

243

### SynthesizeSpeechRequest Class

244

245

```api { .api }

246

from google.cloud.texttospeech import (

247

SynthesizeSpeechRequest,

248

SynthesisInput,

249

VoiceSelectionParams,

250

AudioConfig

251

)

252

253

# Complete request configuration

254

request = SynthesizeSpeechRequest(

255

input=SynthesisInput(text="Text to synthesize"),

256

voice=VoiceSelectionParams(

257

language_code="en-US",

258

ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL

259

),

260

audio_config=AudioConfig(

261

audio_encoding=texttospeech.AudioEncoding.LINEAR16,

262

sample_rate_hertz=22050

263

)

264

)

265

266

# Request with advanced features

267

request = SynthesizeSpeechRequest(

268

input=SynthesisInput(

269

ssml='<speak>Hello <mark name="greeting"/>world!</speak>'

270

),

271

voice=VoiceSelectionParams(

272

language_code="en-US",

273

name="en-US-Neural2-A"

274

),

275

audio_config=AudioConfig(

276

audio_encoding=AudioEncoding.MP3,

277

effects_profile_id=["small-bluetooth-speaker-class-device"]

278

)

279

)

280

```

281

282

### SynthesizeSpeechResponse Class

283

284

```api { .api }

285

from google.cloud.texttospeech import SynthesizeSpeechResponse

286

287

# Standard response

288

response = client.synthesize_speech(request=request)

289

290

# Access response data

291

audio_content = response.audio_content # bytes: synthesized audio data

292

293

# Response provides audio as bytes

294

with open("output.mp3", "wb") as audio_file:

295

audio_file.write(response.audio_content)

296

297

# Get audio length and properties

298

audio_size = len(response.audio_content)

299

print(f"Generated {audio_size} bytes of audio")

300

```

301

302

## Multi-Speaker Synthesis

303

304

### MultiSpeakerMarkup Configuration

305

306

```api { .api }

307

from google.cloud.texttospeech import (

308

SynthesisInput,

309

MultiSpeakerMarkup,

310

VoiceSelectionParams

311

)

312

313

# Multi-speaker SSML

314

multi_speaker_ssml = '''

315

<speak>

316

<voice name="en-US-Neural2-A">

317

Hello, I'm the first speaker.

318

</voice>

319

<voice name="en-US-Neural2-B">

320

And I'm the second speaker.

321

</voice>

322

<voice name="en-US-Neural2-C">

323

Together we create a conversation.

324

</voice>

325

</speak>

326

'''

327

328

# Configure multi-speaker input

329

multi_speaker_input = SynthesisInput(

330

multi_speaker_markup=MultiSpeakerMarkup(

331

ssml=multi_speaker_ssml

332

)

333

)

334

335

# Create synthesis request

336

request = texttospeech.SynthesizeSpeechRequest(

337

input=multi_speaker_input,

338

voice=VoiceSelectionParams(

339

language_code="en-US" # Base language for multi-speaker

340

),

341

audio_config=texttospeech.AudioConfig(

342

audio_encoding=texttospeech.AudioEncoding.LINEAR16

343

)

344

)

345

```

346

347

## Practical Examples

348

349

### File Processing

350

351

```api { .api }

352

import os

353

from google.cloud import texttospeech

354

355

def text_file_to_speech(input_file_path, output_file_path, voice_name=None):

356

"""Convert text file to speech audio file."""

357

client = texttospeech.TextToSpeechClient()

358

359

# Read text from file

360

with open(input_file_path, 'r', encoding='utf-8') as file:

361

text_content = file.read()

362

363

# Configure synthesis

364

voice = texttospeech.VoiceSelectionParams(

365

language_code="en-US",

366

name=voice_name or "en-US-Neural2-A"

367

)

368

369

audio_config = texttospeech.AudioConfig(

370

audio_encoding=texttospeech.AudioEncoding.MP3

371

)

372

373

request = texttospeech.SynthesizeSpeechRequest(

374

input=texttospeech.SynthesisInput(text=text_content),

375

voice=voice,

376

audio_config=audio_config

377

)

378

379

# Synthesize speech

380

response = client.synthesize_speech(request=request)

381

382

# Write audio file

383

with open(output_file_path, "wb") as output_file:

384

output_file.write(response.audio_content)

385

386

print(f"Audio content written to '{output_file_path}'")

387

388

# Usage

389

text_file_to_speech("input.txt", "output.mp3", "en-US-Wavenet-D")

390

```

391

392

### Batch Processing

393

394

```api { .api }

395

from google.cloud import texttospeech

396

import concurrent.futures

397

398

def synthesize_text_batch(texts, output_dir="outputs"):

399

"""Synthesize multiple texts in parallel."""

400

client = texttospeech.TextToSpeechClient()

401

402

def synthesize_single(text_data):

403

text, filename = text_data

404

405

request = texttospeech.SynthesizeSpeechRequest(

406

input=texttospeech.SynthesisInput(text=text),

407

voice=texttospeech.VoiceSelectionParams(

408

language_code="en-US",

409

ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL

410

),

411

audio_config=texttospeech.AudioConfig(

412

audio_encoding=texttospeech.AudioEncoding.MP3

413

)

414

)

415

416

response = client.synthesize_speech(request=request)

417

418

output_path = f"{output_dir}/{filename}.mp3"

419

with open(output_path, "wb") as f:

420

f.write(response.audio_content)

421

422

return output_path

423

424

# Prepare text data

425

text_data = [(text, f"output_{i}") for i, text in enumerate(texts)]

426

427

# Process in parallel

428

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:

429

results = list(executor.map(synthesize_single, text_data))

430

431

return results

432

433

# Usage

434

texts = [

435

"First text to synthesize",

436

"Second text to synthesize",

437

"Third text to synthesize"

438

]

439

output_files = synthesize_text_batch(texts)

440

```

441

442

### SSML Template Processing

443

444

```api { .api }

445

from google.cloud import texttospeech

446

447

def synthesize_with_ssml_template(content_parts, template_path="ssml_template.xml"):

448

"""Use SSML template for consistent speech formatting."""

449

450

# SSML template with placeholders

451

ssml_template = """

452

<speak>

453

<prosody rate="medium" pitch="normal">

454

<emphasis level="moderate">{title}</emphasis>

455

</prosody>

456

<break time="1s"/>

457

<prosody rate="slow">

458

{content}

459

</prosody>

460

<break time="2s"/>

461

<prosody rate="fast" pitch="+1st">

462

{conclusion}

463

</prosody>

464

</speak>

465

"""

466

467

# Fill template

468

ssml_content = ssml_template.format(**content_parts)

469

470

client = texttospeech.TextToSpeechClient()

471

472

request = texttospeech.SynthesizeSpeechRequest(

473

input=texttospeech.SynthesisInput(ssml=ssml_content),

474

voice=texttospeech.VoiceSelectionParams(

475

language_code="en-US",

476

name="en-US-Neural2-A"

477

),

478

audio_config=texttospeech.AudioConfig(

479

audio_encoding=texttospeech.AudioEncoding.LINEAR16,

480

speaking_rate=0.9,

481

pitch=1.0

482

)

483

)

484

485

return client.synthesize_speech(request=request)

486

487

# Usage

488

content = {

489

"title": "Welcome to our presentation",

490

"content": "This is the main content of our speech synthesis example.",

491

"conclusion": "Thank you for listening!"

492

}

493

response = synthesize_with_ssml_template(content)

494

```

495

496

## Error Handling

497

498

### Synthesis-Specific Errors

499

500

```api { .api }

501

from google.api_core import exceptions

502

from google.cloud import texttospeech

503

504

def safe_synthesize_speech(text, language_code="en-US"):

505

"""Synthesize speech with comprehensive error handling."""

506

try:

507

client = texttospeech.TextToSpeechClient()

508

509

request = texttospeech.SynthesizeSpeechRequest(

510

input=texttospeech.SynthesisInput(text=text),

511

voice=texttospeech.VoiceSelectionParams(language_code=language_code),

512

audio_config=texttospeech.AudioConfig(

513

audio_encoding=texttospeech.AudioEncoding.MP3

514

)

515

)

516

517

response = client.synthesize_speech(request=request)

518

return response.audio_content

519

520

except exceptions.InvalidArgument as e:

521

print(f"Invalid request parameters: {e}")

522

return None

523

except exceptions.OutOfRange as e:

524

print(f"Parameter out of valid range: {e}")

525

return None

526

except exceptions.FailedPrecondition as e:

527

print(f"Failed precondition: {e}")

528

return None

529

except exceptions.ResourceExhausted as e:

530

print(f"Quota exceeded or rate limited: {e}")

531

return None

532

except exceptions.Unauthenticated as e:

533

print(f"Authentication failed: {e}")

534

return None

535

except exceptions.PermissionDenied as e:

536

print(f"Permission denied: {e}")

537

return None

538

except Exception as e:

539

print(f"Unexpected error: {e}")

540

return None

541

542

# Usage with error handling

543

audio_data = safe_synthesize_speech("Hello world", "en-US")

544

if audio_data:

545

with open("safe_output.mp3", "wb") as f:

546

f.write(audio_data)

547

```

548

549

## Performance Optimization

550

551

### Request Optimization

552

553

```api { .api }

554

from google.cloud import texttospeech

555

556

# Optimize for latency

557

def create_low_latency_request(text):

558

return texttospeech.SynthesizeSpeechRequest(

559

input=texttospeech.SynthesisInput(text=text),

560

voice=texttospeech.VoiceSelectionParams(

561

language_code="en-US",

562

name="en-US-Standard-A", # Standard voices are faster

563

advanced_voice_options=texttospeech.AdvancedVoiceOptions(

564

low_latency_journey_synthesis=True

565

)

566

),

567

audio_config=texttospeech.AudioConfig(

568

audio_encoding=texttospeech.AudioEncoding.MP3, # MP3 is compressed

569

sample_rate_hertz=16000 # Lower sample rate for faster processing

570

)

571

)

572

573

# Optimize for quality

574

def create_high_quality_request(text):

575

return texttospeech.SynthesizeSpeechRequest(

576

input=texttospeech.SynthesisInput(text=text),

577

voice=texttospeech.VoiceSelectionParams(

578

language_code="en-US",

579

name="en-US-Wavenet-A" # WaveNet for higher quality

580

),

581

audio_config=texttospeech.AudioConfig(

582

audio_encoding=texttospeech.AudioEncoding.LINEAR16, # Uncompressed

583

sample_rate_hertz=48000 # High sample rate

584

)

585

)

586

```