or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-utilities.mdconversational-ai.mdindex.mdproject-management.mdspeech-to-text.mdtext-analysis.mdtext-to-speech.md

text-to-speech.mddocs/

0

# Text-to-Speech

1

2

High-quality neural text-to-speech synthesis with multiple voice models and real-time streaming capabilities. The Speak module supports both REST API for generating complete audio files and WebSocket streaming for real-time audio generation with various voice models, audio formats, and synthesis options.

3

4

## Capabilities

5

6

### REST Client (Complete Audio Generation)

7

8

Synchronous client for generating complete audio files from text input with comprehensive voice and format options.

9

10

```python { .api }

11

class SpeakRESTClient:

12

def stream_memory(

13

self,

14

source: FileSource,

15

options: SpeakRESTOptions = None,

16

addons: dict = None,

17

headers: dict = None,

18

timeout = None,

19

endpoint: str = "v1/speak",

20

**kwargs

21

) -> SpeakRESTResponse:

22

"""

23

Generate speech from text input and return in-memory response.

24

25

Args:

26

source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)

27

options: Synthesis configuration options

28

addons: Additional request parameters

29

headers: Additional HTTP headers

30

timeout: Request timeout

31

endpoint: API endpoint override

32

33

Returns:

34

SpeakRESTResponse: Generated audio data with metadata

35

"""

36

37

def stream_raw(

38

self,

39

source: FileSource,

40

options: SpeakRESTOptions = None,

41

addons: dict = None,

42

headers: dict = None,

43

timeout = None,

44

endpoint: str = "v1/speak",

45

**kwargs

46

) -> httpx.Response:

47

"""

48

Generate speech and return raw HTTP response.

49

50

Args:

51

source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)

52

options: Synthesis configuration options

53

addons: Additional request parameters

54

headers: Additional HTTP headers

55

timeout: Request timeout

56

endpoint: API endpoint override

57

58

Returns:

59

httpx.Response: Raw HTTP response with audio data

60

"""

61

62

def save(

63

self,

64

filename: str,

65

source: FileSource,

66

options: SpeakRESTOptions = None,

67

addons: dict = None,

68

headers: dict = None,

69

timeout = None,

70

endpoint: str = "v1/speak",

71

**kwargs

72

) -> SpeakRESTResponse:

73

"""

74

Generate speech and save directly to file.

75

76

Args:

77

filename: Output file path

78

source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)

79

options: Synthesis configuration options

80

addons: Additional request parameters

81

headers: Additional HTTP headers

82

timeout: Request timeout

83

endpoint: API endpoint override

84

85

Returns:

86

SpeakRESTResponse: Response metadata and status

87

"""

88

89

def file(

90

self,

91

filename: str,

92

source: FileSource,

93

options: SpeakRESTOptions = None,

94

addons: dict = None,

95

timeout = None,

96

endpoint: str = "v1/speak",

97

**kwargs

98

) -> SpeakRESTResponse:

99

"""

100

Generate speech and save to file (alias for save method).

101

102

Args:

103

filename: Output file path

104

source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)

105

options: Synthesis configuration options

106

addons: Additional request parameters

107

timeout: Request timeout

108

endpoint: API endpoint override

109

110

Returns:

111

SpeakRESTResponse: Response metadata and status

112

"""

113

114

class AsyncSpeakRESTClient:

115

async def stream_memory(

116

self,

117

source: FileSource,

118

options: SpeakRESTOptions = None,

119

addons: dict = None,

120

headers: dict = None,

121

timeout = None,

122

endpoint: str = "v1/speak",

123

**kwargs

124

) -> SpeakRESTResponse:

125

"""Async version of stream_memory method"""

126

127

async def stream_raw(

128

self,

129

source: FileSource,

130

options: SpeakRESTOptions = None,

131

addons: dict = None,

132

headers: dict = None,

133

timeout = None,

134

endpoint: str = "v1/speak",

135

**kwargs

136

) -> httpx.Response:

137

"""Async version of stream_raw method"""

138

139

async def save(

140

self,

141

filename: str,

142

source: FileSource,

143

options: SpeakRESTOptions = None,

144

addons: dict = None,

145

headers: dict = None,

146

timeout = None,

147

endpoint: str = "v1/speak",

148

**kwargs

149

) -> SpeakRESTResponse:

150

"""Async version of save method"""

151

152

async def file(

153

self,

154

filename: str,

155

source: FileSource,

156

options: SpeakRESTOptions = None,

157

addons: dict = None,

158

timeout = None,

159

endpoint: str = "v1/speak",

160

**kwargs

161

) -> SpeakRESTResponse:

162

"""Async version of file method"""

163

```

164

165

### WebSocket Client (Streaming Audio Generation)

166

167

Real-time streaming text-to-speech client supporting incremental text input and real-time audio output.

168

169

```python { .api }

170

class SpeakWebSocketClient:

171

def __init__(self, config: DeepgramClientOptions, microphone: Microphone = None): ...

172

173

def start(

174

self,

175

options: SpeakWSOptions = None,

176

addons: dict = None,

177

headers: dict = None,

178

members: dict = None,

179

**kwargs

180

) -> bool:

181

"""

182

Start WebSocket connection for streaming TTS.

183

184

Args:

185

options: WebSocket configuration options

186

addons: Additional request parameters

187

headers: Additional HTTP headers

188

members: Member configuration

189

190

Returns:

191

bool: True if connection started successfully

192

"""

193

194

def on(self, event: SpeakWebSocketEvents, handler: Callable) -> None:

195

"""

196

Register event handler for WebSocket events.

197

198

Args:

199

event: WebSocket event type

200

handler: Callable to handle the event

201

"""

202

203

def send_text(self, text_input: str) -> bool:

204

"""

205

Send text for speech synthesis.

206

207

Args:

208

text_input: Text to convert to speech

209

210

Returns:

211

bool: True if text sent successfully

212

"""

213

214

def send(self, data: Union[str, bytes]) -> bool:

215

"""

216

Send text data (alias for send_text).

217

218

Args:

219

data: Text or bytes to send

220

221

Returns:

222

bool: True if data sent successfully

223

"""

224

225

def send_raw(self, msg: str) -> bool:

226

"""

227

Send raw WebSocket message.

228

229

Args:

230

msg: Raw message to send

231

232

Returns:

233

bool: True if message sent successfully

234

"""

235

236

def send_control(

237

self,

238

msg_type: Union[SpeakWebSocketMessage, str],

239

data: str = ""

240

) -> bool:

241

"""

242

Send control message.

243

244

Args:

245

msg_type: Message type constant

246

data: Optional data payload

247

248

Returns:

249

bool: True if control message sent successfully

250

"""

251

252

def flush(self) -> bool:

253

"""

254

Flush current synthesis buffer.

255

256

Returns:

257

bool: True if flush successful

258

"""

259

260

def clear(self) -> bool:

261

"""

262

Clear synthesis buffer.

263

264

Returns:

265

bool: True if clear successful

266

"""

267

268

def finish(self) -> bool:

269

"""

270

Finish WebSocket connection.

271

272

Returns:

273

bool: True if finish successful

274

"""

275

276

def wait_for_complete(self) -> None:

277

"""

278

Wait for synthesis completion.

279

"""

280

281

class AsyncSpeakWebSocketClient:

282

def __init__(self, config: DeepgramClientOptions, microphone: Microphone = None): ...

283

284

async def start(...) -> bool: ...

285

def on(self, event: SpeakWebSocketEvents, handler: Callable) -> None: ... # Not async

286

async def send_text(self, text_input: str) -> bool: ...

287

async def send(self, data: Union[str, bytes]) -> bool: ...

288

async def send_raw(self, msg: str) -> bool: ...

289

async def send_control(...) -> bool: ...

290

async def flush(self) -> bool: ...

291

async def clear(self) -> bool: ...

292

async def finish(self) -> bool: ...

293

async def wait_for_complete(self) -> None: ...

294

295

# Alternative client names

296

class SpeakWSClient(SpeakWebSocketClient): ...

297

class AsyncSpeakWSClient(AsyncSpeakWebSocketClient): ...

298

```

299

300

### Router Access

301

302

Access text-to-speech clients through the main client's speak router.

303

304

```python { .api }

305

class SpeakRouter:

306

@property

307

def rest(self) -> SpeakRESTClient: ...

308

@property

309

def asyncrest(self) -> AsyncSpeakRESTClient: ...

310

@property

311

def websocket(self) -> SpeakWebSocketClient: ...

312

@property

313

def asyncwebsocket(self) -> AsyncSpeakWebSocketClient: ...

314

```

315

316

### Options Classes

317

318

#### REST Options

319

320

```python { .api }

321

class SpeakRESTOptions:

322

def __init__(self, **kwargs): ...

323

324

# Voice model selection

325

model: str = "aura-asteria-en" # Voice model name

326

327

# Audio format settings

328

encoding: str = "linear16" # Audio encoding format

329

container: str = "wav" # Audio container format

330

sample_rate: int = 24000 # Sample rate in Hz

331

bit_rate: int = None # Bit rate for compressed formats

332

333

# Additional options

334

extra: dict = None # Additional synthesis options

335

336

# Legacy alias

337

class SpeakOptions(SpeakRESTOptions): ...

338

```

339

340

#### WebSocket Options

341

342

```python { .api }

343

class SpeakWSOptions:

344

def __init__(self, **kwargs): ...

345

346

# Voice model selection

347

model: str = "aura-asteria-en" # Voice model name

348

349

# Audio format settings (required for WebSocket)

350

encoding: str = "linear16" # Audio encoding format

351

sample_rate: int = 24000 # Sample rate in Hz

352

container: str = None # Audio container (optional for streaming)

353

354

# Additional options

355

extra: dict = None # Additional synthesis options

356

```

357

358

### WebSocket Events and Messages

359

360

Event constants and message types for WebSocket text-to-speech operations.

361

362

```python { .api }

363

class SpeakWebSocketEvents:

364

"""WebSocket event constants for TTS operations"""

365

OPEN: str = "Open"

366

METADATA: str = "Metadata"

367

AUDIO: str = "Audio"

368

FLUSHED: str = "Flushed"

369

CLEARED: str = "Cleared"

370

CLOSE: str = "Close"

371

ERROR: str = "Error"

372

WARNING: str = "Warning"

373

UNHANDLED: str = "Unhandled"

374

375

class SpeakWebSocketMessage:

376

"""WebSocket message type constants"""

377

SPEAK: str = "Speak"

378

FLUSH: str = "Flush"

379

CLEAR: str = "Clear"

380

CLOSE: str = "Close"

381

```

382

383

### Source Types

384

385

Input sources for text data in various formats.

386

387

```python { .api }

388

class SpeakSource:

389

"""Base class for text-to-speech sources"""

390

391

class TextSource(SpeakSource):

392

def __init__(self, text: str):

393

"""

394

Text from string.

395

396

Args:

397

text: Text content to synthesize

398

"""

399

400

class BufferSource(SpeakSource):

401

def __init__(self, buffer: bytes):

402

"""

403

Text from byte buffer.

404

405

Args:

406

buffer: Text content as bytes

407

"""

408

409

class StreamSource(SpeakSource):

410

def __init__(self, stream):

411

"""

412

Text from stream object.

413

414

Args:

415

stream: File-like stream object

416

"""

417

418

class FileSource(SpeakSource):

419

def __init__(self, file: str):

420

"""

421

Text from local file.

422

423

Args:

424

file: Path to local text file

425

"""

426

427

# Alternative source names

428

class SpeakRestSource(SpeakSource): ...

429

class SpeakRESTSource(SpeakSource): ...

430

```

431

432

### Response Types

433

434

#### REST Response Types

435

436

```python { .api }

437

class SpeakRESTResponse:

438

"""REST text-to-speech response containing generated audio"""

439

content: bytes # Generated audio data

440

headers: dict # Response headers with metadata

441

442

def stream_to_file(self, filename: str) -> None:

443

"""

444

Save audio content to file.

445

446

Args:

447

filename: Output file path

448

"""

449

450

# Legacy alias

451

class SpeakResponse(SpeakRESTResponse): ...

452

```

453

454

#### WebSocket Response Types

455

456

```python { .api }

457

class SpeakWSMetadataResponse:

458

"""WebSocket metadata response"""

459

type: str = "Metadata"

460

request_id: str

461

model_name: str

462

model_uuid: str

463

464

class FlushedResponse:

465

"""Buffer flush confirmation"""

466

type: str = "Flushed"

467

468

class ClearedResponse:

469

"""Buffer clear confirmation"""

470

type: str = "Cleared"

471

472

class WarningResponse:

473

"""Synthesis warning"""

474

type: str = "Warning"

475

message: str

476

477

# Common WebSocket responses are inherited from common module:

478

# OpenResponse, CloseResponse, ErrorResponse, UnhandledResponse

479

```

480

481

## Usage Examples

482

483

### Basic Text-to-Speech

484

485

```python

486

from deepgram import DeepgramClient, TextSource, SpeakRESTOptions

487

488

client = DeepgramClient(api_key="your-api-key")

489

490

# Generate speech from text

491

source = TextSource("Hello, world! This is a test of the Deepgram text-to-speech API.")

492

options = SpeakRESTOptions(

493

model="aura-asteria-en",

494

encoding="linear16",

495

container="wav",

496

sample_rate=24000

497

)

498

499

response = client.speak.rest.stream(source, options)

500

501

# Save to file

502

with open("output.wav", "wb") as f:

503

f.write(response.content)

504

505

# Or use convenience method

506

response.stream_to_file("output.wav")

507

```

508

509

### Voice Model Selection

510

511

```python

512

from deepgram import DeepgramClient, TextSource, SpeakRESTOptions

513

514

client = DeepgramClient(api_key="your-api-key")

515

516

# Different voice models

517

models = [

518

"aura-asteria-en", # English, female

519

"aura-luna-en", # English, female

520

"aura-stella-en", # English, female

521

"aura-athena-en", # English, female

522

"aura-hera-en", # English, female

523

"aura-orion-en", # English, male

524

"aura-arcas-en", # English, male

525

"aura-perseus-en", # English, male

526

"aura-angus-en", # English, male

527

"aura-orpheus-en", # English, male

528

]

529

530

source = TextSource("This is a test with different voice models.")

531

532

for model in models:

533

options = SpeakRESTOptions(model=model)

534

response = client.speak.rest.stream(source, options)

535

response.stream_to_file(f"output_{model}.wav")

536

```

537

538

### Audio Format Options

539

540

```python

541

from deepgram import DeepgramClient, TextSource, SpeakRESTOptions

542

543

client = DeepgramClient(api_key="your-api-key")

544

source = TextSource("Testing different audio formats.")

545

546

# WAV format (uncompressed)

547

wav_options = SpeakRESTOptions(

548

model="aura-asteria-en",

549

encoding="linear16",

550

container="wav",

551

sample_rate=24000

552

)

553

554

# MP3 format (compressed)

555

mp3_options = SpeakRESTOptions(

556

model="aura-asteria-en",

557

encoding="mp3",

558

container="mp3",

559

sample_rate=22050,

560

bit_rate=128000

561

)

562

563

# FLAC format (lossless compression)

564

flac_options = SpeakRESTOptions(

565

model="aura-asteria-en",

566

encoding="flac",

567

container="flac",

568

sample_rate=24000

569

)

570

571

# Generate in different formats

572

wav_response = client.speak.rest.stream(source, wav_options)

573

mp3_response = client.speak.rest.stream(source, mp3_options)

574

flac_response = client.speak.rest.stream(source, flac_options)

575

576

wav_response.stream_to_file("output.wav")

577

mp3_response.stream_to_file("output.mp3")

578

flac_response.stream_to_file("output.flac")

579

```

580

581

### Streaming Text-to-Speech

582

583

```python

584

from deepgram import DeepgramClient, SpeakWSOptions, SpeakWebSocketEvents

585

import threading

586

import queue

587

588

client = DeepgramClient(api_key="your-api-key")

589

audio_queue = queue.Queue()

590

591

def on_open(self, open, **kwargs):

592

print("TTS connection opened")

593

594

def on_audio_data(self, data, **kwargs):

595

# Received audio chunk

596

audio_queue.put(data)

597

598

def on_close(self, close, **kwargs):

599

print("TTS connection closed")

600

601

def on_error(self, error, **kwargs):

602

print(f"TTS error: {error}")

603

604

# Configure WebSocket options

605

options = SpeakWSOptions(

606

model="aura-asteria-en",

607

encoding="linear16",

608

sample_rate=24000

609

)

610

611

# Start connection

612

dg_connection = client.speak.websocket.v("1")

613

dg_connection.on(SpeakWebSocketEvents.Open, on_open)

614

dg_connection.on(SpeakWebSocketEvents.AudioData, on_audio_data)

615

dg_connection.on(SpeakWebSocketEvents.Close, on_close)

616

dg_connection.on(SpeakWebSocketEvents.Error, on_error)

617

618

if dg_connection.start(options):

619

# Send text incrementally

620

dg_connection.send("Hello, this is streaming text-to-speech. ")

621

dg_connection.send("I can send text in chunks and receive audio in real-time. ")

622

dg_connection.send("This is very useful for interactive applications.")

623

624

# Flush to ensure all text is processed

625

dg_connection.flush()

626

627

# Close connection

628

dg_connection.close()

629

630

# Process received audio

631

audio_data = b""

632

while not audio_queue.empty():

633

audio_data += audio_queue.get()

634

635

# Save streamed audio

636

with open("streamed_output.wav", "wb") as f:

637

f.write(audio_data)

638

```

639

640

### Async Text-to-Speech

641

642

```python

643

import asyncio

644

from deepgram import DeepgramClient, TextSource, SpeakRESTOptions

645

646

async def async_tts_example():

647

client = DeepgramClient(api_key="your-api-key")

648

649

source = TextSource("This is an async text-to-speech example.")

650

options = SpeakRESTOptions(

651

model="aura-asteria-en",

652

encoding="linear16",

653

container="wav"

654

)

655

656

response = await client.speak.asyncrest.synthesize(source, options)

657

658

with open("async_output.wav", "wb") as f:

659

f.write(response.content)

660

661

print("Async TTS completed")

662

663

# Run async example

664

asyncio.run(async_tts_example())

665

```

666

667

### Error Handling

668

669

```python

670

from deepgram import DeepgramClient, DeepgramApiError, TextSource, SpeakRESTOptions

671

672

client = DeepgramClient(api_key="your-api-key")

673

674

try:

675

source = TextSource("Text to synthesize")

676

options = SpeakRESTOptions(

677

model="invalid-model", # This will cause an error

678

encoding="linear16"

679

)

680

681

response = client.speak.rest.stream(source, options)

682

683

except DeepgramApiError as e:

684

print(f"API Error: {e}")

685

except Exception as e:

686

print(f"Unexpected error: {e}")

687

```