0
# Text-to-Speech
1
2
High-quality neural text-to-speech synthesis with multiple voice models and real-time streaming capabilities. The Speak module supports both REST API for generating complete audio files and WebSocket streaming for real-time audio generation with various voice models, audio formats, and synthesis options.
3
4
## Capabilities
5
6
### REST Client (Complete Audio Generation)
7
8
Synchronous client for generating complete audio files from text input with comprehensive voice and format options.
9
10
```python { .api }
11
class SpeakRESTClient:
12
def stream_memory(
13
self,
14
source: FileSource,
15
options: SpeakRESTOptions = None,
16
addons: dict = None,
17
headers: dict = None,
18
timeout = None,
19
endpoint: str = "v1/speak",
20
**kwargs
21
) -> SpeakRESTResponse:
22
"""
23
Generate speech from text input and return in-memory response.
24
25
Args:
26
source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
27
options: Synthesis configuration options
28
addons: Additional request parameters
29
headers: Additional HTTP headers
30
timeout: Request timeout
31
endpoint: API endpoint override
32
33
Returns:
34
SpeakRESTResponse: Generated audio data with metadata
35
"""
36
37
def stream_raw(
38
self,
39
source: FileSource,
40
options: SpeakRESTOptions = None,
41
addons: dict = None,
42
headers: dict = None,
43
timeout = None,
44
endpoint: str = "v1/speak",
45
**kwargs
46
) -> httpx.Response:
47
"""
48
Generate speech and return raw HTTP response.
49
50
Args:
51
source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
52
options: Synthesis configuration options
53
addons: Additional request parameters
54
headers: Additional HTTP headers
55
timeout: Request timeout
56
endpoint: API endpoint override
57
58
Returns:
59
httpx.Response: Raw HTTP response with audio data
60
"""
61
62
def save(
63
self,
64
filename: str,
65
source: FileSource,
66
options: SpeakRESTOptions = None,
67
addons: dict = None,
68
headers: dict = None,
69
timeout = None,
70
endpoint: str = "v1/speak",
71
**kwargs
72
) -> SpeakRESTResponse:
73
"""
74
Generate speech and save directly to file.
75
76
Args:
77
filename: Output file path
78
source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
79
options: Synthesis configuration options
80
addons: Additional request parameters
81
headers: Additional HTTP headers
82
timeout: Request timeout
83
endpoint: API endpoint override
84
85
Returns:
86
SpeakRESTResponse: Response metadata and status
87
"""
88
89
def file(
90
self,
91
filename: str,
92
source: FileSource,
93
options: SpeakRESTOptions = None,
94
addons: dict = None,
95
timeout = None,
96
endpoint: str = "v1/speak",
97
**kwargs
98
) -> SpeakRESTResponse:
99
"""
100
Generate speech and save to file (alias for save method).
101
102
Args:
103
filename: Output file path
104
source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
105
options: Synthesis configuration options
106
addons: Additional request parameters
107
timeout: Request timeout
108
endpoint: API endpoint override
109
110
Returns:
111
SpeakRESTResponse: Response metadata and status
112
"""
113
114
class AsyncSpeakRESTClient:
115
async def stream_memory(
116
self,
117
source: FileSource,
118
options: SpeakRESTOptions = None,
119
addons: dict = None,
120
headers: dict = None,
121
timeout = None,
122
endpoint: str = "v1/speak",
123
**kwargs
124
) -> SpeakRESTResponse:
125
"""Async version of stream_memory method"""
126
127
async def stream_raw(
128
self,
129
source: FileSource,
130
options: SpeakRESTOptions = None,
131
addons: dict = None,
132
headers: dict = None,
133
timeout = None,
134
endpoint: str = "v1/speak",
135
**kwargs
136
) -> httpx.Response:
137
"""Async version of stream_raw method"""
138
139
async def save(
140
self,
141
filename: str,
142
source: FileSource,
143
options: SpeakRESTOptions = None,
144
addons: dict = None,
145
headers: dict = None,
146
timeout = None,
147
endpoint: str = "v1/speak",
148
**kwargs
149
) -> SpeakRESTResponse:
150
"""Async version of save method"""
151
152
async def file(
153
self,
154
filename: str,
155
source: FileSource,
156
options: SpeakRESTOptions = None,
157
addons: dict = None,
158
timeout = None,
159
endpoint: str = "v1/speak",
160
**kwargs
161
) -> SpeakRESTResponse:
162
"""Async version of file method"""
163
```
164
165
### WebSocket Client (Streaming Audio Generation)
166
167
Real-time streaming text-to-speech client supporting incremental text input and real-time audio output.
168
169
```python { .api }
170
class SpeakWebSocketClient:
171
def __init__(self, config: DeepgramClientOptions, microphone: Microphone = None): ...
172
173
def start(
174
self,
175
options: SpeakWSOptions = None,
176
addons: dict = None,
177
headers: dict = None,
178
members: dict = None,
179
**kwargs
180
) -> bool:
181
"""
182
Start WebSocket connection for streaming TTS.
183
184
Args:
185
options: WebSocket configuration options
186
addons: Additional request parameters
187
headers: Additional HTTP headers
188
members: Member configuration
189
190
Returns:
191
bool: True if connection started successfully
192
"""
193
194
def on(self, event: SpeakWebSocketEvents, handler: Callable) -> None:
195
"""
196
Register event handler for WebSocket events.
197
198
Args:
199
event: WebSocket event type
200
handler: Callable to handle the event
201
"""
202
203
def send_text(self, text_input: str) -> bool:
204
"""
205
Send text for speech synthesis.
206
207
Args:
208
text_input: Text to convert to speech
209
210
Returns:
211
bool: True if text sent successfully
212
"""
213
214
def send(self, data: Union[str, bytes]) -> bool:
215
"""
216
Send text data (alias for send_text).
217
218
Args:
219
data: Text or bytes to send
220
221
Returns:
222
bool: True if data sent successfully
223
"""
224
225
def send_raw(self, msg: str) -> bool:
226
"""
227
Send raw WebSocket message.
228
229
Args:
230
msg: Raw message to send
231
232
Returns:
233
bool: True if message sent successfully
234
"""
235
236
def send_control(
237
self,
238
msg_type: Union[SpeakWebSocketMessage, str],
239
data: str = ""
240
) -> bool:
241
"""
242
Send control message.
243
244
Args:
245
msg_type: Message type constant
246
data: Optional data payload
247
248
Returns:
249
bool: True if control message sent successfully
250
"""
251
252
def flush(self) -> bool:
253
"""
254
Flush current synthesis buffer.
255
256
Returns:
257
bool: True if flush successful
258
"""
259
260
def clear(self) -> bool:
261
"""
262
Clear synthesis buffer.
263
264
Returns:
265
bool: True if clear successful
266
"""
267
268
def finish(self) -> bool:
269
"""
270
Finish WebSocket connection.
271
272
Returns:
273
bool: True if finish successful
274
"""
275
276
def wait_for_complete(self) -> None:
277
"""
278
Wait for synthesis completion.
279
"""
280
281
class AsyncSpeakWebSocketClient:
282
def __init__(self, config: DeepgramClientOptions, microphone: Microphone = None): ...
283
284
async def start(...) -> bool: ...
285
def on(self, event: SpeakWebSocketEvents, handler: Callable) -> None: ... # Not async
286
async def send_text(self, text_input: str) -> bool: ...
287
async def send(self, data: Union[str, bytes]) -> bool: ...
288
async def send_raw(self, msg: str) -> bool: ...
289
async def send_control(...) -> bool: ...
290
async def flush(self) -> bool: ...
291
async def clear(self) -> bool: ...
292
async def finish(self) -> bool: ...
293
async def wait_for_complete(self) -> None: ...
294
295
# Alternative client names
296
class SpeakWSClient(SpeakWebSocketClient): ...
297
class AsyncSpeakWSClient(AsyncSpeakWebSocketClient): ...
298
```
299
300
### Router Access
301
302
Access text-to-speech clients through the main client's speak router.
303
304
```python { .api }
305
class SpeakRouter:
306
@property
307
def rest(self) -> SpeakRESTClient: ...
308
@property
309
def asyncrest(self) -> AsyncSpeakRESTClient: ...
310
@property
311
def websocket(self) -> SpeakWebSocketClient: ...
312
@property
313
def asyncwebsocket(self) -> AsyncSpeakWebSocketClient: ...
314
```
315
316
### Options Classes
317
318
#### REST Options
319
320
```python { .api }
321
class SpeakRESTOptions:
322
def __init__(self, **kwargs): ...
323
324
# Voice model selection
325
model: str = "aura-asteria-en" # Voice model name
326
327
# Audio format settings
328
encoding: str = "linear16" # Audio encoding format
329
container: str = "wav" # Audio container format
330
sample_rate: int = 24000 # Sample rate in Hz
331
bit_rate: int = None # Bit rate for compressed formats
332
333
# Additional options
334
extra: dict = None # Additional synthesis options
335
336
# Legacy alias
337
class SpeakOptions(SpeakRESTOptions): ...
338
```
339
340
#### WebSocket Options
341
342
```python { .api }
343
class SpeakWSOptions:
344
def __init__(self, **kwargs): ...
345
346
# Voice model selection
347
model: str = "aura-asteria-en" # Voice model name
348
349
# Audio format settings (required for WebSocket)
350
encoding: str = "linear16" # Audio encoding format
351
sample_rate: int = 24000 # Sample rate in Hz
352
container: str = None # Audio container (optional for streaming)
353
354
# Additional options
355
extra: dict = None # Additional synthesis options
356
```
357
358
### WebSocket Events and Messages
359
360
Event constants and message types for WebSocket text-to-speech operations.
361
362
```python { .api }
363
class SpeakWebSocketEvents:
364
"""WebSocket event constants for TTS operations"""
365
OPEN: str = "Open"
366
METADATA: str = "Metadata"
367
AUDIO: str = "Audio"
368
FLUSHED: str = "Flushed"
369
CLEARED: str = "Cleared"
370
CLOSE: str = "Close"
371
ERROR: str = "Error"
372
WARNING: str = "Warning"
373
UNHANDLED: str = "Unhandled"
374
375
class SpeakWebSocketMessage:
376
"""WebSocket message type constants"""
377
SPEAK: str = "Speak"
378
FLUSH: str = "Flush"
379
CLEAR: str = "Clear"
380
CLOSE: str = "Close"
381
```
382
383
### Source Types
384
385
Input sources for text data in various formats.
386
387
```python { .api }
388
class SpeakSource:
389
"""Base class for text-to-speech sources"""
390
391
class TextSource(SpeakSource):
392
def __init__(self, text: str):
393
"""
394
Text from string.
395
396
Args:
397
text: Text content to synthesize
398
"""
399
400
class BufferSource(SpeakSource):
401
def __init__(self, buffer: bytes):
402
"""
403
Text from byte buffer.
404
405
Args:
406
buffer: Text content as bytes
407
"""
408
409
class StreamSource(SpeakSource):
410
def __init__(self, stream):
411
"""
412
Text from stream object.
413
414
Args:
415
stream: File-like stream object
416
"""
417
418
class FileSource(SpeakSource):
419
def __init__(self, file: str):
420
"""
421
Text from local file.
422
423
Args:
424
file: Path to local text file
425
"""
426
427
# Alternative source names
428
class SpeakRestSource(SpeakSource): ...
429
class SpeakRESTSource(SpeakSource): ...
430
```
431
432
### Response Types
433
434
#### REST Response Types
435
436
```python { .api }
437
class SpeakRESTResponse:
438
"""REST text-to-speech response containing generated audio"""
439
content: bytes # Generated audio data
440
headers: dict # Response headers with metadata
441
442
def stream_to_file(self, filename: str) -> None:
443
"""
444
Save audio content to file.
445
446
Args:
447
filename: Output file path
448
"""
449
450
# Legacy alias
451
class SpeakResponse(SpeakRESTResponse): ...
452
```
453
454
#### WebSocket Response Types
455
456
```python { .api }
457
class SpeakWSMetadataResponse:
458
"""WebSocket metadata response"""
459
type: str = "Metadata"
460
request_id: str
461
model_name: str
462
model_uuid: str
463
464
class FlushedResponse:
465
"""Buffer flush confirmation"""
466
type: str = "Flushed"
467
468
class ClearedResponse:
469
"""Buffer clear confirmation"""
470
type: str = "Cleared"
471
472
class WarningResponse:
473
"""Synthesis warning"""
474
type: str = "Warning"
475
message: str
476
477
# Common WebSocket responses are inherited from common module:
478
# OpenResponse, CloseResponse, ErrorResponse, UnhandledResponse
479
```
480
481
## Usage Examples
482
483
### Basic Text-to-Speech
484
485
```python
486
from deepgram import DeepgramClient, TextSource, SpeakRESTOptions
487
488
client = DeepgramClient(api_key="your-api-key")
489
490
# Generate speech from text
491
source = TextSource("Hello, world! This is a test of the Deepgram text-to-speech API.")
492
options = SpeakRESTOptions(
493
model="aura-asteria-en",
494
encoding="linear16",
495
container="wav",
496
sample_rate=24000
497
)
498
499
response = client.speak.rest.stream(source, options)
500
501
# Save to file
502
with open("output.wav", "wb") as f:
503
f.write(response.content)
504
505
# Or use convenience method
506
response.stream_to_file("output.wav")
507
```
508
509
### Voice Model Selection
510
511
```python
512
from deepgram import DeepgramClient, TextSource, SpeakRESTOptions
513
514
client = DeepgramClient(api_key="your-api-key")
515
516
# Different voice models
517
models = [
518
"aura-asteria-en", # English, female
519
"aura-luna-en", # English, female
520
"aura-stella-en", # English, female
521
"aura-athena-en", # English, female
522
"aura-hera-en", # English, female
523
"aura-orion-en", # English, male
524
"aura-arcas-en", # English, male
525
"aura-perseus-en", # English, male
526
"aura-angus-en", # English, male
527
"aura-orpheus-en", # English, male
528
]
529
530
source = TextSource("This is a test with different voice models.")
531
532
for model in models:
533
options = SpeakRESTOptions(model=model)
534
response = client.speak.rest.stream(source, options)
535
response.stream_to_file(f"output_{model}.wav")
536
```
537
538
### Audio Format Options
539
540
```python
541
from deepgram import DeepgramClient, TextSource, SpeakRESTOptions
542
543
client = DeepgramClient(api_key="your-api-key")
544
source = TextSource("Testing different audio formats.")
545
546
# WAV format (uncompressed)
547
wav_options = SpeakRESTOptions(
548
model="aura-asteria-en",
549
encoding="linear16",
550
container="wav",
551
sample_rate=24000
552
)
553
554
# MP3 format (compressed)
555
mp3_options = SpeakRESTOptions(
556
model="aura-asteria-en",
557
encoding="mp3",
558
container="mp3",
559
sample_rate=22050,
560
bit_rate=128000
561
)
562
563
# FLAC format (lossless compression)
564
flac_options = SpeakRESTOptions(
565
model="aura-asteria-en",
566
encoding="flac",
567
container="flac",
568
sample_rate=24000
569
)
570
571
# Generate in different formats
572
wav_response = client.speak.rest.stream(source, wav_options)
573
mp3_response = client.speak.rest.stream(source, mp3_options)
574
flac_response = client.speak.rest.stream(source, flac_options)
575
576
wav_response.stream_to_file("output.wav")
577
mp3_response.stream_to_file("output.mp3")
578
flac_response.stream_to_file("output.flac")
579
```
580
581
### Streaming Text-to-Speech
582
583
```python
584
from deepgram import DeepgramClient, SpeakWSOptions, SpeakWebSocketEvents
585
import threading
586
import queue
587
588
client = DeepgramClient(api_key="your-api-key")
589
audio_queue = queue.Queue()
590
591
def on_open(self, open, **kwargs):
592
print("TTS connection opened")
593
594
def on_audio_data(self, data, **kwargs):
595
# Received audio chunk
596
audio_queue.put(data)
597
598
def on_close(self, close, **kwargs):
599
print("TTS connection closed")
600
601
def on_error(self, error, **kwargs):
602
print(f"TTS error: {error}")
603
604
# Configure WebSocket options
605
options = SpeakWSOptions(
606
model="aura-asteria-en",
607
encoding="linear16",
608
sample_rate=24000
609
)
610
611
# Start connection
612
dg_connection = client.speak.websocket.v("1")
613
dg_connection.on(SpeakWebSocketEvents.Open, on_open)
614
dg_connection.on(SpeakWebSocketEvents.AudioData, on_audio_data)
615
dg_connection.on(SpeakWebSocketEvents.Close, on_close)
616
dg_connection.on(SpeakWebSocketEvents.Error, on_error)
617
618
if dg_connection.start(options):
619
# Send text incrementally
620
dg_connection.send("Hello, this is streaming text-to-speech. ")
621
dg_connection.send("I can send text in chunks and receive audio in real-time. ")
622
dg_connection.send("This is very useful for interactive applications.")
623
624
# Flush to ensure all text is processed
625
dg_connection.flush()
626
627
# Close connection
628
dg_connection.close()
629
630
# Process received audio
631
audio_data = b""
632
while not audio_queue.empty():
633
audio_data += audio_queue.get()
634
635
# Save streamed audio
636
with open("streamed_output.wav", "wb") as f:
637
f.write(audio_data)
638
```
639
640
### Async Text-to-Speech
641
642
```python
643
import asyncio
644
from deepgram import DeepgramClient, TextSource, SpeakRESTOptions
645
646
async def async_tts_example():
647
client = DeepgramClient(api_key="your-api-key")
648
649
source = TextSource("This is an async text-to-speech example.")
650
options = SpeakRESTOptions(
651
model="aura-asteria-en",
652
encoding="linear16",
653
container="wav"
654
)
655
656
response = await client.speak.asyncrest.synthesize(source, options)
657
658
with open("async_output.wav", "wb") as f:
659
f.write(response.content)
660
661
print("Async TTS completed")
662
663
# Run async example
664
asyncio.run(async_tts_example())
665
```
666
667
### Error Handling
668
669
```python
670
from deepgram import DeepgramClient, DeepgramApiError, TextSource, SpeakRESTOptions
671
672
client = DeepgramClient(api_key="your-api-key")
673
674
try:
675
source = TextSource("Text to synthesize")
676
options = SpeakRESTOptions(
677
model="invalid-model", # This will cause an error
678
encoding="linear16"
679
)
680
681
response = client.speak.rest.stream(source, options)
682
683
except DeepgramApiError as e:
684
print(f"API Error: {e}")
685
except Exception as e:
686
print(f"Unexpected error: {e}")
687
```