Tessl Tile for pypi/deepgram-sdk@4.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio-utilities.md conversational-ai.md index.md project-management.md speech-to-text.md text-analysis.md text-to-speech.md

text-to-speech.mddocs/

0
# Text-to-Speech
1

2
High-quality neural text-to-speech synthesis with multiple voice models and real-time streaming capabilities. The Speak module supports both REST API for generating complete audio files and WebSocket streaming for real-time audio generation with various voice models, audio formats, and synthesis options.
3

4
## Capabilities
5

6
### REST Client (Complete Audio Generation)
7

8
Synchronous client for generating complete audio files from text input with comprehensive voice and format options.
9

10
```python { .api }
11
class SpeakRESTClient:
12
    def stream_memory(
13
        self,
14
        source: FileSource,
15
        options: SpeakRESTOptions = None,
16
        addons: dict = None,
17
        headers: dict = None,
18
        timeout = None,
19
        endpoint: str = "v1/speak",
20
        **kwargs
21
    ) -> SpeakRESTResponse:
22
        """
23
        Generate speech from text input and return in-memory response.
24
        
25
        Args:
26
            source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
27
            options: Synthesis configuration options
28
            addons: Additional request parameters
29
            headers: Additional HTTP headers
30
            timeout: Request timeout
31
            endpoint: API endpoint override
32
            
33
        Returns:
34
            SpeakRESTResponse: Generated audio data with metadata
35
        """
36
    
37
    def stream_raw(
38
        self,
39
        source: FileSource,
40
        options: SpeakRESTOptions = None,
41
        addons: dict = None,
42
        headers: dict = None,
43
        timeout = None,
44
        endpoint: str = "v1/speak",
45
        **kwargs
46
    ) -> httpx.Response:
47
        """
48
        Generate speech and return raw HTTP response.
49
        
50
        Args:
51
            source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
52
            options: Synthesis configuration options
53
            addons: Additional request parameters
54
            headers: Additional HTTP headers
55
            timeout: Request timeout
56
            endpoint: API endpoint override
57
            
58
        Returns:
59
            httpx.Response: Raw HTTP response with audio data
60
        """
61
    
62
    def save(
63
        self,
64
        filename: str,
65
        source: FileSource,
66
        options: SpeakRESTOptions = None,
67
        addons: dict = None,
68
        headers: dict = None,
69
        timeout = None,
70
        endpoint: str = "v1/speak",
71
        **kwargs
72
    ) -> SpeakRESTResponse:
73
        """
74
        Generate speech and save directly to file.
75
        
76
        Args:
77
            filename: Output file path
78
            source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
79
            options: Synthesis configuration options
80
            addons: Additional request parameters
81
            headers: Additional HTTP headers
82
            timeout: Request timeout
83
            endpoint: API endpoint override
84
            
85
        Returns:
86
            SpeakRESTResponse: Response metadata and status
87
        """
88
    
89
    def file(
90
        self,
91
        filename: str,
92
        source: FileSource,
93
        options: SpeakRESTOptions = None,
94
        addons: dict = None,
95
        timeout = None,
96
        endpoint: str = "v1/speak",
97
        **kwargs
98
    ) -> SpeakRESTResponse:
99
        """
100
        Generate speech and save to file (alias for save method).
101
        
102
        Args:
103
            filename: Output file path
104
            source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
105
            options: Synthesis configuration options
106
            addons: Additional request parameters
107
            timeout: Request timeout
108
            endpoint: API endpoint override
109
            
110
        Returns:
111
            SpeakRESTResponse: Response metadata and status
112
        """
113

114
class AsyncSpeakRESTClient:
115
    async def stream_memory(
116
        self,
117
        source: FileSource,
118
        options: SpeakRESTOptions = None,
119
        addons: dict = None,
120
        headers: dict = None,
121
        timeout = None,
122
        endpoint: str = "v1/speak",
123
        **kwargs
124
    ) -> SpeakRESTResponse:
125
        """Async version of stream_memory method"""
126
    
127
    async def stream_raw(
128
        self,
129
        source: FileSource,
130
        options: SpeakRESTOptions = None,
131
        addons: dict = None,
132
        headers: dict = None,
133
        timeout = None,
134
        endpoint: str = "v1/speak",
135
        **kwargs
136
    ) -> httpx.Response:
137
        """Async version of stream_raw method"""
138
    
139
    async def save(
140
        self,
141
        filename: str,
142
        source: FileSource,
143
        options: SpeakRESTOptions = None,
144
        addons: dict = None,
145
        headers: dict = None,
146
        timeout = None,
147
        endpoint: str = "v1/speak",
148
        **kwargs
149
    ) -> SpeakRESTResponse:
150
        """Async version of save method"""
151
    
152
    async def file(
153
        self,
154
        filename: str,
155
        source: FileSource,
156
        options: SpeakRESTOptions = None,
157
        addons: dict = None,
158
        timeout = None,
159
        endpoint: str = "v1/speak",
160
        **kwargs
161
    ) -> SpeakRESTResponse:
162
        """Async version of file method"""
163
```
164

165
### WebSocket Client (Streaming Audio Generation)
166

167
Real-time streaming text-to-speech client supporting incremental text input and real-time audio output.
168

169
```python { .api }
170
class SpeakWebSocketClient:
171
    def __init__(self, config: DeepgramClientOptions, microphone: Microphone = None): ...
172
    
173
    def start(
174
        self,
175
        options: SpeakWSOptions = None,
176
        addons: dict = None,
177
        headers: dict = None,
178
        members: dict = None,
179
        **kwargs
180
    ) -> bool:
181
        """
182
        Start WebSocket connection for streaming TTS.
183
        
184
        Args:
185
            options: WebSocket configuration options
186
            addons: Additional request parameters
187
            headers: Additional HTTP headers
188
            members: Member configuration
189
            
190
        Returns:
191
            bool: True if connection started successfully
192
        """
193
    
194
    def on(self, event: SpeakWebSocketEvents, handler: Callable) -> None:
195
        """
196
        Register event handler for WebSocket events.
197
        
198
        Args:
199
            event: WebSocket event type
200
            handler: Callable to handle the event
201
        """
202
    
203
    def send_text(self, text_input: str) -> bool:
204
        """
205
        Send text for speech synthesis.
206
        
207
        Args:
208
            text_input: Text to convert to speech
209
            
210
        Returns:
211
            bool: True if text sent successfully
212
        """
213
    
214
    def send(self, data: Union[str, bytes]) -> bool:
215
        """
216
        Send text data (alias for send_text).
217
        
218
        Args:
219
            data: Text or bytes to send
220
            
221
        Returns:
222
            bool: True if data sent successfully
223
        """
224
    
225
    def send_raw(self, msg: str) -> bool:
226
        """
227
        Send raw WebSocket message.
228
        
229
        Args:
230
            msg: Raw message to send
231
            
232
        Returns:
233
            bool: True if message sent successfully
234
        """
235
    
236
    def send_control(
237
        self, 
238
        msg_type: Union[SpeakWebSocketMessage, str], 
239
        data: str = ""
240
    ) -> bool:
241
        """
242
        Send control message.
243
        
244
        Args:
245
            msg_type: Message type constant
246
            data: Optional data payload
247
            
248
        Returns:
249
            bool: True if control message sent successfully
250
        """
251
    
252
    def flush(self) -> bool:
253
        """
254
        Flush current synthesis buffer.
255
        
256
        Returns:
257
            bool: True if flush successful
258
        """
259
    
260
    def clear(self) -> bool:
261
        """
262
        Clear synthesis buffer.
263
        
264
        Returns:
265
            bool: True if clear successful
266
        """
267
    
268
    def finish(self) -> bool:
269
        """
270
        Finish WebSocket connection.
271
        
272
        Returns:
273
            bool: True if finish successful
274
        """
275
    
276
    def wait_for_complete(self) -> None:
277
        """
278
        Wait for synthesis completion.
279
        """
280

281
class AsyncSpeakWebSocketClient:
282
    def __init__(self, config: DeepgramClientOptions, microphone: Microphone = None): ...
283
    
284
    async def start(...) -> bool: ...
285
    def on(self, event: SpeakWebSocketEvents, handler: Callable) -> None: ...  # Not async
286
    async def send_text(self, text_input: str) -> bool: ...
287
    async def send(self, data: Union[str, bytes]) -> bool: ...
288
    async def send_raw(self, msg: str) -> bool: ...
289
    async def send_control(...) -> bool: ...
290
    async def flush(self) -> bool: ...
291
    async def clear(self) -> bool: ...
292
    async def finish(self) -> bool: ...
293
    async def wait_for_complete(self) -> None: ...
294

295
# Alternative client names
296
class SpeakWSClient(SpeakWebSocketClient): ...
297
class AsyncSpeakWSClient(AsyncSpeakWebSocketClient): ...
298
```
299

300
### Router Access
301

302
Access text-to-speech clients through the main client's speak router.
303

304
```python { .api }
305
class SpeakRouter:
306
    @property
307
    def rest(self) -> SpeakRESTClient: ...
308
    @property
309
    def asyncrest(self) -> AsyncSpeakRESTClient: ...
310
    @property
311
    def websocket(self) -> SpeakWebSocketClient: ...
312
    @property
313
    def asyncwebsocket(self) -> AsyncSpeakWebSocketClient: ...
314
```
315

316
### Options Classes
317

318
#### REST Options
319

320
```python { .api }
321
class SpeakRESTOptions:
322
    def __init__(self, **kwargs): ...
323
    
324
    # Voice model selection
325
    model: str = "aura-asteria-en"  # Voice model name
326
    
327
    # Audio format settings
328
    encoding: str = "linear16"  # Audio encoding format
329
    container: str = "wav"  # Audio container format
330
    sample_rate: int = 24000  # Sample rate in Hz
331
    bit_rate: int = None  # Bit rate for compressed formats
332
    
333
    # Additional options
334
    extra: dict = None  # Additional synthesis options
335

336
# Legacy alias
337
class SpeakOptions(SpeakRESTOptions): ...
338
```
339

340
#### WebSocket Options
341

342
```python { .api }
343
class SpeakWSOptions:
344
    def __init__(self, **kwargs): ...
345
    
346
    # Voice model selection
347
    model: str = "aura-asteria-en"  # Voice model name
348
    
349
    # Audio format settings (required for WebSocket)
350
    encoding: str = "linear16"  # Audio encoding format
351
    sample_rate: int = 24000  # Sample rate in Hz
352
    container: str = None  # Audio container (optional for streaming)
353
    
354
    # Additional options
355
    extra: dict = None  # Additional synthesis options
356
```
357

358
### WebSocket Events and Messages
359

360
Event constants and message types for WebSocket text-to-speech operations.
361

362
```python { .api }
363
class SpeakWebSocketEvents:
364
    """WebSocket event constants for TTS operations"""
365
    OPEN: str = "Open"
366
    METADATA: str = "Metadata"
367
    AUDIO: str = "Audio"
368
    FLUSHED: str = "Flushed"
369
    CLEARED: str = "Cleared"
370
    CLOSE: str = "Close"
371
    ERROR: str = "Error"
372
    WARNING: str = "Warning"
373
    UNHANDLED: str = "Unhandled"
374

375
class SpeakWebSocketMessage:
376
    """WebSocket message type constants"""
377
    SPEAK: str = "Speak"
378
    FLUSH: str = "Flush"
379
    CLEAR: str = "Clear"
380
    CLOSE: str = "Close"
381
```
382

383
### Source Types
384

385
Input sources for text data in various formats.
386

387
```python { .api }
388
class SpeakSource:
389
    """Base class for text-to-speech sources"""
390

391
class TextSource(SpeakSource):
392
    def __init__(self, text: str):
393
        """
394
        Text from string.
395
        
396
        Args:
397
            text: Text content to synthesize
398
        """
399

400
class BufferSource(SpeakSource):
401
    def __init__(self, buffer: bytes):
402
        """
403
        Text from byte buffer.
404
        
405
        Args:
406
            buffer: Text content as bytes
407
        """
408

409
class StreamSource(SpeakSource):
410
    def __init__(self, stream):
411
        """
412
        Text from stream object.
413
        
414
        Args:
415
            stream: File-like stream object
416
        """
417

418
class FileSource(SpeakSource):
419
    def __init__(self, file: str):
420
        """
421
        Text from local file.
422
        
423
        Args:
424
            file: Path to local text file
425
        """
426

427
# Alternative source names
428
class SpeakRestSource(SpeakSource): ...
429
class SpeakRESTSource(SpeakSource): ...
430
```
431

432
### Response Types
433

434
#### REST Response Types
435

436
```python { .api }
437
class SpeakRESTResponse:
438
    """REST text-to-speech response containing generated audio"""
439
    content: bytes  # Generated audio data
440
    headers: dict  # Response headers with metadata
441
    
442
    def stream_to_file(self, filename: str) -> None:
443
        """
444
        Save audio content to file.
445
        
446
        Args:
447
            filename: Output file path
448
        """
449

450
# Legacy alias
451
class SpeakResponse(SpeakRESTResponse): ...
452
```
453

454
#### WebSocket Response Types
455

456
```python { .api }
457
class SpeakWSMetadataResponse:
458
    """WebSocket metadata response"""
459
    type: str = "Metadata"
460
    request_id: str
461
    model_name: str
462
    model_uuid: str
463

464
class FlushedResponse:
465
    """Buffer flush confirmation"""
466
    type: str = "Flushed"
467

468
class ClearedResponse:
469
    """Buffer clear confirmation"""
470
    type: str = "Cleared"
471

472
class WarningResponse:
473
    """Synthesis warning"""
474
    type: str = "Warning"
475
    message: str
476
    
477
# Common WebSocket responses are inherited from common module:
478
# OpenResponse, CloseResponse, ErrorResponse, UnhandledResponse
479
```
480

481
## Usage Examples
482

483
### Basic Text-to-Speech
484

485
```python
486
from deepgram import DeepgramClient, TextSource, SpeakRESTOptions
487

488
client = DeepgramClient(api_key="your-api-key")
489

490
# Generate speech from text
491
source = TextSource("Hello, world! This is a test of the Deepgram text-to-speech API.")
492
options = SpeakRESTOptions(
493
    model="aura-asteria-en",
494
    encoding="linear16",
495
    container="wav",
496
    sample_rate=24000
497
)
498

499
response = client.speak.rest.stream(source, options)
500

501
# Save to file
502
with open("output.wav", "wb") as f:
503
    f.write(response.content)
504

505
# Or use convenience method
506
response.stream_to_file("output.wav")
507
```
508

509
### Voice Model Selection
510

511
```python
512
from deepgram import DeepgramClient, TextSource, SpeakRESTOptions
513

514
client = DeepgramClient(api_key="your-api-key")
515

516
# Different voice models
517
models = [
518
    "aura-asteria-en",    # English, female
519
    "aura-luna-en",       # English, female  
520
    "aura-stella-en",     # English, female
521
    "aura-athena-en",     # English, female
522
    "aura-hera-en",       # English, female
523
    "aura-orion-en",      # English, male
524
    "aura-arcas-en",      # English, male
525
    "aura-perseus-en",    # English, male
526
    "aura-angus-en",      # English, male
527
    "aura-orpheus-en",    # English, male
528
]
529

530
source = TextSource("This is a test with different voice models.")
531

532
for model in models:
533
    options = SpeakRESTOptions(model=model)
534
    response = client.speak.rest.stream(source, options)
535
    response.stream_to_file(f"output_{model}.wav")
536
```
537

538
### Audio Format Options
539

540
```python
541
from deepgram import DeepgramClient, TextSource, SpeakRESTOptions
542

543
client = DeepgramClient(api_key="your-api-key")
544
source = TextSource("Testing different audio formats.")
545

546
# WAV format (uncompressed)
547
wav_options = SpeakRESTOptions(
548
    model="aura-asteria-en",
549
    encoding="linear16",
550
    container="wav",
551
    sample_rate=24000
552
)
553

554
# MP3 format (compressed)
555
mp3_options = SpeakRESTOptions(
556
    model="aura-asteria-en", 
557
    encoding="mp3",
558
    container="mp3",
559
    sample_rate=22050,
560
    bit_rate=128000
561
)
562

563
# FLAC format (lossless compression)  
564
flac_options = SpeakRESTOptions(
565
    model="aura-asteria-en",
566
    encoding="flac",
567
    container="flac", 
568
    sample_rate=24000
569
)
570

571
# Generate in different formats
572
wav_response = client.speak.rest.stream(source, wav_options)
573
mp3_response = client.speak.rest.stream(source, mp3_options)
574
flac_response = client.speak.rest.stream(source, flac_options)
575

576
wav_response.stream_to_file("output.wav")
577
mp3_response.stream_to_file("output.mp3")
578
flac_response.stream_to_file("output.flac")
579
```
580

581
### Streaming Text-to-Speech
582

583
```python
584
from deepgram import DeepgramClient, SpeakWSOptions, SpeakWebSocketEvents
585
import threading
586
import queue
587

588
client = DeepgramClient(api_key="your-api-key")
589
audio_queue = queue.Queue()
590

591
def on_open(self, open, **kwargs):
592
    print("TTS connection opened")
593

594
def on_audio_data(self, data, **kwargs):
595
    # Received audio chunk
596
    audio_queue.put(data)
597

598
def on_close(self, close, **kwargs):
599
    print("TTS connection closed")
600

601
def on_error(self, error, **kwargs):
602
    print(f"TTS error: {error}")
603

604
# Configure WebSocket options
605
options = SpeakWSOptions(
606
    model="aura-asteria-en",
607
    encoding="linear16",
608
    sample_rate=24000
609
)
610

611
# Start connection
612
dg_connection = client.speak.websocket.v("1")
613
dg_connection.on(SpeakWebSocketEvents.Open, on_open)
614
dg_connection.on(SpeakWebSocketEvents.AudioData, on_audio_data)
615
dg_connection.on(SpeakWebSocketEvents.Close, on_close)
616
dg_connection.on(SpeakWebSocketEvents.Error, on_error)
617

618
if dg_connection.start(options):
619
    # Send text incrementally
620
    dg_connection.send("Hello, this is streaming text-to-speech. ")
621
    dg_connection.send("I can send text in chunks and receive audio in real-time. ")
622
    dg_connection.send("This is very useful for interactive applications.")
623
    
624
    # Flush to ensure all text is processed
625
    dg_connection.flush()
626
    
627
    # Close connection
628
    dg_connection.close()
629

630
# Process received audio
631
audio_data = b""
632
while not audio_queue.empty():
633
    audio_data += audio_queue.get()
634

635
# Save streamed audio
636
with open("streamed_output.wav", "wb") as f:
637
    f.write(audio_data)
638
```
639

640
### Async Text-to-Speech
641

642
```python
643
import asyncio
644
from deepgram import DeepgramClient, TextSource, SpeakRESTOptions
645

646
async def async_tts_example():
647
    client = DeepgramClient(api_key="your-api-key")
648
    
649
    source = TextSource("This is an async text-to-speech example.")
650
    options = SpeakRESTOptions(
651
        model="aura-asteria-en",
652
        encoding="linear16",
653
        container="wav"
654
    )
655
    
656
    response = await client.speak.asyncrest.synthesize(source, options)
657
    
658
    with open("async_output.wav", "wb") as f:
659
        f.write(response.content)
660
    
661
    print("Async TTS completed")
662

663
# Run async example
664
asyncio.run(async_tts_example())
665
```
666

667
### Error Handling
668

669
```python
670
from deepgram import DeepgramClient, DeepgramApiError, TextSource, SpeakRESTOptions
671

672
client = DeepgramClient(api_key="your-api-key")
673

674
try:
675
    source = TextSource("Text to synthesize")
676
    options = SpeakRESTOptions(
677
        model="invalid-model",  # This will cause an error
678
        encoding="linear16"
679
    )
680
    
681
    response = client.speak.rest.stream(source, options)
682
    
683
except DeepgramApiError as e:
684
    print(f"API Error: {e}")
685
except Exception as e:
686
    print(f"Unexpected error: {e}")
687
```

Version

Tile

Files

text-to-speech.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text-to-speech.mddocs/