0
# Speech-to-Text
1
2
Comprehensive speech recognition capabilities supporting both batch transcription of prerecorded audio and real-time streaming transcription. The Listen module provides advanced features like speaker diarization, punctuation, profanity filtering, keyword detection, sentiment analysis, and support for multiple languages and audio formats.
3
4
## Capabilities
5
6
### REST Client (Prerecorded Audio)
7
8
Synchronous client for transcribing prerecorded audio files with comprehensive configuration options and detailed transcription results.
9
10
```python { .api }
11
class ListenRESTClient:
12
def transcribe_url(
13
self,
14
source: UrlSource,
15
options: ListenRESTOptions = None,
16
headers: dict = None,
17
timeout = None
18
) -> PrerecordedResponse:
19
"""
20
Transcribe audio from URL.
21
22
Args:
23
source: URL source containing audio to transcribe
24
options: Transcription configuration options
25
headers: Additional HTTP headers
26
timeout: Request timeout
27
28
Returns:
29
PrerecordedResponse: Complete transcription results with metadata
30
"""
31
32
def transcribe_file(
33
self,
34
source: FileSource,
35
options: ListenRESTOptions = None,
36
headers: dict = None,
37
timeout = None
38
) -> PrerecordedResponse:
39
"""
40
Transcribe audio from file.
41
42
Args:
43
source: File source containing audio to transcribe
44
options: Transcription configuration options
45
headers: Additional HTTP headers
46
timeout: Request timeout
47
48
Returns:
49
PrerecordedResponse: Complete transcription results with metadata
50
"""
51
52
def transcribe_url_callback(
53
self,
54
source: UrlSource,
55
callback: str,
56
options: ListenRESTOptions = None,
57
headers: dict = None,
58
timeout = None
59
) -> AsyncPrerecordedResponse:
60
"""
61
Transcribe audio from URL with callback URL for results.
62
63
Args:
64
source: URL source containing audio to transcribe
65
callback: Callback URL to receive transcription results
66
options: Transcription configuration options
67
headers: Additional HTTP headers
68
timeout: Request timeout
69
70
Returns:
71
AsyncPrerecordedResponse: Async response for callback processing
72
"""
73
74
def transcribe_file_callback(
75
self,
76
source: FileSource,
77
callback: str,
78
options: ListenRESTOptions = None,
79
headers: dict = None,
80
timeout = None
81
) -> AsyncPrerecordedResponse:
82
"""
83
Transcribe audio from file with callback URL for results.
84
85
Args:
86
source: File source containing audio to transcribe
87
callback: Callback URL to receive transcription results
88
options: Transcription configuration options
89
headers: Additional HTTP headers
90
timeout: Request timeout
91
92
Returns:
93
AsyncPrerecordedResponse: Async response for callback processing
94
"""
95
96
class AsyncListenRESTClient:
97
async def transcribe_url(
98
self,
99
source: UrlSource,
100
options: ListenRESTOptions = None,
101
headers: dict = None,
102
timeout = None
103
) -> AsyncPrerecordedResponse:
104
"""Async version of transcribe_url method"""
105
106
async def transcribe_file(
107
self,
108
source: FileSource,
109
options: ListenRESTOptions = None,
110
headers: dict = None,
111
timeout = None
112
) -> AsyncPrerecordedResponse:
113
"""Async version of transcribe_file method"""
114
115
async def transcribe_url_callback(
116
self,
117
source: UrlSource,
118
callback: str,
119
options: ListenRESTOptions = None,
120
headers: dict = None,
121
timeout = None
122
) -> AsyncPrerecordedResponse:
123
"""Async version of transcribe_url_callback method"""
124
125
async def transcribe_file_callback(
126
self,
127
source: FileSource,
128
callback: str,
129
options: ListenRESTOptions = None,
130
headers: dict = None,
131
timeout = None
132
) -> AsyncPrerecordedResponse:
133
"""Async version of transcribe_file_callback method"""
134
```
135
136
### WebSocket Client (Real-time Audio)
137
138
Real-time streaming transcription client supporting live audio processing with configurable buffering and result handling.
139
140
```python { .api }
141
class ListenWebSocketClient:
142
def start(self, options: ListenWebSocketOptions) -> bool:
143
"""
144
Start WebSocket connection for real-time transcription.
145
146
Args:
147
options: WebSocket configuration options
148
149
Returns:
150
bool: True if connection started successfully
151
"""
152
153
def send(self, data: bytes) -> bool:
154
"""
155
Send audio data for transcription.
156
157
Args:
158
data: Raw audio bytes
159
160
Returns:
161
bool: True if data sent successfully
162
"""
163
164
def finish(self) -> bool:
165
"""
166
Signal end of audio stream and receive final results.
167
168
Returns:
169
bool: True if stream finished successfully
170
"""
171
172
def close(self) -> bool:
173
"""
174
Close WebSocket connection.
175
176
Returns:
177
bool: True if connection closed successfully
178
"""
179
180
class AsyncListenWebSocketClient:
181
async def start(self, options: ListenWebSocketOptions) -> bool: ...
182
async def send(self, data: bytes) -> bool: ...
183
async def finish(self) -> bool: ...
184
async def close(self) -> bool: ...
185
```
186
187
### Router Access
188
189
Access speech-to-text clients through the main client's listen router.
190
191
```python { .api }
192
class ListenRouter:
193
@property
194
def rest(self) -> ListenRESTClient: ...
195
@property
196
def asyncrest(self) -> AsyncListenRESTClient: ...
197
@property
198
def websocket(self) -> ListenWebSocketClient: ...
199
@property
200
def asyncwebsocket(self) -> AsyncListenWebSocketClient: ...
201
```
202
203
### Options Classes
204
205
#### REST Options
206
207
```python { .api }
208
class ListenRESTOptions:
209
def __init__(self, **kwargs): ...
210
211
# Model and language settings
212
model: str = "nova-2" # AI model for transcription
213
language: str = "en-US" # Language code
214
version: str = None # Model version
215
216
# Audio processing
217
encoding: str = None # Audio encoding format
218
sample_rate: int = None # Audio sample rate
219
channels: int = None # Number of audio channels
220
221
# Transcription features
222
punctuate: bool = True # Add punctuation
223
profanity_filter: bool = False # Filter profanity
224
redact: list = None # Redact sensitive information
225
diarize: bool = False # Speaker diarization
226
diarize_version: str = None # Diarization model version
227
ner: bool = False # Named entity recognition
228
multichannel: bool = False # Process multiple channels separately
229
alternatives: int = 1 # Number of transcript alternatives
230
numerals: bool = False # Convert numbers to numerals
231
smart_format: bool = False # Smart formatting
232
233
# Analysis features
234
summarize: bool = False # Generate summary
235
detect_language: bool = False # Auto-detect language
236
paragraphs: bool = False # Paragraph detection
237
utterances: bool = False # Utterance segmentation
238
utt_split: float = None # Utterance split threshold
239
sentiment: bool = False # Sentiment analysis
240
topics: bool = False # Topic detection
241
intents: bool = False # Intent recognition
242
243
# Keywords and search
244
keywords: list = None # Keyword detection
245
keyword_boost: str = None # Keyword boosting
246
search: list = None # Search terms
247
replace: list = None # Text replacement
248
249
# Output formatting
250
filler_words: bool = False # Include filler words
251
dictation: bool = False # Dictation mode
252
measurements: bool = False # Measurement formatting
253
dates: bool = False # Date formatting
254
times: bool = False # Time formatting
255
256
# Callback and metadata
257
callback: str = None # Webhook callback URL
258
callback_method: str = "POST" # Callback HTTP method
259
custom_intent: list = None # Custom intent models
260
custom_intent_mode: str = None # Custom intent processing mode
261
custom_topic: list = None # Custom topic models
262
custom_topic_mode: str = None # Custom topic processing mode
263
264
# Advanced options
265
tag: list = None # Custom tags
266
extra: dict = None # Additional options
267
```
268
269
#### WebSocket Options
270
271
```python { .api }
272
class ListenWebSocketOptions:
273
def __init__(self, **kwargs): ...
274
275
# Model and language settings
276
model: str = "nova-2" # AI model for transcription
277
language: str = "en-US" # Language code
278
version: str = None # Model version
279
280
# Audio settings (required for WebSocket)
281
encoding: str = "linear16" # Audio encoding
282
sample_rate: int = 16000 # Sample rate in Hz
283
channels: int = 1 # Number of channels
284
285
# Real-time processing
286
interim_results: bool = True # Receive interim results
287
endpointing: bool = True # Automatic endpoint detection
288
vad_events: bool = False # Voice activity detection events
289
utterance_end_ms: int = 1000 # Utterance end timeout
290
291
# Transcription features (same as REST)
292
punctuate: bool = True
293
profanity_filter: bool = False
294
redact: list = None
295
diarize: bool = False
296
diarize_version: str = None
297
ner: bool = False
298
alternatives: int = 1
299
numerals: bool = False
300
smart_format: bool = False
301
302
# Analysis features
303
sentiment: bool = False
304
topics: bool = False
305
intents: bool = False
306
307
# Keywords and search
308
keywords: list = None
309
keyword_boost: str = None
310
search: list = None
311
replace: list = None
312
313
# Output options
314
filler_words: bool = False
315
dictation: bool = False
316
measurements: bool = False
317
dates: bool = False
318
times: bool = False
319
320
# Custom models
321
custom_intent: list = None
322
custom_intent_mode: str = None
323
custom_topic: list = None
324
custom_topic_mode: str = None
325
326
# Advanced options
327
tag: list = None
328
extra: dict = None
329
```
330
331
### Source Types
332
333
Input sources for audio data in various formats.
334
335
```python { .api }
336
class PrerecordedSource:
337
"""Base class for prerecorded audio sources"""
338
339
class UrlSource(PrerecordedSource):
340
def __init__(self, url: str):
341
"""
342
Audio from URL.
343
344
Args:
345
url: HTTP/HTTPS URL to audio file
346
"""
347
348
class FileSource(PrerecordedSource):
349
def __init__(self, file: str):
350
"""
351
Audio from local file.
352
353
Args:
354
file: Path to local audio file
355
"""
356
357
class BufferSource(PrerecordedSource):
358
def __init__(self, buffer: bytes):
359
"""
360
Audio from byte buffer.
361
362
Args:
363
buffer: Raw audio bytes
364
"""
365
366
class StreamSource(PrerecordedSource):
367
def __init__(self, stream):
368
"""
369
Audio from stream object.
370
371
Args:
372
stream: File-like stream object
373
"""
374
375
class PreRecordedStreamSource(PrerecordedSource):
376
"""Legacy stream source alias"""
377
378
class ListenRestSource(PrerecordedSource):
379
"""REST-specific source type"""
380
```
381
382
### Response Types
383
384
#### REST Response Types
385
386
```python { .api }
387
class PrerecordedResponse:
388
"""Main prerecorded transcription response"""
389
metadata: ListenRESTMetadata
390
results: ListenRESTResults
391
392
class AsyncPrerecordedResponse(PrerecordedResponse):
393
"""Async prerecorded response"""
394
395
class SyncPrerecordedResponse(PrerecordedResponse):
396
"""Sync prerecorded response"""
397
398
class ListenRESTMetadata:
399
"""REST transcription metadata"""
400
request_id: str
401
transaction_key: str
402
sha256: str
403
created: str
404
duration: float
405
channels: int
406
models: list
407
model_info: dict
408
409
class ListenRESTResults:
410
"""REST transcription results"""
411
channels: list[ListenRESTChannel]
412
utterances: list[Utterance] = None
413
summary: dict = None
414
415
class ListenRESTChannel:
416
"""Channel-specific transcription results"""
417
search: list[Search] = None
418
alternatives: list[ListenRESTAlternative]
419
420
class ListenRESTAlternative:
421
"""Alternative transcription result"""
422
transcript: str
423
confidence: float
424
words: list[ListenRESTWord]
425
paragraphs: Paragraphs = None
426
entities: list[Entity] = None
427
translations: list[Translation] = None
428
summaries: list[Summaries] = None
429
430
class ListenRESTWord:
431
"""Word-level transcription data"""
432
word: str
433
start: float
434
end: float
435
confidence: float
436
punctuated_word: str = None
437
speaker: int = None
438
speaker_confidence: float = None
439
language: str = None
440
```
441
442
#### WebSocket Response Types
443
444
```python { .api }
445
class LiveResultResponse:
446
"""Live transcription result"""
447
channel: ListenWSChannel
448
metadata: ListenWSMetadata
449
type: str
450
451
class ListenWSMetadataResponse:
452
"""WebSocket metadata response"""
453
type: str
454
transaction_key: str
455
request_id: str
456
sha256: str
457
created: str
458
duration: float
459
channels: int
460
461
class SpeechStartedResponse:
462
"""Speech detection event"""
463
type: str
464
timestamp: str
465
466
class UtteranceEndResponse:
467
"""Utterance completion event"""
468
type: str
469
channel: list
470
last_word_end: float
471
472
class ListenWSChannel:
473
"""WebSocket channel data"""
474
alternatives: list[ListenWSAlternative]
475
476
class ListenWSAlternative:
477
"""WebSocket alternative transcript"""
478
transcript: str
479
confidence: float
480
words: list[ListenWSWord]
481
482
class ListenWSWord:
483
"""WebSocket word-level data"""
484
word: str
485
start: float
486
end: float
487
confidence: float
488
punctuated_word: str = None
489
speaker: int = None
490
speaker_confidence: float = None
491
492
class ListenWSMetadata:
493
"""WebSocket connection metadata"""
494
request_id: str
495
model_name: str
496
model_uuid: str
497
```
498
499
#### Common Response Elements
500
501
```python { .api }
502
class Entity:
503
"""Named entity recognition result"""
504
label: str
505
value: str
506
confidence: float
507
start_word: int
508
end_word: int
509
510
class Paragraph:
511
"""Paragraph structure"""
512
sentences: list[Sentence]
513
start: float
514
end: float
515
516
class Paragraphs:
517
"""Collection of paragraphs"""
518
transcript: str
519
paragraphs: list[Paragraph]
520
521
class Sentence:
522
"""Sentence structure"""
523
text: str
524
start: float
525
end: float
526
527
class Utterance:
528
"""Speaker utterance"""
529
start: float
530
end: float
531
confidence: float
532
channel: int
533
transcript: str
534
words: list[ListenRESTWord]
535
speaker: int
536
id: str
537
538
class Translation:
539
"""Translation result"""
540
language: str
541
translation: str
542
543
class Warning:
544
"""Processing warning"""
545
parameter: str
546
type: str
547
message: str
548
549
class Summaries:
550
"""Summary collection"""
551
summary: str
552
start_word: int
553
end_word: int
554
555
class SummaryV1:
556
"""Version 1 summary format"""
557
summary: str
558
559
class SummaryV2:
560
"""Version 2 summary format"""
561
result: str
562
short: str
563
```
564
565
### Events
566
567
```python { .api }
568
class LiveTranscriptionEvents:
569
"""WebSocket event types for real-time transcription"""
570
Open: str = "Open"
571
Close: str = "Close"
572
Transcript: str = "Results"
573
Metadata: str = "Metadata"
574
UtteranceEnd: str = "UtteranceEnd"
575
SpeechStarted: str = "SpeechStarted"
576
Finalize: str = "Finalize"
577
Error: str = "Error"
578
Unhandled: str = "Unhandled"
579
Warning: str = "Warning"
580
```
581
582
## Usage Examples
583
584
### Basic Prerecorded Transcription
585
586
```python
587
from deepgram import DeepgramClient, UrlSource, ListenRESTOptions
588
589
client = DeepgramClient(api_key="your-api-key")
590
591
# Transcribe from URL
592
source = UrlSource("https://example.com/audio.wav")
593
options = ListenRESTOptions(
594
model="nova-2",
595
language="en-US",
596
punctuate=True,
597
diarize=True
598
)
599
600
response = client.listen.rest.transcribe_url(source, options)
601
transcript = response.results.channels[0].alternatives[0].transcript
602
print(transcript)
603
```
604
605
### Real-time Transcription
606
607
```python
608
from deepgram import DeepgramClient, ListenWebSocketOptions
609
import threading
610
611
client = DeepgramClient(api_key="your-api-key")
612
613
def on_message(self, result, **kwargs):
614
sentence = result.channel.alternatives[0].transcript
615
if sentence:
616
print(f"Transcript: {sentence}")
617
618
def on_error(self, error, **kwargs):
619
print(f"Error: {error}")
620
621
# Configure WebSocket options
622
options = ListenWebSocketOptions(
623
model="nova-2",
624
language="en-US",
625
encoding="linear16",
626
sample_rate=16000,
627
channels=1,
628
interim_results=True
629
)
630
631
# Start connection
632
dg_connection = client.listen.websocket.v("1")
633
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
634
dg_connection.on(LiveTranscriptionEvents.Error, on_error)
635
636
if dg_connection.start(options):
637
# Send audio data (typically from microphone)
638
# dg_connection.send(audio_data)
639
640
# When done
641
dg_connection.finish()
642
dg_connection.close()
643
```
644
645
### Advanced Features
646
647
```python
648
from deepgram import DeepgramClient, FileSource, ListenRESTOptions
649
650
client = DeepgramClient(api_key="your-api-key")
651
652
# Advanced transcription with multiple features
653
source = FileSource("meeting.wav")
654
options = ListenRESTOptions(
655
model="nova-2",
656
language="en-US",
657
punctuate=True,
658
diarize=True,
659
diarize_version="2021-07-14.0",
660
ner=True,
661
summarize="v2",
662
topics=True,
663
intents=True,
664
sentiment=True,
665
utterances=True,
666
paragraphs=True,
667
keywords=["project", "deadline", "budget"],
668
search=["important", "action item"]
669
)
670
671
response = client.listen.rest.transcribe_url(source, options)
672
673
# Access different types of results
674
transcript = response.results.channels[0].alternatives[0].transcript
675
utterances = response.results.utterances
676
summary = response.results.summary
677
```