Tessl Tile for pypi/deepgram-sdk@4.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio-utilities.md conversational-ai.md index.md project-management.md speech-to-text.md text-analysis.md text-to-speech.md

speech-to-text.mddocs/

0
# Speech-to-Text
1

2
Comprehensive speech recognition capabilities supporting both batch transcription of prerecorded audio and real-time streaming transcription. The Listen module provides advanced features like speaker diarization, punctuation, profanity filtering, keyword detection, sentiment analysis, and support for multiple languages and audio formats.
3

4
## Capabilities
5

6
### REST Client (Prerecorded Audio)
7

8
Synchronous client for transcribing prerecorded audio files with comprehensive configuration options and detailed transcription results.
9

10
```python { .api }
11
class ListenRESTClient:
12
    def transcribe_url(
13
        self, 
14
        source: UrlSource, 
15
        options: ListenRESTOptions = None,
16
        headers: dict = None,
17
        timeout = None
18
    ) -> PrerecordedResponse:
19
        """
20
        Transcribe audio from URL.
21
        
22
        Args:
23
            source: URL source containing audio to transcribe
24
            options: Transcription configuration options
25
            headers: Additional HTTP headers
26
            timeout: Request timeout
27
            
28
        Returns:
29
            PrerecordedResponse: Complete transcription results with metadata
30
        """
31
    
32
    def transcribe_file(
33
        self,
34
        source: FileSource,
35
        options: ListenRESTOptions = None,
36
        headers: dict = None,
37
        timeout = None
38
    ) -> PrerecordedResponse:
39
        """
40
        Transcribe audio from file.
41
        
42
        Args:
43
            source: File source containing audio to transcribe
44
            options: Transcription configuration options
45
            headers: Additional HTTP headers
46
            timeout: Request timeout
47
            
48
        Returns:
49
            PrerecordedResponse: Complete transcription results with metadata
50
        """
51
    
52
    def transcribe_url_callback(
53
        self,
54
        source: UrlSource,
55
        callback: str,
56
        options: ListenRESTOptions = None,
57
        headers: dict = None,
58
        timeout = None
59
    ) -> AsyncPrerecordedResponse:
60
        """
61
        Transcribe audio from URL with callback URL for results.
62
        
63
        Args:
64
            source: URL source containing audio to transcribe
65
            callback: Callback URL to receive transcription results
66
            options: Transcription configuration options
67
            headers: Additional HTTP headers
68
            timeout: Request timeout
69
            
70
        Returns:
71
            AsyncPrerecordedResponse: Async response for callback processing
72
        """
73
    
74
    def transcribe_file_callback(
75
        self,
76
        source: FileSource,
77
        callback: str,
78
        options: ListenRESTOptions = None,
79
        headers: dict = None,
80
        timeout = None
81
    ) -> AsyncPrerecordedResponse:
82
        """
83
        Transcribe audio from file with callback URL for results.
84
        
85
        Args:
86
            source: File source containing audio to transcribe
87
            callback: Callback URL to receive transcription results
88
            options: Transcription configuration options
89
            headers: Additional HTTP headers
90
            timeout: Request timeout
91
            
92
        Returns:
93
            AsyncPrerecordedResponse: Async response for callback processing
94
        """
95

96
class AsyncListenRESTClient:
97
    async def transcribe_url(
98
        self,
99
        source: UrlSource,
100
        options: ListenRESTOptions = None,
101
        headers: dict = None,
102
        timeout = None
103
    ) -> AsyncPrerecordedResponse:
104
        """Async version of transcribe_url method"""
105
    
106
    async def transcribe_file(
107
        self,
108
        source: FileSource,
109
        options: ListenRESTOptions = None,
110
        headers: dict = None,
111
        timeout = None
112
    ) -> AsyncPrerecordedResponse:
113
        """Async version of transcribe_file method"""
114
    
115
    async def transcribe_url_callback(
116
        self,
117
        source: UrlSource,
118
        callback: str,
119
        options: ListenRESTOptions = None,
120
        headers: dict = None,
121
        timeout = None
122
    ) -> AsyncPrerecordedResponse:
123
        """Async version of transcribe_url_callback method"""
124
    
125
    async def transcribe_file_callback(
126
        self,
127
        source: FileSource,
128
        callback: str,
129
        options: ListenRESTOptions = None,
130
        headers: dict = None,
131
        timeout = None
132
    ) -> AsyncPrerecordedResponse:
133
        """Async version of transcribe_file_callback method"""
134
```
135

136
### WebSocket Client (Real-time Audio)
137

138
Real-time streaming transcription client supporting live audio processing with configurable buffering and result handling.
139

140
```python { .api }
141
class ListenWebSocketClient:
142
    def start(self, options: ListenWebSocketOptions) -> bool:
143
        """
144
        Start WebSocket connection for real-time transcription.
145
        
146
        Args:
147
            options: WebSocket configuration options
148
            
149
        Returns:
150
            bool: True if connection started successfully
151
        """
152
    
153
    def send(self, data: bytes) -> bool:
154
        """
155
        Send audio data for transcription.
156
        
157
        Args:
158
            data: Raw audio bytes
159
            
160
        Returns:
161
            bool: True if data sent successfully
162
        """
163
    
164
    def finish(self) -> bool:
165
        """
166
        Signal end of audio stream and receive final results.
167
        
168
        Returns:
169
            bool: True if stream finished successfully
170
        """
171
    
172
    def close(self) -> bool:
173
        """
174
        Close WebSocket connection.
175
        
176
        Returns:
177
            bool: True if connection closed successfully
178
        """
179

180
class AsyncListenWebSocketClient:
181
    async def start(self, options: ListenWebSocketOptions) -> bool: ...
182
    async def send(self, data: bytes) -> bool: ...
183
    async def finish(self) -> bool: ...
184
    async def close(self) -> bool: ...
185
```
186

187
### Router Access
188

189
Access speech-to-text clients through the main client's listen router.
190

191
```python { .api }
192
class ListenRouter:
193
    @property
194
    def rest(self) -> ListenRESTClient: ...
195
    @property
196
    def asyncrest(self) -> AsyncListenRESTClient: ...
197
    @property
198
    def websocket(self) -> ListenWebSocketClient: ...
199
    @property
200
    def asyncwebsocket(self) -> AsyncListenWebSocketClient: ...
201
```
202

203
### Options Classes
204

205
#### REST Options
206

207
```python { .api }
208
class ListenRESTOptions:
209
    def __init__(self, **kwargs): ...
210
    
211
    # Model and language settings
212
    model: str = "nova-2"  # AI model for transcription
213
    language: str = "en-US"  # Language code
214
    version: str = None  # Model version
215
    
216
    # Audio processing
217
    encoding: str = None  # Audio encoding format
218
    sample_rate: int = None  # Audio sample rate
219
    channels: int = None  # Number of audio channels
220
    
221
    # Transcription features
222
    punctuate: bool = True  # Add punctuation
223
    profanity_filter: bool = False  # Filter profanity
224
    redact: list = None  # Redact sensitive information
225
    diarize: bool = False  # Speaker diarization
226
    diarize_version: str = None  # Diarization model version
227
    ner: bool = False  # Named entity recognition
228
    multichannel: bool = False  # Process multiple channels separately
229
    alternatives: int = 1  # Number of transcript alternatives
230
    numerals: bool = False  # Convert numbers to numerals
231
    smart_format: bool = False  # Smart formatting
232
    
233
    # Analysis features
234
    summarize: bool = False  # Generate summary
235
    detect_language: bool = False  # Auto-detect language
236
    paragraphs: bool = False  # Paragraph detection
237
    utterances: bool = False  # Utterance segmentation
238
    utt_split: float = None  # Utterance split threshold
239
    sentiment: bool = False  # Sentiment analysis
240
    topics: bool = False  # Topic detection
241
    intents: bool = False  # Intent recognition
242
    
243
    # Keywords and search
244
    keywords: list = None  # Keyword detection
245
    keyword_boost: str = None  # Keyword boosting
246
    search: list = None  # Search terms
247
    replace: list = None  # Text replacement
248
    
249
    # Output formatting
250
    filler_words: bool = False  # Include filler words
251
    dictation: bool = False  # Dictation mode
252
    measurements: bool = False  # Measurement formatting
253
    dates: bool = False  # Date formatting
254
    times: bool = False  # Time formatting
255
    
256
    # Callback and metadata
257
    callback: str = None  # Webhook callback URL
258
    callback_method: str = "POST"  # Callback HTTP method
259
    custom_intent: list = None  # Custom intent models
260
    custom_intent_mode: str = None  # Custom intent processing mode
261
    custom_topic: list = None  # Custom topic models
262
    custom_topic_mode: str = None  # Custom topic processing mode
263
    
264
    # Advanced options
265
    tag: list = None  # Custom tags
266
    extra: dict = None  # Additional options
267
```
268

269
#### WebSocket Options
270

271
```python { .api }
272
class ListenWebSocketOptions:
273
    def __init__(self, **kwargs): ...
274
    
275
    # Model and language settings
276
    model: str = "nova-2"  # AI model for transcription
277
    language: str = "en-US"  # Language code
278
    version: str = None  # Model version
279
    
280
    # Audio settings (required for WebSocket)
281
    encoding: str = "linear16"  # Audio encoding
282
    sample_rate: int = 16000  # Sample rate in Hz
283
    channels: int = 1  # Number of channels
284
    
285
    # Real-time processing
286
    interim_results: bool = True  # Receive interim results
287
    endpointing: bool = True  # Automatic endpoint detection
288
    vad_events: bool = False  # Voice activity detection events
289
    utterance_end_ms: int = 1000  # Utterance end timeout
290
    
291
    # Transcription features (same as REST)
292
    punctuate: bool = True
293
    profanity_filter: bool = False
294
    redact: list = None
295
    diarize: bool = False
296
    diarize_version: str = None
297
    ner: bool = False
298
    alternatives: int = 1
299
    numerals: bool = False
300
    smart_format: bool = False
301
    
302
    # Analysis features
303
    sentiment: bool = False
304
    topics: bool = False
305
    intents: bool = False
306
    
307
    # Keywords and search
308
    keywords: list = None
309
    keyword_boost: str = None
310
    search: list = None
311
    replace: list = None
312
    
313
    # Output options
314
    filler_words: bool = False
315
    dictation: bool = False
316
    measurements: bool = False
317
    dates: bool = False
318
    times: bool = False
319
    
320
    # Custom models
321
    custom_intent: list = None
322
    custom_intent_mode: str = None
323
    custom_topic: list = None
324
    custom_topic_mode: str = None
325
    
326
    # Advanced options
327
    tag: list = None
328
    extra: dict = None
329
```
330

331
### Source Types
332

333
Input sources for audio data in various formats.
334

335
```python { .api }
336
class PrerecordedSource:
337
    """Base class for prerecorded audio sources"""
338

339
class UrlSource(PrerecordedSource):
340
    def __init__(self, url: str):
341
        """
342
        Audio from URL.
343
        
344
        Args:
345
            url: HTTP/HTTPS URL to audio file
346
        """
347

348
class FileSource(PrerecordedSource):
349
    def __init__(self, file: str):
350
        """
351
        Audio from local file.
352
        
353
        Args:
354
            file: Path to local audio file
355
        """
356

357
class BufferSource(PrerecordedSource):
358
    def __init__(self, buffer: bytes):
359
        """
360
        Audio from byte buffer.
361
        
362
        Args:
363
            buffer: Raw audio bytes
364
        """
365

366
class StreamSource(PrerecordedSource):
367
    def __init__(self, stream):
368
        """
369
        Audio from stream object.
370
        
371
        Args:
372
            stream: File-like stream object
373
        """
374

375
class PreRecordedStreamSource(PrerecordedSource):
376
    """Legacy stream source alias"""
377

378
class ListenRestSource(PrerecordedSource):  
379
    """REST-specific source type"""
380
```
381

382
### Response Types
383

384
#### REST Response Types
385

386
```python { .api }
387
class PrerecordedResponse:
388
    """Main prerecorded transcription response"""
389
    metadata: ListenRESTMetadata
390
    results: ListenRESTResults
391

392
class AsyncPrerecordedResponse(PrerecordedResponse):
393
    """Async prerecorded response"""
394

395
class SyncPrerecordedResponse(PrerecordedResponse):
396
    """Sync prerecorded response"""
397

398
class ListenRESTMetadata:
399
    """REST transcription metadata"""
400
    request_id: str
401
    transaction_key: str
402
    sha256: str
403
    created: str
404
    duration: float
405
    channels: int
406
    models: list
407
    model_info: dict
408

409
class ListenRESTResults:
410
    """REST transcription results"""
411
    channels: list[ListenRESTChannel]
412
    utterances: list[Utterance] = None
413
    summary: dict = None
414

415
class ListenRESTChannel:
416
    """Channel-specific transcription results"""
417
    search: list[Search] = None
418
    alternatives: list[ListenRESTAlternative]
419

420
class ListenRESTAlternative:
421
    """Alternative transcription result"""
422
    transcript: str
423
    confidence: float
424
    words: list[ListenRESTWord]
425
    paragraphs: Paragraphs = None
426
    entities: list[Entity] = None
427
    translations: list[Translation] = None
428
    summaries: list[Summaries] = None
429

430
class ListenRESTWord:
431
    """Word-level transcription data"""
432
    word: str
433
    start: float
434
    end: float
435
    confidence: float
436
    punctuated_word: str = None
437
    speaker: int = None
438
    speaker_confidence: float = None
439
    language: str = None
440
```
441

442
#### WebSocket Response Types
443

444
```python { .api }
445
class LiveResultResponse:
446
    """Live transcription result"""
447
    channel: ListenWSChannel
448
    metadata: ListenWSMetadata
449
    type: str
450

451
class ListenWSMetadataResponse:
452
    """WebSocket metadata response"""
453
    type: str
454
    transaction_key: str
455
    request_id: str
456
    sha256: str
457
    created: str
458
    duration: float
459
    channels: int
460

461
class SpeechStartedResponse:
462
    """Speech detection event"""
463
    type: str
464
    timestamp: str
465

466
class UtteranceEndResponse:
467
    """Utterance completion event"""  
468
    type: str
469
    channel: list
470
    last_word_end: float
471

472
class ListenWSChannel:
473
    """WebSocket channel data"""
474
    alternatives: list[ListenWSAlternative]
475

476
class ListenWSAlternative:
477
    """WebSocket alternative transcript"""
478
    transcript: str
479
    confidence: float
480
    words: list[ListenWSWord]
481

482
class ListenWSWord:
483
    """WebSocket word-level data"""
484
    word: str
485
    start: float
486
    end: float
487
    confidence: float
488
    punctuated_word: str = None
489
    speaker: int = None
490
    speaker_confidence: float = None
491

492
class ListenWSMetadata:
493
    """WebSocket connection metadata"""
494
    request_id: str
495
    model_name: str
496
    model_uuid: str
497
```
498

499
#### Common Response Elements
500

501
```python { .api }
502
class Entity:
503
    """Named entity recognition result"""
504
    label: str
505
    value: str
506
    confidence: float
507
    start_word: int
508
    end_word: int
509

510
class Paragraph:
511
    """Paragraph structure"""
512
    sentences: list[Sentence]
513
    start: float
514
    end: float
515

516
class Paragraphs:
517
    """Collection of paragraphs"""
518
    transcript: str
519
    paragraphs: list[Paragraph]
520

521
class Sentence:
522
    """Sentence structure"""
523
    text: str
524
    start: float
525
    end: float
526

527
class Utterance:
528
    """Speaker utterance"""
529
    start: float
530
    end: float  
531
    confidence: float
532
    channel: int
533
    transcript: str
534
    words: list[ListenRESTWord]
535
    speaker: int
536
    id: str
537

538
class Translation:
539
    """Translation result"""
540
    language: str
541
    translation: str
542

543
class Warning:
544
    """Processing warning"""
545
    parameter: str
546
    type: str
547
    message: str
548

549
class Summaries:
550
    """Summary collection"""
551
    summary: str
552
    start_word: int
553
    end_word: int
554

555
class SummaryV1:
556
    """Version 1 summary format"""
557
    summary: str
558

559
class SummaryV2:
560
    """Version 2 summary format"""
561
    result: str
562
    short: str
563
```
564

565
### Events
566

567
```python { .api }
568
class LiveTranscriptionEvents:
569
    """WebSocket event types for real-time transcription"""
570
    Open: str = "Open"
571
    Close: str = "Close"
572
    Transcript: str = "Results"
573
    Metadata: str = "Metadata"
574
    UtteranceEnd: str = "UtteranceEnd"
575
    SpeechStarted: str = "SpeechStarted"
576
    Finalize: str = "Finalize"
577
    Error: str = "Error"
578
    Unhandled: str = "Unhandled"
579
    Warning: str = "Warning"
580
```
581

582
## Usage Examples
583

584
### Basic Prerecorded Transcription
585

586
```python
587
from deepgram import DeepgramClient, UrlSource, ListenRESTOptions
588

589
client = DeepgramClient(api_key="your-api-key")
590

591
# Transcribe from URL
592
source = UrlSource("https://example.com/audio.wav")
593
options = ListenRESTOptions(
594
    model="nova-2",
595
    language="en-US",
596
    punctuate=True,
597
    diarize=True
598
)
599

600
response = client.listen.rest.transcribe_url(source, options)
601
transcript = response.results.channels[0].alternatives[0].transcript
602
print(transcript)
603
```
604

605
### Real-time Transcription
606

607
```python
608
from deepgram import DeepgramClient, ListenWebSocketOptions
609
import threading
610

611
client = DeepgramClient(api_key="your-api-key")
612

613
def on_message(self, result, **kwargs):
614
    sentence = result.channel.alternatives[0].transcript
615
    if sentence:
616
        print(f"Transcript: {sentence}")
617

618
def on_error(self, error, **kwargs):
619
    print(f"Error: {error}")
620

621
# Configure WebSocket options
622
options = ListenWebSocketOptions(
623
    model="nova-2",
624
    language="en-US",
625
    encoding="linear16",
626
    sample_rate=16000,
627
    channels=1,
628
    interim_results=True
629
)
630

631
# Start connection
632
dg_connection = client.listen.websocket.v("1")
633
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
634
dg_connection.on(LiveTranscriptionEvents.Error, on_error)
635

636
if dg_connection.start(options):
637
    # Send audio data (typically from microphone)
638
    # dg_connection.send(audio_data)
639
    
640
    # When done
641
    dg_connection.finish()
642
    dg_connection.close()
643
```
644

645
### Advanced Features
646

647
```python
648
from deepgram import DeepgramClient, FileSource, ListenRESTOptions
649

650
client = DeepgramClient(api_key="your-api-key")
651

652
# Advanced transcription with multiple features
653
source = FileSource("meeting.wav")
654
options = ListenRESTOptions(
655
    model="nova-2",
656
    language="en-US",
657
    punctuate=True,
658
    diarize=True,
659
    diarize_version="2021-07-14.0",
660
    ner=True,
661
    summarize="v2",
662
    topics=True,
663
    intents=True,
664
    sentiment=True,
665
    utterances=True,
666
    paragraphs=True,
667
    keywords=["project", "deadline", "budget"],
668
    search=["important", "action item"]
669
)
670

671
response = client.listen.rest.transcribe_url(source, options)
672

673
# Access different types of results
674
transcript = response.results.channels[0].alternatives[0].transcript
675
utterances = response.results.utterances
676
summary = response.results.summary
677
```

Version

Tile

Files

speech-to-text.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

speech-to-text.mddocs/