The official Python SDK for the Deepgram automated speech recognition platform.
—
Comprehensive speech recognition capabilities supporting both batch transcription of prerecorded audio and real-time streaming transcription. The Listen module provides advanced features like speaker diarization, punctuation, profanity filtering, keyword detection, sentiment analysis, and support for multiple languages and audio formats.
Synchronous client for transcribing prerecorded audio files with comprehensive configuration options and detailed transcription results.
class ListenRESTClient:
def transcribe_url(
self,
source: UrlSource,
options: ListenRESTOptions = None,
headers: dict = None,
timeout = None
) -> PrerecordedResponse:
"""
Transcribe audio from URL.
Args:
source: URL source containing audio to transcribe
options: Transcription configuration options
headers: Additional HTTP headers
timeout: Request timeout
Returns:
PrerecordedResponse: Complete transcription results with metadata
"""
def transcribe_file(
self,
source: FileSource,
options: ListenRESTOptions = None,
headers: dict = None,
timeout = None
) -> PrerecordedResponse:
"""
Transcribe audio from file.
Args:
source: File source containing audio to transcribe
options: Transcription configuration options
headers: Additional HTTP headers
timeout: Request timeout
Returns:
PrerecordedResponse: Complete transcription results with metadata
"""
def transcribe_url_callback(
self,
source: UrlSource,
callback: str,
options: ListenRESTOptions = None,
headers: dict = None,
timeout = None
) -> AsyncPrerecordedResponse:
"""
Transcribe audio from URL with callback URL for results.
Args:
source: URL source containing audio to transcribe
callback: Callback URL to receive transcription results
options: Transcription configuration options
headers: Additional HTTP headers
timeout: Request timeout
Returns:
AsyncPrerecordedResponse: Async response for callback processing
"""
def transcribe_file_callback(
self,
source: FileSource,
callback: str,
options: ListenRESTOptions = None,
headers: dict = None,
timeout = None
) -> AsyncPrerecordedResponse:
"""
Transcribe audio from file with callback URL for results.
Args:
source: File source containing audio to transcribe
callback: Callback URL to receive transcription results
options: Transcription configuration options
headers: Additional HTTP headers
timeout: Request timeout
Returns:
AsyncPrerecordedResponse: Async response for callback processing
"""
class AsyncListenRESTClient:
async def transcribe_url(
self,
source: UrlSource,
options: ListenRESTOptions = None,
headers: dict = None,
timeout = None
) -> AsyncPrerecordedResponse:
"""Async version of transcribe_url method"""
async def transcribe_file(
self,
source: FileSource,
options: ListenRESTOptions = None,
headers: dict = None,
timeout = None
) -> AsyncPrerecordedResponse:
"""Async version of transcribe_file method"""
async def transcribe_url_callback(
self,
source: UrlSource,
callback: str,
options: ListenRESTOptions = None,
headers: dict = None,
timeout = None
) -> AsyncPrerecordedResponse:
"""Async version of transcribe_url_callback method"""
async def transcribe_file_callback(
self,
source: FileSource,
callback: str,
options: ListenRESTOptions = None,
headers: dict = None,
timeout = None
) -> AsyncPrerecordedResponse:
"""Async version of transcribe_file_callback method"""Real-time streaming transcription client supporting live audio processing with configurable buffering and result handling.
class ListenWebSocketClient:
def start(self, options: ListenWebSocketOptions) -> bool:
"""
Start WebSocket connection for real-time transcription.
Args:
options: WebSocket configuration options
Returns:
bool: True if connection started successfully
"""
def send(self, data: bytes) -> bool:
"""
Send audio data for transcription.
Args:
data: Raw audio bytes
Returns:
bool: True if data sent successfully
"""
def finish(self) -> bool:
"""
Signal end of audio stream and receive final results.
Returns:
bool: True if stream finished successfully
"""
def close(self) -> bool:
"""
Close WebSocket connection.
Returns:
bool: True if connection closed successfully
"""
class AsyncListenWebSocketClient:
async def start(self, options: ListenWebSocketOptions) -> bool: ...
async def send(self, data: bytes) -> bool: ...
async def finish(self) -> bool: ...
async def close(self) -> bool: ...Access speech-to-text clients through the main client's listen router.
class ListenRouter:
@property
def rest(self) -> ListenRESTClient: ...
@property
def asyncrest(self) -> AsyncListenRESTClient: ...
@property
def websocket(self) -> ListenWebSocketClient: ...
@property
def asyncwebsocket(self) -> AsyncListenWebSocketClient: ...class ListenRESTOptions:
def __init__(self, **kwargs): ...
# Model and language settings
model: str = "nova-2" # AI model for transcription
language: str = "en-US" # Language code
version: str = None # Model version
# Audio processing
encoding: str = None # Audio encoding format
sample_rate: int = None # Audio sample rate
channels: int = None # Number of audio channels
# Transcription features
punctuate: bool = True # Add punctuation
profanity_filter: bool = False # Filter profanity
redact: list = None # Redact sensitive information
diarize: bool = False # Speaker diarization
diarize_version: str = None # Diarization model version
ner: bool = False # Named entity recognition
multichannel: bool = False # Process multiple channels separately
alternatives: int = 1 # Number of transcript alternatives
numerals: bool = False # Convert numbers to numerals
smart_format: bool = False # Smart formatting
# Analysis features
summarize: bool = False # Generate summary
detect_language: bool = False # Auto-detect language
paragraphs: bool = False # Paragraph detection
utterances: bool = False # Utterance segmentation
utt_split: float = None # Utterance split threshold
sentiment: bool = False # Sentiment analysis
topics: bool = False # Topic detection
intents: bool = False # Intent recognition
# Keywords and search
keywords: list = None # Keyword detection
keyword_boost: str = None # Keyword boosting
search: list = None # Search terms
replace: list = None # Text replacement
# Output formatting
filler_words: bool = False # Include filler words
dictation: bool = False # Dictation mode
measurements: bool = False # Measurement formatting
dates: bool = False # Date formatting
times: bool = False # Time formatting
# Callback and metadata
callback: str = None # Webhook callback URL
callback_method: str = "POST" # Callback HTTP method
custom_intent: list = None # Custom intent models
custom_intent_mode: str = None # Custom intent processing mode
custom_topic: list = None # Custom topic models
custom_topic_mode: str = None # Custom topic processing mode
# Advanced options
tag: list = None # Custom tags
extra: dict = None # Additional optionsclass ListenWebSocketOptions:
def __init__(self, **kwargs): ...
# Model and language settings
model: str = "nova-2" # AI model for transcription
language: str = "en-US" # Language code
version: str = None # Model version
# Audio settings (required for WebSocket)
encoding: str = "linear16" # Audio encoding
sample_rate: int = 16000 # Sample rate in Hz
channels: int = 1 # Number of channels
# Real-time processing
interim_results: bool = True # Receive interim results
endpointing: bool = True # Automatic endpoint detection
vad_events: bool = False # Voice activity detection events
utterance_end_ms: int = 1000 # Utterance end timeout
# Transcription features (same as REST)
punctuate: bool = True
profanity_filter: bool = False
redact: list = None
diarize: bool = False
diarize_version: str = None
ner: bool = False
alternatives: int = 1
numerals: bool = False
smart_format: bool = False
# Analysis features
sentiment: bool = False
topics: bool = False
intents: bool = False
# Keywords and search
keywords: list = None
keyword_boost: str = None
search: list = None
replace: list = None
# Output options
filler_words: bool = False
dictation: bool = False
measurements: bool = False
dates: bool = False
times: bool = False
# Custom models
custom_intent: list = None
custom_intent_mode: str = None
custom_topic: list = None
custom_topic_mode: str = None
# Advanced options
tag: list = None
extra: dict = NoneInput sources for audio data in various formats.
class PrerecordedSource:
"""Base class for prerecorded audio sources"""
class UrlSource(PrerecordedSource):
def __init__(self, url: str):
"""
Audio from URL.
Args:
url: HTTP/HTTPS URL to audio file
"""
class FileSource(PrerecordedSource):
def __init__(self, file: str):
"""
Audio from local file.
Args:
file: Path to local audio file
"""
class BufferSource(PrerecordedSource):
def __init__(self, buffer: bytes):
"""
Audio from byte buffer.
Args:
buffer: Raw audio bytes
"""
class StreamSource(PrerecordedSource):
def __init__(self, stream):
"""
Audio from stream object.
Args:
stream: File-like stream object
"""
class PreRecordedStreamSource(PrerecordedSource):
"""Legacy stream source alias"""
class ListenRestSource(PrerecordedSource):
"""REST-specific source type"""class PrerecordedResponse:
"""Main prerecorded transcription response"""
metadata: ListenRESTMetadata
results: ListenRESTResults
class AsyncPrerecordedResponse(PrerecordedResponse):
"""Async prerecorded response"""
class SyncPrerecordedResponse(PrerecordedResponse):
"""Sync prerecorded response"""
class ListenRESTMetadata:
"""REST transcription metadata"""
request_id: str
transaction_key: str
sha256: str
created: str
duration: float
channels: int
models: list
model_info: dict
class ListenRESTResults:
"""REST transcription results"""
channels: list[ListenRESTChannel]
utterances: list[Utterance] = None
summary: dict = None
class ListenRESTChannel:
"""Channel-specific transcription results"""
search: list[Search] = None
alternatives: list[ListenRESTAlternative]
class ListenRESTAlternative:
"""Alternative transcription result"""
transcript: str
confidence: float
words: list[ListenRESTWord]
paragraphs: Paragraphs = None
entities: list[Entity] = None
translations: list[Translation] = None
summaries: list[Summaries] = None
class ListenRESTWord:
"""Word-level transcription data"""
word: str
start: float
end: float
confidence: float
punctuated_word: str = None
speaker: int = None
speaker_confidence: float = None
language: str = Noneclass LiveResultResponse:
"""Live transcription result"""
channel: ListenWSChannel
metadata: ListenWSMetadata
type: str
class ListenWSMetadataResponse:
"""WebSocket metadata response"""
type: str
transaction_key: str
request_id: str
sha256: str
created: str
duration: float
channels: int
class SpeechStartedResponse:
"""Speech detection event"""
type: str
timestamp: str
class UtteranceEndResponse:
"""Utterance completion event"""
type: str
channel: list
last_word_end: float
class ListenWSChannel:
"""WebSocket channel data"""
alternatives: list[ListenWSAlternative]
class ListenWSAlternative:
"""WebSocket alternative transcript"""
transcript: str
confidence: float
words: list[ListenWSWord]
class ListenWSWord:
"""WebSocket word-level data"""
word: str
start: float
end: float
confidence: float
punctuated_word: str = None
speaker: int = None
speaker_confidence: float = None
class ListenWSMetadata:
"""WebSocket connection metadata"""
request_id: str
model_name: str
model_uuid: strclass Entity:
"""Named entity recognition result"""
label: str
value: str
confidence: float
start_word: int
end_word: int
class Paragraph:
"""Paragraph structure"""
sentences: list[Sentence]
start: float
end: float
class Paragraphs:
"""Collection of paragraphs"""
transcript: str
paragraphs: list[Paragraph]
class Sentence:
"""Sentence structure"""
text: str
start: float
end: float
class Utterance:
"""Speaker utterance"""
start: float
end: float
confidence: float
channel: int
transcript: str
words: list[ListenRESTWord]
speaker: int
id: str
class Translation:
"""Translation result"""
language: str
translation: str
class Warning:
"""Processing warning"""
parameter: str
type: str
message: str
class Summaries:
"""Summary collection"""
summary: str
start_word: int
end_word: int
class SummaryV1:
"""Version 1 summary format"""
summary: str
class SummaryV2:
"""Version 2 summary format"""
result: str
short: strclass LiveTranscriptionEvents:
"""WebSocket event types for real-time transcription"""
Open: str = "Open"
Close: str = "Close"
Transcript: str = "Results"
Metadata: str = "Metadata"
UtteranceEnd: str = "UtteranceEnd"
SpeechStarted: str = "SpeechStarted"
Finalize: str = "Finalize"
Error: str = "Error"
Unhandled: str = "Unhandled"
Warning: str = "Warning"from deepgram import DeepgramClient, UrlSource, ListenRESTOptions
client = DeepgramClient(api_key="your-api-key")
# Transcribe from URL
source = UrlSource("https://example.com/audio.wav")
options = ListenRESTOptions(
model="nova-2",
language="en-US",
punctuate=True,
diarize=True
)
response = client.listen.rest.transcribe_url(source, options)
transcript = response.results.channels[0].alternatives[0].transcript
print(transcript)from deepgram import DeepgramClient, ListenWebSocketOptions
import threading
client = DeepgramClient(api_key="your-api-key")
def on_message(self, result, **kwargs):
sentence = result.channel.alternatives[0].transcript
if sentence:
print(f"Transcript: {sentence}")
def on_error(self, error, **kwargs):
print(f"Error: {error}")
# Configure WebSocket options
options = ListenWebSocketOptions(
model="nova-2",
language="en-US",
encoding="linear16",
sample_rate=16000,
channels=1,
interim_results=True
)
# Start connection
dg_connection = client.listen.websocket.v("1")
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
dg_connection.on(LiveTranscriptionEvents.Error, on_error)
if dg_connection.start(options):
# Send audio data (typically from microphone)
# dg_connection.send(audio_data)
# When done
dg_connection.finish()
dg_connection.close()from deepgram import DeepgramClient, FileSource, ListenRESTOptions
client = DeepgramClient(api_key="your-api-key")
# Advanced transcription with multiple features
source = FileSource("meeting.wav")
options = ListenRESTOptions(
model="nova-2",
language="en-US",
punctuate=True,
diarize=True,
diarize_version="2021-07-14.0",
ner=True,
summarize="v2",
topics=True,
intents=True,
sentiment=True,
utterances=True,
paragraphs=True,
keywords=["project", "deadline", "budget"],
search=["important", "action item"]
)
response = client.listen.rest.transcribe_url(source, options)
# Access different types of results
transcript = response.results.channels[0].alternatives[0].transcript
utterances = response.results.utterances
summary = response.results.summaryInstall with Tessl CLI
npx tessl i tessl/pypi-deepgram-sdk