CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-deepgram-sdk

The official Python SDK for the Deepgram automated speech recognition platform.

Pending
Overview
Eval results
Files

speech-to-text.mddocs/

Speech-to-Text

Comprehensive speech recognition capabilities supporting both batch transcription of prerecorded audio and real-time streaming transcription. The Listen module provides advanced features like speaker diarization, punctuation, profanity filtering, keyword detection, sentiment analysis, and support for multiple languages and audio formats.

Capabilities

REST Client (Prerecorded Audio)

Synchronous client for transcribing prerecorded audio files with comprehensive configuration options and detailed transcription results.

class ListenRESTClient:
    def transcribe_url(
        self, 
        source: UrlSource, 
        options: ListenRESTOptions = None,
        headers: dict = None,
        timeout = None
    ) -> PrerecordedResponse:
        """
        Transcribe audio from URL.
        
        Args:
            source: URL source containing audio to transcribe
            options: Transcription configuration options
            headers: Additional HTTP headers
            timeout: Request timeout
            
        Returns:
            PrerecordedResponse: Complete transcription results with metadata
        """
    
    def transcribe_file(
        self,
        source: FileSource,
        options: ListenRESTOptions = None,
        headers: dict = None,
        timeout = None
    ) -> PrerecordedResponse:
        """
        Transcribe audio from file.
        
        Args:
            source: File source containing audio to transcribe
            options: Transcription configuration options
            headers: Additional HTTP headers
            timeout: Request timeout
            
        Returns:
            PrerecordedResponse: Complete transcription results with metadata
        """
    
    def transcribe_url_callback(
        self,
        source: UrlSource,
        callback: str,
        options: ListenRESTOptions = None,
        headers: dict = None,
        timeout = None
    ) -> AsyncPrerecordedResponse:
        """
        Transcribe audio from URL with callback URL for results.
        
        Args:
            source: URL source containing audio to transcribe
            callback: Callback URL to receive transcription results
            options: Transcription configuration options
            headers: Additional HTTP headers
            timeout: Request timeout
            
        Returns:
            AsyncPrerecordedResponse: Async response for callback processing
        """
    
    def transcribe_file_callback(
        self,
        source: FileSource,
        callback: str,
        options: ListenRESTOptions = None,
        headers: dict = None,
        timeout = None
    ) -> AsyncPrerecordedResponse:
        """
        Transcribe audio from file with callback URL for results.
        
        Args:
            source: File source containing audio to transcribe
            callback: Callback URL to receive transcription results
            options: Transcription configuration options
            headers: Additional HTTP headers
            timeout: Request timeout
            
        Returns:
            AsyncPrerecordedResponse: Async response for callback processing
        """

class AsyncListenRESTClient:
    async def transcribe_url(
        self,
        source: UrlSource,
        options: ListenRESTOptions = None,
        headers: dict = None,
        timeout = None
    ) -> AsyncPrerecordedResponse:
        """Async version of transcribe_url method"""
    
    async def transcribe_file(
        self,
        source: FileSource,
        options: ListenRESTOptions = None,
        headers: dict = None,
        timeout = None
    ) -> AsyncPrerecordedResponse:
        """Async version of transcribe_file method"""
    
    async def transcribe_url_callback(
        self,
        source: UrlSource,
        callback: str,
        options: ListenRESTOptions = None,
        headers: dict = None,
        timeout = None
    ) -> AsyncPrerecordedResponse:
        """Async version of transcribe_url_callback method"""
    
    async def transcribe_file_callback(
        self,
        source: FileSource,
        callback: str,
        options: ListenRESTOptions = None,
        headers: dict = None,
        timeout = None
    ) -> AsyncPrerecordedResponse:
        """Async version of transcribe_file_callback method"""

WebSocket Client (Real-time Audio)

Real-time streaming transcription client supporting live audio processing with configurable buffering and result handling.

class ListenWebSocketClient:
    def start(self, options: ListenWebSocketOptions) -> bool:
        """
        Start WebSocket connection for real-time transcription.
        
        Args:
            options: WebSocket configuration options
            
        Returns:
            bool: True if connection started successfully
        """
    
    def send(self, data: bytes) -> bool:
        """
        Send audio data for transcription.
        
        Args:
            data: Raw audio bytes
            
        Returns:
            bool: True if data sent successfully
        """
    
    def finish(self) -> bool:
        """
        Signal end of audio stream and receive final results.
        
        Returns:
            bool: True if stream finished successfully
        """
    
    def close(self) -> bool:
        """
        Close WebSocket connection.
        
        Returns:
            bool: True if connection closed successfully
        """

class AsyncListenWebSocketClient:
    async def start(self, options: ListenWebSocketOptions) -> bool: ...
    async def send(self, data: bytes) -> bool: ...
    async def finish(self) -> bool: ...
    async def close(self) -> bool: ...

Router Access

Access speech-to-text clients through the main client's listen router.

class ListenRouter:
    @property
    def rest(self) -> ListenRESTClient: ...
    @property
    def asyncrest(self) -> AsyncListenRESTClient: ...
    @property
    def websocket(self) -> ListenWebSocketClient: ...
    @property
    def asyncwebsocket(self) -> AsyncListenWebSocketClient: ...

Options Classes

REST Options

class ListenRESTOptions:
    def __init__(self, **kwargs): ...
    
    # Model and language settings
    model: str = "nova-2"  # AI model for transcription
    language: str = "en-US"  # Language code
    version: str = None  # Model version
    
    # Audio processing
    encoding: str = None  # Audio encoding format
    sample_rate: int = None  # Audio sample rate
    channels: int = None  # Number of audio channels
    
    # Transcription features
    punctuate: bool = True  # Add punctuation
    profanity_filter: bool = False  # Filter profanity
    redact: list = None  # Redact sensitive information
    diarize: bool = False  # Speaker diarization
    diarize_version: str = None  # Diarization model version
    ner: bool = False  # Named entity recognition
    multichannel: bool = False  # Process multiple channels separately
    alternatives: int = 1  # Number of transcript alternatives
    numerals: bool = False  # Convert numbers to numerals
    smart_format: bool = False  # Smart formatting
    
    # Analysis features
    summarize: bool = False  # Generate summary
    detect_language: bool = False  # Auto-detect language
    paragraphs: bool = False  # Paragraph detection
    utterances: bool = False  # Utterance segmentation
    utt_split: float = None  # Utterance split threshold
    sentiment: bool = False  # Sentiment analysis
    topics: bool = False  # Topic detection
    intents: bool = False  # Intent recognition
    
    # Keywords and search
    keywords: list = None  # Keyword detection
    keyword_boost: str = None  # Keyword boosting
    search: list = None  # Search terms
    replace: list = None  # Text replacement
    
    # Output formatting
    filler_words: bool = False  # Include filler words
    dictation: bool = False  # Dictation mode
    measurements: bool = False  # Measurement formatting
    dates: bool = False  # Date formatting
    times: bool = False  # Time formatting
    
    # Callback and metadata
    callback: str = None  # Webhook callback URL
    callback_method: str = "POST"  # Callback HTTP method
    custom_intent: list = None  # Custom intent models
    custom_intent_mode: str = None  # Custom intent processing mode
    custom_topic: list = None  # Custom topic models
    custom_topic_mode: str = None  # Custom topic processing mode
    
    # Advanced options
    tag: list = None  # Custom tags
    extra: dict = None  # Additional options

WebSocket Options

class ListenWebSocketOptions:
    def __init__(self, **kwargs): ...
    
    # Model and language settings
    model: str = "nova-2"  # AI model for transcription
    language: str = "en-US"  # Language code
    version: str = None  # Model version
    
    # Audio settings (required for WebSocket)
    encoding: str = "linear16"  # Audio encoding
    sample_rate: int = 16000  # Sample rate in Hz
    channels: int = 1  # Number of channels
    
    # Real-time processing
    interim_results: bool = True  # Receive interim results
    endpointing: bool = True  # Automatic endpoint detection
    vad_events: bool = False  # Voice activity detection events
    utterance_end_ms: int = 1000  # Utterance end timeout
    
    # Transcription features (same as REST)
    punctuate: bool = True
    profanity_filter: bool = False
    redact: list = None
    diarize: bool = False
    diarize_version: str = None
    ner: bool = False
    alternatives: int = 1
    numerals: bool = False
    smart_format: bool = False
    
    # Analysis features
    sentiment: bool = False
    topics: bool = False
    intents: bool = False
    
    # Keywords and search
    keywords: list = None
    keyword_boost: str = None
    search: list = None
    replace: list = None
    
    # Output options
    filler_words: bool = False
    dictation: bool = False
    measurements: bool = False
    dates: bool = False
    times: bool = False
    
    # Custom models
    custom_intent: list = None
    custom_intent_mode: str = None
    custom_topic: list = None
    custom_topic_mode: str = None
    
    # Advanced options
    tag: list = None
    extra: dict = None

Source Types

Input sources for audio data in various formats.

class PrerecordedSource:
    """Base class for prerecorded audio sources"""

class UrlSource(PrerecordedSource):
    def __init__(self, url: str):
        """
        Audio from URL.
        
        Args:
            url: HTTP/HTTPS URL to audio file
        """

class FileSource(PrerecordedSource):
    def __init__(self, file: str):
        """
        Audio from local file.
        
        Args:
            file: Path to local audio file
        """

class BufferSource(PrerecordedSource):
    def __init__(self, buffer: bytes):
        """
        Audio from byte buffer.
        
        Args:
            buffer: Raw audio bytes
        """

class StreamSource(PrerecordedSource):
    def __init__(self, stream):
        """
        Audio from stream object.
        
        Args:
            stream: File-like stream object
        """

class PreRecordedStreamSource(PrerecordedSource):
    """Legacy stream source alias"""

class ListenRestSource(PrerecordedSource):  
    """REST-specific source type"""

Response Types

REST Response Types

class PrerecordedResponse:
    """Main prerecorded transcription response"""
    metadata: ListenRESTMetadata
    results: ListenRESTResults

class AsyncPrerecordedResponse(PrerecordedResponse):
    """Async prerecorded response"""

class SyncPrerecordedResponse(PrerecordedResponse):
    """Sync prerecorded response"""

class ListenRESTMetadata:
    """REST transcription metadata"""
    request_id: str
    transaction_key: str
    sha256: str
    created: str
    duration: float
    channels: int
    models: list
    model_info: dict

class ListenRESTResults:
    """REST transcription results"""
    channels: list[ListenRESTChannel]
    utterances: list[Utterance] = None
    summary: dict = None

class ListenRESTChannel:
    """Channel-specific transcription results"""
    search: list[Search] = None
    alternatives: list[ListenRESTAlternative]

class ListenRESTAlternative:
    """Alternative transcription result"""
    transcript: str
    confidence: float
    words: list[ListenRESTWord]
    paragraphs: Paragraphs = None
    entities: list[Entity] = None
    translations: list[Translation] = None
    summaries: list[Summaries] = None

class ListenRESTWord:
    """Word-level transcription data"""
    word: str
    start: float
    end: float
    confidence: float
    punctuated_word: str = None
    speaker: int = None
    speaker_confidence: float = None
    language: str = None

WebSocket Response Types

class LiveResultResponse:
    """Live transcription result"""
    channel: ListenWSChannel
    metadata: ListenWSMetadata
    type: str

class ListenWSMetadataResponse:
    """WebSocket metadata response"""
    type: str
    transaction_key: str
    request_id: str
    sha256: str
    created: str
    duration: float
    channels: int

class SpeechStartedResponse:
    """Speech detection event"""
    type: str
    timestamp: str

class UtteranceEndResponse:
    """Utterance completion event"""  
    type: str
    channel: list
    last_word_end: float

class ListenWSChannel:
    """WebSocket channel data"""
    alternatives: list[ListenWSAlternative]

class ListenWSAlternative:
    """WebSocket alternative transcript"""
    transcript: str
    confidence: float
    words: list[ListenWSWord]

class ListenWSWord:
    """WebSocket word-level data"""
    word: str
    start: float
    end: float
    confidence: float
    punctuated_word: str = None
    speaker: int = None
    speaker_confidence: float = None

class ListenWSMetadata:
    """WebSocket connection metadata"""
    request_id: str
    model_name: str
    model_uuid: str

Common Response Elements

class Entity:
    """Named entity recognition result"""
    label: str
    value: str
    confidence: float
    start_word: int
    end_word: int

class Paragraph:
    """Paragraph structure"""
    sentences: list[Sentence]
    start: float
    end: float

class Paragraphs:
    """Collection of paragraphs"""
    transcript: str
    paragraphs: list[Paragraph]

class Sentence:
    """Sentence structure"""
    text: str
    start: float
    end: float

class Utterance:
    """Speaker utterance"""
    start: float
    end: float  
    confidence: float
    channel: int
    transcript: str
    words: list[ListenRESTWord]
    speaker: int
    id: str

class Translation:
    """Translation result"""
    language: str
    translation: str

class Warning:
    """Processing warning"""
    parameter: str
    type: str
    message: str

class Summaries:
    """Summary collection"""
    summary: str
    start_word: int
    end_word: int

class SummaryV1:
    """Version 1 summary format"""
    summary: str

class SummaryV2:
    """Version 2 summary format"""
    result: str
    short: str

Events

class LiveTranscriptionEvents:
    """WebSocket event types for real-time transcription"""
    Open: str = "Open"
    Close: str = "Close"
    Transcript: str = "Results"
    Metadata: str = "Metadata"
    UtteranceEnd: str = "UtteranceEnd"
    SpeechStarted: str = "SpeechStarted"
    Finalize: str = "Finalize"
    Error: str = "Error"
    Unhandled: str = "Unhandled"
    Warning: str = "Warning"

Usage Examples

Basic Prerecorded Transcription

from deepgram import DeepgramClient, UrlSource, ListenRESTOptions

client = DeepgramClient(api_key="your-api-key")

# Transcribe from URL
source = UrlSource("https://example.com/audio.wav")
options = ListenRESTOptions(
    model="nova-2",
    language="en-US",
    punctuate=True,
    diarize=True
)

response = client.listen.rest.transcribe_url(source, options)
transcript = response.results.channels[0].alternatives[0].transcript
print(transcript)

Real-time Transcription

from deepgram import DeepgramClient, ListenWebSocketOptions
import threading

client = DeepgramClient(api_key="your-api-key")

def on_message(self, result, **kwargs):
    sentence = result.channel.alternatives[0].transcript
    if sentence:
        print(f"Transcript: {sentence}")

def on_error(self, error, **kwargs):
    print(f"Error: {error}")

# Configure WebSocket options
options = ListenWebSocketOptions(
    model="nova-2",
    language="en-US",
    encoding="linear16",
    sample_rate=16000,
    channels=1,
    interim_results=True
)

# Start connection
dg_connection = client.listen.websocket.v("1")
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
dg_connection.on(LiveTranscriptionEvents.Error, on_error)

if dg_connection.start(options):
    # Send audio data (typically from microphone)
    # dg_connection.send(audio_data)
    
    # When done
    dg_connection.finish()
    dg_connection.close()

Advanced Features

from deepgram import DeepgramClient, FileSource, ListenRESTOptions

client = DeepgramClient(api_key="your-api-key")

# Advanced transcription with multiple features
source = FileSource("meeting.wav")
options = ListenRESTOptions(
    model="nova-2",
    language="en-US",
    punctuate=True,
    diarize=True,
    diarize_version="2021-07-14.0",
    ner=True,
    summarize="v2",
    topics=True,
    intents=True,
    sentiment=True,
    utterances=True,
    paragraphs=True,
    keywords=["project", "deadline", "budget"],
    search=["important", "action item"]
)

response = client.listen.rest.transcribe_url(source, options)

# Access different types of results
transcript = response.results.channels[0].alternatives[0].transcript
utterances = response.results.utterances
summary = response.results.summary

Install with Tessl CLI

npx tessl i tessl/pypi-deepgram-sdk

docs

audio-utilities.md

conversational-ai.md

index.md

project-management.md

speech-to-text.md

text-analysis.md

text-to-speech.md

tile.json