CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-deepgram-sdk

The official Python SDK for the Deepgram automated speech recognition platform.

Pending
Overview
Eval results
Files

text-to-speech.mddocs/

Text-to-Speech

High-quality neural text-to-speech synthesis with multiple voice models and real-time streaming capabilities. The Speak module supports both REST API for generating complete audio files and WebSocket streaming for real-time audio generation with various voice models, audio formats, and synthesis options.

Capabilities

REST Client (Complete Audio Generation)

Synchronous client for generating complete audio files from text input with comprehensive voice and format options.

class SpeakRESTClient:
    def stream_memory(
        self,
        source: FileSource,
        options: SpeakRESTOptions = None,
        addons: dict = None,
        headers: dict = None,
        timeout = None,
        endpoint: str = "v1/speak",
        **kwargs
    ) -> SpeakRESTResponse:
        """
        Generate speech from text input and return in-memory response.
        
        Args:
            source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
            options: Synthesis configuration options
            addons: Additional request parameters
            headers: Additional HTTP headers
            timeout: Request timeout
            endpoint: API endpoint override
            
        Returns:
            SpeakRESTResponse: Generated audio data with metadata
        """
    
    def stream_raw(
        self,
        source: FileSource,
        options: SpeakRESTOptions = None,
        addons: dict = None,
        headers: dict = None,
        timeout = None,
        endpoint: str = "v1/speak",
        **kwargs
    ) -> httpx.Response:
        """
        Generate speech and return raw HTTP response.
        
        Args:
            source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
            options: Synthesis configuration options
            addons: Additional request parameters
            headers: Additional HTTP headers
            timeout: Request timeout
            endpoint: API endpoint override
            
        Returns:
            httpx.Response: Raw HTTP response with audio data
        """
    
    def save(
        self,
        filename: str,
        source: FileSource,
        options: SpeakRESTOptions = None,
        addons: dict = None,
        headers: dict = None,
        timeout = None,
        endpoint: str = "v1/speak",
        **kwargs
    ) -> SpeakRESTResponse:
        """
        Generate speech and save directly to file.
        
        Args:
            filename: Output file path
            source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
            options: Synthesis configuration options
            addons: Additional request parameters
            headers: Additional HTTP headers
            timeout: Request timeout
            endpoint: API endpoint override
            
        Returns:
            SpeakRESTResponse: Response metadata and status
        """
    
    def file(
        self,
        filename: str,
        source: FileSource,
        options: SpeakRESTOptions = None,
        addons: dict = None,
        timeout = None,
        endpoint: str = "v1/speak",
        **kwargs
    ) -> SpeakRESTResponse:
        """
        Generate speech and save to file (alias for save method).
        
        Args:
            filename: Output file path
            source: Text/audio source for synthesis (TextSource, BufferSource, StreamSource)
            options: Synthesis configuration options
            addons: Additional request parameters
            timeout: Request timeout
            endpoint: API endpoint override
            
        Returns:
            SpeakRESTResponse: Response metadata and status
        """

class AsyncSpeakRESTClient:
    async def stream_memory(
        self,
        source: FileSource,
        options: SpeakRESTOptions = None,
        addons: dict = None,
        headers: dict = None,
        timeout = None,
        endpoint: str = "v1/speak",
        **kwargs
    ) -> SpeakRESTResponse:
        """Async version of stream_memory method"""
    
    async def stream_raw(
        self,
        source: FileSource,
        options: SpeakRESTOptions = None,
        addons: dict = None,
        headers: dict = None,
        timeout = None,
        endpoint: str = "v1/speak",
        **kwargs
    ) -> httpx.Response:
        """Async version of stream_raw method"""
    
    async def save(
        self,
        filename: str,
        source: FileSource,
        options: SpeakRESTOptions = None,
        addons: dict = None,
        headers: dict = None,
        timeout = None,
        endpoint: str = "v1/speak",
        **kwargs
    ) -> SpeakRESTResponse:
        """Async version of save method"""
    
    async def file(
        self,
        filename: str,
        source: FileSource,
        options: SpeakRESTOptions = None,
        addons: dict = None,
        timeout = None,
        endpoint: str = "v1/speak",
        **kwargs
    ) -> SpeakRESTResponse:
        """Async version of file method"""

WebSocket Client (Streaming Audio Generation)

Real-time streaming text-to-speech client supporting incremental text input and real-time audio output.

class SpeakWebSocketClient:
    def __init__(self, config: DeepgramClientOptions, microphone: Microphone = None): ...
    
    def start(
        self,
        options: SpeakWSOptions = None,
        addons: dict = None,
        headers: dict = None,
        members: dict = None,
        **kwargs
    ) -> bool:
        """
        Start WebSocket connection for streaming TTS.
        
        Args:
            options: WebSocket configuration options
            addons: Additional request parameters
            headers: Additional HTTP headers
            members: Member configuration
            
        Returns:
            bool: True if connection started successfully
        """
    
    def on(self, event: SpeakWebSocketEvents, handler: Callable) -> None:
        """
        Register event handler for WebSocket events.
        
        Args:
            event: WebSocket event type
            handler: Callable to handle the event
        """
    
    def send_text(self, text_input: str) -> bool:
        """
        Send text for speech synthesis.
        
        Args:
            text_input: Text to convert to speech
            
        Returns:
            bool: True if text sent successfully
        """
    
    def send(self, data: Union[str, bytes]) -> bool:
        """
        Send text data (alias for send_text).
        
        Args:
            data: Text or bytes to send
            
        Returns:
            bool: True if data sent successfully
        """
    
    def send_raw(self, msg: str) -> bool:
        """
        Send raw WebSocket message.
        
        Args:
            msg: Raw message to send
            
        Returns:
            bool: True if message sent successfully
        """
    
    def send_control(
        self, 
        msg_type: Union[SpeakWebSocketMessage, str], 
        data: str = ""
    ) -> bool:
        """
        Send control message.
        
        Args:
            msg_type: Message type constant
            data: Optional data payload
            
        Returns:
            bool: True if control message sent successfully
        """
    
    def flush(self) -> bool:
        """
        Flush current synthesis buffer.
        
        Returns:
            bool: True if flush successful
        """
    
    def clear(self) -> bool:
        """
        Clear synthesis buffer.
        
        Returns:
            bool: True if clear successful
        """
    
    def finish(self) -> bool:
        """
        Finish WebSocket connection.
        
        Returns:
            bool: True if finish successful
        """
    
    def wait_for_complete(self) -> None:
        """
        Wait for synthesis completion.
        """

class AsyncSpeakWebSocketClient:
    def __init__(self, config: DeepgramClientOptions, microphone: Microphone = None): ...
    
    async def start(...) -> bool: ...
    def on(self, event: SpeakWebSocketEvents, handler: Callable) -> None: ...  # Not async
    async def send_text(self, text_input: str) -> bool: ...
    async def send(self, data: Union[str, bytes]) -> bool: ...
    async def send_raw(self, msg: str) -> bool: ...
    async def send_control(...) -> bool: ...
    async def flush(self) -> bool: ...
    async def clear(self) -> bool: ...
    async def finish(self) -> bool: ...
    async def wait_for_complete(self) -> None: ...

# Alternative client names
class SpeakWSClient(SpeakWebSocketClient): ...
class AsyncSpeakWSClient(AsyncSpeakWebSocketClient): ...

Router Access

Access text-to-speech clients through the main client's speak router.

class SpeakRouter:
    @property
    def rest(self) -> SpeakRESTClient: ...
    @property
    def asyncrest(self) -> AsyncSpeakRESTClient: ...
    @property
    def websocket(self) -> SpeakWebSocketClient: ...
    @property
    def asyncwebsocket(self) -> AsyncSpeakWebSocketClient: ...

Options Classes

REST Options

class SpeakRESTOptions:
    def __init__(self, **kwargs): ...
    
    # Voice model selection
    model: str = "aura-asteria-en"  # Voice model name
    
    # Audio format settings
    encoding: str = "linear16"  # Audio encoding format
    container: str = "wav"  # Audio container format
    sample_rate: int = 24000  # Sample rate in Hz
    bit_rate: int = None  # Bit rate for compressed formats
    
    # Additional options
    extra: dict = None  # Additional synthesis options

# Legacy alias
class SpeakOptions(SpeakRESTOptions): ...

WebSocket Options

class SpeakWSOptions:
    def __init__(self, **kwargs): ...
    
    # Voice model selection
    model: str = "aura-asteria-en"  # Voice model name
    
    # Audio format settings (required for WebSocket)
    encoding: str = "linear16"  # Audio encoding format
    sample_rate: int = 24000  # Sample rate in Hz
    container: str = None  # Audio container (optional for streaming)
    
    # Additional options
    extra: dict = None  # Additional synthesis options

WebSocket Events and Messages

Event constants and message types for WebSocket text-to-speech operations.

class SpeakWebSocketEvents:
    """WebSocket event constants for TTS operations"""
    OPEN: str = "Open"
    METADATA: str = "Metadata"
    AUDIO: str = "Audio"
    FLUSHED: str = "Flushed"
    CLEARED: str = "Cleared"
    CLOSE: str = "Close"
    ERROR: str = "Error"
    WARNING: str = "Warning"
    UNHANDLED: str = "Unhandled"

class SpeakWebSocketMessage:
    """WebSocket message type constants"""
    SPEAK: str = "Speak"
    FLUSH: str = "Flush"
    CLEAR: str = "Clear"
    CLOSE: str = "Close"

Source Types

Input sources for text data in various formats.

class SpeakSource:
    """Base class for text-to-speech sources"""

class TextSource(SpeakSource):
    def __init__(self, text: str):
        """
        Text from string.
        
        Args:
            text: Text content to synthesize
        """

class BufferSource(SpeakSource):
    def __init__(self, buffer: bytes):
        """
        Text from byte buffer.
        
        Args:
            buffer: Text content as bytes
        """

class StreamSource(SpeakSource):
    def __init__(self, stream):
        """
        Text from stream object.
        
        Args:
            stream: File-like stream object
        """

class FileSource(SpeakSource):
    def __init__(self, file: str):
        """
        Text from local file.
        
        Args:
            file: Path to local text file
        """

# Alternative source names
class SpeakRestSource(SpeakSource): ...
class SpeakRESTSource(SpeakSource): ...

Response Types

REST Response Types

class SpeakRESTResponse:
    """REST text-to-speech response containing generated audio"""
    content: bytes  # Generated audio data
    headers: dict  # Response headers with metadata
    
    def stream_to_file(self, filename: str) -> None:
        """
        Save audio content to file.
        
        Args:
            filename: Output file path
        """

# Legacy alias
class SpeakResponse(SpeakRESTResponse): ...

WebSocket Response Types

class SpeakWSMetadataResponse:
    """WebSocket metadata response"""
    type: str = "Metadata"
    request_id: str
    model_name: str
    model_uuid: str

class FlushedResponse:
    """Buffer flush confirmation"""
    type: str = "Flushed"

class ClearedResponse:
    """Buffer clear confirmation"""
    type: str = "Cleared"

class WarningResponse:
    """Synthesis warning"""
    type: str = "Warning"
    message: str
    
# Common WebSocket responses are inherited from common module:
# OpenResponse, CloseResponse, ErrorResponse, UnhandledResponse

Usage Examples

Basic Text-to-Speech

from deepgram import DeepgramClient, TextSource, SpeakRESTOptions

client = DeepgramClient(api_key="your-api-key")

# Generate speech from text
source = TextSource("Hello, world! This is a test of the Deepgram text-to-speech API.")
options = SpeakRESTOptions(
    model="aura-asteria-en",
    encoding="linear16",
    container="wav",
    sample_rate=24000
)

response = client.speak.rest.stream(source, options)

# Save to file
with open("output.wav", "wb") as f:
    f.write(response.content)

# Or use convenience method
response.stream_to_file("output.wav")

Voice Model Selection

from deepgram import DeepgramClient, TextSource, SpeakRESTOptions

client = DeepgramClient(api_key="your-api-key")

# Different voice models
models = [
    "aura-asteria-en",    # English, female
    "aura-luna-en",       # English, female  
    "aura-stella-en",     # English, female
    "aura-athena-en",     # English, female
    "aura-hera-en",       # English, female
    "aura-orion-en",      # English, male
    "aura-arcas-en",      # English, male
    "aura-perseus-en",    # English, male
    "aura-angus-en",      # English, male
    "aura-orpheus-en",    # English, male
]

source = TextSource("This is a test with different voice models.")

for model in models:
    options = SpeakRESTOptions(model=model)
    response = client.speak.rest.stream(source, options)
    response.stream_to_file(f"output_{model}.wav")

Audio Format Options

from deepgram import DeepgramClient, TextSource, SpeakRESTOptions

client = DeepgramClient(api_key="your-api-key")
source = TextSource("Testing different audio formats.")

# WAV format (uncompressed)
wav_options = SpeakRESTOptions(
    model="aura-asteria-en",
    encoding="linear16",
    container="wav",
    sample_rate=24000
)

# MP3 format (compressed)
mp3_options = SpeakRESTOptions(
    model="aura-asteria-en", 
    encoding="mp3",
    container="mp3",
    sample_rate=22050,
    bit_rate=128000
)

# FLAC format (lossless compression)  
flac_options = SpeakRESTOptions(
    model="aura-asteria-en",
    encoding="flac",
    container="flac", 
    sample_rate=24000
)

# Generate in different formats
wav_response = client.speak.rest.stream(source, wav_options)
mp3_response = client.speak.rest.stream(source, mp3_options)
flac_response = client.speak.rest.stream(source, flac_options)

wav_response.stream_to_file("output.wav")
mp3_response.stream_to_file("output.mp3")
flac_response.stream_to_file("output.flac")

Streaming Text-to-Speech

from deepgram import DeepgramClient, SpeakWSOptions, SpeakWebSocketEvents
import threading
import queue

client = DeepgramClient(api_key="your-api-key")
audio_queue = queue.Queue()

def on_open(self, open, **kwargs):
    print("TTS connection opened")

def on_audio_data(self, data, **kwargs):
    # Received audio chunk
    audio_queue.put(data)

def on_close(self, close, **kwargs):
    print("TTS connection closed")

def on_error(self, error, **kwargs):
    print(f"TTS error: {error}")

# Configure WebSocket options
options = SpeakWSOptions(
    model="aura-asteria-en",
    encoding="linear16",
    sample_rate=24000
)

# Start connection
dg_connection = client.speak.websocket.v("1")
dg_connection.on(SpeakWebSocketEvents.Open, on_open)
dg_connection.on(SpeakWebSocketEvents.AudioData, on_audio_data)
dg_connection.on(SpeakWebSocketEvents.Close, on_close)
dg_connection.on(SpeakWebSocketEvents.Error, on_error)

if dg_connection.start(options):
    # Send text incrementally
    dg_connection.send("Hello, this is streaming text-to-speech. ")
    dg_connection.send("I can send text in chunks and receive audio in real-time. ")
    dg_connection.send("This is very useful for interactive applications.")
    
    # Flush to ensure all text is processed
    dg_connection.flush()
    
    # Close connection
    dg_connection.close()

# Process received audio
audio_data = b""
while not audio_queue.empty():
    audio_data += audio_queue.get()

# Save streamed audio
with open("streamed_output.wav", "wb") as f:
    f.write(audio_data)

Async Text-to-Speech

import asyncio
from deepgram import DeepgramClient, TextSource, SpeakRESTOptions

async def async_tts_example():
    client = DeepgramClient(api_key="your-api-key")
    
    source = TextSource("This is an async text-to-speech example.")
    options = SpeakRESTOptions(
        model="aura-asteria-en",
        encoding="linear16",
        container="wav"
    )
    
    response = await client.speak.asyncrest.synthesize(source, options)
    
    with open("async_output.wav", "wb") as f:
        f.write(response.content)
    
    print("Async TTS completed")

# Run async example
asyncio.run(async_tts_example())

Error Handling

from deepgram import DeepgramClient, DeepgramApiError, TextSource, SpeakRESTOptions

client = DeepgramClient(api_key="your-api-key")

try:
    source = TextSource("Text to synthesize")
    options = SpeakRESTOptions(
        model="invalid-model",  # This will cause an error
        encoding="linear16"
    )
    
    response = client.speak.rest.stream(source, options)
    
except DeepgramApiError as e:
    print(f"API Error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

Install with Tessl CLI

npx tessl i tessl/pypi-deepgram-sdk

docs

audio-utilities.md

conversational-ai.md

index.md

project-management.md

speech-to-text.md

text-analysis.md

text-to-speech.md

tile.json