CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-deepgram-sdk

The official Python SDK for the Deepgram automated speech recognition platform.

Pending
Overview
Eval results
Files

conversational-ai.mddocs/

Conversational AI

Real-time conversational AI capabilities enabling voice-based interactions with intelligent agents. The Agent module supports function calling, dynamic prompt updates, bidirectional audio streaming, and sophisticated conversation management for building interactive voice applications.

Capabilities

Agent WebSocket Client

Real-time WebSocket clients for conversational AI interactions with full duplex audio streaming and message handling.

class AgentWebSocketClient:
    def start(self, options: SettingsOptions) -> bool:
        """
        Start WebSocket connection for agent interaction.
        
        Args:
            options: Agent configuration settings
            
        Returns:
            bool: True if connection started successfully
        """
    
    def send_settings(self, settings: SettingsOptions) -> bool:
        """
        Update agent settings during conversation.
        
        Args:
            settings: New agent configuration
            
        Returns:
            bool: True if settings sent successfully
        """
    
    def update_prompt(self, options: UpdatePromptOptions) -> bool:
        """
        Update the agent's system prompt.
        
        Args:
            options: New prompt configuration
            
        Returns:
            bool: True if prompt updated successfully
        """
    
    def update_speak_options(self, options: UpdateSpeakOptions) -> bool:
        """
        Update the agent's speech synthesis settings.
        
        Args:
            options: New speak configuration
            
        Returns:
            bool: True if speak options updated successfully
        """
    
    def inject_agent_message(self, options: InjectAgentMessageOptions) -> bool:
        """
        Inject a message as if spoken by the agent.
        
        Args:
            options: Message injection configuration
            
        Returns:
            bool: True if message injected successfully
        """
    
    def inject_user_message(self, options: InjectUserMessageOptions) -> bool:
        """
        Inject a message as if spoken by the user.
        
        Args:
            options: Message injection configuration
            
        Returns:
            bool: True if message injected successfully
        """
    
    def send_function_call_response(self, response: FunctionCallResponse) -> bool:
        """
        Send response to agent function call request.
        
        Args:
            response: Function call result
            
        Returns:
            bool: True if response sent successfully
        """
    
    def keep_alive(self) -> bool:
        """
        Send keep-alive message to maintain connection.
        
        Returns:
            bool: True if keep-alive sent successfully
        """
    
    def send_audio(self, audio_data: bytes) -> bool:
        """
        Send audio data to the agent.
        
        Args:
            audio_data: Raw audio bytes
            
        Returns:
            bool: True if audio sent successfully
        """
    
    def close(self) -> bool:
        """
        Close WebSocket connection.
        
        Returns:
            bool: True if connection closed successfully
        """

class AsyncAgentWebSocketClient:
    # All methods are async versions of AgentWebSocketClient methods
    async def start(self, options: SettingsOptions) -> bool: ...
    async def send_settings(self, settings: SettingsOptions) -> bool: ...
    async def update_prompt(self, options: UpdatePromptOptions) -> bool: ...
    # ... (all other methods with async keyword)

Router Access

Access conversational AI clients through the main client's agent router.

class AgentRouter:
    @property
    def websocket(self) -> AgentWebSocketClient: ...
    @property
    def asyncwebsocket(self) -> AsyncAgentWebSocketClient: ...

Options Classes

Top-level Configuration

class SettingsOptions:
    def __init__(self, **kwargs): ...
    agent: Agent  # Agent configuration
    listen: Listen = None  # Speech-to-text settings
    speak: Speak = None  # Text-to-speech settings
    think: Think = None  # Thinking/processing settings

class UpdatePromptOptions:
    def __init__(self, **kwargs): ...
    prompt: str  # New system prompt text

class UpdateSpeakOptions:
    def __init__(self, **kwargs): ...
    speak: Speak  # New speech synthesis settings

class InjectAgentMessageOptions:
    def __init__(self, **kwargs): ...
    text: str  # Message text to inject

class InjectUserMessageOptions:
    def __init__(self, **kwargs): ...
    text: str  # User message text to inject

class FunctionCallResponse:
    def __init__(self, **kwargs): ...
    name: str  # Function name
    result: str  # Function execution result

class AgentKeepAlive:
    def __init__(self, **kwargs): ...
    type: str = "KeepAlive"  # Message type

Sub-level Configuration

class Agent:
    def __init__(self, **kwargs): ...
    listen: Listen  # Listening configuration
    think: Think  # Thinking configuration
    speak: Speak  # Speaking configuration

class Listen:
    def __init__(self, **kwargs): ...
    model: str = "nova-2"  # STT model
    language: str = "en-US"  # Language code
    smart_format: bool = True  # Smart formatting
    encoding: str = "linear16"  # Audio encoding
    sample_rate: int = 16000  # Sample rate
    channels: int = 1  # Audio channels
    interim_results: bool = True  # Interim results
    vad_events: bool = True  # Voice activity detection
    endpointing: bool = True  # Endpoint detection

class Speak:
    def __init__(self, **kwargs): ...
    model: str = "aura-asteria-en"  # TTS model
    encoding: str = "linear16"  # Audio encoding
    sample_rate: int = 24000  # Sample rate
    container: str = "none"  # Audio container

class Think:
    def __init__(self, **kwargs): ...
    provider: Provider  # AI provider configuration
    model: str = "gpt-4"  # Language model
    instructions: str = ""  # System instructions
    functions: list[Function] = None  # Available functions

class Provider:
    def __init__(self, **kwargs): ...
    type: str = "open_ai"  # Provider type
    ```

#### Function Configuration

```python { .api }
class Function:
    def __init__(self, **kwargs): ...
    name: str  # Function name
    description: str  # Function description
    parameters: Parameters  # Function parameters schema

class Parameters:
    def __init__(self, **kwargs): ...
    type: str = "object"  # Parameters type
    properties: Properties  # Parameter properties
    required: list[str] = None  # Required parameters

class Properties:
    def __init__(self, **kwargs): ...
    # Dynamic properties based on function parameters

class Header:
    def __init__(self, **kwargs): ...
    name: str  # Header name
    value: str  # Header value

class Item:
    def __init__(self, **kwargs): ...
    # Generic item configuration

class Input:
    def __init__(self, **kwargs): ...
    # Input configuration

class Output:
    def __init__(self, **kwargs): ...
    # Output configuration

class Audio:
    def __init__(self, **kwargs): ...
    # Audio configuration

class Endpoint:
    def __init__(self, **kwargs): ...
    # Endpoint configuration

Response Types

Agent-Specific Responses

class WelcomeResponse:
    """Initial connection welcome message"""
    type: str = "Welcome"
    message: str

class SettingsAppliedResponse:
    """Settings update confirmation"""
    type: str = "SettingsApplied"
    settings: dict

class ConversationTextResponse:
    """Conversation text event"""
    type: str = "ConversationText"
    text: str
    role: str  # "user" or "assistant"

class UserStartedSpeakingResponse:
    """User speech detection event"""
    type: str = "UserStartedSpeaking"
    timestamp: str

class AgentThinkingResponse:
    """Agent processing indication"""
    type: str = "AgentThinking"

class FunctionCall:
    """Function call data"""
    name: str
    arguments: dict

class FunctionCallRequest:
    """Function call request from agent"""
    type: str = "FunctionCallRequest"
    function_call: FunctionCall
    call_id: str

class AgentStartedSpeakingResponse:
    """Agent speech start event"""
    type: str = "AgentStartedSpeaking"
    timestamp: str

class AgentAudioDoneResponse:
    """Agent finished speaking event"""
    type: str = "AgentAudioDone"

class InjectionRefusedResponse:
    """Message injection refusal"""
    type: str = "InjectionRefused"
    message: str
    
# Common WebSocket responses are inherited:
# OpenResponse, CloseResponse, ErrorResponse, UnhandledResponse

Events

class AgentWebSocketEvents:
    """WebSocket event types for conversational AI"""
    
    # Server Events (received from agent)
    Open: str = "Open"
    Close: str = "Close"
    AudioData: str = "AudioData"
    Welcome: str = "Welcome"
    SettingsApplied: str = "SettingsApplied"
    ConversationText: str = "ConversationText"
    UserStartedSpeaking: str = "UserStartedSpeaking"
    AgentThinking: str = "AgentThinking"
    FunctionCallRequest: str = "FunctionCallRequest"
    AgentStartedSpeaking: str = "AgentStartedSpeaking"
    AgentAudioDone: str = "AgentAudioDone"
    Error: str = "Error"
    Unhandled: str = "Unhandled"
    
    # Client Events (sent to agent)
    Settings: str = "Settings"
    UpdatePrompt: str = "UpdatePrompt"
    UpdateSpeak: str = "UpdateSpeak"
    InjectAgentMessage: str = "InjectAgentMessage"
    InjectUserMessage: str = "InjectUserMessage"
    InjectionRefused: str = "InjectionRefused"
    AgentKeepAlive: str = "KeepAlive"

Usage Examples

Basic Conversational Agent

from deepgram import DeepgramClient, SettingsOptions, Agent, Listen, Speak, Think, Provider, AgentWebSocketEvents
import threading

client = DeepgramClient(api_key="your-api-key")

def on_open(self, open_event, **kwargs):
    print("Agent connection opened")

def on_welcome(self, welcome, **kwargs):
    print(f"Agent welcome: {welcome.message}")

def on_conversation_text(self, text_event, **kwargs):
    print(f"{text_event.role}: {text_event.text}")

def on_user_started_speaking(self, event, **kwargs):
    print("User started speaking")

def on_agent_thinking(self, event, **kwargs):
    print("Agent is thinking...")

def on_agent_started_speaking(self, event, **kwargs):
    print("Agent started speaking")

def on_agent_audio_done(self, event, **kwargs):
    print("Agent finished speaking")

def on_audio_data(self, audio_data, **kwargs):
    # Handle agent's speech audio
    # In a real application, you'd play this audio
    print(f"Received {len(audio_data)} bytes of audio")

def on_error(self, error, **kwargs):
    print(f"Agent error: {error}")

# Configure agent settings
agent_settings = SettingsOptions(
    agent=Agent(
        listen=Listen(
            model="nova-2",
            language="en-US",
            smart_format=True,
            encoding="linear16",
            sample_rate=16000,
            interim_results=True,
            vad_events=True
        ),
        think=Think(
            provider=Provider(type="open_ai"),
            model="gpt-4",
            instructions="You are a helpful AI assistant. Be conversational and friendly."
        ),
        speak=Speak(
            model="aura-asteria-en",
            encoding="linear16",
            sample_rate=24000
        )
    )
)

# Create connection
dg_connection = client.agent.websocket.v("1")

# Set up event handlers
dg_connection.on(AgentWebSocketEvents.Open, on_open)
dg_connection.on(AgentWebSocketEvents.Welcome, on_welcome)
dg_connection.on(AgentWebSocketEvents.ConversationText, on_conversation_text)
dg_connection.on(AgentWebSocketEvents.UserStartedSpeaking, on_user_started_speaking)
dg_connection.on(AgentWebSocketEvents.AgentThinking, on_agent_thinking)
dg_connection.on(AgentWebSocketEvents.AgentStartedSpeaking, on_agent_started_speaking)
dg_connection.on(AgentWebSocketEvents.AgentAudioDone, on_agent_audio_done)
dg_connection.on(AgentWebSocketEvents.AudioData, on_audio_data)
dg_connection.on(AgentWebSocketEvents.Error, on_error)

# Start connection
if dg_connection.start(agent_settings):
    print("Agent connection started")
    
    # Send audio data (typically from microphone)
    # audio_data = get_microphone_data()
    # dg_connection.send_audio(audio_data)
    
    # Keep connection alive
    # dg_connection.keep_alive()
    
    # Close when done
    dg_connection.close()

Agent with Function Calling

from deepgram import (
    DeepgramClient, SettingsOptions, Agent, Think, Provider, Function, 
    Parameters, Properties, FunctionCallResponse, AgentWebSocketEvents
)
import json

client = DeepgramClient(api_key="your-api-key")

def on_function_call_request(self, request, **kwargs):
    """Handle function call requests from the agent"""
    print(f"Function call: {request.function_call.name}")
    print(f"Arguments: {request.function_call.arguments}")
    
    # Execute the function based on name
    if request.function_call.name == "get_weather":
        location = request.function_call.arguments.get("location")
        weather_data = get_weather(location)  # Your weather function
        
        # Send response back to agent
        response = FunctionCallResponse(
            name=request.function_call.name,
            result=json.dumps(weather_data)
        )
        dg_connection.send_function_call_response(response)
    
    elif request.function_call.name == "set_reminder":
        reminder = request.function_call.arguments.get("reminder")
        time = request.function_call.arguments.get("time")
        result = set_reminder(reminder, time)  # Your reminder function
        
        response = FunctionCallResponse(
            name=request.function_call.name,
            result=json.dumps({"success": result})
        )
        dg_connection.send_function_call_response(response)

def get_weather(location):
    """Mock weather function"""
    return {
        "location": location,
        "temperature": 72,
        "condition": "sunny",
        "humidity": 45
    }

def set_reminder(reminder, time):
    """Mock reminder function"""
    print(f"Setting reminder: {reminder} at {time}")
    return True

# Define available functions
weather_function = Function(
    name="get_weather",
    description="Get current weather information for a location",
    parameters=Parameters(
        type="object",
        properties={
            "location": {"type": "string", "description": "City name or location"}
        },
        required=["location"]
    )
)

reminder_function = Function(
    name="set_reminder",
    description="Set a reminder for the user",
    parameters=Parameters(
        type="object",
        properties={
            "reminder": {"type": "string", "description": "Reminder text"},
            "time": {"type": "string", "description": "Time for the reminder"}
        },
        required=["reminder", "time"]
    )
)

# Configure agent with functions
agent_settings = SettingsOptions(
    agent=Agent(
        think=Think(
            provider=Provider(type="open_ai"),
            model="gpt-4",
            instructions="You are a helpful assistant with access to weather and reminder functions. Use them when appropriate.",
            functions=[weather_function, reminder_function]
        )
        # ... other agent configuration
    )
)

dg_connection = client.agent.websocket.v("1")
dg_connection.on(AgentWebSocketEvents.FunctionCallRequest, on_function_call_request)

if dg_connection.start(agent_settings):
    # Agent can now call functions during conversation
    pass

Dynamic Agent Updates

from deepgram import (
    DeepgramClient, UpdatePromptOptions, UpdateSpeakOptions, 
    InjectAgentMessageOptions, InjectUserMessageOptions, Speak
)

client = DeepgramClient(api_key="your-api-key")
dg_connection = client.agent.websocket.v("1")

# Start with initial settings
if dg_connection.start(initial_settings):
    
    # Update the agent's personality/instructions
    new_prompt = UpdatePromptOptions(
        prompt="You are now a cheerful children's storyteller. Use simple language and be very enthusiastic."
    )
    dg_connection.update_prompt(new_prompt)
    
    # Change the voice model
    new_speak_options = UpdateSpeakOptions(
        speak=Speak(
            model="aura-luna-en",  # Different voice
            encoding="linear16",
            sample_rate=24000
        )
    )
    dg_connection.update_speak_options(new_speak_options)
    
    # Inject context into the conversation
    agent_message = InjectAgentMessageOptions(
        text="I just switched to storytelling mode! What kind of story would you like to hear?"
    )
    dg_connection.inject_agent_message(agent_message)
    
    # Inject user context
    user_message = InjectUserMessageOptions(
        text="The user mentioned they like adventure stories about pirates."
    )
    dg_connection.inject_user_message(user_message)

Multi-Agent Conversation

from deepgram import DeepgramClient, SettingsOptions, Agent, Think, Provider
import asyncio

async def create_agent(client, agent_id, instructions):
    """Create and configure an agent"""
    settings = SettingsOptions(
        agent=Agent(
            think=Think(
                provider=Provider(type="open_ai"),
                model="gpt-4",
                instructions=f"Agent {agent_id}: {instructions}"
            )
            # ... other configuration
        )
    )
    
    connection = client.agent.asyncwebsocket.v("1")
    await connection.start(settings)
    return connection

async def multi_agent_example():
    client = DeepgramClient(api_key="your-api-key")
    
    # Create multiple agents with different roles
    moderator = await create_agent(
        client, "Moderator", 
        "You are a meeting moderator. Keep discussions on track and summarize key points."
    )
    
    expert1 = await create_agent(
        client, "Expert1",
        "You are a technical expert. Provide detailed technical insights."
    )
    
    expert2 = await create_agent(
        client, "Expert2", 
        "You are a business expert. Focus on practical business implications."
    )
    
    # Coordinate conversation between agents
    # This would involve managing turn-taking and message passing
    # between the different agent connections
    
    # Close connections when done
    await moderator.close()
    await expert1.close() 
    await expert2.close()

# Run multi-agent example
asyncio.run(multi_agent_example())

Error Handling and Recovery

from deepgram import DeepgramClient, DeepgramApiError, SettingsOptions, AgentWebSocketEvents

client = DeepgramClient(api_key="your-api-key")

def on_error(self, error, **kwargs):
    """Handle various error types"""
    print(f"Agent error: {error}")
    
    # Implement error-specific recovery logic
    if "connection" in str(error).lower():
        print("Connection error - attempting to reconnect...")
        # Implement reconnection logic
    elif "authentication" in str(error).lower():
        print("Authentication error - check API key")
    else:
        print("Unknown error - logging for investigation")

def on_injection_refused(self, refusal, **kwargs):
    """Handle message injection refusals"""
    print(f"Message injection refused: {refusal.message}")
    # Implement fallback logic

try:
    settings = SettingsOptions(
        # ... agent configuration
    )
    
    dg_connection = client.agent.websocket.v("1")
    dg_connection.on(AgentWebSocketEvents.Error, on_error)
    dg_connection.on(AgentWebSocketEvents.InjectionRefused, on_injection_refused)
    
    if dg_connection.start(settings):
        # Connection successful
        print("Agent started successfully")
        
        # Implement connection health monitoring
        # dg_connection.keep_alive()  # Send periodically
        
    else:
        print("Failed to start agent connection")
        
except DeepgramApiError as e:
    print(f"API Error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")
finally:
    if 'dg_connection' in locals():
        dg_connection.close()

Install with Tessl CLI

npx tessl i tessl/pypi-deepgram-sdk

docs

audio-utilities.md

conversational-ai.md

index.md

project-management.md

speech-to-text.md

text-analysis.md

text-to-speech.md

tile.json