0
# Conversational AI
1
2
Real-time conversational AI capabilities enabling voice-based interactions with intelligent agents. The Agent module supports function calling, dynamic prompt updates, bidirectional audio streaming, and sophisticated conversation management for building interactive voice applications.
3
4
## Capabilities
5
6
### Agent WebSocket Client
7
8
Real-time WebSocket clients for conversational AI interactions with full duplex audio streaming and message handling.
9
10
```python { .api }
11
class AgentWebSocketClient:
12
def start(self, options: SettingsOptions) -> bool:
13
"""
14
Start WebSocket connection for agent interaction.
15
16
Args:
17
options: Agent configuration settings
18
19
Returns:
20
bool: True if connection started successfully
21
"""
22
23
def send_settings(self, settings: SettingsOptions) -> bool:
24
"""
25
Update agent settings during conversation.
26
27
Args:
28
settings: New agent configuration
29
30
Returns:
31
bool: True if settings sent successfully
32
"""
33
34
def update_prompt(self, options: UpdatePromptOptions) -> bool:
35
"""
36
Update the agent's system prompt.
37
38
Args:
39
options: New prompt configuration
40
41
Returns:
42
bool: True if prompt updated successfully
43
"""
44
45
def update_speak_options(self, options: UpdateSpeakOptions) -> bool:
46
"""
47
Update the agent's speech synthesis settings.
48
49
Args:
50
options: New speak configuration
51
52
Returns:
53
bool: True if speak options updated successfully
54
"""
55
56
def inject_agent_message(self, options: InjectAgentMessageOptions) -> bool:
57
"""
58
Inject a message as if spoken by the agent.
59
60
Args:
61
options: Message injection configuration
62
63
Returns:
64
bool: True if message injected successfully
65
"""
66
67
def inject_user_message(self, options: InjectUserMessageOptions) -> bool:
68
"""
69
Inject a message as if spoken by the user.
70
71
Args:
72
options: Message injection configuration
73
74
Returns:
75
bool: True if message injected successfully
76
"""
77
78
def send_function_call_response(self, response: FunctionCallResponse) -> bool:
79
"""
80
Send response to agent function call request.
81
82
Args:
83
response: Function call result
84
85
Returns:
86
bool: True if response sent successfully
87
"""
88
89
def keep_alive(self) -> bool:
90
"""
91
Send keep-alive message to maintain connection.
92
93
Returns:
94
bool: True if keep-alive sent successfully
95
"""
96
97
def send_audio(self, audio_data: bytes) -> bool:
98
"""
99
Send audio data to the agent.
100
101
Args:
102
audio_data: Raw audio bytes
103
104
Returns:
105
bool: True if audio sent successfully
106
"""
107
108
def close(self) -> bool:
109
"""
110
Close WebSocket connection.
111
112
Returns:
113
bool: True if connection closed successfully
114
"""
115
116
class AsyncAgentWebSocketClient:
117
# All methods are async versions of AgentWebSocketClient methods
118
async def start(self, options: SettingsOptions) -> bool: ...
119
async def send_settings(self, settings: SettingsOptions) -> bool: ...
120
async def update_prompt(self, options: UpdatePromptOptions) -> bool: ...
121
# ... (all other methods with async keyword)
122
```
123
124
### Router Access
125
126
Access conversational AI clients through the main client's agent router.
127
128
```python { .api }
129
class AgentRouter:
130
@property
131
def websocket(self) -> AgentWebSocketClient: ...
132
@property
133
def asyncwebsocket(self) -> AsyncAgentWebSocketClient: ...
134
```
135
136
### Options Classes
137
138
#### Top-level Configuration
139
140
```python { .api }
141
class SettingsOptions:
142
def __init__(self, **kwargs): ...
143
agent: Agent # Agent configuration
144
listen: Listen = None # Speech-to-text settings
145
speak: Speak = None # Text-to-speech settings
146
think: Think = None # Thinking/processing settings
147
148
class UpdatePromptOptions:
149
def __init__(self, **kwargs): ...
150
prompt: str # New system prompt text
151
152
class UpdateSpeakOptions:
153
def __init__(self, **kwargs): ...
154
speak: Speak # New speech synthesis settings
155
156
class InjectAgentMessageOptions:
157
def __init__(self, **kwargs): ...
158
text: str # Message text to inject
159
160
class InjectUserMessageOptions:
161
def __init__(self, **kwargs): ...
162
text: str # User message text to inject
163
164
class FunctionCallResponse:
165
def __init__(self, **kwargs): ...
166
name: str # Function name
167
result: str # Function execution result
168
169
class AgentKeepAlive:
170
def __init__(self, **kwargs): ...
171
type: str = "KeepAlive" # Message type
172
```
173
174
#### Sub-level Configuration
175
176
```python { .api }
177
class Agent:
178
def __init__(self, **kwargs): ...
179
listen: Listen # Listening configuration
180
think: Think # Thinking configuration
181
speak: Speak # Speaking configuration
182
183
class Listen:
184
def __init__(self, **kwargs): ...
185
model: str = "nova-2" # STT model
186
language: str = "en-US" # Language code
187
smart_format: bool = True # Smart formatting
188
encoding: str = "linear16" # Audio encoding
189
sample_rate: int = 16000 # Sample rate
190
channels: int = 1 # Audio channels
191
interim_results: bool = True # Interim results
192
vad_events: bool = True # Voice activity detection
193
endpointing: bool = True # Endpoint detection
194
195
class Speak:
196
def __init__(self, **kwargs): ...
197
model: str = "aura-asteria-en" # TTS model
198
encoding: str = "linear16" # Audio encoding
199
sample_rate: int = 24000 # Sample rate
200
container: str = "none" # Audio container
201
202
class Think:
203
def __init__(self, **kwargs): ...
204
provider: Provider # AI provider configuration
205
model: str = "gpt-4" # Language model
206
instructions: str = "" # System instructions
207
functions: list[Function] = None # Available functions
208
209
class Provider:
210
def __init__(self, **kwargs): ...
211
type: str = "open_ai" # Provider type
212
```
213
214
#### Function Configuration
215
216
```python { .api }
217
class Function:
218
def __init__(self, **kwargs): ...
219
name: str # Function name
220
description: str # Function description
221
parameters: Parameters # Function parameters schema
222
223
class Parameters:
224
def __init__(self, **kwargs): ...
225
type: str = "object" # Parameters type
226
properties: Properties # Parameter properties
227
required: list[str] = None # Required parameters
228
229
class Properties:
230
def __init__(self, **kwargs): ...
231
# Dynamic properties based on function parameters
232
233
class Header:
234
def __init__(self, **kwargs): ...
235
name: str # Header name
236
value: str # Header value
237
238
class Item:
239
def __init__(self, **kwargs): ...
240
# Generic item configuration
241
242
class Input:
243
def __init__(self, **kwargs): ...
244
# Input configuration
245
246
class Output:
247
def __init__(self, **kwargs): ...
248
# Output configuration
249
250
class Audio:
251
def __init__(self, **kwargs): ...
252
# Audio configuration
253
254
class Endpoint:
255
def __init__(self, **kwargs): ...
256
# Endpoint configuration
257
```
258
259
### Response Types
260
261
#### Agent-Specific Responses
262
263
```python { .api }
264
class WelcomeResponse:
265
"""Initial connection welcome message"""
266
type: str = "Welcome"
267
message: str
268
269
class SettingsAppliedResponse:
270
"""Settings update confirmation"""
271
type: str = "SettingsApplied"
272
settings: dict
273
274
class ConversationTextResponse:
275
"""Conversation text event"""
276
type: str = "ConversationText"
277
text: str
278
role: str # "user" or "assistant"
279
280
class UserStartedSpeakingResponse:
281
"""User speech detection event"""
282
type: str = "UserStartedSpeaking"
283
timestamp: str
284
285
class AgentThinkingResponse:
286
"""Agent processing indication"""
287
type: str = "AgentThinking"
288
289
class FunctionCall:
290
"""Function call data"""
291
name: str
292
arguments: dict
293
294
class FunctionCallRequest:
295
"""Function call request from agent"""
296
type: str = "FunctionCallRequest"
297
function_call: FunctionCall
298
call_id: str
299
300
class AgentStartedSpeakingResponse:
301
"""Agent speech start event"""
302
type: str = "AgentStartedSpeaking"
303
timestamp: str
304
305
class AgentAudioDoneResponse:
306
"""Agent finished speaking event"""
307
type: str = "AgentAudioDone"
308
309
class InjectionRefusedResponse:
310
"""Message injection refusal"""
311
type: str = "InjectionRefused"
312
message: str
313
314
# Common WebSocket responses are inherited:
315
# OpenResponse, CloseResponse, ErrorResponse, UnhandledResponse
316
```
317
318
### Events
319
320
```python { .api }
321
class AgentWebSocketEvents:
322
"""WebSocket event types for conversational AI"""
323
324
# Server Events (received from agent)
325
Open: str = "Open"
326
Close: str = "Close"
327
AudioData: str = "AudioData"
328
Welcome: str = "Welcome"
329
SettingsApplied: str = "SettingsApplied"
330
ConversationText: str = "ConversationText"
331
UserStartedSpeaking: str = "UserStartedSpeaking"
332
AgentThinking: str = "AgentThinking"
333
FunctionCallRequest: str = "FunctionCallRequest"
334
AgentStartedSpeaking: str = "AgentStartedSpeaking"
335
AgentAudioDone: str = "AgentAudioDone"
336
Error: str = "Error"
337
Unhandled: str = "Unhandled"
338
339
# Client Events (sent to agent)
340
Settings: str = "Settings"
341
UpdatePrompt: str = "UpdatePrompt"
342
UpdateSpeak: str = "UpdateSpeak"
343
InjectAgentMessage: str = "InjectAgentMessage"
344
InjectUserMessage: str = "InjectUserMessage"
345
InjectionRefused: str = "InjectionRefused"
346
AgentKeepAlive: str = "KeepAlive"
347
```
348
349
## Usage Examples
350
351
### Basic Conversational Agent
352
353
```python
354
from deepgram import DeepgramClient, SettingsOptions, Agent, Listen, Speak, Think, Provider, AgentWebSocketEvents
355
import threading
356
357
client = DeepgramClient(api_key="your-api-key")
358
359
def on_open(self, open_event, **kwargs):
360
print("Agent connection opened")
361
362
def on_welcome(self, welcome, **kwargs):
363
print(f"Agent welcome: {welcome.message}")
364
365
def on_conversation_text(self, text_event, **kwargs):
366
print(f"{text_event.role}: {text_event.text}")
367
368
def on_user_started_speaking(self, event, **kwargs):
369
print("User started speaking")
370
371
def on_agent_thinking(self, event, **kwargs):
372
print("Agent is thinking...")
373
374
def on_agent_started_speaking(self, event, **kwargs):
375
print("Agent started speaking")
376
377
def on_agent_audio_done(self, event, **kwargs):
378
print("Agent finished speaking")
379
380
def on_audio_data(self, audio_data, **kwargs):
381
# Handle agent's speech audio
382
# In a real application, you'd play this audio
383
print(f"Received {len(audio_data)} bytes of audio")
384
385
def on_error(self, error, **kwargs):
386
print(f"Agent error: {error}")
387
388
# Configure agent settings
389
agent_settings = SettingsOptions(
390
agent=Agent(
391
listen=Listen(
392
model="nova-2",
393
language="en-US",
394
smart_format=True,
395
encoding="linear16",
396
sample_rate=16000,
397
interim_results=True,
398
vad_events=True
399
),
400
think=Think(
401
provider=Provider(type="open_ai"),
402
model="gpt-4",
403
instructions="You are a helpful AI assistant. Be conversational and friendly."
404
),
405
speak=Speak(
406
model="aura-asteria-en",
407
encoding="linear16",
408
sample_rate=24000
409
)
410
)
411
)
412
413
# Create connection
414
dg_connection = client.agent.websocket.v("1")
415
416
# Set up event handlers
417
dg_connection.on(AgentWebSocketEvents.Open, on_open)
418
dg_connection.on(AgentWebSocketEvents.Welcome, on_welcome)
419
dg_connection.on(AgentWebSocketEvents.ConversationText, on_conversation_text)
420
dg_connection.on(AgentWebSocketEvents.UserStartedSpeaking, on_user_started_speaking)
421
dg_connection.on(AgentWebSocketEvents.AgentThinking, on_agent_thinking)
422
dg_connection.on(AgentWebSocketEvents.AgentStartedSpeaking, on_agent_started_speaking)
423
dg_connection.on(AgentWebSocketEvents.AgentAudioDone, on_agent_audio_done)
424
dg_connection.on(AgentWebSocketEvents.AudioData, on_audio_data)
425
dg_connection.on(AgentWebSocketEvents.Error, on_error)
426
427
# Start connection
428
if dg_connection.start(agent_settings):
429
print("Agent connection started")
430
431
# Send audio data (typically from microphone)
432
# audio_data = get_microphone_data()
433
# dg_connection.send_audio(audio_data)
434
435
# Keep connection alive
436
# dg_connection.keep_alive()
437
438
# Close when done
439
dg_connection.close()
440
```
441
442
### Agent with Function Calling
443
444
```python
445
from deepgram import (
446
DeepgramClient, SettingsOptions, Agent, Think, Provider, Function,
447
Parameters, Properties, FunctionCallResponse, AgentWebSocketEvents
448
)
449
import json
450
451
client = DeepgramClient(api_key="your-api-key")
452
453
def on_function_call_request(self, request, **kwargs):
454
"""Handle function call requests from the agent"""
455
print(f"Function call: {request.function_call.name}")
456
print(f"Arguments: {request.function_call.arguments}")
457
458
# Execute the function based on name
459
if request.function_call.name == "get_weather":
460
location = request.function_call.arguments.get("location")
461
weather_data = get_weather(location) # Your weather function
462
463
# Send response back to agent
464
response = FunctionCallResponse(
465
name=request.function_call.name,
466
result=json.dumps(weather_data)
467
)
468
dg_connection.send_function_call_response(response)
469
470
elif request.function_call.name == "set_reminder":
471
reminder = request.function_call.arguments.get("reminder")
472
time = request.function_call.arguments.get("time")
473
result = set_reminder(reminder, time) # Your reminder function
474
475
response = FunctionCallResponse(
476
name=request.function_call.name,
477
result=json.dumps({"success": result})
478
)
479
dg_connection.send_function_call_response(response)
480
481
def get_weather(location):
482
"""Mock weather function"""
483
return {
484
"location": location,
485
"temperature": 72,
486
"condition": "sunny",
487
"humidity": 45
488
}
489
490
def set_reminder(reminder, time):
491
"""Mock reminder function"""
492
print(f"Setting reminder: {reminder} at {time}")
493
return True
494
495
# Define available functions
496
weather_function = Function(
497
name="get_weather",
498
description="Get current weather information for a location",
499
parameters=Parameters(
500
type="object",
501
properties={
502
"location": {"type": "string", "description": "City name or location"}
503
},
504
required=["location"]
505
)
506
)
507
508
reminder_function = Function(
509
name="set_reminder",
510
description="Set a reminder for the user",
511
parameters=Parameters(
512
type="object",
513
properties={
514
"reminder": {"type": "string", "description": "Reminder text"},
515
"time": {"type": "string", "description": "Time for the reminder"}
516
},
517
required=["reminder", "time"]
518
)
519
)
520
521
# Configure agent with functions
522
agent_settings = SettingsOptions(
523
agent=Agent(
524
think=Think(
525
provider=Provider(type="open_ai"),
526
model="gpt-4",
527
instructions="You are a helpful assistant with access to weather and reminder functions. Use them when appropriate.",
528
functions=[weather_function, reminder_function]
529
)
530
# ... other agent configuration
531
)
532
)
533
534
dg_connection = client.agent.websocket.v("1")
535
dg_connection.on(AgentWebSocketEvents.FunctionCallRequest, on_function_call_request)
536
537
if dg_connection.start(agent_settings):
538
# Agent can now call functions during conversation
539
pass
540
```
541
542
### Dynamic Agent Updates
543
544
```python
545
from deepgram import (
546
DeepgramClient, UpdatePromptOptions, UpdateSpeakOptions,
547
InjectAgentMessageOptions, InjectUserMessageOptions, Speak
548
)
549
550
client = DeepgramClient(api_key="your-api-key")
551
dg_connection = client.agent.websocket.v("1")
552
553
# Start with initial settings
554
if dg_connection.start(initial_settings):
555
556
# Update the agent's personality/instructions
557
new_prompt = UpdatePromptOptions(
558
prompt="You are now a cheerful children's storyteller. Use simple language and be very enthusiastic."
559
)
560
dg_connection.update_prompt(new_prompt)
561
562
# Change the voice model
563
new_speak_options = UpdateSpeakOptions(
564
speak=Speak(
565
model="aura-luna-en", # Different voice
566
encoding="linear16",
567
sample_rate=24000
568
)
569
)
570
dg_connection.update_speak_options(new_speak_options)
571
572
# Inject context into the conversation
573
agent_message = InjectAgentMessageOptions(
574
text="I just switched to storytelling mode! What kind of story would you like to hear?"
575
)
576
dg_connection.inject_agent_message(agent_message)
577
578
# Inject user context
579
user_message = InjectUserMessageOptions(
580
text="The user mentioned they like adventure stories about pirates."
581
)
582
dg_connection.inject_user_message(user_message)
583
```
584
585
### Multi-Agent Conversation
586
587
```python
588
from deepgram import DeepgramClient, SettingsOptions, Agent, Think, Provider
589
import asyncio
590
591
async def create_agent(client, agent_id, instructions):
592
"""Create and configure an agent"""
593
settings = SettingsOptions(
594
agent=Agent(
595
think=Think(
596
provider=Provider(type="open_ai"),
597
model="gpt-4",
598
instructions=f"Agent {agent_id}: {instructions}"
599
)
600
# ... other configuration
601
)
602
)
603
604
connection = client.agent.asyncwebsocket.v("1")
605
await connection.start(settings)
606
return connection
607
608
async def multi_agent_example():
609
client = DeepgramClient(api_key="your-api-key")
610
611
# Create multiple agents with different roles
612
moderator = await create_agent(
613
client, "Moderator",
614
"You are a meeting moderator. Keep discussions on track and summarize key points."
615
)
616
617
expert1 = await create_agent(
618
client, "Expert1",
619
"You are a technical expert. Provide detailed technical insights."
620
)
621
622
expert2 = await create_agent(
623
client, "Expert2",
624
"You are a business expert. Focus on practical business implications."
625
)
626
627
# Coordinate conversation between agents
628
# This would involve managing turn-taking and message passing
629
# between the different agent connections
630
631
# Close connections when done
632
await moderator.close()
633
await expert1.close()
634
await expert2.close()
635
636
# Run multi-agent example
637
asyncio.run(multi_agent_example())
638
```
639
640
### Error Handling and Recovery
641
642
```python
643
from deepgram import DeepgramClient, DeepgramApiError, SettingsOptions, AgentWebSocketEvents
644
645
client = DeepgramClient(api_key="your-api-key")
646
647
def on_error(self, error, **kwargs):
648
"""Handle various error types"""
649
print(f"Agent error: {error}")
650
651
# Implement error-specific recovery logic
652
if "connection" in str(error).lower():
653
print("Connection error - attempting to reconnect...")
654
# Implement reconnection logic
655
elif "authentication" in str(error).lower():
656
print("Authentication error - check API key")
657
else:
658
print("Unknown error - logging for investigation")
659
660
def on_injection_refused(self, refusal, **kwargs):
661
"""Handle message injection refusals"""
662
print(f"Message injection refused: {refusal.message}")
663
# Implement fallback logic
664
665
try:
666
settings = SettingsOptions(
667
# ... agent configuration
668
)
669
670
dg_connection = client.agent.websocket.v("1")
671
dg_connection.on(AgentWebSocketEvents.Error, on_error)
672
dg_connection.on(AgentWebSocketEvents.InjectionRefused, on_injection_refused)
673
674
if dg_connection.start(settings):
675
# Connection successful
676
print("Agent started successfully")
677
678
# Implement connection health monitoring
679
# dg_connection.keep_alive() # Send periodically
680
681
else:
682
print("Failed to start agent connection")
683
684
except DeepgramApiError as e:
685
print(f"API Error: {e}")
686
except Exception as e:
687
print(f"Unexpected error: {e}")
688
finally:
689
if 'dg_connection' in locals():
690
dg_connection.close()
691
```