docs
0
# Beta Realtime API
1
2
Real-time audio and WebSocket-based AI interactions for building conversational applications with low-latency voice communication. Supports real-time session management and WebSocket connections for streaming audio communication.
3
4
## Capabilities
5
6
### Real-Time Connection Management
7
8
Establishes WebSocket connections for real-time communication with AI models, enabling low-latency voice and audio interactions.
9
10
```python { .api }
11
class BetaRealtime:
12
def connect(
13
self,
14
*,
15
model: str,
16
websocket_connection_options: WebsocketConnectionOptions = {},
17
**kwargs
18
) -> RealtimeConnectionManager:
19
"""
20
Create a real-time WebSocket connection to an AI model.
21
22
Args:
23
model: Model identifier for real-time communication
24
websocket_connection_options: WebSocket configuration options
25
**kwargs: Additional connection parameters
26
27
Returns:
28
RealtimeConnectionManager: Connection manager for real-time communication
29
"""
30
31
sessions: BetaSessions
32
33
class AsyncBetaRealtime:
34
def connect(
35
self,
36
*,
37
model: str,
38
websocket_connection_options: WebsocketConnectionOptions = {},
39
**kwargs
40
) -> AsyncRealtimeConnectionManager:
41
"""Async version of connect method."""
42
43
sessions: AsyncBetaSessions
44
```
45
46
### Real-Time Session Management
47
48
Create and manage real-time sessions with configurable audio formats, voice settings, and interaction parameters.
49
50
```python { .api }
51
class BetaSessions:
52
def create(
53
self,
54
*,
55
model: Any = "portkey-default",
56
input_audio_format: Union[Any, NotGiven] = NOT_GIVEN,
57
input_audio_transcription: Union[Any, NotGiven] = NOT_GIVEN,
58
instructions: Union[str, NotGiven] = NOT_GIVEN,
59
max_response_output_tokens: Union[int, Any, NotGiven] = NOT_GIVEN,
60
modalities: Union[List[Any], NotGiven] = NOT_GIVEN,
61
output_audio_format: Union[Any, NotGiven] = NOT_GIVEN,
62
temperature: Union[float, NotGiven] = NOT_GIVEN,
63
tool_choice: Union[str, NotGiven] = NOT_GIVEN,
64
tools: Union[Iterable[Any], NotGiven] = NOT_GIVEN,
65
turn_detection: Union[Any, NotGiven] = NOT_GIVEN,
66
voice: Union[Any, NotGiven] = NOT_GIVEN
67
) -> SessionCreateResponse:
68
"""
69
Create a real-time session for voice communication.
70
71
Args:
72
model: Model to use for the session
73
input_audio_format: Format for input audio (e.g., "pcm16", "g711_ulaw")
74
input_audio_transcription: Configuration for input audio transcription
75
instructions: System instructions for the AI assistant
76
max_response_output_tokens: Maximum tokens in response
77
modalities: Supported modalities (audio, text)
78
output_audio_format: Format for output audio
79
temperature: Response randomness (0.0 to 2.0)
80
tool_choice: Tool selection strategy
81
tools: Available tools for the assistant
82
turn_detection: Turn detection configuration
83
voice: Voice model for audio output
84
85
Returns:
86
SessionCreateResponse: Session configuration and connection details
87
"""
88
89
class AsyncBetaSessions:
90
async def create(
91
self,
92
*,
93
model: Any = "portkey-default",
94
input_audio_format: Union[Any, NotGiven] = NOT_GIVEN,
95
input_audio_transcription: Union[Any, NotGiven] = NOT_GIVEN,
96
instructions: Union[str, NotGiven] = NOT_GIVEN,
97
max_response_output_tokens: Union[int, Any, NotGiven] = NOT_GIVEN,
98
modalities: Union[List[Any], NotGiven] = NOT_GIVEN,
99
output_audio_format: Union[Any, NotGiven] = NOT_GIVEN,
100
temperature: Union[float, NotGiven] = NOT_GIVEN,
101
tool_choice: Union[str, NotGiven] = NOT_GIVEN,
102
tools: Union[Iterable[Any], NotGiven] = NOT_GIVEN,
103
turn_detection: Union[Any, NotGiven] = NOT_GIVEN,
104
voice: Union[Any, NotGiven] = NOT_GIVEN
105
) -> SessionCreateResponse:
106
"""Async version of session creation."""
107
```
108
109
### Usage Examples
110
111
```python
112
from portkey_ai import Portkey
113
114
# Initialize client
115
portkey = Portkey(
116
api_key="PORTKEY_API_KEY",
117
virtual_key="VIRTUAL_KEY"
118
)
119
120
# Create a real-time session
121
session = portkey.beta.realtime.sessions.create(
122
model="gpt-4-realtime-preview",
123
modalities=["text", "audio"],
124
instructions="You are a helpful voice assistant.",
125
voice="alloy",
126
input_audio_format="pcm16",
127
output_audio_format="pcm16",
128
turn_detection={
129
"type": "server_vad",
130
"threshold": 0.5,
131
"prefix_padding_ms": 300,
132
"silence_duration_ms": 200
133
}
134
)
135
136
print(f"Session ID: {session.id}")
137
print(f"Model: {session.model}")
138
139
# Establish WebSocket connection
140
connection = portkey.beta.realtime.connect(
141
model="gpt-4-realtime-preview",
142
websocket_connection_options={
143
"timeout": 30,
144
"additional_headers": {
145
"Authorization": f"Bearer {portkey.api_key}"
146
}
147
}
148
)
149
150
# Use connection for real-time communication
151
# Note: Actual usage would involve WebSocket event handling
152
with connection as conn:
153
# Send audio data
154
conn.send_audio_data(audio_bytes)
155
156
# Handle responses
157
for event in conn.listen():
158
if event.type == "response.audio.delta":
159
# Process audio response
160
process_audio_chunk(event.delta)
161
elif event.type == "response.text.delta":
162
# Process text response
163
print(event.delta, end="")
164
```
165
166
### Async Usage
167
168
```python
169
import asyncio
170
from portkey_ai import AsyncPortkey
171
172
async def create_realtime_session():
173
portkey = AsyncPortkey(
174
api_key="PORTKEY_API_KEY",
175
virtual_key="VIRTUAL_KEY"
176
)
177
178
# Create session asynchronously
179
session = await portkey.beta.realtime.sessions.create(
180
model="gpt-4-realtime-preview",
181
modalities=["text", "audio"],
182
instructions="You are a voice assistant for customer support.",
183
voice="nova",
184
temperature=0.7,
185
max_response_output_tokens=150
186
)
187
188
# Establish async connection
189
connection = portkey.beta.realtime.connect(
190
model="gpt-4-realtime-preview"
191
)
192
193
return session, connection
194
195
# Run async function
196
session, connection = asyncio.run(create_realtime_session())
197
```
198
199
### Advanced Configuration
200
201
```python
202
# Configure detailed session parameters
203
session = portkey.beta.realtime.sessions.create(
204
model="gpt-4-realtime-preview",
205
modalities=["text", "audio"],
206
instructions="""
207
You are an AI assistant for a language learning app.
208
Help users practice pronunciation and provide feedback.
209
Speak clearly and at a moderate pace.
210
""",
211
voice="shimmer",
212
input_audio_format="pcm16",
213
output_audio_format="pcm16",
214
input_audio_transcription={
215
"model": "whisper-1"
216
},
217
turn_detection={
218
"type": "server_vad",
219
"threshold": 0.6,
220
"prefix_padding_ms": 300,
221
"silence_duration_ms": 500
222
},
223
tools=[
224
{
225
"type": "function",
226
"name": "pronunciation_feedback",
227
"description": "Provide pronunciation feedback",
228
"parameters": {
229
"type": "object",
230
"properties": {
231
"word": {"type": "string"},
232
"accuracy": {"type": "number"},
233
"feedback": {"type": "string"}
234
}
235
}
236
}
237
],
238
tool_choice="auto",
239
temperature=0.3,
240
max_response_output_tokens=100
241
)
242
```
243
244
## Types
245
246
```python { .api }
247
class SessionCreateResponse:
248
"""Response from real-time session creation"""
249
id: str # Session identifier
250
object: str # "realtime.session"
251
model: str # Model used for the session
252
modalities: List[str] # Supported modalities
253
instructions: str # System instructions
254
voice: str # Voice model
255
input_audio_format: str # Input audio format
256
output_audio_format: str # Output audio format
257
input_audio_transcription: dict # Transcription settings
258
turn_detection: dict # Turn detection configuration
259
tools: List[dict] # Available tools
260
tool_choice: str # Tool selection strategy
261
temperature: float # Response temperature
262
max_response_output_tokens: int # Token limit
263
_headers: Optional[dict] # Response headers
264
265
class RealtimeConnectionManager:
266
"""Synchronous WebSocket connection manager"""
267
def send_audio_data(self, audio_bytes: bytes) -> None: ...
268
def listen(self) -> Iterator[RealtimeEvent]: ...
269
def close(self) -> None: ...
270
271
class AsyncRealtimeConnectionManager:
272
"""Asynchronous WebSocket connection manager"""
273
async def send_audio_data(self, audio_bytes: bytes) -> None: ...
274
async def listen(self) -> AsyncIterator[RealtimeEvent]: ...
275
async def close(self) -> None: ...
276
277
class WebsocketConnectionOptions:
278
"""WebSocket connection configuration"""
279
timeout: Optional[int] # Connection timeout in seconds
280
additional_headers: Optional[dict] # Additional headers
281
# Additional WebSocket-specific options
282
283
class RealtimeEvent:
284
"""Real-time event from WebSocket connection"""
285
type: str # Event type
286
delta: Optional[str] # Content delta for streaming
287
audio: Optional[bytes] # Audio data
288
# Additional event-specific fields
289
```