docs
0
# Realtime API
1
2
WebSocket-based realtime communication for low-latency conversational AI experiences with audio streaming, function calling, and interruption handling.
3
4
## Capabilities
5
6
### Create Realtime Session
7
8
Establish a WebSocket connection for realtime interaction.
9
10
```python { .api }
11
def create(
12
self,
13
*,
14
model: str,
15
modalities: list[str] | Omit = omit,
16
instructions: str | Omit = omit,
17
voice: str | Omit = omit,
18
input_audio_format: str | Omit = omit,
19
output_audio_format: str | Omit = omit,
20
input_audio_transcription: dict | Omit = omit,
21
turn_detection: dict | Omit = omit,
22
tools: list[dict] | Omit = omit,
23
tool_choice: str | Omit = omit,
24
temperature: float | Omit = omit,
25
max_response_output_tokens: int | str | Omit = omit,
26
extra_headers: dict[str, str] | None = None,
27
extra_query: dict[str, object] | None = None,
28
extra_body: dict[str, object] | None = None,
29
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
30
) -> RealtimeSession:
31
"""
32
Create a realtime session for WebSocket communication.
33
34
Args:
35
model: Model to use (e.g., "gpt-4o-realtime-preview").
36
37
modalities: Response modalities. Options: ["text", "audio"].
38
39
instructions: System instructions for the session.
40
41
voice: Voice for audio output. Options: "alloy", "echo", "fable",
42
"onyx", "nova", "shimmer".
43
44
input_audio_format: Input audio format. Options: "pcm16", "g711_ulaw",
45
"g711_alaw".
46
47
output_audio_format: Output audio format. Options: "pcm16", "g711_ulaw",
48
"g711_alaw", "mp3", "opus".
49
50
input_audio_transcription: Enable input transcription.
51
{"model": "whisper-1"}
52
53
turn_detection: Turn detection configuration.
54
{"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300,
55
"silence_duration_ms": 200}
56
57
tools: Function tools available to the model.
58
59
tool_choice: Tool choice configuration. "auto", "none", "required".
60
61
temperature: Sampling temperature.
62
63
max_response_output_tokens: Maximum output tokens per response.
64
65
Returns:
66
RealtimeSession: WebSocket session configuration.
67
"""
68
```
69
70
Usage example:
71
72
```python
73
from openai import OpenAI
74
75
client = OpenAI()
76
77
# Create realtime session
78
session = client.beta.realtime.sessions.create(
79
model="gpt-4o-realtime-preview",
80
modalities=["text", "audio"],
81
voice="alloy",
82
instructions="You are a helpful assistant.",
83
input_audio_format="pcm16",
84
output_audio_format="pcm16",
85
turn_detection={
86
"type": "server_vad",
87
"threshold": 0.5,
88
"silence_duration_ms": 500
89
}
90
)
91
92
# Access WebSocket URL
93
ws_url = session.client_secret.value
94
95
# Use with WebSocket library (e.g., websockets)
96
import asyncio
97
import websockets
98
import json
99
100
async def realtime_conversation():
101
async with websockets.connect(ws_url) as websocket:
102
# Send audio input
103
await websocket.send(json.dumps({
104
"type": "input_audio_buffer.append",
105
"audio": base64_audio_data
106
}))
107
108
# Commit audio
109
await websocket.send(json.dumps({
110
"type": "input_audio_buffer.commit"
111
}))
112
113
# Receive responses
114
async for message in websocket:
115
event = json.loads(message)
116
117
if event["type"] == "response.audio.delta":
118
# Handle audio chunk
119
audio_chunk = event["delta"]
120
# Play or process audio
121
122
elif event["type"] == "response.text.delta":
123
# Handle text chunk
124
text = event["delta"]
125
print(text, end="", flush=True)
126
127
elif event["type"] == "response.done":
128
break
129
130
asyncio.run(realtime_conversation())
131
```
132
133
### Realtime Calls
134
135
Manage incoming and outgoing realtime voice calls with call control methods.
136
137
```python { .api }
138
# Accessed via: client.realtime.calls or client.beta.realtime.calls
139
def create(
140
self,
141
*,
142
model: str,
143
modalities: list[str] | Omit = omit,
144
instructions: str | Omit = omit,
145
voice: str | Omit = omit,
146
extra_headers: dict[str, str] | None = None,
147
extra_query: dict[str, object] | None = None,
148
extra_body: dict[str, object] | None = None,
149
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
150
) -> RealtimeCall:
151
"""
152
Initiate an outgoing realtime call.
153
154
Args:
155
model: Model to use for the call.
156
modalities: Response modalities (["audio"] recommended).
157
instructions: System instructions for the call.
158
voice: Voice for audio output.
159
160
Returns:
161
RealtimeCall: Call object with connection details.
162
"""
163
164
def accept(
165
self,
166
call_id: str,
167
*,
168
extra_headers: dict[str, str] | None = None,
169
extra_query: dict[str, object] | None = None,
170
extra_body: dict[str, object] | None = None,
171
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
172
) -> RealtimeCall:
173
"""
174
Accept an incoming realtime call.
175
176
Args:
177
call_id: The ID of the incoming call to accept.
178
179
Returns:
180
RealtimeCall: Call object with status "active".
181
"""
182
183
def hangup(
184
self,
185
call_id: str,
186
*,
187
extra_headers: dict[str, str] | None = None,
188
extra_query: dict[str, object] | None = None,
189
extra_body: dict[str, object] | None = None,
190
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
191
) -> RealtimeCall:
192
"""
193
End an active realtime call.
194
195
Args:
196
call_id: The ID of the call to hang up.
197
198
Returns:
199
RealtimeCall: Call object with status "completed".
200
"""
201
202
def refer(
203
self,
204
call_id: str,
205
*,
206
refer_to: str,
207
extra_headers: dict[str, str] | None = None,
208
extra_query: dict[str, object] | None = None,
209
extra_body: dict[str, object] | None = None,
210
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
211
) -> RealtimeCall:
212
"""
213
Transfer a realtime call to another destination.
214
215
Args:
216
call_id: The ID of the call to transfer.
217
refer_to: Destination identifier to transfer the call to.
218
219
Returns:
220
RealtimeCall: Call object with status "referred".
221
"""
222
223
def reject(
224
self,
225
call_id: str,
226
*,
227
reason: str | Omit = omit,
228
extra_headers: dict[str, str] | None = None,
229
extra_query: dict[str, object] | None = None,
230
extra_body: dict[str, object] | None = None,
231
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
232
) -> RealtimeCall:
233
"""
234
Reject an incoming realtime call.
235
236
Args:
237
call_id: The ID of the incoming call to reject.
238
reason: Optional reason for rejection.
239
240
Returns:
241
RealtimeCall: Call object with status "rejected".
242
"""
243
```
244
245
Usage example:
246
247
```python
248
from openai import OpenAI
249
250
client = OpenAI()
251
252
# Create an outgoing call (available at both paths)
253
call = client.realtime.calls.create(
254
model="gpt-4o-realtime-preview",
255
modalities=["audio"],
256
voice="alloy",
257
instructions="You are a helpful phone assistant."
258
)
259
260
print(f"Call ID: {call.id}, Status: {call.status}")
261
262
# Accept an incoming call
263
accepted_call = client.realtime.calls.accept("call_abc123")
264
265
# Transfer a call
266
referred_call = client.realtime.calls.refer(
267
call.id,
268
refer_to="destination_id"
269
)
270
271
# End a call
272
ended_call = client.realtime.calls.hangup(call.id)
273
print(f"Call ended: {ended_call.status}")
274
275
# Reject an incoming call
276
rejected_call = client.realtime.calls.reject(
277
"call_xyz789",
278
reason="User unavailable"
279
)
280
```
281
282
### Client Secrets
283
284
Create ephemeral client secrets for secure realtime session establishment.
285
286
```python { .api }
287
# Access via client.realtime.client_secrets
288
289
def create(
290
self,
291
*,
292
expires_after: dict | Omit = omit,
293
session: dict | Omit = omit,
294
extra_headers: dict[str, str] | None = None,
295
extra_query: dict[str, object] | None = None,
296
extra_body: dict[str, object] | None = None,
297
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
298
) -> ClientSecretCreateResponse:
299
"""
300
Create a Realtime client secret with an associated session configuration.
301
302
Args:
303
expires_after: Configuration for the client secret expiration.
304
Expiration refers to the time after which a client secret will
305
no longer be valid for creating sessions. The session itself may
306
continue after that time once started. A secret can be used to
307
create multiple sessions until it expires.
308
Example: {"anchor": "created_at", "seconds": 3600}
309
310
session: Session configuration to use for the client secret.
311
Choose either a realtime session or a transcription session.
312
Example for realtime: {
313
"type": "realtime",
314
"model": "gpt-4o-realtime-preview",
315
"voice": "alloy",
316
"modalities": ["text", "audio"]
317
}
318
Example for transcription: {
319
"type": "transcription",
320
"model": "whisper-1"
321
}
322
323
extra_headers: Additional HTTP headers.
324
extra_query: Additional query parameters.
325
extra_body: Additional JSON fields.
326
timeout: Request timeout in seconds.
327
328
Returns:
329
ClientSecretCreateResponse: Created client secret with value and
330
expiration time. Use the secret value to establish WebSocket
331
connections from client-side applications.
332
333
Notes:
334
- Client secrets enable secure browser-based realtime connections
335
- Secrets expire after specified duration
336
- One secret can establish multiple sessions until expiration
337
- Use for temporary, client-side authentication
338
"""
339
```
340
341
Usage example:
342
343
```python
344
from openai import OpenAI
345
346
client = OpenAI()
347
348
# Create client secret for realtime session
349
secret = client.realtime.client_secrets.create(
350
expires_after={
351
"anchor": "created_at",
352
"seconds": 3600 # Expires in 1 hour
353
},
354
session={
355
"type": "realtime",
356
"model": "gpt-4o-realtime-preview",
357
"voice": "alloy",
358
"modalities": ["text", "audio"],
359
"instructions": "You are a helpful voice assistant."
360
}
361
)
362
363
print(f"Client Secret: {secret.value}")
364
print(f"Expires At: {secret.expires_at}")
365
366
# Create client secret for transcription session
367
transcription_secret = client.realtime.client_secrets.create(
368
expires_after={
369
"anchor": "created_at",
370
"seconds": 1800 # Expires in 30 minutes
371
},
372
session={
373
"type": "transcription",
374
"model": "whisper-1",
375
"input_audio_format": "pcm16",
376
"input_audio_transcription": {
377
"model": "whisper-1"
378
}
379
}
380
)
381
382
# Use the secret client-side for WebSocket connection
383
# The secret value is passed as authentication to establish the connection
384
```
385
386
### Transcription Sessions
387
388
Create ephemeral API tokens for client-side realtime transcription applications.
389
390
```python { .api }
391
def create(
392
self,
393
*,
394
client_secret: dict | Omit = omit,
395
include: list[str] | Omit = omit,
396
input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | Omit = omit,
397
input_audio_noise_reduction: dict | Omit = omit,
398
input_audio_transcription: dict | Omit = omit,
399
modalities: list[Literal["text", "audio"]] | Omit = omit,
400
turn_detection: dict | Omit = omit,
401
extra_headers: dict[str, str] | None = None,
402
extra_query: dict[str, object] | None = None,
403
extra_body: dict[str, object] | None = None,
404
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
405
) -> TranscriptionSession:
406
"""
407
Create an ephemeral API token for client-side realtime transcriptions.
408
409
Returns a session object with a client_secret containing an ephemeral
410
API token for authenticating browser clients.
411
412
Args:
413
client_secret: Configuration options for the generated client secret.
414
415
include: Items to include in the transcription. Options:
416
- "item.input_audio_transcription.logprobs"
417
418
input_audio_format: Input audio format. Options: "pcm16", "g711_ulaw",
419
"g711_alaw". For pcm16, audio must be 16-bit PCM at 24kHz sample rate,
420
single channel (mono), little-endian byte order.
421
422
input_audio_noise_reduction: Configuration for input audio noise reduction.
423
Filters audio added to the input buffer before VAD and model processing.
424
Can improve VAD accuracy and model performance.
425
426
input_audio_transcription: Configuration for input audio transcription.
427
Can optionally set language and prompt for additional guidance.
428
429
modalities: Response modalities. Options: ["text"], ["audio"], or both.
430
To disable audio, set to ["text"].
431
432
turn_detection: Configuration for turn detection (Server VAD or Semantic VAD).
433
Set to null to turn off, requiring manual trigger of model response.
434
Server VAD detects speech based on audio volume. Semantic VAD uses
435
a turn detection model to estimate turn completion and dynamically
436
sets timeout based on probability.
437
438
Returns:
439
TranscriptionSession: Session with client_secret for browser authentication.
440
"""
441
```
442
443
**Usage Example:**
444
445
```python
446
from openai import OpenAI
447
448
client = OpenAI()
449
450
# Create transcription session for client-side use
451
session = client.beta.realtime.transcription_sessions.create(
452
input_audio_format="pcm16",
453
input_audio_transcription={
454
"model": "whisper-1",
455
"language": "en",
456
"prompt": "Technical discussion"
457
},
458
input_audio_noise_reduction={
459
"type": "default"
460
},
461
turn_detection={
462
"type": "semantic_vad",
463
"threshold": 0.6
464
},
465
modalities=["text", "audio"],
466
include=["item.input_audio_transcription.logprobs"]
467
)
468
469
# Use the client_secret in browser/client application
470
ephemeral_token = session.client_secret.value
471
print(f"Session ID: {session.id}")
472
print(f"Token expires: {session.client_secret.expires_at}")
473
```
474
475
## Types
476
477
```python { .api }
478
from typing import Literal
479
from pydantic import BaseModel
480
481
class RealtimeSession(BaseModel):
482
"""Realtime session configuration."""
483
id: str
484
model: str
485
modalities: list[str]
486
instructions: str | None
487
voice: str | None
488
input_audio_format: str
489
output_audio_format: str
490
input_audio_transcription: dict | None
491
turn_detection: dict | None
492
tools: list[dict] | None
493
tool_choice: str
494
temperature: float | None
495
max_response_output_tokens: int | str | None
496
client_secret: ClientSecret
497
498
class ClientSecret(BaseModel):
499
"""WebSocket client secret."""
500
value: str
501
expires_at: int
502
503
class ClientSecretCreateResponse(BaseModel):
504
"""Response from creating a client secret."""
505
id: str
506
created_at: int
507
expires_at: int
508
value: str # The ephemeral client secret value
509
session: dict # Session configuration associated with this secret
510
```
511
512
## Event Types
513
514
WebSocket events for realtime communication:
515
516
- `session.created` - Session established
517
- `input_audio_buffer.append` - Add audio data
518
- `input_audio_buffer.commit` - Process buffered audio
519
- `input_audio_buffer.clear` - Clear buffer
520
- `conversation.item.create` - Add conversation item
521
- `response.create` - Request response
522
- `response.cancel` - Cancel current response
523
- `response.audio.delta` - Audio chunk received
524
- `response.text.delta` - Text chunk received
525
- `response.done` - Response completed
526
- `conversation.item.input_audio_transcription.completed` - Transcription ready
527
- `error` - Error occurred
528
529
## Best Practices
530
531
```python
532
import asyncio
533
import websockets
534
import json
535
import base64
536
537
async def realtime_session(session_url: str):
538
async with websockets.connect(session_url) as ws:
539
# Handle incoming events
540
async def receive_events():
541
async for message in ws:
542
event = json.loads(message)
543
544
if event["type"] == "response.audio.delta":
545
# Stream audio to speaker
546
audio_data = base64.b64decode(event["delta"])
547
play_audio(audio_data)
548
549
elif event["type"] == "response.text.delta":
550
# Display text
551
print(event["delta"], end="", flush=True)
552
553
# Send audio input
554
async def send_audio():
555
while True:
556
audio_chunk = record_audio_chunk()
557
await ws.send(json.dumps({
558
"type": "input_audio_buffer.append",
559
"audio": base64.b64encode(audio_chunk).decode()
560
}))
561
await asyncio.sleep(0.1)
562
563
# Run both tasks
564
await asyncio.gather(
565
receive_events(),
566
send_audio()
567
)
568
```
569
570
## Async Usage
571
572
```python
573
import asyncio
574
from openai import AsyncOpenAI
575
576
async def create_session():
577
client = AsyncOpenAI()
578
579
session = await client.beta.realtime.sessions.create(
580
model="gpt-4o-realtime-preview",
581
modalities=["audio"]
582
)
583
584
return session.client_secret.value
585
586
ws_url = asyncio.run(create_session())
587
```
588