Tessl Tile for pypi/openai@2.8.1

or run

npx @tessl/cli init

realtime.mddocs/

0
# Realtime API
1

2
WebSocket-based realtime communication for low-latency conversational AI experiences with audio streaming, function calling, and interruption handling.
3

4
## Capabilities
5

6
### Create Realtime Session
7

8
Establish a WebSocket connection for realtime interaction.
9

10
```python { .api }
11
def create(
12
    self,
13
    *,
14
    model: str,
15
    modalities: list[str] | Omit = omit,
16
    instructions: str | Omit = omit,
17
    voice: str | Omit = omit,
18
    input_audio_format: str | Omit = omit,
19
    output_audio_format: str | Omit = omit,
20
    input_audio_transcription: dict | Omit = omit,
21
    turn_detection: dict | Omit = omit,
22
    tools: list[dict] | Omit = omit,
23
    tool_choice: str | Omit = omit,
24
    temperature: float | Omit = omit,
25
    max_response_output_tokens: int | str | Omit = omit,
26
    extra_headers: dict[str, str] | None = None,
27
    extra_query: dict[str, object] | None = None,
28
    extra_body: dict[str, object] | None = None,
29
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
30
) -> RealtimeSession:
31
    """
32
    Create a realtime session for WebSocket communication.
33

34
    Args:
35
        model: Model to use (e.g., "gpt-4o-realtime-preview").
36

37
        modalities: Response modalities. Options: ["text", "audio"].
38

39
        instructions: System instructions for the session.
40

41
        voice: Voice for audio output. Options: "alloy", "echo", "fable",
42
            "onyx", "nova", "shimmer".
43

44
        input_audio_format: Input audio format. Options: "pcm16", "g711_ulaw",
45
            "g711_alaw".
46

47
        output_audio_format: Output audio format. Options: "pcm16", "g711_ulaw",
48
            "g711_alaw", "mp3", "opus".
49

50
        input_audio_transcription: Enable input transcription.
51
            {"model": "whisper-1"}
52

53
        turn_detection: Turn detection configuration.
54
            {"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300,
55
             "silence_duration_ms": 200}
56

57
        tools: Function tools available to the model.
58

59
        tool_choice: Tool choice configuration. "auto", "none", "required".
60

61
        temperature: Sampling temperature.
62

63
        max_response_output_tokens: Maximum output tokens per response.
64

65
    Returns:
66
        RealtimeSession: WebSocket session configuration.
67
    """
68
```
69

70
Usage example:
71

72
```python
73
from openai import OpenAI
74

75
client = OpenAI()
76

77
# Create realtime session
78
session = client.beta.realtime.sessions.create(
79
    model="gpt-4o-realtime-preview",
80
    modalities=["text", "audio"],
81
    voice="alloy",
82
    instructions="You are a helpful assistant.",
83
    input_audio_format="pcm16",
84
    output_audio_format="pcm16",
85
    turn_detection={
86
        "type": "server_vad",
87
        "threshold": 0.5,
88
        "silence_duration_ms": 500
89
    }
90
)
91

92
# Access WebSocket URL
93
ws_url = session.client_secret.value
94

95
# Use with WebSocket library (e.g., websockets)
96
import asyncio
97
import websockets
98
import json
99

100
async def realtime_conversation():
101
    async with websockets.connect(ws_url) as websocket:
102
        # Send audio input
103
        await websocket.send(json.dumps({
104
            "type": "input_audio_buffer.append",
105
            "audio": base64_audio_data
106
        }))
107

108
        # Commit audio
109
        await websocket.send(json.dumps({
110
            "type": "input_audio_buffer.commit"
111
        }))
112

113
        # Receive responses
114
        async for message in websocket:
115
            event = json.loads(message)
116

117
            if event["type"] == "response.audio.delta":
118
                # Handle audio chunk
119
                audio_chunk = event["delta"]
120
                # Play or process audio
121

122
            elif event["type"] == "response.text.delta":
123
                # Handle text chunk
124
                text = event["delta"]
125
                print(text, end="", flush=True)
126

127
            elif event["type"] == "response.done":
128
                break
129

130
asyncio.run(realtime_conversation())
131
```
132

133
### Realtime Calls
134

135
Manage incoming and outgoing realtime voice calls with call control methods.
136

137
```python { .api }
138
# Accessed via: client.realtime.calls or client.beta.realtime.calls
139
def create(
140
    self,
141
    *,
142
    model: str,
143
    modalities: list[str] | Omit = omit,
144
    instructions: str | Omit = omit,
145
    voice: str | Omit = omit,
146
    extra_headers: dict[str, str] | None = None,
147
    extra_query: dict[str, object] | None = None,
148
    extra_body: dict[str, object] | None = None,
149
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
150
) -> RealtimeCall:
151
    """
152
    Initiate an outgoing realtime call.
153

154
    Args:
155
        model: Model to use for the call.
156
        modalities: Response modalities (["audio"] recommended).
157
        instructions: System instructions for the call.
158
        voice: Voice for audio output.
159

160
    Returns:
161
        RealtimeCall: Call object with connection details.
162
    """
163

164
def accept(
165
    self,
166
    call_id: str,
167
    *,
168
    extra_headers: dict[str, str] | None = None,
169
    extra_query: dict[str, object] | None = None,
170
    extra_body: dict[str, object] | None = None,
171
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
172
) -> RealtimeCall:
173
    """
174
    Accept an incoming realtime call.
175

176
    Args:
177
        call_id: The ID of the incoming call to accept.
178

179
    Returns:
180
        RealtimeCall: Call object with status "active".
181
    """
182

183
def hangup(
184
    self,
185
    call_id: str,
186
    *,
187
    extra_headers: dict[str, str] | None = None,
188
    extra_query: dict[str, object] | None = None,
189
    extra_body: dict[str, object] | None = None,
190
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
191
) -> RealtimeCall:
192
    """
193
    End an active realtime call.
194

195
    Args:
196
        call_id: The ID of the call to hang up.
197

198
    Returns:
199
        RealtimeCall: Call object with status "completed".
200
    """
201

202
def refer(
203
    self,
204
    call_id: str,
205
    *,
206
    refer_to: str,
207
    extra_headers: dict[str, str] | None = None,
208
    extra_query: dict[str, object] | None = None,
209
    extra_body: dict[str, object] | None = None,
210
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
211
) -> RealtimeCall:
212
    """
213
    Transfer a realtime call to another destination.
214

215
    Args:
216
        call_id: The ID of the call to transfer.
217
        refer_to: Destination identifier to transfer the call to.
218

219
    Returns:
220
        RealtimeCall: Call object with status "referred".
221
    """
222

223
def reject(
224
    self,
225
    call_id: str,
226
    *,
227
    reason: str | Omit = omit,
228
    extra_headers: dict[str, str] | None = None,
229
    extra_query: dict[str, object] | None = None,
230
    extra_body: dict[str, object] | None = None,
231
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
232
) -> RealtimeCall:
233
    """
234
    Reject an incoming realtime call.
235

236
    Args:
237
        call_id: The ID of the incoming call to reject.
238
        reason: Optional reason for rejection.
239

240
    Returns:
241
        RealtimeCall: Call object with status "rejected".
242
    """
243
```
244

245
Usage example:
246

247
```python
248
from openai import OpenAI
249

250
client = OpenAI()
251

252
# Create an outgoing call (available at both paths)
253
call = client.realtime.calls.create(
254
    model="gpt-4o-realtime-preview",
255
    modalities=["audio"],
256
    voice="alloy",
257
    instructions="You are a helpful phone assistant."
258
)
259

260
print(f"Call ID: {call.id}, Status: {call.status}")
261

262
# Accept an incoming call
263
accepted_call = client.realtime.calls.accept("call_abc123")
264

265
# Transfer a call
266
referred_call = client.realtime.calls.refer(
267
    call.id,
268
    refer_to="destination_id"
269
)
270

271
# End a call
272
ended_call = client.realtime.calls.hangup(call.id)
273
print(f"Call ended: {ended_call.status}")
274

275
# Reject an incoming call
276
rejected_call = client.realtime.calls.reject(
277
    "call_xyz789",
278
    reason="User unavailable"
279
)
280
```
281

282
### Client Secrets
283

284
Create ephemeral client secrets for secure realtime session establishment.
285

286
```python { .api }
287
# Access via client.realtime.client_secrets
288

289
def create(
290
    self,
291
    *,
292
    expires_after: dict | Omit = omit,
293
    session: dict | Omit = omit,
294
    extra_headers: dict[str, str] | None = None,
295
    extra_query: dict[str, object] | None = None,
296
    extra_body: dict[str, object] | None = None,
297
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
298
) -> ClientSecretCreateResponse:
299
    """
300
    Create a Realtime client secret with an associated session configuration.
301

302
    Args:
303
        expires_after: Configuration for the client secret expiration.
304
            Expiration refers to the time after which a client secret will
305
            no longer be valid for creating sessions. The session itself may
306
            continue after that time once started. A secret can be used to
307
            create multiple sessions until it expires.
308
            Example: {"anchor": "created_at", "seconds": 3600}
309

310
        session: Session configuration to use for the client secret.
311
            Choose either a realtime session or a transcription session.
312
            Example for realtime: {
313
                "type": "realtime",
314
                "model": "gpt-4o-realtime-preview",
315
                "voice": "alloy",
316
                "modalities": ["text", "audio"]
317
            }
318
            Example for transcription: {
319
                "type": "transcription",
320
                "model": "whisper-1"
321
            }
322

323
        extra_headers: Additional HTTP headers.
324
        extra_query: Additional query parameters.
325
        extra_body: Additional JSON fields.
326
        timeout: Request timeout in seconds.
327

328
    Returns:
329
        ClientSecretCreateResponse: Created client secret with value and
330
            expiration time. Use the secret value to establish WebSocket
331
            connections from client-side applications.
332

333
    Notes:
334
        - Client secrets enable secure browser-based realtime connections
335
        - Secrets expire after specified duration
336
        - One secret can establish multiple sessions until expiration
337
        - Use for temporary, client-side authentication
338
    """
339
```
340

341
Usage example:
342

343
```python
344
from openai import OpenAI
345

346
client = OpenAI()
347

348
# Create client secret for realtime session
349
secret = client.realtime.client_secrets.create(
350
    expires_after={
351
        "anchor": "created_at",
352
        "seconds": 3600  # Expires in 1 hour
353
    },
354
    session={
355
        "type": "realtime",
356
        "model": "gpt-4o-realtime-preview",
357
        "voice": "alloy",
358
        "modalities": ["text", "audio"],
359
        "instructions": "You are a helpful voice assistant."
360
    }
361
)
362

363
print(f"Client Secret: {secret.value}")
364
print(f"Expires At: {secret.expires_at}")
365

366
# Create client secret for transcription session
367
transcription_secret = client.realtime.client_secrets.create(
368
    expires_after={
369
        "anchor": "created_at",
370
        "seconds": 1800  # Expires in 30 minutes
371
    },
372
    session={
373
        "type": "transcription",
374
        "model": "whisper-1",
375
        "input_audio_format": "pcm16",
376
        "input_audio_transcription": {
377
            "model": "whisper-1"
378
        }
379
    }
380
)
381

382
# Use the secret client-side for WebSocket connection
383
# The secret value is passed as authentication to establish the connection
384
```
385

386
### Transcription Sessions
387

388
Create ephemeral API tokens for client-side realtime transcription applications.
389

390
```python { .api }
391
def create(
392
    self,
393
    *,
394
    client_secret: dict | Omit = omit,
395
    include: list[str] | Omit = omit,
396
    input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | Omit = omit,
397
    input_audio_noise_reduction: dict | Omit = omit,
398
    input_audio_transcription: dict | Omit = omit,
399
    modalities: list[Literal["text", "audio"]] | Omit = omit,
400
    turn_detection: dict | Omit = omit,
401
    extra_headers: dict[str, str] | None = None,
402
    extra_query: dict[str, object] | None = None,
403
    extra_body: dict[str, object] | None = None,
404
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
405
) -> TranscriptionSession:
406
    """
407
    Create an ephemeral API token for client-side realtime transcriptions.
408

409
    Returns a session object with a client_secret containing an ephemeral
410
    API token for authenticating browser clients.
411

412
    Args:
413
        client_secret: Configuration options for the generated client secret.
414

415
        include: Items to include in the transcription. Options:
416
            - "item.input_audio_transcription.logprobs"
417

418
        input_audio_format: Input audio format. Options: "pcm16", "g711_ulaw",
419
            "g711_alaw". For pcm16, audio must be 16-bit PCM at 24kHz sample rate,
420
            single channel (mono), little-endian byte order.
421

422
        input_audio_noise_reduction: Configuration for input audio noise reduction.
423
            Filters audio added to the input buffer before VAD and model processing.
424
            Can improve VAD accuracy and model performance.
425

426
        input_audio_transcription: Configuration for input audio transcription.
427
            Can optionally set language and prompt for additional guidance.
428

429
        modalities: Response modalities. Options: ["text"], ["audio"], or both.
430
            To disable audio, set to ["text"].
431

432
        turn_detection: Configuration for turn detection (Server VAD or Semantic VAD).
433
            Set to null to turn off, requiring manual trigger of model response.
434
            Server VAD detects speech based on audio volume. Semantic VAD uses
435
            a turn detection model to estimate turn completion and dynamically
436
            sets timeout based on probability.
437

438
    Returns:
439
        TranscriptionSession: Session with client_secret for browser authentication.
440
    """
441
```
442

443
**Usage Example:**
444

445
```python
446
from openai import OpenAI
447

448
client = OpenAI()
449

450
# Create transcription session for client-side use
451
session = client.beta.realtime.transcription_sessions.create(
452
    input_audio_format="pcm16",
453
    input_audio_transcription={
454
        "model": "whisper-1",
455
        "language": "en",
456
        "prompt": "Technical discussion"
457
    },
458
    input_audio_noise_reduction={
459
        "type": "default"
460
    },
461
    turn_detection={
462
        "type": "semantic_vad",
463
        "threshold": 0.6
464
    },
465
    modalities=["text", "audio"],
466
    include=["item.input_audio_transcription.logprobs"]
467
)
468

469
# Use the client_secret in browser/client application
470
ephemeral_token = session.client_secret.value
471
print(f"Session ID: {session.id}")
472
print(f"Token expires: {session.client_secret.expires_at}")
473
```
474

475
## Types
476

477
```python { .api }
478
from typing import Literal
479
from pydantic import BaseModel
480

481
class RealtimeSession(BaseModel):
482
    """Realtime session configuration."""
483
    id: str
484
    model: str
485
    modalities: list[str]
486
    instructions: str | None
487
    voice: str | None
488
    input_audio_format: str
489
    output_audio_format: str
490
    input_audio_transcription: dict | None
491
    turn_detection: dict | None
492
    tools: list[dict] | None
493
    tool_choice: str
494
    temperature: float | None
495
    max_response_output_tokens: int | str | None
496
    client_secret: ClientSecret
497

498
class ClientSecret(BaseModel):
499
    """WebSocket client secret."""
500
    value: str
501
    expires_at: int
502

503
class ClientSecretCreateResponse(BaseModel):
504
    """Response from creating a client secret."""
505
    id: str
506
    created_at: int
507
    expires_at: int
508
    value: str  # The ephemeral client secret value
509
    session: dict  # Session configuration associated with this secret
510
```
511

512
## Event Types
513

514
WebSocket events for realtime communication:
515

516
- `session.created` - Session established
517
- `input_audio_buffer.append` - Add audio data
518
- `input_audio_buffer.commit` - Process buffered audio
519
- `input_audio_buffer.clear` - Clear buffer
520
- `conversation.item.create` - Add conversation item
521
- `response.create` - Request response
522
- `response.cancel` - Cancel current response
523
- `response.audio.delta` - Audio chunk received
524
- `response.text.delta` - Text chunk received
525
- `response.done` - Response completed
526
- `conversation.item.input_audio_transcription.completed` - Transcription ready
527
- `error` - Error occurred
528

529
## Best Practices
530

531
```python
532
import asyncio
533
import websockets
534
import json
535
import base64
536

537
async def realtime_session(session_url: str):
538
    async with websockets.connect(session_url) as ws:
539
        # Handle incoming events
540
        async def receive_events():
541
            async for message in ws:
542
                event = json.loads(message)
543

544
                if event["type"] == "response.audio.delta":
545
                    # Stream audio to speaker
546
                    audio_data = base64.b64decode(event["delta"])
547
                    play_audio(audio_data)
548

549
                elif event["type"] == "response.text.delta":
550
                    # Display text
551
                    print(event["delta"], end="", flush=True)
552

553
        # Send audio input
554
        async def send_audio():
555
            while True:
556
                audio_chunk = record_audio_chunk()
557
                await ws.send(json.dumps({
558
                    "type": "input_audio_buffer.append",
559
                    "audio": base64.b64encode(audio_chunk).decode()
560
                }))
561
                await asyncio.sleep(0.1)
562

563
        # Run both tasks
564
        await asyncio.gather(
565
            receive_events(),
566
            send_audio()
567
        )
568
```
569

570
## Async Usage
571

572
```python
573
import asyncio
574
from openai import AsyncOpenAI
575

576
async def create_session():
577
    client = AsyncOpenAI()
578

579
    session = await client.beta.realtime.sessions.create(
580
        model="gpt-4o-realtime-preview",
581
        modalities=["audio"]
582
    )
583

584
    return session.client_secret.value
585

586
ws_url = asyncio.run(create_session())
587
```
588

Version

Tile

Files

realtime.mddocs/

Version

Tile

Files

realtime.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

realtime.mddocs/