or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

assistants.mdaudio.mdbatches.mdchat-completions.mdchatkit.mdclient-initialization.mdcompletions.mdcontainers.mdconversations.mdembeddings.mdevals.mdfiles.mdfine-tuning.mdimages.mdindex.mdmodels.mdmoderations.mdrealtime.mdresponses.mdruns.mdthreads-messages.mduploads.mdvector-stores.mdvideos.mdwebhooks.md
KNOWN_ISSUES.md

realtime.mddocs/

0

# Realtime API

1

2

WebSocket-based realtime communication for low-latency conversational AI experiences with audio streaming, function calling, and interruption handling.

3

4

## Capabilities

5

6

### Create Realtime Session

7

8

Establish a WebSocket connection for realtime interaction.

9

10

```python { .api }

11

def create(

12

self,

13

*,

14

model: str,

15

modalities: list[str] | Omit = omit,

16

instructions: str | Omit = omit,

17

voice: str | Omit = omit,

18

input_audio_format: str | Omit = omit,

19

output_audio_format: str | Omit = omit,

20

input_audio_transcription: dict | Omit = omit,

21

turn_detection: dict | Omit = omit,

22

tools: list[dict] | Omit = omit,

23

tool_choice: str | Omit = omit,

24

temperature: float | Omit = omit,

25

max_response_output_tokens: int | str | Omit = omit,

26

extra_headers: dict[str, str] | None = None,

27

extra_query: dict[str, object] | None = None,

28

extra_body: dict[str, object] | None = None,

29

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,

30

) -> RealtimeSession:

31

"""

32

Create a realtime session for WebSocket communication.

33

34

Args:

35

model: Model to use (e.g., "gpt-4o-realtime-preview").

36

37

modalities: Response modalities. Options: ["text", "audio"].

38

39

instructions: System instructions for the session.

40

41

voice: Voice for audio output. Options: "alloy", "echo", "fable",

42

"onyx", "nova", "shimmer".

43

44

input_audio_format: Input audio format. Options: "pcm16", "g711_ulaw",

45

"g711_alaw".

46

47

output_audio_format: Output audio format. Options: "pcm16", "g711_ulaw",

48

"g711_alaw", "mp3", "opus".

49

50

input_audio_transcription: Enable input transcription.

51

{"model": "whisper-1"}

52

53

turn_detection: Turn detection configuration.

54

{"type": "server_vad", "threshold": 0.5, "prefix_padding_ms": 300,

55

"silence_duration_ms": 200}

56

57

tools: Function tools available to the model.

58

59

tool_choice: Tool choice configuration. "auto", "none", "required".

60

61

temperature: Sampling temperature.

62

63

max_response_output_tokens: Maximum output tokens per response.

64

65

Returns:

66

RealtimeSession: WebSocket session configuration.

67

"""

68

```

69

70

Usage example:

71

72

```python

73

from openai import OpenAI

74

75

client = OpenAI()

76

77

# Create realtime session

78

session = client.beta.realtime.sessions.create(

79

model="gpt-4o-realtime-preview",

80

modalities=["text", "audio"],

81

voice="alloy",

82

instructions="You are a helpful assistant.",

83

input_audio_format="pcm16",

84

output_audio_format="pcm16",

85

turn_detection={

86

"type": "server_vad",

87

"threshold": 0.5,

88

"silence_duration_ms": 500

89

}

90

)

91

92

# Access WebSocket URL

93

ws_url = session.client_secret.value

94

95

# Use with WebSocket library (e.g., websockets)

96

import asyncio

97

import websockets

98

import json

99

100

async def realtime_conversation():

101

async with websockets.connect(ws_url) as websocket:

102

# Send audio input

103

await websocket.send(json.dumps({

104

"type": "input_audio_buffer.append",

105

"audio": base64_audio_data

106

}))

107

108

# Commit audio

109

await websocket.send(json.dumps({

110

"type": "input_audio_buffer.commit"

111

}))

112

113

# Receive responses

114

async for message in websocket:

115

event = json.loads(message)

116

117

if event["type"] == "response.audio.delta":

118

# Handle audio chunk

119

audio_chunk = event["delta"]

120

# Play or process audio

121

122

elif event["type"] == "response.text.delta":

123

# Handle text chunk

124

text = event["delta"]

125

print(text, end="", flush=True)

126

127

elif event["type"] == "response.done":

128

break

129

130

asyncio.run(realtime_conversation())

131

```

132

133

### Realtime Calls

134

135

Manage incoming and outgoing realtime voice calls with call control methods.

136

137

```python { .api }

138

# Accessed via: client.realtime.calls or client.beta.realtime.calls

139

def create(

140

self,

141

*,

142

model: str,

143

modalities: list[str] | Omit = omit,

144

instructions: str | Omit = omit,

145

voice: str | Omit = omit,

146

extra_headers: dict[str, str] | None = None,

147

extra_query: dict[str, object] | None = None,

148

extra_body: dict[str, object] | None = None,

149

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,

150

) -> RealtimeCall:

151

"""

152

Initiate an outgoing realtime call.

153

154

Args:

155

model: Model to use for the call.

156

modalities: Response modalities (["audio"] recommended).

157

instructions: System instructions for the call.

158

voice: Voice for audio output.

159

160

Returns:

161

RealtimeCall: Call object with connection details.

162

"""

163

164

def accept(

165

self,

166

call_id: str,

167

*,

168

extra_headers: dict[str, str] | None = None,

169

extra_query: dict[str, object] | None = None,

170

extra_body: dict[str, object] | None = None,

171

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,

172

) -> RealtimeCall:

173

"""

174

Accept an incoming realtime call.

175

176

Args:

177

call_id: The ID of the incoming call to accept.

178

179

Returns:

180

RealtimeCall: Call object with status "active".

181

"""

182

183

def hangup(

184

self,

185

call_id: str,

186

*,

187

extra_headers: dict[str, str] | None = None,

188

extra_query: dict[str, object] | None = None,

189

extra_body: dict[str, object] | None = None,

190

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,

191

) -> RealtimeCall:

192

"""

193

End an active realtime call.

194

195

Args:

196

call_id: The ID of the call to hang up.

197

198

Returns:

199

RealtimeCall: Call object with status "completed".

200

"""

201

202

def refer(

203

self,

204

call_id: str,

205

*,

206

refer_to: str,

207

extra_headers: dict[str, str] | None = None,

208

extra_query: dict[str, object] | None = None,

209

extra_body: dict[str, object] | None = None,

210

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,

211

) -> RealtimeCall:

212

"""

213

Transfer a realtime call to another destination.

214

215

Args:

216

call_id: The ID of the call to transfer.

217

refer_to: Destination identifier to transfer the call to.

218

219

Returns:

220

RealtimeCall: Call object with status "referred".

221

"""

222

223

def reject(

224

self,

225

call_id: str,

226

*,

227

reason: str | Omit = omit,

228

extra_headers: dict[str, str] | None = None,

229

extra_query: dict[str, object] | None = None,

230

extra_body: dict[str, object] | None = None,

231

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,

232

) -> RealtimeCall:

233

"""

234

Reject an incoming realtime call.

235

236

Args:

237

call_id: The ID of the incoming call to reject.

238

reason: Optional reason for rejection.

239

240

Returns:

241

RealtimeCall: Call object with status "rejected".

242

"""

243

```

244

245

Usage example:

246

247

```python

248

from openai import OpenAI

249

250

client = OpenAI()

251

252

# Create an outgoing call (available at both paths)

253

call = client.realtime.calls.create(

254

model="gpt-4o-realtime-preview",

255

modalities=["audio"],

256

voice="alloy",

257

instructions="You are a helpful phone assistant."

258

)

259

260

print(f"Call ID: {call.id}, Status: {call.status}")

261

262

# Accept an incoming call

263

accepted_call = client.realtime.calls.accept("call_abc123")

264

265

# Transfer a call

266

referred_call = client.realtime.calls.refer(

267

call.id,

268

refer_to="destination_id"

269

)

270

271

# End a call

272

ended_call = client.realtime.calls.hangup(call.id)

273

print(f"Call ended: {ended_call.status}")

274

275

# Reject an incoming call

276

rejected_call = client.realtime.calls.reject(

277

"call_xyz789",

278

reason="User unavailable"

279

)

280

```

281

282

### Client Secrets

283

284

Create ephemeral client secrets for secure realtime session establishment.

285

286

```python { .api }

287

# Access via client.realtime.client_secrets

288

289

def create(

290

self,

291

*,

292

expires_after: dict | Omit = omit,

293

session: dict | Omit = omit,

294

extra_headers: dict[str, str] | None = None,

295

extra_query: dict[str, object] | None = None,

296

extra_body: dict[str, object] | None = None,

297

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,

298

) -> ClientSecretCreateResponse:

299

"""

300

Create a Realtime client secret with an associated session configuration.

301

302

Args:

303

expires_after: Configuration for the client secret expiration.

304

Expiration refers to the time after which a client secret will

305

no longer be valid for creating sessions. The session itself may

306

continue after that time once started. A secret can be used to

307

create multiple sessions until it expires.

308

Example: {"anchor": "created_at", "seconds": 3600}

309

310

session: Session configuration to use for the client secret.

311

Choose either a realtime session or a transcription session.

312

Example for realtime: {

313

"type": "realtime",

314

"model": "gpt-4o-realtime-preview",

315

"voice": "alloy",

316

"modalities": ["text", "audio"]

317

}

318

Example for transcription: {

319

"type": "transcription",

320

"model": "whisper-1"

321

}

322

323

extra_headers: Additional HTTP headers.

324

extra_query: Additional query parameters.

325

extra_body: Additional JSON fields.

326

timeout: Request timeout in seconds.

327

328

Returns:

329

ClientSecretCreateResponse: Created client secret with value and

330

expiration time. Use the secret value to establish WebSocket

331

connections from client-side applications.

332

333

Notes:

334

- Client secrets enable secure browser-based realtime connections

335

- Secrets expire after specified duration

336

- One secret can establish multiple sessions until expiration

337

- Use for temporary, client-side authentication

338

"""

339

```

340

341

Usage example:

342

343

```python

344

from openai import OpenAI

345

346

client = OpenAI()

347

348

# Create client secret for realtime session

349

secret = client.realtime.client_secrets.create(

350

expires_after={

351

"anchor": "created_at",

352

"seconds": 3600 # Expires in 1 hour

353

},

354

session={

355

"type": "realtime",

356

"model": "gpt-4o-realtime-preview",

357

"voice": "alloy",

358

"modalities": ["text", "audio"],

359

"instructions": "You are a helpful voice assistant."

360

}

361

)

362

363

print(f"Client Secret: {secret.value}")

364

print(f"Expires At: {secret.expires_at}")

365

366

# Create client secret for transcription session

367

transcription_secret = client.realtime.client_secrets.create(

368

expires_after={

369

"anchor": "created_at",

370

"seconds": 1800 # Expires in 30 minutes

371

},

372

session={

373

"type": "transcription",

374

"model": "whisper-1",

375

"input_audio_format": "pcm16",

376

"input_audio_transcription": {

377

"model": "whisper-1"

378

}

379

}

380

)

381

382

# Use the secret client-side for WebSocket connection

383

# The secret value is passed as authentication to establish the connection

384

```

385

386

### Transcription Sessions

387

388

Create ephemeral API tokens for client-side realtime transcription applications.

389

390

```python { .api }

391

def create(

392

self,

393

*,

394

client_secret: dict | Omit = omit,

395

include: list[str] | Omit = omit,

396

input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | Omit = omit,

397

input_audio_noise_reduction: dict | Omit = omit,

398

input_audio_transcription: dict | Omit = omit,

399

modalities: list[Literal["text", "audio"]] | Omit = omit,

400

turn_detection: dict | Omit = omit,

401

extra_headers: dict[str, str] | None = None,

402

extra_query: dict[str, object] | None = None,

403

extra_body: dict[str, object] | None = None,

404

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,

405

) -> TranscriptionSession:

406

"""

407

Create an ephemeral API token for client-side realtime transcriptions.

408

409

Returns a session object with a client_secret containing an ephemeral

410

API token for authenticating browser clients.

411

412

Args:

413

client_secret: Configuration options for the generated client secret.

414

415

include: Items to include in the transcription. Options:

416

- "item.input_audio_transcription.logprobs"

417

418

input_audio_format: Input audio format. Options: "pcm16", "g711_ulaw",

419

"g711_alaw". For pcm16, audio must be 16-bit PCM at 24kHz sample rate,

420

single channel (mono), little-endian byte order.

421

422

input_audio_noise_reduction: Configuration for input audio noise reduction.

423

Filters audio added to the input buffer before VAD and model processing.

424

Can improve VAD accuracy and model performance.

425

426

input_audio_transcription: Configuration for input audio transcription.

427

Can optionally set language and prompt for additional guidance.

428

429

modalities: Response modalities. Options: ["text"], ["audio"], or both.

430

To disable audio, set to ["text"].

431

432

turn_detection: Configuration for turn detection (Server VAD or Semantic VAD).

433

Set to null to turn off, requiring manual trigger of model response.

434

Server VAD detects speech based on audio volume. Semantic VAD uses

435

a turn detection model to estimate turn completion and dynamically

436

sets timeout based on probability.

437

438

Returns:

439

TranscriptionSession: Session with client_secret for browser authentication.

440

"""

441

```

442

443

**Usage Example:**

444

445

```python

446

from openai import OpenAI

447

448

client = OpenAI()

449

450

# Create transcription session for client-side use

451

session = client.beta.realtime.transcription_sessions.create(

452

input_audio_format="pcm16",

453

input_audio_transcription={

454

"model": "whisper-1",

455

"language": "en",

456

"prompt": "Technical discussion"

457

},

458

input_audio_noise_reduction={

459

"type": "default"

460

},

461

turn_detection={

462

"type": "semantic_vad",

463

"threshold": 0.6

464

},

465

modalities=["text", "audio"],

466

include=["item.input_audio_transcription.logprobs"]

467

)

468

469

# Use the client_secret in browser/client application

470

ephemeral_token = session.client_secret.value

471

print(f"Session ID: {session.id}")

472

print(f"Token expires: {session.client_secret.expires_at}")

473

```

474

475

## Types

476

477

```python { .api }

478

from typing import Literal

479

from pydantic import BaseModel

480

481

class RealtimeSession(BaseModel):

482

"""Realtime session configuration."""

483

id: str

484

model: str

485

modalities: list[str]

486

instructions: str | None

487

voice: str | None

488

input_audio_format: str

489

output_audio_format: str

490

input_audio_transcription: dict | None

491

turn_detection: dict | None

492

tools: list[dict] | None

493

tool_choice: str

494

temperature: float | None

495

max_response_output_tokens: int | str | None

496

client_secret: ClientSecret

497

498

class ClientSecret(BaseModel):

499

"""WebSocket client secret."""

500

value: str

501

expires_at: int

502

503

class ClientSecretCreateResponse(BaseModel):

504

"""Response from creating a client secret."""

505

id: str

506

created_at: int

507

expires_at: int

508

value: str # The ephemeral client secret value

509

session: dict # Session configuration associated with this secret

510

```

511

512

## Event Types

513

514

WebSocket events for realtime communication:

515

516

- `session.created` - Session established

517

- `input_audio_buffer.append` - Add audio data

518

- `input_audio_buffer.commit` - Process buffered audio

519

- `input_audio_buffer.clear` - Clear buffer

520

- `conversation.item.create` - Add conversation item

521

- `response.create` - Request response

522

- `response.cancel` - Cancel current response

523

- `response.audio.delta` - Audio chunk received

524

- `response.text.delta` - Text chunk received

525

- `response.done` - Response completed

526

- `conversation.item.input_audio_transcription.completed` - Transcription ready

527

- `error` - Error occurred

528

529

## Best Practices

530

531

```python

532

import asyncio

533

import websockets

534

import json

535

import base64

536

537

async def realtime_session(session_url: str):

538

async with websockets.connect(session_url) as ws:

539

# Handle incoming events

540

async def receive_events():

541

async for message in ws:

542

event = json.loads(message)

543

544

if event["type"] == "response.audio.delta":

545

# Stream audio to speaker

546

audio_data = base64.b64decode(event["delta"])

547

play_audio(audio_data)

548

549

elif event["type"] == "response.text.delta":

550

# Display text

551

print(event["delta"], end="", flush=True)

552

553

# Send audio input

554

async def send_audio():

555

while True:

556

audio_chunk = record_audio_chunk()

557

await ws.send(json.dumps({

558

"type": "input_audio_buffer.append",

559

"audio": base64.b64encode(audio_chunk).decode()

560

}))

561

await asyncio.sleep(0.1)

562

563

# Run both tasks

564

await asyncio.gather(

565

receive_events(),

566

send_audio()

567

)

568

```

569

570

## Async Usage

571

572

```python

573

import asyncio

574

from openai import AsyncOpenAI

575

576

async def create_session():

577

client = AsyncOpenAI()

578

579

session = await client.beta.realtime.sessions.create(

580

model="gpt-4o-realtime-preview",

581

modalities=["audio"]

582

)

583

584

return session.client_secret.value

585

586

ws_url = asyncio.run(create_session())

587

```

588