Build realtime voice agents using WebRTC or WebSockets.
Specialized agent class for voice and audio interactions.
/**
* Agent for realtime voice/audio interactions
* Import from '@openai/agents/realtime'
*/
class RealtimeAgent<TContext = any> {
constructor(config: RealtimeAgentConfig<TContext>);
/** Agent identifier */
name: string;
}
interface RealtimeAgentConfig<TContext = any> {
/** Agent identifier (required) */
name: string;
/** System prompt or function returning prompt */
instructions?: string | ((runContext: RunContext<TContext>, agent: RealtimeAgent) => string | Promise<string>);
/** Prompt template for OpenAI Responses API */
prompt?: Prompt | ((runContext: RunContext<TContext>, agent: RealtimeAgent) => Prompt | Promise<Prompt>);
/** Description shown when agent is used as handoff target */
handoffDescription?: string;
/** Sub-agents this agent can handoff to */
handoffs?: (RealtimeAgent | Handoff)[];
/** Available tools */
tools?: Tool[];
/** MCP protocol servers providing tools */
mcpServers?: MCPServer[];
/** Voice ID for speech synthesis */
voice?: string;
}Limitations vs Regular Agent:
model choice (set at session level)modelSettings (set at session level)outputType (text only)toolUseBehaviorinputGuardrails / outputGuardrails (use RealtimeOutputGuardrail)Usage Examples:
import { RealtimeAgent } from '@openai/agents/realtime';
// Basic realtime agent
const agent = new RealtimeAgent({
name: 'VoiceAssistant',
instructions: 'You are a helpful voice assistant',
voice: 'alloy',
});
// With tools
import { tool } from '@openai/agents';
import { z } from 'zod';
const weatherTool = tool({
name: 'get_weather',
description: 'Get weather for a city',
parameters: z.object({
city: z.string(),
}),
execute: async (input) => {
return `Weather in ${input.city}: Sunny, 72°F`;
},
});
const toolAgent = new RealtimeAgent({
name: 'WeatherVoiceAgent',
instructions: 'You provide weather information via voice',
tools: [weatherTool],
voice: 'shimmer',
});
// With handoffs
const specialist = new RealtimeAgent({
name: 'Specialist',
handoffDescription: 'Expert in specialized topics',
instructions: 'You are a specialist',
voice: 'nova',
});
const mainAgent = new RealtimeAgent({
name: 'MainAgent',
instructions: 'You route to specialists',
handoffs: [specialist],
voice: 'alloy',
});
// With dynamic instructions
const personalizedAgent = new RealtimeAgent({
name: 'PersonalAgent',
instructions: (runContext) => {
return `You are helping ${runContext.context.userName}`;
},
voice: 'echo',
});Manages realtime voice agent sessions with continuous connection.
/**
* Realtime voice agent session
* Import from '@openai/agents/realtime'
*/
class RealtimeSession<TBaseContext = any> {
constructor(
initialAgent: RealtimeAgent<TBaseContext>,
options: RealtimeSessionOptions<TBaseContext>
);
/** Transport layer (WebRTC/WebSocket/SIP) */
transport: RealtimeTransportLayer;
/** Currently active agent */
currentAgent: RealtimeAgent;
/** Initial agent at session start */
initialAgent: RealtimeAgent;
/** Token usage statistics */
usage: Usage;
/** Run context with user context and utilities */
context: RunContext<RealtimeContextData>;
/** Current mute state */
muted: boolean | null;
/** Conversation history */
history: RealtimeItem[];
/** Available MCP tools */
availableMcpTools: Array<{
name: string;
description?: string;
input_schema?: Record<string, any>;
[key: string]: any;
}>;
/**
* Connect to the realtime API
* @param options - Connection configuration
*/
connect(options: RealtimeConnectOptions): Promise<void>;
/**
* Close the connection
*/
close(): void;
/**
* Switch to a different agent
* @param newAgent - Agent to switch to
* @returns The new current agent
*/
updateAgent(newAgent: RealtimeAgent): Promise<RealtimeAgent>;
/**
* Send a text message
* @param message - Message text
* @param otherEventData - Additional event data
*/
sendMessage(message: string, otherEventData?: any): void;
/**
* Send audio data
* @param audio - Audio buffer
* @param options - Send options
*/
sendAudio(audio: ArrayBuffer | Uint8Array, options?: { commit?: boolean }): void;
/**
* Add an image to the conversation
* @param image - Base64 encoded image
* @param options - Image options
*/
addImage(image: string, options?: { triggerResponse?: boolean }): void;
/**
* Interrupt current agent speech
*/
interrupt(): void;
/**
* Mute or unmute audio input
* @param muted - Mute state
*/
mute(muted: boolean): void;
/**
* Approve a tool call
* @param approvalItem - Tool approval item
* @param options - Approval options
*/
approve(
approvalItem: RunToolApprovalItem,
options?: { alwaysApprove?: boolean }
): Promise<void>;
/**
* Reject a tool call
* @param approvalItem - Tool approval item
* @param options - Rejection options
*/
reject(
approvalItem: RunToolApprovalItem,
options?: { alwaysReject?: boolean }
): Promise<void>;
/**
* Update conversation history
* @param newHistory - New history array or update function
*/
updateHistory(
newHistory: RealtimeItem[] | ((current: RealtimeItem[]) => RealtimeItem[])
): void;
/**
* Get initial session configuration
* @param overrides - Configuration overrides
* @returns Session configuration
*/
getInitialSessionConfig(
overrides?: Partial<RealtimeSessionConfig>
): Promise<RealtimeSessionConfig>;
/**
* Compute initial session configuration (static)
* @param agent - Agent to configure for
* @param options - Session options
* @param overrides - Configuration overrides
* @returns Session configuration
*/
static computeInitialSessionConfig(
agent: RealtimeAgent,
options?: Partial<RealtimeSessionOptions>,
overrides?: Partial<RealtimeSessionConfig>
): Promise<RealtimeSessionConfig>;
}
interface RealtimeSessionOptions<TBaseContext = any> {
/** API key or function returning API key (required) */
apiKey: string | (() => string | Promise<string>);
/** Transport layer (required) */
transport: 'webrtc' | 'websocket' | RealtimeTransportLayer;
/** Model to use */
model?: OpenAIRealtimeModels | string;
/** User context object */
context?: TBaseContext;
/** Output validation guardrails */
outputGuardrails?: RealtimeOutputGuardrail[];
/** Guardrail execution settings */
outputGuardrailSettings?: {
debounceTextLength?: number;
feedbackMessage?: string | ((result: GuardrailFunctionOutput) => string);
};
/** Session configuration overrides */
config?: Partial<RealtimeSessionConfig>;
/** Store audio in history (default: false) */
historyStoreAudio?: boolean;
/** Disable tracing */
tracingDisabled?: boolean;
/** Group ID for trace grouping */
groupId?: string;
/** Additional trace metadata */
traceMetadata?: Record<string, any>;
/** Workflow name for tracing */
workflowName?: string;
/** Auto-trigger response after MCP tool calls */
automaticallyTriggerResponseForMcpToolCalls?: boolean;
}
interface RealtimeConnectOptions {
/** API key (overrides constructor) */
apiKey?: string;
/** Model (overrides constructor) */
model?: string;
/** Connection URL (for WebSocket) */
url?: string;
/** Call ID (for SIP) */
callId?: string;
}
type OpenAIRealtimeModels = 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-12-17';Usage Examples:
import { RealtimeAgent, RealtimeSession, OpenAIRealtimeWebRTC } from '@openai/agents/realtime';
const agent = new RealtimeAgent({
name: 'VoiceAssistant',
instructions: 'You are a helpful voice assistant',
voice: 'alloy',
});
// WebRTC session (browser)
const session = new RealtimeSession(agent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: 'webrtc',
model: 'gpt-4o-realtime-preview',
});
// Connect
await session.connect();
// Handle events
session.transport.on('audio', (audio: ArrayBuffer) => {
// Play audio to user
playAudio(audio);
});
session.transport.on('agent_start', ({ agent }) => {
console.log('Agent started speaking');
});
session.transport.on('agent_end', ({ agent }) => {
console.log('Agent finished speaking');
});
// Send user audio
const userAudio = await captureAudio();
session.sendAudio(userAudio);
// Send text message
session.sendMessage('What is the weather?');
// Interrupt agent
session.interrupt();
// Close when done
session.close();
// Helper functions (implementation-specific)
function playAudio(audio: ArrayBuffer) {
// Platform-specific audio playback
}
async function captureAudio(): Promise<ArrayBuffer> {
// Platform-specific audio capture
return new ArrayBuffer(0);
}Different transport options for realtime connections.
/**
* WebRTC transport (browser-optimized, lowest latency)
*/
class OpenAIRealtimeWebRTC implements RealtimeTransportLayer {
constructor(options?: RealtimeWebRTCOptions);
}
/**
* WebSocket transport (Node.js and browser)
*/
class OpenAIRealtimeWebSocket implements RealtimeTransportLayer {
constructor(options?: RealtimeWebSocketOptions);
}
/**
* SIP transport (phone integration)
*/
class OpenAIRealtimeSIP implements RealtimeTransportLayer {
constructor(options?: RealtimeSIPOptions);
}
interface RealtimeTransportLayer {
/** Current mute state */
muted: boolean | null;
/** Connect to service */
connect(options: any): Promise<void>;
/** Close connection */
close(): void;
/** Mute/unmute audio */
mute(muted: boolean): void;
/** Send audio data */
sendAudio(audio: ArrayBuffer | Uint8Array, options?: any): void;
/** Send text message */
sendMessage(message: string, otherEventData?: any): void;
/** Add image to conversation */
addImage(image: string, options?: any): void;
/** Interrupt agent speech */
interrupt(): void;
/** Update session configuration */
updateSessionConfig(config: RealtimeSessionConfig): Promise<void>;
/** Send function call output */
sendFunctionCallOutput(
toolCall: any,
output: string,
triggerResponse: boolean
): void;
/** Send MCP response */
sendMcpResponse(approvalRequest: any, approved: boolean): void;
/** Reset conversation history */
resetHistory(oldHistory: RealtimeItem[], newHistory: RealtimeItem[]): void;
/** Send raw event */
sendEvent(event: any): void;
/** Register event handler */
on(event: string, handler: (...args: any[]) => void): void;
/** Unregister event handler */
off(event: string, handler: (...args: any[]) => void): void;
/** Emit event */
emit(event: string, ...args: any[]): boolean;
}Usage Examples:
import {
RealtimeAgent,
RealtimeSession,
OpenAIRealtimeWebRTC,
OpenAIRealtimeWebSocket,
} from '@openai/agents/realtime';
const agent = new RealtimeAgent({
name: 'VoiceAgent',
instructions: 'You are helpful',
});
// WebRTC (browser, lowest latency)
const webrtcSession = new RealtimeSession(agent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: new OpenAIRealtimeWebRTC(),
});
// WebSocket (Node.js or browser)
const websocketSession = new RealtimeSession(agent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: new OpenAIRealtimeWebSocket(),
});
// Or use string shortcuts
const quickWebRTC = new RealtimeSession(agent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: 'webrtc',
});
const quickWebSocket = new RealtimeSession(agent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: 'websocket',
});Events emitted during realtime session execution.
Event Types:
transport_event - Raw transport eventsagent_start - Agent begins speakingagent_end - Agent finishes speakingagent_handoff - Handoff to another agentagent_tool_start - Tool execution beginsagent_tool_end - Tool execution completestool_approval_requested - Tool needs approvalaudio - Audio chunk receivedaudio_start - Audio stream startedaudio_stopped - Audio stream stoppedaudio_interrupted - Audio stream interruptedhistory_updated - History was modifiedhistory_added - Items added to historyguardrail_tripped - Output guardrail triggeredmcp_tools_changed - MCP tools list changedmcp_tool_call_completed - MCP tool call finishederror - Error occurredUsage Examples:
import { RealtimeAgent, RealtimeSession } from '@openai/agents/realtime';
const agent = new RealtimeAgent({
name: 'EventAgent',
instructions: 'You are helpful',
});
const session = new RealtimeSession(agent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: 'webrtc',
});
await session.connect();
// Audio events
session.transport.on('audio', (audio: ArrayBuffer) => {
console.log('Received audio chunk:', audio.byteLength, 'bytes');
playAudio(audio);
});
session.transport.on('audio_start', () => {
console.log('Agent started speaking');
});
session.transport.on('audio_stopped', () => {
console.log('Agent stopped speaking');
});
session.transport.on('audio_interrupted', () => {
console.log('Agent speech interrupted');
});
// Agent events
session.transport.on('agent_start', ({ agent }) => {
console.log(`Agent ${agent.name} activated`);
});
session.transport.on('agent_end', ({ agent }) => {
console.log(`Agent ${agent.name} finished`);
});
session.transport.on('agent_handoff', ({ fromAgent, toAgent }) => {
console.log(`Handoff from ${fromAgent.name} to ${toAgent.name}`);
});
// Tool events
session.transport.on('agent_tool_start', ({ tool }) => {
console.log(`Tool ${tool.name} starting`);
});
session.transport.on('agent_tool_end', ({ tool, result }) => {
console.log(`Tool ${tool.name} completed:`, result);
});
session.transport.on('tool_approval_requested', (approval) => {
console.log(`Tool ${approval.toolName} needs approval`);
// Handle approval UI
});
// History events
session.transport.on('history_updated', ({ history }) => {
console.log('History updated, items:', history.length);
});
session.transport.on('history_added', ({ items }) => {
console.log('Added items:', items.length);
});
// Guardrail events
session.transport.on('guardrail_tripped', ({ guardrailName, outputInfo }) => {
console.log(`Guardrail ${guardrailName} triggered:`, outputInfo);
});
// Error events
session.transport.on('error', (error) => {
console.error('Session error:', error);
});
// MCP events
session.transport.on('mcp_tools_changed', ({ tools }) => {
console.log('MCP tools changed:', tools.length);
});
// Helper function
function playAudio(audio: ArrayBuffer) {
// Implementation-specific
}Classes representing items in realtime conversation history.
/**
* Base realtime item
*/
class RealtimeItem {
id: string;
type: string;
}
/**
* Tool call item
*/
class RealtimeToolCallItem extends RealtimeItem {
toolCall: FunctionCallItem;
}
/**
* Message item
*/
class RealtimeMessageItem extends RealtimeItem {
message: AssistantMessageItem | UserMessageItem;
}
/**
* MCP tool call item
*/
class RealtimeMcpCallItem extends RealtimeItem {
mcpCall: any;
}
/**
* Base item type
*/
type RealtimeBaseItem = RealtimeItem;Usage Examples:
import { RealtimeAgent, RealtimeSession } from '@openai/agents/realtime';
const agent = new RealtimeAgent({
name: 'HistoryAgent',
instructions: 'You are helpful',
});
const session = new RealtimeSession(agent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: 'webrtc',
});
await session.connect();
// Send some messages
session.sendMessage('Hello');
await new Promise(resolve => setTimeout(resolve, 2000));
session.sendMessage('Tell me a joke');
await new Promise(resolve => setTimeout(resolve, 3000));
// Access history
const history = session.history;
console.log('History items:', history.length);
for (const item of history) {
if (item instanceof RealtimeMessageItem) {
console.log('Message:', item.message.content);
} else if (item instanceof RealtimeToolCallItem) {
console.log('Tool call:', item.toolCall.name);
} else if (item instanceof RealtimeMcpCallItem) {
console.log('MCP call:', item.mcpCall);
}
}
// Update history
session.updateHistory((current) => {
// Keep only last 10 items
return current.slice(-10);
});
// Replace history
session.updateHistory([]);Output validation for realtime agents.
/**
* Output guardrail for realtime agents
*/
interface RealtimeOutputGuardrail<TContext = any> {
/** Guardrail identifier */
name: string;
/**
* Execute the guardrail check
* @param args - Guardrail execution arguments
* @returns Guardrail result
*/
execute: (args: {
agent: RealtimeAgent;
agentOutput: string;
context: RunContext<TContext>;
}) => Promise<GuardrailFunctionOutput>;
}Usage Examples:
import {
RealtimeAgent,
RealtimeSession,
RealtimeOutputGuardrail,
} from '@openai/agents/realtime';
// Define guardrails using the interface
const profanityGuardrail: RealtimeOutputGuardrail = {
name: 'profanity_check',
execute: async ({ agentOutput }) => {
const hasProfanity = /\b(bad|words)\b/i.test(agentOutput);
return {
tripwireTriggered: hasProfanity,
outputInfo: { filtered: hasProfanity },
};
},
};
const toxicityGuardrail: RealtimeOutputGuardrail = {
name: 'toxicity_check',
execute: async ({ agentOutput }) => {
const toxicityScore = await checkToxicity(agentOutput);
return {
tripwireTriggered: toxicityScore > 0.8,
outputInfo: { score: toxicityScore },
};
},
};
// Use with session
const agent = new RealtimeAgent({
name: 'SafeAgent',
instructions: 'You are helpful and safe',
});
const session = new RealtimeSession(agent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: 'webrtc',
outputGuardrails: [profanityGuardrail, toxicityGuardrail],
outputGuardrailSettings: {
debounceTextLength: 50, // Check every 50 chars
feedbackMessage: 'Please use appropriate language',
// Or dynamic:
// feedbackMessage: (result) => `Guardrail ${result.name} triggered`,
},
});
// Listen for guardrail events
session.transport.on('guardrail_tripped', ({ guardrailName, outputInfo }) => {
console.log(`Guardrail ${guardrailName} triggered:`, outputInfo);
});
await session.connect();
// Helper function
async function checkToxicity(text: string): Promise<number> {
return 0.2; // Mock implementation
}Special functions for realtime tool execution.
/**
* Return tool result without triggering agent response
* @param result - Tool result
* @returns Background result wrapper
*/
function backgroundResult(result: any): BackgroundResult;
/**
* Check if result is a background result
* @param result - Result to check
* @returns True if background result
*/
function isBackgroundResult(result: any): result is BackgroundResult;Usage Examples:
import { RealtimeAgent, RealtimeSession, backgroundResult } from '@openai/agents/realtime';
import { tool } from '@openai/agents';
import { z } from 'zod';
// Tool that doesn't trigger response
const logTool = tool({
name: 'log_event',
description: 'Log an event silently',
parameters: z.object({
event: z.string(),
}),
execute: async (input) => {
console.log('Event logged:', input.event);
// Return background result to prevent agent response
return backgroundResult({ logged: true });
},
});
// Tool that triggers response
const searchTool = tool({
name: 'search',
description: 'Search for information',
parameters: z.object({
query: z.string(),
}),
execute: async (input) => {
const results = await performSearch(input.query);
// Normal return triggers agent to speak results
return `Found: ${results}`;
},
});
const agent = new RealtimeAgent({
name: 'ToolAgent',
instructions: 'You can log events and search',
tools: [logTool, searchTool],
});
const session = new RealtimeSession(agent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: 'webrtc',
});
await session.connect();
// "log_event" won't cause agent to speak
// "search" will cause agent to speak the results
// Helper function
async function performSearch(query: string): Promise<string> {
return 'Search results'; // Mock implementation
}Utility functions for realtime agents.
/**
* Utility functions for realtime agents
*/
namespace utils {
/**
* Convert base64 to ArrayBuffer
* @param base64 - Base64 encoded string
* @returns ArrayBuffer
*/
function base64ToArrayBuffer(base64: string): ArrayBuffer;
/**
* Convert ArrayBuffer to base64
* @param buffer - Array buffer
* @returns Base64 encoded string
*/
function arrayBufferToBase64(buffer: ArrayBuffer): string;
/**
* Extract last text from audio output message
* @param message - Audio output message
* @returns Last text or null
*/
function getLastTextFromAudioOutputMessage(message: any): string | null;
}Usage Examples:
import { RealtimeAgent, RealtimeSession, utils } from '@openai/agents/realtime';
const agent = new RealtimeAgent({
name: 'UtilAgent',
instructions: 'You are helpful',
});
const session = new RealtimeSession(agent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: 'webrtc',
});
await session.connect();
// Convert audio data
const base64Audio = 'SGVsbG8gV29ybGQ=';
const audioBuffer = utils.base64ToArrayBuffer(base64Audio);
session.sendAudio(audioBuffer);
// Convert buffer to base64
const buffer = new ArrayBuffer(8);
const base64 = utils.arrayBufferToBase64(buffer);
console.log('Base64:', base64);
// Extract text from message
session.transport.on('audio', (audio, message) => {
const text = utils.getLastTextFromAudioOutputMessage(message);
if (text) {
console.log('Agent said:', text);
}
});Full example with all features.
Usage Examples:
import {
RealtimeAgent,
RealtimeSession,
RealtimeOutputGuardrail,
backgroundResult,
} from '@openai/agents/realtime';
import { tool } from '@openai/agents';
import { z } from 'zod';
// Define tools
const weatherTool = tool({
name: 'get_weather',
description: 'Get weather for a city',
parameters: z.object({
city: z.string(),
}),
execute: async (input) => {
return `Weather in ${input.city}: Sunny, 72°F`;
},
});
const notifyTool = tool({
name: 'send_notification',
description: 'Send a silent notification',
parameters: z.object({
message: z.string(),
}),
execute: async (input) => {
console.log('Notification:', input.message);
return backgroundResult({ sent: true });
},
});
// Define guardrail using the interface
const safetyGuardrail: RealtimeOutputGuardrail = {
name: 'safety_check',
execute: async ({ agentOutput }) => {
const isUnsafe = false; // Check logic
return {
tripwireTriggered: isUnsafe,
outputInfo: { safe: !isUnsafe },
};
},
};
// Create specialist agent
const specialist = new RealtimeAgent({
name: 'WeatherSpecialist',
handoffDescription: 'Expert in weather information',
instructions: 'You are a weather expert',
tools: [weatherTool],
voice: 'shimmer',
});
// Create main agent
const mainAgent = new RealtimeAgent({
name: 'MainAgent',
instructions: 'You are a helpful voice assistant. Route weather questions to the specialist.',
handoffs: [specialist],
tools: [notifyTool],
voice: 'alloy',
});
// Create session
const session = new RealtimeSession(mainAgent, {
apiKey: process.env.OPENAI_API_KEY!,
transport: 'webrtc',
model: 'gpt-4o-realtime-preview',
context: { userId: 'user123' },
outputGuardrails: [safetyGuardrail],
outputGuardrailSettings: {
debounceTextLength: 50,
feedbackMessage: 'Please rephrase that',
},
workflowName: 'VoiceAssistant',
});
// Set up event handlers
session.transport.on('audio', (audio: ArrayBuffer) => {
playAudio(audio);
});
session.transport.on('agent_handoff', ({ fromAgent, toAgent }) => {
console.log(`Handoff: ${fromAgent.name} -> ${toAgent.name}`);
});
session.transport.on('agent_tool_start', ({ tool }) => {
console.log(`Tool starting: ${tool.name}`);
});
session.transport.on('guardrail_tripped', ({ guardrailName }) => {
console.log(`Guardrail triggered: ${guardrailName}`);
});
session.transport.on('error', (error) => {
console.error('Error:', error);
});
// Connect and use
await session.connect();
// Send user audio
const userAudio = await captureUserAudio();
session.sendAudio(userAudio);
// Or send text
session.sendMessage('What is the weather in Tokyo?');
// Interrupt if needed
session.interrupt();
// Switch agents
await session.updateAgent(specialist);
// Close when done
session.close();
// Helper functions (implementation-specific)
function playAudio(audio: ArrayBuffer) {
// Platform-specific audio playback
}
async function captureUserAudio(): Promise<ArrayBuffer> {
// Platform-specific audio capture
return new ArrayBuffer(0);
}