The ElevenLabs SDK provides comprehensive real-time streaming capabilities for low-latency text-to-speech applications. This includes WebSocket-based streaming for interactive applications, multi-context streaming for complex scenarios, and optimized streaming configurations for various use cases.
import {
ElevenLabsClient,
type StreamTextToSpeechRequest,
type StreamTextToSpeechWithTimestampsRequest,
type StreamingAudioChunkWithTimestampsResponse,
type WebsocketTtsClientMessageMulti,
type WebsocketTtsServerMessageMulti,
type GenerationConfig,
type RealtimeVoiceSettings,
type InitializeConnection,
type SendText,
type CloseConnection
} from 'elevenlabs';const client = new ElevenLabsClient();
// Basic streaming text-to-speech
const audioStream = await client.textToSpeech.convertAsStream(
"21m00Tcm4TlvDq8ikWAM", // Voice ID
{
text: "This text will be converted to speech in real-time chunks.",
model_id: "eleven_turbo_v2_5", // Optimized for streaming
optimize_streaming_latency: 3,
output_format: "mp3_22050_32" // Lower quality for faster streaming
}
);
// Process audio chunks as they arrive
audioStream.on('data', (chunk: Buffer) => {
console.log(`Received audio chunk: ${chunk.length} bytes`);
// Play chunk immediately, send to client, etc.
});
audioStream.on('end', () => {
console.log('Streaming completed');
});
audioStream.on('error', (error) => {
console.error('Streaming error:', error);
});// Stream with precise character timing information
const timestampStream = await client.textToSpeech.streamWithTimestamps(
"pNInz6obpgDQGcFmaJgB",
{
text: "Real-time streaming with character-level timing synchronization.",
model_id: "eleven_multilingual_v2",
optimize_streaming_latency: 2,
enable_logging: false // Zero retention for privacy
}
);
// Process timestamped audio chunks
for await (const chunk of timestampStream) {
console.log('Audio chunk:', chunk.audio);
console.log(`Characters ${chunk.start_char_idx}-${chunk.end_char_idx}`);
console.log(`Time: ${chunk.start_time_seconds}s - ${chunk.end_time_seconds}s`);
if (chunk.is_final) {
console.log('Final chunk received');
}
// Synchronize text highlighting with audio playback
highlightText(chunk.start_char_idx, chunk.end_char_idx);
playAudioChunk(chunk.audio);
}import WebSocket from 'ws';
// Establish WebSocket connection for real-time streaming
const voiceId = "21m00Tcm4TlvDq8ikWAM";
const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input?model_id=eleven_turbo_v2_5`;
const ws = new WebSocket(wsUrl, {
headers: {
'xi-api-key': process.env.ELEVENLABS_API_KEY
}
});
ws.on('open', () => {
console.log('WebSocket connected');
// Initialize connection
ws.send(JSON.stringify({
text: " ", // Initial space to establish connection
voice_settings: {
stability: 0.5,
similarity_boost: 0.8,
style: 0.2
},
generation_config: {
chunk_length_schedule: [120, 160, 250, 290] // Default chunking
}
}));
});
ws.on('message', (data) => {
const message = JSON.parse(data.toString());
if (message.audio) {
// Received audio chunk (base64 encoded)
const audioBuffer = Buffer.from(message.audio, 'base64');
console.log(`Audio chunk: ${audioBuffer.length} bytes`);
// Play audio immediately for real-time experience
playAudioChunk(audioBuffer);
}
if (message.is_final) {
console.log('Generation complete');
}
});
// Send text for synthesis
function sendText(text: string) {
ws.send(JSON.stringify({
text: text
}));
}
// Usage
sendText("Hello, this is real-time text-to-speech!");
sendText(" And this continues the conversation seamlessly.");
// Close connection
function closeConnection() {
ws.send(JSON.stringify({
text: ""
}));
ws.close();
}// Advanced multi-context WebSocket for concurrent streams
const multiWsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/stream-input-multi`;
const multiWs = new WebSocket(multiWsUrl, {
headers: {
'xi-api-key': process.env.ELEVENLABS_API_KEY
}
});
multiWs.on('open', () => {
console.log('Multi-context WebSocket connected');
// Initialize multiple contexts
const contexts = ['narrator', 'character1', 'character2'];
contexts.forEach(contextId => {
multiWs.send(JSON.stringify({
context_id: contextId,
text: " ", // Initial space for each context
voice_settings: {
stability: contextId === 'narrator' ? 0.7 : 0.4,
similarity_boost: 0.8
},
generation_config: {
chunk_length_schedule: [100, 140, 200, 250] // Faster for real-time
}
}));
});
});
multiWs.on('message', (data) => {
const message: WebsocketTtsServerMessageMulti = JSON.parse(data.toString());
if (message.audio && message.context_id) {
const audioBuffer = Buffer.from(message.audio, 'base64');
console.log(`Audio from ${message.context_id}: ${audioBuffer.length} bytes`);
// Route audio to appropriate playback channel
playAudioForContext(message.context_id, audioBuffer);
}
if (message.is_final) {
console.log(`Context ${message.context_id} completed`);
}
});
// Send text to specific context
function sendTextToContext(contextId: string, text: string) {
multiWs.send(JSON.stringify({
context_id: contextId,
text: text
}));
}
// Usage: Multiple speakers in parallel
sendTextToContext('narrator', 'Once upon a time, in a distant land...');
sendTextToContext('character1', 'Hello there, stranger!');
sendTextToContext('character2', 'Welcome to our village.');
// Flush and close specific context
function closeContext(contextId: string) {
multiWs.send(JSON.stringify({
context_id: contextId,
flush: true,
close_context: true
}));
}
// Close all contexts and connection
function closeAllContexts() {
multiWs.send(JSON.stringify({
close_socket: true
}));
}interface StreamTextToSpeechRequest {
/** Text to convert to speech */
text: string;
/** Model optimized for streaming (e.g., "eleven_turbo_v2_5") */
model_id?: string;
/** Voice settings */
voice_settings?: VoiceSettings;
/** Output format (lower quality formats stream faster) */
output_format?: TextToSpeechConvertRequestOutputFormat;
/**
* Streaming latency optimization (0-4)
* Higher values = lower latency but potentially lower quality
*/
optimize_streaming_latency?: number;
/** Disable logging for zero retention */
enable_logging?: boolean;
/** Language code */
language_code?: string;
/** Deterministic seed */
seed?: number;
/** Context for continuity */
previous_text?: string;
next_text?: string;
previous_request_ids?: string[];
next_request_ids?: string[];
}// Client messages for multi-context WebSocket
interface WebsocketTtsClientMessageMulti {
/**
* Text to synthesize
* Use " " (space) for initial connection
* Empty string for control messages
*/
text?: string;
/** Voice settings (first message only) */
voice_settings?: RealtimeVoiceSettings;
/** Generation configuration (first message only) */
generation_config?: GenerationConfig;
/** API key (if not in header) */
"xi-api-key"?: string;
/** Authorization token (if not in header) */
authorization?: string;
/** Flush audio buffer for context */
flush?: boolean;
/** Pronunciation dictionaries (first message only) */
pronunciation_dictionary_locators?: PronunciationDictionaryLocator[];
/** Context identifier for multi-context streaming */
context_id?: string;
/** Close specific context */
close_context?: boolean;
/** Close entire WebSocket connection */
close_socket?: boolean;
}
// Server messages from multi-context WebSocket
interface WebsocketTtsServerMessageMulti {
/** Base64-encoded audio chunk */
audio?: string;
/** Whether this is the final message for the context */
is_final?: boolean;
/** Character alignment information */
normalizedAlignment?: NormalizedAlignment;
alignment?: Alignment;
/** Context identifier */
context_id?: string;
}interface GenerationConfig {
/**
* Chunk length schedule for streaming optimization
*
* Controls when audio generation starts based on text buffer size
* Default: [120, 160, 250, 290]
*
* - First chunk: 120+ characters
* - Second chunk: +160 characters
* - Third chunk: +250 characters
* - Subsequent: +290 characters each
*
* Lower values = faster response, potentially lower quality
* Range: 50-500 per value
*/
chunk_length_schedule?: number[];
}interface RealtimeVoiceSettings {
/** Voice stability (0.0-1.0) */
stability?: number;
/** Similarity boost (0.0-1.0) */
similarity_boost?: number;
/** Style exaggeration (0.0-1.0) */
style?: number;
/** Speaker boost for enhanced similarity */
use_speaker_boost?: boolean;
/** Speech speed multiplier */
speed?: number;
}// Real-time conversational AI with streaming
class StreamingConversation {
private ws: WebSocket;
private audioQueue: Buffer[] = [];
constructor(voiceId: string) {
const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input`;
this.ws = new WebSocket(wsUrl, {
headers: { 'xi-api-key': process.env.ELEVENLABS_API_KEY }
});
this.ws.on('open', () => this.initialize());
this.ws.on('message', (data) => this.handleAudioChunk(data));
}
private initialize() {
this.ws.send(JSON.stringify({
text: " ",
voice_settings: { stability: 0.4, similarity_boost: 0.7 },
generation_config: { chunk_length_schedule: [80, 120, 160, 200] }
}));
}
private handleAudioChunk(data: any) {
const message = JSON.parse(data.toString());
if (message.audio) {
const audioBuffer = Buffer.from(message.audio, 'base64');
this.audioQueue.push(audioBuffer);
this.playNextChunk();
}
}
private playNextChunk() {
if (this.audioQueue.length > 0) {
const chunk = this.audioQueue.shift()!;
// Play audio chunk immediately
playAudioBuffer(chunk);
}
}
speak(text: string) {
this.ws.send(JSON.stringify({ text: text }));
}
interrupt() {
// Flush current generation and start new
this.ws.send(JSON.stringify({ text: "" }));
this.audioQueue = []; // Clear buffered audio
}
}
// Usage
const conversation = new StreamingConversation("pNInz6obpgDQGcFmaJgB");
conversation.speak("Hello! How can I help you today?");// Stream multiple voices simultaneously
class MultiVoiceStreaming {
private connections = new Map<string, WebSocket>();
addVoice(name: string, voiceId: string) {
const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input`;
const ws = new WebSocket(wsUrl, {
headers: { 'xi-api-key': process.env.ELEVENLABS_API_KEY }
});
ws.on('open', () => {
ws.send(JSON.stringify({
text: " ",
voice_settings: { stability: 0.5, similarity_boost: 0.8 },
generation_config: { chunk_length_schedule: [100, 150, 200, 250] }
}));
});
ws.on('message', (data) => {
const message = JSON.parse(data.toString());
if (message.audio) {
const audioBuffer = Buffer.from(message.audio, 'base64');
this.playAudioForVoice(name, audioBuffer);
}
});
this.connections.set(name, ws);
}
speak(voiceName: string, text: string) {
const ws = this.connections.get(voiceName);
if (ws && ws.readyState === WebSocket.OPEN) {
ws.send(JSON.stringify({ text: text }));
}
}
private playAudioForVoice(voiceName: string, audioBuffer: Buffer) {
// Route to appropriate audio channel/speaker
console.log(`Playing audio for ${voiceName}: ${audioBuffer.length} bytes`);
}
}
// Usage
const multiVoice = new MultiVoiceStreaming();
multiVoice.addVoice('narrator', 'pNInz6obpgDQGcFmaJgB');
multiVoice.addVoice('character', '21m00Tcm4TlvDq8ikWAM');
multiVoice.speak('narrator', 'The story begins...');
multiVoice.speak('character', 'Hello, world!');// Intelligent buffering for smooth playback
class BufferedStreamingPlayer {
private audioBuffer: Buffer[] = [];
private isPlaying = false;
private targetBufferSize = 5; // Number of chunks to buffer
async startStreaming(voiceId: string, text: string) {
const audioStream = await client.textToSpeech.convertAsStream(voiceId, {
text: text,
model_id: "eleven_turbo_v2_5",
optimize_streaming_latency: 2,
output_format: "mp3_22050_32"
});
audioStream.on('data', (chunk: Buffer) => {
this.audioBuffer.push(chunk);
// Start playing when buffer reaches target size
if (!this.isPlaying && this.audioBuffer.length >= this.targetBufferSize) {
this.startPlayback();
}
});
audioStream.on('end', () => {
// Continue playing remaining buffer
setTimeout(() => this.checkPlaybackComplete(), 100);
});
}
private startPlayback() {
this.isPlaying = true;
this.playNextChunk();
}
private playNextChunk() {
if (this.audioBuffer.length > 0) {
const chunk = this.audioBuffer.shift()!;
// Play chunk and schedule next
playAudioBuffer(chunk).then(() => {
if (this.audioBuffer.length > 0 || this.isPlaying) {
this.playNextChunk();
} else {
this.isPlaying = false;
}
});
}
}
private checkPlaybackComplete() {
if (this.audioBuffer.length === 0 && !this.isPlaying) {
console.log('Playback completed');
}
}
}// Configuration for minimum possible latency
const ultraLowLatencyConfig = {
model_id: "eleven_turbo_v2_5", // Fastest model
optimize_streaming_latency: 4, // Maximum optimization
output_format: "mp3_22050_32", // Lowest quality/fastest
voice_settings: {
stability: 0.5,
similarity_boost: 0.7,
style: 0, // Disable style for speed
use_speaker_boost: false // Disable for speed
},
generation_config: {
chunk_length_schedule: [50, 80, 120, 150] // Very aggressive chunking
}
};
// Apply to streaming
const ultraFastStream = await client.textToSpeech.convertAsStream(voiceId, {
text: "Ultra-fast streaming response",
...ultraLowLatencyConfig
});// Predefined profiles for different use cases
const streamingProfiles = {
// Interactive chatbot - prioritize speed
interactive: {
optimize_streaming_latency: 4,
output_format: "mp3_22050_32",
generation_config: { chunk_length_schedule: [60, 100, 140, 180] }
},
// Conversational - balanced
conversational: {
optimize_streaming_latency: 2,
output_format: "mp3_44100_64",
generation_config: { chunk_length_schedule: [100, 150, 200, 250] }
},
// Presentation - prioritize quality
presentation: {
optimize_streaming_latency: 1,
output_format: "mp3_44100_128",
generation_config: { chunk_length_schedule: [150, 200, 300, 400] }
}
};class RobustStreamingClient {
private ws?: WebSocket;
private reconnectAttempts = 0;
private maxReconnectAttempts = 3;
connect(voiceId: string) {
const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input`;
this.ws = new WebSocket(wsUrl, {
headers: { 'xi-api-key': process.env.ELEVENLABS_API_KEY }
});
this.ws.on('open', () => {
console.log('WebSocket connected');
this.reconnectAttempts = 0;
this.initialize();
});
this.ws.on('error', (error) => {
console.error('WebSocket error:', error);
this.handleConnectionError();
});
this.ws.on('close', (code, reason) => {
console.log(`WebSocket closed: ${code} ${reason}`);
if (code !== 1000) { // Not a normal close
this.handleConnectionError();
}
});
this.ws.on('message', (data) => {
try {
const message = JSON.parse(data.toString());
this.handleMessage(message);
} catch (error) {
console.error('Message parsing error:', error);
}
});
}
private handleConnectionError() {
if (this.reconnectAttempts < this.maxReconnectAttempts) {
this.reconnectAttempts++;
const delay = Math.pow(2, this.reconnectAttempts) * 1000; // Exponential backoff
console.log(`Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`);
setTimeout(() => this.connect(voiceId), delay);
} else {
console.error('Max reconnection attempts reached');
}
}
private initialize() {
this.ws?.send(JSON.stringify({
text: " ",
voice_settings: { stability: 0.5, similarity_boost: 0.8 }
}));
}
private handleMessage(message: any) {
if (message.audio) {
const audioBuffer = Buffer.from(message.audio, 'base64');
this.playAudio(audioBuffer);
}
}
speak(text: string) {
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify({ text: text }));
} else {
console.error('WebSocket not connected');
}
}
}// Monitor streaming performance metrics
class StreamingMetrics {
private startTime?: number;
private firstChunkTime?: number;
private totalChunks = 0;
private totalBytes = 0;
startRequest() {
this.startTime = Date.now();
this.firstChunkTime = undefined;
this.totalChunks = 0;
this.totalBytes = 0;
}
recordChunk(chunkSize: number) {
if (!this.firstChunkTime) {
this.firstChunkTime = Date.now();
}
this.totalChunks++;
this.totalBytes += chunkSize;
}
getMetrics() {
const now = Date.now();
return {
timeToFirstByte: this.firstChunkTime ? this.firstChunkTime - this.startTime! : 0,
totalDuration: now - this.startTime!,
avgChunkSize: this.totalBytes / this.totalChunks,
chunksPerSecond: this.totalChunks / ((now - this.startTime!) / 1000),
bytesPerSecond: this.totalBytes / ((now - this.startTime!) / 1000)
};
}
}
// Usage
const metrics = new StreamingMetrics();
metrics.startRequest();
audioStream.on('data', (chunk: Buffer) => {
metrics.recordChunk(chunk.length);
});
audioStream.on('end', () => {
console.log('Streaming metrics:', metrics.getMetrics());
});eleven_turbo_v2_5 for streaming, eleven_flash_v2_5 for ultra-low latencyoptimize_streaming_latency: 2-3 for balanced performancemp3_22050_32) for faster streamingchunk_length_schedule based on latency requirements