or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

audio-processing.mdconversational-ai.mdhistory.mdindex.mdprojects-studio.mdstreaming.mdtext-to-speech.mdutilities.mdvoice-management.md
tile.json

streaming.mddocs/

Real-time Streaming

Overview

The ElevenLabs SDK provides comprehensive real-time streaming capabilities for low-latency text-to-speech applications. This includes WebSocket-based streaming for interactive applications, multi-context streaming for complex scenarios, and optimized streaming configurations for various use cases.

Core Imports

import { 
  ElevenLabsClient,
  type StreamTextToSpeechRequest,
  type StreamTextToSpeechWithTimestampsRequest,
  type StreamingAudioChunkWithTimestampsResponse,
  type WebsocketTtsClientMessageMulti,
  type WebsocketTtsServerMessageMulti,
  type GenerationConfig,
  type RealtimeVoiceSettings,
  type InitializeConnection,
  type SendText,
  type CloseConnection
} from 'elevenlabs';

Basic Streaming

Simple Streaming TTS

const client = new ElevenLabsClient();

// Basic streaming text-to-speech
const audioStream = await client.textToSpeech.convertAsStream(
  "21m00Tcm4TlvDq8ikWAM", // Voice ID
  {
    text: "This text will be converted to speech in real-time chunks.",
    model_id: "eleven_turbo_v2_5", // Optimized for streaming
    optimize_streaming_latency: 3,
    output_format: "mp3_22050_32" // Lower quality for faster streaming
  }
);

// Process audio chunks as they arrive
audioStream.on('data', (chunk: Buffer) => {
  console.log(`Received audio chunk: ${chunk.length} bytes`);
  // Play chunk immediately, send to client, etc.
});

audioStream.on('end', () => {
  console.log('Streaming completed');
});

audioStream.on('error', (error) => {
  console.error('Streaming error:', error);
});

Streaming with Character Timestamps

// Stream with precise character timing information
const timestampStream = await client.textToSpeech.streamWithTimestamps(
  "pNInz6obpgDQGcFmaJgB",
  {
    text: "Real-time streaming with character-level timing synchronization.",
    model_id: "eleven_multilingual_v2",
    optimize_streaming_latency: 2,
    enable_logging: false // Zero retention for privacy
  }
);

// Process timestamped audio chunks
for await (const chunk of timestampStream) {
  console.log('Audio chunk:', chunk.audio);
  console.log(`Characters ${chunk.start_char_idx}-${chunk.end_char_idx}`);
  console.log(`Time: ${chunk.start_time_seconds}s - ${chunk.end_time_seconds}s`);
  
  if (chunk.is_final) {
    console.log('Final chunk received');
  }
  
  // Synchronize text highlighting with audio playback
  highlightText(chunk.start_char_idx, chunk.end_char_idx);
  playAudioChunk(chunk.audio);
}

WebSocket Streaming

Single-Context WebSocket

import WebSocket from 'ws';

// Establish WebSocket connection for real-time streaming
const voiceId = "21m00Tcm4TlvDq8ikWAM";
const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input?model_id=eleven_turbo_v2_5`;

const ws = new WebSocket(wsUrl, {
  headers: {
    'xi-api-key': process.env.ELEVENLABS_API_KEY
  }
});

ws.on('open', () => {
  console.log('WebSocket connected');
  
  // Initialize connection
  ws.send(JSON.stringify({
    text: " ", // Initial space to establish connection
    voice_settings: {
      stability: 0.5,
      similarity_boost: 0.8,
      style: 0.2
    },
    generation_config: {
      chunk_length_schedule: [120, 160, 250, 290] // Default chunking
    }
  }));
});

ws.on('message', (data) => {
  const message = JSON.parse(data.toString());
  
  if (message.audio) {
    // Received audio chunk (base64 encoded)
    const audioBuffer = Buffer.from(message.audio, 'base64');
    console.log(`Audio chunk: ${audioBuffer.length} bytes`);
    
    // Play audio immediately for real-time experience
    playAudioChunk(audioBuffer);
  }
  
  if (message.is_final) {
    console.log('Generation complete');
  }
});

// Send text for synthesis
function sendText(text: string) {
  ws.send(JSON.stringify({
    text: text
  }));
}

// Usage
sendText("Hello, this is real-time text-to-speech!");
sendText(" And this continues the conversation seamlessly.");

// Close connection
function closeConnection() {
  ws.send(JSON.stringify({
    text: ""
  }));
  ws.close();
}

Multi-Context WebSocket

// Advanced multi-context WebSocket for concurrent streams
const multiWsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/stream-input-multi`;

const multiWs = new WebSocket(multiWsUrl, {
  headers: {
    'xi-api-key': process.env.ELEVENLABS_API_KEY
  }
});

multiWs.on('open', () => {
  console.log('Multi-context WebSocket connected');
  
  // Initialize multiple contexts
  const contexts = ['narrator', 'character1', 'character2'];
  
  contexts.forEach(contextId => {
    multiWs.send(JSON.stringify({
      context_id: contextId,
      text: " ", // Initial space for each context
      voice_settings: {
        stability: contextId === 'narrator' ? 0.7 : 0.4,
        similarity_boost: 0.8
      },
      generation_config: {
        chunk_length_schedule: [100, 140, 200, 250] // Faster for real-time
      }
    }));
  });
});

multiWs.on('message', (data) => {
  const message: WebsocketTtsServerMessageMulti = JSON.parse(data.toString());
  
  if (message.audio && message.context_id) {
    const audioBuffer = Buffer.from(message.audio, 'base64');
    console.log(`Audio from ${message.context_id}: ${audioBuffer.length} bytes`);
    
    // Route audio to appropriate playback channel
    playAudioForContext(message.context_id, audioBuffer);
  }
  
  if (message.is_final) {
    console.log(`Context ${message.context_id} completed`);
  }
});

// Send text to specific context
function sendTextToContext(contextId: string, text: string) {
  multiWs.send(JSON.stringify({
    context_id: contextId,
    text: text
  }));
}

// Usage: Multiple speakers in parallel
sendTextToContext('narrator', 'Once upon a time, in a distant land...');
sendTextToContext('character1', 'Hello there, stranger!');
sendTextToContext('character2', 'Welcome to our village.');

// Flush and close specific context
function closeContext(contextId: string) {
  multiWs.send(JSON.stringify({
    context_id: contextId,
    flush: true,
    close_context: true
  }));
}

// Close all contexts and connection
function closeAllContexts() {
  multiWs.send(JSON.stringify({
    close_socket: true
  }));
}

Streaming Interfaces

StreamTextToSpeechRequest

interface StreamTextToSpeechRequest {
  /** Text to convert to speech */
  text: string;
  
  /** Model optimized for streaming (e.g., "eleven_turbo_v2_5") */
  model_id?: string;
  
  /** Voice settings */
  voice_settings?: VoiceSettings;
  
  /** Output format (lower quality formats stream faster) */
  output_format?: TextToSpeechConvertRequestOutputFormat;
  
  /** 
   * Streaming latency optimization (0-4)
   * Higher values = lower latency but potentially lower quality
   */
  optimize_streaming_latency?: number;
  
  /** Disable logging for zero retention */
  enable_logging?: boolean;
  
  /** Language code */
  language_code?: string;
  
  /** Deterministic seed */
  seed?: number;
  
  /** Context for continuity */
  previous_text?: string;
  next_text?: string;
  previous_request_ids?: string[];
  next_request_ids?: string[];
}

WebSocket Message Types

// Client messages for multi-context WebSocket
interface WebsocketTtsClientMessageMulti {
  /** 
   * Text to synthesize
   * Use " " (space) for initial connection
   * Empty string for control messages
   */
  text?: string;
  
  /** Voice settings (first message only) */
  voice_settings?: RealtimeVoiceSettings;
  
  /** Generation configuration (first message only) */
  generation_config?: GenerationConfig;
  
  /** API key (if not in header) */
  "xi-api-key"?: string;
  
  /** Authorization token (if not in header) */
  authorization?: string;
  
  /** Flush audio buffer for context */
  flush?: boolean;
  
  /** Pronunciation dictionaries (first message only) */
  pronunciation_dictionary_locators?: PronunciationDictionaryLocator[];
  
  /** Context identifier for multi-context streaming */
  context_id?: string;
  
  /** Close specific context */
  close_context?: boolean;
  
  /** Close entire WebSocket connection */
  close_socket?: boolean;
}

// Server messages from multi-context WebSocket
interface WebsocketTtsServerMessageMulti {
  /** Base64-encoded audio chunk */
  audio?: string;
  
  /** Whether this is the final message for the context */
  is_final?: boolean;
  
  /** Character alignment information */
  normalizedAlignment?: NormalizedAlignment;
  alignment?: Alignment;
  
  /** Context identifier */
  context_id?: string;
}

Generation Configuration

interface GenerationConfig {
  /**
   * Chunk length schedule for streaming optimization
   * 
   * Controls when audio generation starts based on text buffer size
   * Default: [120, 160, 250, 290]
   * 
   * - First chunk: 120+ characters
   * - Second chunk: +160 characters  
   * - Third chunk: +250 characters
   * - Subsequent: +290 characters each
   * 
   * Lower values = faster response, potentially lower quality
   * Range: 50-500 per value
   */
  chunk_length_schedule?: number[];
}

Realtime Voice Settings

interface RealtimeVoiceSettings {
  /** Voice stability (0.0-1.0) */
  stability?: number;
  
  /** Similarity boost (0.0-1.0) */
  similarity_boost?: number;
  
  /** Style exaggeration (0.0-1.0) */
  style?: number;
  
  /** Speaker boost for enhanced similarity */
  use_speaker_boost?: boolean;
  
  /** Speech speed multiplier */
  speed?: number;
}

Advanced Streaming Patterns

Conversational Streaming

// Real-time conversational AI with streaming
class StreamingConversation {
  private ws: WebSocket;
  private audioQueue: Buffer[] = [];
  
  constructor(voiceId: string) {
    const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input`;
    this.ws = new WebSocket(wsUrl, {
      headers: { 'xi-api-key': process.env.ELEVENLABS_API_KEY }
    });
    
    this.ws.on('open', () => this.initialize());
    this.ws.on('message', (data) => this.handleAudioChunk(data));
  }
  
  private initialize() {
    this.ws.send(JSON.stringify({
      text: " ",
      voice_settings: { stability: 0.4, similarity_boost: 0.7 },
      generation_config: { chunk_length_schedule: [80, 120, 160, 200] }
    }));
  }
  
  private handleAudioChunk(data: any) {
    const message = JSON.parse(data.toString());
    if (message.audio) {
      const audioBuffer = Buffer.from(message.audio, 'base64');
      this.audioQueue.push(audioBuffer);
      this.playNextChunk();
    }
  }
  
  private playNextChunk() {
    if (this.audioQueue.length > 0) {
      const chunk = this.audioQueue.shift()!;
      // Play audio chunk immediately
      playAudioBuffer(chunk);
    }
  }
  
  speak(text: string) {
    this.ws.send(JSON.stringify({ text: text }));
  }
  
  interrupt() {
    // Flush current generation and start new
    this.ws.send(JSON.stringify({ text: "" }));
    this.audioQueue = []; // Clear buffered audio
  }
}

// Usage
const conversation = new StreamingConversation("pNInz6obpgDQGcFmaJgB");
conversation.speak("Hello! How can I help you today?");

Multi-Voice Streaming

// Stream multiple voices simultaneously
class MultiVoiceStreaming {
  private connections = new Map<string, WebSocket>();
  
  addVoice(name: string, voiceId: string) {
    const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input`;
    const ws = new WebSocket(wsUrl, {
      headers: { 'xi-api-key': process.env.ELEVENLABS_API_KEY }
    });
    
    ws.on('open', () => {
      ws.send(JSON.stringify({
        text: " ",
        voice_settings: { stability: 0.5, similarity_boost: 0.8 },
        generation_config: { chunk_length_schedule: [100, 150, 200, 250] }
      }));
    });
    
    ws.on('message', (data) => {
      const message = JSON.parse(data.toString());
      if (message.audio) {
        const audioBuffer = Buffer.from(message.audio, 'base64');
        this.playAudioForVoice(name, audioBuffer);
      }
    });
    
    this.connections.set(name, ws);
  }
  
  speak(voiceName: string, text: string) {
    const ws = this.connections.get(voiceName);
    if (ws && ws.readyState === WebSocket.OPEN) {
      ws.send(JSON.stringify({ text: text }));
    }
  }
  
  private playAudioForVoice(voiceName: string, audioBuffer: Buffer) {
    // Route to appropriate audio channel/speaker
    console.log(`Playing audio for ${voiceName}: ${audioBuffer.length} bytes`);
  }
}

// Usage
const multiVoice = new MultiVoiceStreaming();
multiVoice.addVoice('narrator', 'pNInz6obpgDQGcFmaJgB');
multiVoice.addVoice('character', '21m00Tcm4TlvDq8ikWAM');

multiVoice.speak('narrator', 'The story begins...');
multiVoice.speak('character', 'Hello, world!');

Buffered Streaming

// Intelligent buffering for smooth playback
class BufferedStreamingPlayer {
  private audioBuffer: Buffer[] = [];
  private isPlaying = false;
  private targetBufferSize = 5; // Number of chunks to buffer
  
  async startStreaming(voiceId: string, text: string) {
    const audioStream = await client.textToSpeech.convertAsStream(voiceId, {
      text: text,
      model_id: "eleven_turbo_v2_5",
      optimize_streaming_latency: 2,
      output_format: "mp3_22050_32"
    });
    
    audioStream.on('data', (chunk: Buffer) => {
      this.audioBuffer.push(chunk);
      
      // Start playing when buffer reaches target size
      if (!this.isPlaying && this.audioBuffer.length >= this.targetBufferSize) {
        this.startPlayback();
      }
    });
    
    audioStream.on('end', () => {
      // Continue playing remaining buffer
      setTimeout(() => this.checkPlaybackComplete(), 100);
    });
  }
  
  private startPlayback() {
    this.isPlaying = true;
    this.playNextChunk();
  }
  
  private playNextChunk() {
    if (this.audioBuffer.length > 0) {
      const chunk = this.audioBuffer.shift()!;
      
      // Play chunk and schedule next
      playAudioBuffer(chunk).then(() => {
        if (this.audioBuffer.length > 0 || this.isPlaying) {
          this.playNextChunk();
        } else {
          this.isPlaying = false;
        }
      });
    }
  }
  
  private checkPlaybackComplete() {
    if (this.audioBuffer.length === 0 && !this.isPlaying) {
      console.log('Playback completed');
    }
  }
}

Latency Optimization

Ultra-Low Latency Configuration

// Configuration for minimum possible latency
const ultraLowLatencyConfig = {
  model_id: "eleven_turbo_v2_5", // Fastest model
  optimize_streaming_latency: 4, // Maximum optimization
  output_format: "mp3_22050_32", // Lowest quality/fastest
  voice_settings: {
    stability: 0.5,
    similarity_boost: 0.7,
    style: 0, // Disable style for speed
    use_speaker_boost: false // Disable for speed
  },
  generation_config: {
    chunk_length_schedule: [50, 80, 120, 150] // Very aggressive chunking
  }
};

// Apply to streaming
const ultraFastStream = await client.textToSpeech.convertAsStream(voiceId, {
  text: "Ultra-fast streaming response",
  ...ultraLowLatencyConfig
});

Quality vs Latency Profiles

// Predefined profiles for different use cases
const streamingProfiles = {
  // Interactive chatbot - prioritize speed
  interactive: {
    optimize_streaming_latency: 4,
    output_format: "mp3_22050_32",
    generation_config: { chunk_length_schedule: [60, 100, 140, 180] }
  },
  
  // Conversational - balanced
  conversational: {
    optimize_streaming_latency: 2,
    output_format: "mp3_44100_64",
    generation_config: { chunk_length_schedule: [100, 150, 200, 250] }
  },
  
  // Presentation - prioritize quality
  presentation: {
    optimize_streaming_latency: 1,
    output_format: "mp3_44100_128",
    generation_config: { chunk_length_schedule: [150, 200, 300, 400] }
  }
};

Error Handling and Recovery

class RobustStreamingClient {
  private ws?: WebSocket;
  private reconnectAttempts = 0;
  private maxReconnectAttempts = 3;
  
  connect(voiceId: string) {
    const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${voiceId}/stream-input`;
    
    this.ws = new WebSocket(wsUrl, {
      headers: { 'xi-api-key': process.env.ELEVENLABS_API_KEY }
    });
    
    this.ws.on('open', () => {
      console.log('WebSocket connected');
      this.reconnectAttempts = 0;
      this.initialize();
    });
    
    this.ws.on('error', (error) => {
      console.error('WebSocket error:', error);
      this.handleConnectionError();
    });
    
    this.ws.on('close', (code, reason) => {
      console.log(`WebSocket closed: ${code} ${reason}`);
      if (code !== 1000) { // Not a normal close
        this.handleConnectionError();
      }
    });
    
    this.ws.on('message', (data) => {
      try {
        const message = JSON.parse(data.toString());
        this.handleMessage(message);
      } catch (error) {
        console.error('Message parsing error:', error);
      }
    });
  }
  
  private handleConnectionError() {
    if (this.reconnectAttempts < this.maxReconnectAttempts) {
      this.reconnectAttempts++;
      const delay = Math.pow(2, this.reconnectAttempts) * 1000; // Exponential backoff
      
      console.log(`Reconnecting in ${delay}ms (attempt ${this.reconnectAttempts})`);
      setTimeout(() => this.connect(voiceId), delay);
    } else {
      console.error('Max reconnection attempts reached');
    }
  }
  
  private initialize() {
    this.ws?.send(JSON.stringify({
      text: " ",
      voice_settings: { stability: 0.5, similarity_boost: 0.8 }
    }));
  }
  
  private handleMessage(message: any) {
    if (message.audio) {
      const audioBuffer = Buffer.from(message.audio, 'base64');
      this.playAudio(audioBuffer);
    }
  }
  
  speak(text: string) {
    if (this.ws && this.ws.readyState === WebSocket.OPEN) {
      this.ws.send(JSON.stringify({ text: text }));
    } else {
      console.error('WebSocket not connected');
    }
  }
}

Performance Monitoring

// Monitor streaming performance metrics
class StreamingMetrics {
  private startTime?: number;
  private firstChunkTime?: number;
  private totalChunks = 0;
  private totalBytes = 0;
  
  startRequest() {
    this.startTime = Date.now();
    this.firstChunkTime = undefined;
    this.totalChunks = 0;
    this.totalBytes = 0;
  }
  
  recordChunk(chunkSize: number) {
    if (!this.firstChunkTime) {
      this.firstChunkTime = Date.now();
    }
    
    this.totalChunks++;
    this.totalBytes += chunkSize;
  }
  
  getMetrics() {
    const now = Date.now();
    return {
      timeToFirstByte: this.firstChunkTime ? this.firstChunkTime - this.startTime! : 0,
      totalDuration: now - this.startTime!,
      avgChunkSize: this.totalBytes / this.totalChunks,
      chunksPerSecond: this.totalChunks / ((now - this.startTime!) / 1000),
      bytesPerSecond: this.totalBytes / ((now - this.startTime!) / 1000)
    };
  }
}

// Usage
const metrics = new StreamingMetrics();
metrics.startRequest();

audioStream.on('data', (chunk: Buffer) => {
  metrics.recordChunk(chunk.length);
});

audioStream.on('end', () => {
  console.log('Streaming metrics:', metrics.getMetrics());
});

Best Practices

  1. Model Selection: Use eleven_turbo_v2_5 for streaming, eleven_flash_v2_5 for ultra-low latency
  2. Latency Optimization: Set optimize_streaming_latency: 2-3 for balanced performance
  3. Output Format: Use lower quality formats (mp3_22050_32) for faster streaming
  4. Chunking: Adjust chunk_length_schedule based on latency requirements
  5. Buffering: Implement intelligent buffering to prevent audio gaps
  6. Error Handling: Always implement reconnection logic for WebSocket connections
  7. Memory Management: Process audio chunks immediately, don't accumulate in memory
  8. Context Management: Use multi-context streaming for complex scenarios
  9. Performance Monitoring: Track metrics to optimize streaming performance
  10. Graceful Degradation: Have fallback strategies for connection failures