or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

audio-processing.mdconversational-ai.mdhistory.mdindex.mdprojects-studio.mdstreaming.mdtext-to-speech.mdutilities.mdvoice-management.md
tile.json

text-to-speech.mddocs/

Text-to-Speech

Overview

The Text-to-Speech (TTS) API converts written text into natural-sounding speech using AI-generated voices. The ElevenLabs SDK provides comprehensive TTS functionality including basic conversion, streaming audio, and precise character-level timing synchronization.

Core Imports

import { 
  ElevenLabsClient,
  type TextToSpeechRequest,
  type StreamTextToSpeechRequest,
  type TextToSpeechWithTimestampsRequest,
  type StreamTextToSpeechWithTimestampsRequest,
  type VoiceSettings,
  type AudioWithTimestampsResponse,
  type StreamingAudioChunkWithTimestampsResponse,
  type TextToSpeechConvertRequestOutputFormat
} from 'elevenlabs';

Basic Text-to-Speech

Simple Conversion

const client = new ElevenLabsClient();

// Basic text-to-speech conversion
const audioStream = await client.textToSpeech.convert(
  "21m00Tcm4TlvDq8ikWAM", // Voice ID
  {
    text: "Hello world! This is generated speech.",
    model_id: "eleven_multilingual_v2"
  }
);

// The audioStream is a Node.js Readable stream containing the audio data

Complete Request Example

const audioStream = await client.textToSpeech.convert(
  "pNInz6obpgDQGcFmaJgB", // Voice ID (Sarah)
  {
    text: "Welcome to ElevenLabs! This is a comprehensive text-to-speech example.",
    model_id: "eleven_multilingual_v2",
    voice_settings: {
      stability: 0.5,
      similarity_boost: 0.8,
      style: 0.2,
      use_speaker_boost: true,
      speed: 1.0
    },
    output_format: "mp3_44100_128",
    optimize_streaming_latency: 1,
    enable_logging: true,
    language_code: "en",
    seed: 12345
  }
);

Request Types and Interfaces

TextToSpeechRequest

interface TextToSpeechRequest {
  /** The text that will get converted into speech */
  text: string;
  
  /** Model identifier (e.g., "eleven_multilingual_v2") */
  model_id?: string;
  
  /** Language code (ISO 639-1) for model enforcement */
  language_code?: string;
  
  /** Voice settings overriding stored settings */
  voice_settings?: VoiceSettings;
  
  /** Output format (default: "mp3_44100_128") */
  output_format?: TextToSpeechConvertRequestOutputFormat;
  
  /** 
   * Latency optimization level (0-4):
   * 0: default mode (no optimizations)
   * 1: normal optimizations (~50% improvement)
   * 2: strong optimizations (~75% improvement) 
   * 3: max optimizations
   * 4: max + text normalizer off (best latency)
   */
  optimize_streaming_latency?: number;
  
  /** 
   * Enable zero retention mode for enterprise customers
   * When false, history features are unavailable
   */
  enable_logging?: boolean;
  
  /** Pronunciation dictionary locators (max 3) */
  pronunciation_dictionary_locators?: PronunciationDictionaryVersionLocatorRequestModel[];
  
  /** Deterministic seed for reproducible results (0-4294967295) */
  seed?: number;
  
  /** Previous text for continuity */
  previous_text?: string;
  
  /** Next text for continuity */
  next_text?: string;
  
  /** Previous request IDs for continuity (max 3) */
  previous_request_ids?: string[];
  
  /** Next request IDs for continuity (max 3) */
  next_request_ids?: string[];
  
  /** Use IVC instead of PVC for lower latency */
  use_pvc_as_ivc?: boolean;
  
  /** Text normalization: 'auto' | 'on' | 'off' */
  apply_text_normalization?: BodyTextToSpeechV1TextToSpeechVoiceIdPostApplyTextNormalization;
  
  /** Language-specific text normalization (increases latency) */
  apply_language_text_normalization?: boolean;
}

VoiceSettings

interface VoiceSettings {
  /** 
   * Voice stability (0.0-1.0)
   * Lower values: broader emotional range
   * Higher values: more monotonous but consistent
   */
  stability?: number;
  
  /** 
   * Similarity boost (0.0-1.0)
   * How closely AI adheres to original voice
   */
  similarity_boost?: number;
  
  /** 
   * Style exaggeration (0.0-1.0)
   * Amplifies original speaker's style
   * Increases computational load and latency
   */
  style?: number;
  
  /** 
   * Speaker boost for enhanced similarity
   * Increases computational load
   */
  use_speaker_boost?: boolean;
  
  /** 
   * Speech speed multiplier
   * 1.0 = normal, <1.0 = slower, >1.0 = faster
   */
  speed?: number;
}

Output Formats

type TextToSpeechConvertRequestOutputFormat =
  // MP3 formats (various bitrates)
  | "mp3_22050_32"   // 22.05kHz, 32kbps
  | "mp3_44100_32"   // 44.1kHz, 32kbps
  | "mp3_44100_64"   // 44.1kHz, 64kbps
  | "mp3_44100_96"   // 44.1kHz, 96kbps
  | "mp3_44100_128"  // 44.1kHz, 128kbps (most common)
  | "mp3_44100_192"  // 44.1kHz, 192kbps (requires Creator tier+)
  
  // PCM formats (uncompressed)
  | "pcm_8000"       // 8kHz (telephony)
  | "pcm_16000"      // 16kHz
  | "pcm_22050"      // 22.05kHz
  | "pcm_24000"      // 24kHz
  | "pcm_44100"      // 44.1kHz (requires Pro tier+)
  | "pcm_48000"      // 48kHz
  
  // Telephony formats
  | "ulaw_8000"      // μ-law 8kHz (Twilio compatible)
  | "alaw_8000"      // A-law 8kHz
  
  // Opus formats (high quality, efficient)
  | "opus_48000_32"  // 48kHz, 32kbps
  | "opus_48000_64"  // 48kHz, 64kbps
  | "opus_48000_96"  // 48kHz, 96kbps
  | "opus_48000_128" // 48kHz, 128kbps
  | "opus_48000_192"; // 48kHz, 192kbps

Streaming Text-to-Speech

Real-time Streaming

For low-latency applications, use streaming to receive audio as it's generated:

const audioStream = await client.textToSpeech.convertAsStream(
  "pNInz6obpgDQGcFmaJgB", // Voice ID
  {
    text: "This text will be converted to speech in real-time chunks.",
    model_id: "eleven_turbo_v2_5", // Optimized for streaming
    optimize_streaming_latency: 3,
    output_format: "mp3_44100_128"
  }
);

// Process stream chunks as they arrive
audioStream.on('data', (chunk) => {
  console.log(`Received audio chunk: ${chunk.length} bytes`);
  // Handle audio chunk (e.g., play immediately, buffer, etc.)
});

audioStream.on('end', () => {
  console.log('Stream completed');
});

StreamTextToSpeechRequest

interface StreamTextToSpeechRequest {
  /** Text to convert to speech */
  text: string;
  
  /** Model ID optimized for streaming */
  model_id?: string;
  
  /** Voice settings */
  voice_settings?: VoiceSettings;
  
  /** Output format */
  output_format?: TextToSpeechConvertRequestOutputFormat;
  
  /** Streaming latency optimization (recommended: 2-3) */
  optimize_streaming_latency?: number;
  
  /** Disable logging for zero retention */
  enable_logging?: boolean;
  
  /** Language code */
  language_code?: string;
  
  /** Deterministic seed */
  seed?: number;
  
  /** Continuity parameters */
  previous_text?: string;
  next_text?: string;
  previous_request_ids?: string[];
  next_request_ids?: string[];
}

Text-to-Speech with Timestamps

Character-level Timing

Get precise character-level timing information for audio-text synchronization:

const result = await client.textToSpeech.convertWithTimestamps(
  "21m00Tcm4TlvDq8ikWAM",
  {
    text: "This is a test for precise character timing.",
    model_id: "eleven_multilingual_v2"
  }
);

console.log('Audio data:', result.audio_base64);
console.log('Alignment info:', result.alignment);

// Process character alignments
result.alignment.characters.forEach((char, index) => {
  console.log(`Character "${char.character}" at ${char.start_time_seconds}s - ${char.end_time_seconds}s`);
});

AudioWithTimestampsResponse

interface AudioWithTimestampsResponse {
  /** Base64-encoded audio data */
  audio_base64: string;
  
  /** Character-level timing alignment */
  alignment: AlignmentInfo;
}

interface AlignmentInfo {
  /** Array of character timing data */
  characters: CharacterAlignment[];
  
  /** Overall audio duration */
  duration_seconds: number;
}

interface CharacterAlignment {
  /** The character */
  character: string;
  
  /** Start time in seconds */
  start_time_seconds: number;
  
  /** End time in seconds */
  end_time_seconds: number;
  
  /** Character index in original text */
  character_index: number;
}

Streaming with Timestamps

Real-time Character Timing

Stream audio with character-level timing for real-time synchronization:

const timestampStream = await client.textToSpeech.streamWithTimestamps(
  "pNInz6obpgDQGcFmaJgB",
  {
    text: "Real-time streaming with character-level timing information.",
    model_id: "eleven_multilingual_v2",
    optimize_streaming_latency: 2
  }
);

// Process streaming chunks with timing data
for await (const chunk of timestampStream) {
  console.log('Audio chunk:', chunk.audio);
  console.log('Characters:', chunk.start_char_idx, '-', chunk.end_char_idx);
  console.log('Timing:', chunk.start_time_seconds, '-', chunk.end_time_seconds);
  
  // Synchronize audio playback with text highlighting
  // highlightText(chunk.start_char_idx, chunk.end_char_idx);
  // playAudioChunk(chunk.audio);
}

StreamingAudioChunkWithTimestampsResponse

interface StreamingAudioChunkWithTimestampsResponse {
  /** Base64-encoded audio chunk */
  audio: string;
  
  /** Start character index in original text */
  start_char_idx: number;
  
  /** End character index in original text */
  end_char_idx: number;
  
  /** Start time of this chunk in seconds */
  start_time_seconds: number;
  
  /** End time of this chunk in seconds */
  end_time_seconds: number;
  
  /** Whether this is the final chunk */
  is_final?: boolean;
}

Model Selection

Available Models

// Get available models
const models = await client.models.getAll();

// Filter TTS-capable models
const ttsModels = models.filter(model => model.can_do_text_to_speech);

console.log('Available TTS models:');
ttsModels.forEach(model => {
  console.log(`- ${model.model_id}: ${model.name}`);
  console.log(`  Languages: ${model.languages?.join(', ')}`);
  console.log(`  Max characters: ${model.max_characters_request_free_user}`);
});

Recommended Models

// High-quality multilingual (default)
const audio1 = await client.textToSpeech.convert(voiceId, {
  text: "Hello world",
  model_id: "eleven_multilingual_v2"
});

// Optimized for streaming/low latency
const audio2 = await client.textToSpeech.convertAsStream(voiceId, {
  text: "Real-time speech",
  model_id: "eleven_turbo_v2_5",
  optimize_streaming_latency: 3
});

// Flash model for fastest generation
const audio3 = await client.textToSpeech.convert(voiceId, {
  text: "Ultra-fast generation",
  model_id: "eleven_flash_v2_5"
});

Advanced Features

Pronunciation Dictionaries

// Apply custom pronunciation rules
const audioWithPronunciation = await client.textToSpeech.convert(voiceId, {
  text: "The CEO of ACME Corp will speak about AI/ML technologies.",
  pronunciation_dictionary_locators: [
    {
      pronunciation_dictionary_id: "dictionary_id_1",
      version_id: "version_1"
    }
  ]
});

Deterministic Generation

// Generate reproducible audio with seed
const deterministicAudio = await client.textToSpeech.convert(voiceId, {
  text: "This will sound identical each time",
  seed: 42,
  model_id: "eleven_multilingual_v2"
});

// Same seed = same audio output
const identicalAudio = await client.textToSpeech.convert(voiceId, {
  text: "This will sound identical each time",
  seed: 42, // Same seed
  model_id: "eleven_multilingual_v2"
});

Context Continuity

// Improve speech flow with context
const audioWithContext = await client.textToSpeech.convert(voiceId, {
  text: "This sentence will flow naturally.",
  previous_text: "Welcome to our presentation.",
  next_text: "Let's explore the key features.",
  model_id: "eleven_multilingual_v2"
});

// Chain multiple generations for long content
const firstAudio = await client.textToSpeech.convert(voiceId, {
  text: "Chapter one begins here.",
  model_id: "eleven_multilingual_v2"
});

// Store the request ID for continuity
const firstRequestId = firstAudio.headers?.get('request-id');

const secondAudio = await client.textToSpeech.convert(voiceId, {
  text: "Chapter two continues the story.",
  previous_request_ids: [firstRequestId],
  model_id: "eleven_multilingual_v2"
});

Text Normalization Control

// Control how numbers, dates, and abbreviations are spoken
const normalizedAudio = await client.textToSpeech.convert(voiceId, {
  text: "On 12/25/2023, we sold $1,500 worth of items (approx. 150 units).",
  apply_text_normalization: "on", // "on", "off", or "auto"
  model_id: "eleven_multilingual_v2"
});

// Language-specific normalization (Japanese example)
const japaneseAudio = await client.textToSpeech.convert(voiceId, {
  text: "こんにちは、世界!今日は2023年12月25日です。",
  language_code: "ja",
  apply_language_text_normalization: true,
  model_id: "eleven_multilingual_v2"
});

Error Handling

import { ElevenLabsError, ElevenLabsTimeoutError } from 'elevenlabs';

try {
  const audio = await client.textToSpeech.convert(voiceId, {
    text: "Test audio generation"
  });
  
  // Process audio stream
  await processAudio(audio);
  
} catch (error) {
  if (error instanceof ElevenLabsTimeoutError) {
    console.error('TTS request timed out:', error.message);
    // Implement retry logic or fallback
    
  } else if (error instanceof ElevenLabsError) {
    console.error('TTS API error:', error.statusCode);
    
    if (error.statusCode === 422) {
      console.error('Validation error:', error.body);
      // Handle invalid parameters
      
    } else if (error.statusCode === 401) {
      console.error('Authentication error - check API key');
      
    } else if (error.statusCode === 429) {
      console.error('Rate limit exceeded');
      // Implement backoff strategy
    }
    
  } else {
    console.error('Unexpected error:', error);
  }
}

Performance Optimization

Streaming vs Batch

// For real-time applications - use streaming
const realtimeAudio = await client.textToSpeech.convertAsStream(voiceId, {
  text: "Live announcement or chatbot response",
  optimize_streaming_latency: 3,
  model_id: "eleven_turbo_v2_5"
});

// For pre-generated content - use batch
const batchAudio = await client.textToSpeech.convert(voiceId, {
  text: "Pre-recorded announcement or content",
  model_id: "eleven_multilingual_v2"
});

Latency Optimization

// Maximum speed configuration
const fastestAudio = await client.textToSpeech.convertAsStream(voiceId, {
  text: "Ultra-low latency speech generation",
  model_id: "eleven_turbo_v2_5",
  optimize_streaming_latency: 4, // Max optimization + no text normalizer
  apply_text_normalization: "off",
  use_pvc_as_ivc: true, // Use faster IVC version
  output_format: "mp3_22050_32" // Lower quality for speed
});

Memory-Efficient Processing

import { pipeline } from 'stream';
import { promisify } from 'util';

const pipelineAsync = promisify(pipeline);

// Stream processing without loading entire audio into memory
const audioStream = await client.textToSpeech.convertAsStream(voiceId, {
  text: "Large audio content that should be streamed"
});

// Pipe directly to file or another stream
await pipelineAsync(
  audioStream,
  fs.createWriteStream('output.mp3')
);

Best Practices

  1. Model Selection: Use eleven_turbo_v2_5 for streaming, eleven_multilingual_v2 for quality
  2. Latency: Set optimize_streaming_latency: 2-3 for balanced quality/speed
  3. Output Format: Use mp3_44100_128 for general use, lower bitrates for real-time
  4. Voice Settings: Start with stability: 0.5, similarity_boost: 0.8
  5. Context: Use previous_text/next_text for better flow in long content
  6. Error Handling: Always handle rate limits and validation errors
  7. Memory: Use streaming for large content or real-time applications
  8. Caching: Cache frequently used audio to reduce API calls