The Text-to-Speech (TTS) API converts written text into natural-sounding speech using AI-generated voices. The ElevenLabs SDK provides comprehensive TTS functionality including basic conversion, streaming audio, and precise character-level timing synchronization.
import {
ElevenLabsClient,
type TextToSpeechRequest,
type StreamTextToSpeechRequest,
type TextToSpeechWithTimestampsRequest,
type StreamTextToSpeechWithTimestampsRequest,
type VoiceSettings,
type AudioWithTimestampsResponse,
type StreamingAudioChunkWithTimestampsResponse,
type TextToSpeechConvertRequestOutputFormat
} from 'elevenlabs';const client = new ElevenLabsClient();
// Basic text-to-speech conversion
const audioStream = await client.textToSpeech.convert(
"21m00Tcm4TlvDq8ikWAM", // Voice ID
{
text: "Hello world! This is generated speech.",
model_id: "eleven_multilingual_v2"
}
);
// The audioStream is a Node.js Readable stream containing the audio dataconst audioStream = await client.textToSpeech.convert(
"pNInz6obpgDQGcFmaJgB", // Voice ID (Sarah)
{
text: "Welcome to ElevenLabs! This is a comprehensive text-to-speech example.",
model_id: "eleven_multilingual_v2",
voice_settings: {
stability: 0.5,
similarity_boost: 0.8,
style: 0.2,
use_speaker_boost: true,
speed: 1.0
},
output_format: "mp3_44100_128",
optimize_streaming_latency: 1,
enable_logging: true,
language_code: "en",
seed: 12345
}
);interface TextToSpeechRequest {
/** The text that will get converted into speech */
text: string;
/** Model identifier (e.g., "eleven_multilingual_v2") */
model_id?: string;
/** Language code (ISO 639-1) for model enforcement */
language_code?: string;
/** Voice settings overriding stored settings */
voice_settings?: VoiceSettings;
/** Output format (default: "mp3_44100_128") */
output_format?: TextToSpeechConvertRequestOutputFormat;
/**
* Latency optimization level (0-4):
* 0: default mode (no optimizations)
* 1: normal optimizations (~50% improvement)
* 2: strong optimizations (~75% improvement)
* 3: max optimizations
* 4: max + text normalizer off (best latency)
*/
optimize_streaming_latency?: number;
/**
* Enable zero retention mode for enterprise customers
* When false, history features are unavailable
*/
enable_logging?: boolean;
/** Pronunciation dictionary locators (max 3) */
pronunciation_dictionary_locators?: PronunciationDictionaryVersionLocatorRequestModel[];
/** Deterministic seed for reproducible results (0-4294967295) */
seed?: number;
/** Previous text for continuity */
previous_text?: string;
/** Next text for continuity */
next_text?: string;
/** Previous request IDs for continuity (max 3) */
previous_request_ids?: string[];
/** Next request IDs for continuity (max 3) */
next_request_ids?: string[];
/** Use IVC instead of PVC for lower latency */
use_pvc_as_ivc?: boolean;
/** Text normalization: 'auto' | 'on' | 'off' */
apply_text_normalization?: BodyTextToSpeechV1TextToSpeechVoiceIdPostApplyTextNormalization;
/** Language-specific text normalization (increases latency) */
apply_language_text_normalization?: boolean;
}interface VoiceSettings {
/**
* Voice stability (0.0-1.0)
* Lower values: broader emotional range
* Higher values: more monotonous but consistent
*/
stability?: number;
/**
* Similarity boost (0.0-1.0)
* How closely AI adheres to original voice
*/
similarity_boost?: number;
/**
* Style exaggeration (0.0-1.0)
* Amplifies original speaker's style
* Increases computational load and latency
*/
style?: number;
/**
* Speaker boost for enhanced similarity
* Increases computational load
*/
use_speaker_boost?: boolean;
/**
* Speech speed multiplier
* 1.0 = normal, <1.0 = slower, >1.0 = faster
*/
speed?: number;
}type TextToSpeechConvertRequestOutputFormat =
// MP3 formats (various bitrates)
| "mp3_22050_32" // 22.05kHz, 32kbps
| "mp3_44100_32" // 44.1kHz, 32kbps
| "mp3_44100_64" // 44.1kHz, 64kbps
| "mp3_44100_96" // 44.1kHz, 96kbps
| "mp3_44100_128" // 44.1kHz, 128kbps (most common)
| "mp3_44100_192" // 44.1kHz, 192kbps (requires Creator tier+)
// PCM formats (uncompressed)
| "pcm_8000" // 8kHz (telephony)
| "pcm_16000" // 16kHz
| "pcm_22050" // 22.05kHz
| "pcm_24000" // 24kHz
| "pcm_44100" // 44.1kHz (requires Pro tier+)
| "pcm_48000" // 48kHz
// Telephony formats
| "ulaw_8000" // μ-law 8kHz (Twilio compatible)
| "alaw_8000" // A-law 8kHz
// Opus formats (high quality, efficient)
| "opus_48000_32" // 48kHz, 32kbps
| "opus_48000_64" // 48kHz, 64kbps
| "opus_48000_96" // 48kHz, 96kbps
| "opus_48000_128" // 48kHz, 128kbps
| "opus_48000_192"; // 48kHz, 192kbpsFor low-latency applications, use streaming to receive audio as it's generated:
const audioStream = await client.textToSpeech.convertAsStream(
"pNInz6obpgDQGcFmaJgB", // Voice ID
{
text: "This text will be converted to speech in real-time chunks.",
model_id: "eleven_turbo_v2_5", // Optimized for streaming
optimize_streaming_latency: 3,
output_format: "mp3_44100_128"
}
);
// Process stream chunks as they arrive
audioStream.on('data', (chunk) => {
console.log(`Received audio chunk: ${chunk.length} bytes`);
// Handle audio chunk (e.g., play immediately, buffer, etc.)
});
audioStream.on('end', () => {
console.log('Stream completed');
});interface StreamTextToSpeechRequest {
/** Text to convert to speech */
text: string;
/** Model ID optimized for streaming */
model_id?: string;
/** Voice settings */
voice_settings?: VoiceSettings;
/** Output format */
output_format?: TextToSpeechConvertRequestOutputFormat;
/** Streaming latency optimization (recommended: 2-3) */
optimize_streaming_latency?: number;
/** Disable logging for zero retention */
enable_logging?: boolean;
/** Language code */
language_code?: string;
/** Deterministic seed */
seed?: number;
/** Continuity parameters */
previous_text?: string;
next_text?: string;
previous_request_ids?: string[];
next_request_ids?: string[];
}Get precise character-level timing information for audio-text synchronization:
const result = await client.textToSpeech.convertWithTimestamps(
"21m00Tcm4TlvDq8ikWAM",
{
text: "This is a test for precise character timing.",
model_id: "eleven_multilingual_v2"
}
);
console.log('Audio data:', result.audio_base64);
console.log('Alignment info:', result.alignment);
// Process character alignments
result.alignment.characters.forEach((char, index) => {
console.log(`Character "${char.character}" at ${char.start_time_seconds}s - ${char.end_time_seconds}s`);
});interface AudioWithTimestampsResponse {
/** Base64-encoded audio data */
audio_base64: string;
/** Character-level timing alignment */
alignment: AlignmentInfo;
}
interface AlignmentInfo {
/** Array of character timing data */
characters: CharacterAlignment[];
/** Overall audio duration */
duration_seconds: number;
}
interface CharacterAlignment {
/** The character */
character: string;
/** Start time in seconds */
start_time_seconds: number;
/** End time in seconds */
end_time_seconds: number;
/** Character index in original text */
character_index: number;
}Stream audio with character-level timing for real-time synchronization:
const timestampStream = await client.textToSpeech.streamWithTimestamps(
"pNInz6obpgDQGcFmaJgB",
{
text: "Real-time streaming with character-level timing information.",
model_id: "eleven_multilingual_v2",
optimize_streaming_latency: 2
}
);
// Process streaming chunks with timing data
for await (const chunk of timestampStream) {
console.log('Audio chunk:', chunk.audio);
console.log('Characters:', chunk.start_char_idx, '-', chunk.end_char_idx);
console.log('Timing:', chunk.start_time_seconds, '-', chunk.end_time_seconds);
// Synchronize audio playback with text highlighting
// highlightText(chunk.start_char_idx, chunk.end_char_idx);
// playAudioChunk(chunk.audio);
}interface StreamingAudioChunkWithTimestampsResponse {
/** Base64-encoded audio chunk */
audio: string;
/** Start character index in original text */
start_char_idx: number;
/** End character index in original text */
end_char_idx: number;
/** Start time of this chunk in seconds */
start_time_seconds: number;
/** End time of this chunk in seconds */
end_time_seconds: number;
/** Whether this is the final chunk */
is_final?: boolean;
}// Get available models
const models = await client.models.getAll();
// Filter TTS-capable models
const ttsModels = models.filter(model => model.can_do_text_to_speech);
console.log('Available TTS models:');
ttsModels.forEach(model => {
console.log(`- ${model.model_id}: ${model.name}`);
console.log(` Languages: ${model.languages?.join(', ')}`);
console.log(` Max characters: ${model.max_characters_request_free_user}`);
});// High-quality multilingual (default)
const audio1 = await client.textToSpeech.convert(voiceId, {
text: "Hello world",
model_id: "eleven_multilingual_v2"
});
// Optimized for streaming/low latency
const audio2 = await client.textToSpeech.convertAsStream(voiceId, {
text: "Real-time speech",
model_id: "eleven_turbo_v2_5",
optimize_streaming_latency: 3
});
// Flash model for fastest generation
const audio3 = await client.textToSpeech.convert(voiceId, {
text: "Ultra-fast generation",
model_id: "eleven_flash_v2_5"
});// Apply custom pronunciation rules
const audioWithPronunciation = await client.textToSpeech.convert(voiceId, {
text: "The CEO of ACME Corp will speak about AI/ML technologies.",
pronunciation_dictionary_locators: [
{
pronunciation_dictionary_id: "dictionary_id_1",
version_id: "version_1"
}
]
});// Generate reproducible audio with seed
const deterministicAudio = await client.textToSpeech.convert(voiceId, {
text: "This will sound identical each time",
seed: 42,
model_id: "eleven_multilingual_v2"
});
// Same seed = same audio output
const identicalAudio = await client.textToSpeech.convert(voiceId, {
text: "This will sound identical each time",
seed: 42, // Same seed
model_id: "eleven_multilingual_v2"
});// Improve speech flow with context
const audioWithContext = await client.textToSpeech.convert(voiceId, {
text: "This sentence will flow naturally.",
previous_text: "Welcome to our presentation.",
next_text: "Let's explore the key features.",
model_id: "eleven_multilingual_v2"
});
// Chain multiple generations for long content
const firstAudio = await client.textToSpeech.convert(voiceId, {
text: "Chapter one begins here.",
model_id: "eleven_multilingual_v2"
});
// Store the request ID for continuity
const firstRequestId = firstAudio.headers?.get('request-id');
const secondAudio = await client.textToSpeech.convert(voiceId, {
text: "Chapter two continues the story.",
previous_request_ids: [firstRequestId],
model_id: "eleven_multilingual_v2"
});// Control how numbers, dates, and abbreviations are spoken
const normalizedAudio = await client.textToSpeech.convert(voiceId, {
text: "On 12/25/2023, we sold $1,500 worth of items (approx. 150 units).",
apply_text_normalization: "on", // "on", "off", or "auto"
model_id: "eleven_multilingual_v2"
});
// Language-specific normalization (Japanese example)
const japaneseAudio = await client.textToSpeech.convert(voiceId, {
text: "こんにちは、世界!今日は2023年12月25日です。",
language_code: "ja",
apply_language_text_normalization: true,
model_id: "eleven_multilingual_v2"
});import { ElevenLabsError, ElevenLabsTimeoutError } from 'elevenlabs';
try {
const audio = await client.textToSpeech.convert(voiceId, {
text: "Test audio generation"
});
// Process audio stream
await processAudio(audio);
} catch (error) {
if (error instanceof ElevenLabsTimeoutError) {
console.error('TTS request timed out:', error.message);
// Implement retry logic or fallback
} else if (error instanceof ElevenLabsError) {
console.error('TTS API error:', error.statusCode);
if (error.statusCode === 422) {
console.error('Validation error:', error.body);
// Handle invalid parameters
} else if (error.statusCode === 401) {
console.error('Authentication error - check API key');
} else if (error.statusCode === 429) {
console.error('Rate limit exceeded');
// Implement backoff strategy
}
} else {
console.error('Unexpected error:', error);
}
}// For real-time applications - use streaming
const realtimeAudio = await client.textToSpeech.convertAsStream(voiceId, {
text: "Live announcement or chatbot response",
optimize_streaming_latency: 3,
model_id: "eleven_turbo_v2_5"
});
// For pre-generated content - use batch
const batchAudio = await client.textToSpeech.convert(voiceId, {
text: "Pre-recorded announcement or content",
model_id: "eleven_multilingual_v2"
});// Maximum speed configuration
const fastestAudio = await client.textToSpeech.convertAsStream(voiceId, {
text: "Ultra-low latency speech generation",
model_id: "eleven_turbo_v2_5",
optimize_streaming_latency: 4, // Max optimization + no text normalizer
apply_text_normalization: "off",
use_pvc_as_ivc: true, // Use faster IVC version
output_format: "mp3_22050_32" // Lower quality for speed
});import { pipeline } from 'stream';
import { promisify } from 'util';
const pipelineAsync = promisify(pipeline);
// Stream processing without loading entire audio into memory
const audioStream = await client.textToSpeech.convertAsStream(voiceId, {
text: "Large audio content that should be streamed"
});
// Pipe directly to file or another stream
await pipelineAsync(
audioStream,
fs.createWriteStream('output.mp3')
);eleven_turbo_v2_5 for streaming, eleven_multilingual_v2 for qualityoptimize_streaming_latency: 2-3 for balanced quality/speedmp3_44100_128 for general use, lower bitrates for real-timestability: 0.5, similarity_boost: 0.8previous_text/next_text for better flow in long content