or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

audio

audio-processing.mdrealtime-transcription.mdspeech-to-speech.mdspeech-to-text.mdtext-to-speech.md
index.md
tile.json

text-to-speech.mddocs/audio/

Text-to-Speech

Convert text into lifelike speech using AI voices with support for multiple models, voice settings, and streaming responses. Includes options for character-level timing information.

Quick Reference

import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";

const client = new ElevenLabsClient({ apiKey: "your-api-key" });
// Access this API via: client.textToSpeech

Capabilities

Convert Text to Speech

Converts text to speech and returns audio stream.

/**
 * @param voice_id - Voice ID to be used
 * @param request - Text and configuration options
 * @param requestOptions - Optional request configuration
 * @returns ReadableStream of audio chunks (Uint8Array)
 * @throws UnprocessableEntityError if request fails
 */
client.textToSpeech.convert(
  voice_id: string,
  request: BodyTextToSpeechFull,
  requestOptions?: RequestOptions
): HttpResponsePromise<ReadableStream<Uint8Array>>;

interface BodyTextToSpeechFull {
  /** The text to convert to speech */
  text: string;
  /** Model ID to use (e.g., "eleven_multilingual_v2", "eleven_turbo_v2_5") */
  modelId?: string;
  /** Language code (ISO 639-1) used to enforce a language for the model and text normalization */
  languageCode?: string;
  /** Voice settings to override defaults */
  voiceSettings?: VoiceSettings;
  /** Pronunciation dictionary version IDs to apply (max 3) */
  pronunciationDictionaryLocators?: PronunciationDictionaryVersionLocator[];
  /** Seed for reproducible generation (integer between 0 and 4294967295) */
  seed?: number;
  /** Previous text for context continuity */
  previousText?: string;
  /** Next text for context continuity */
  nextText?: string;
  /** Request previous/next request IDs for context (max 3) */
  previousRequestIds?: string[];
  /** Request next request IDs for context (max 3) */
  nextRequestIds?: string[];
  /** If true, use IVC version of voice instead of PVC (temporary workaround for latency) */
  usePvcAsIvc?: boolean;
  /** Text normalization mode: 'auto' (default), 'on', or 'off' */
  applyTextNormalization?: ApplyTextNormalizationEnum;
  /** Language text normalization for pronunciation. WARNING: Increases latency. Currently only supports Japanese */
  applyLanguageTextNormalization?: boolean;
  /** Enable logging (when false, zero retention mode is used - enterprise only) */
  enableLogging?: boolean;
  /** Latency optimization level (0-4). 0: default, 1-3: progressive optimization, 4: max with text normalizer off */
  optimizeStreamingLatency?: number;
  /** Output format (e.g., "mp3_44100_128", "pcm_16000"). MP3 192kbps requires Creator+, PCM 44.1kHz requires Pro+ */
  outputFormat?: string;
}

interface VoiceSettings {
  /** Stability (0.0 to 1.0) */
  stability?: number;
  /** Similarity boost (0.0 to 1.0) */
  similarityBoost?: number;
  /** Style exaggeration (0.0 to 1.0) */
  style?: number;
  /** Use speaker boost */
  useSpeakerBoost?: boolean;
  /** Adjusts the speed of the voice. 1.0 is default speed */
  speed?: number;
}

interface PronunciationDictionaryVersionLocator {
  pronunciationDictionaryId: string;
  /** Version ID. If not provided, the latest version will be used */
  versionId?: string;
}

enum ApplyTextNormalizationEnum {
  Auto = "auto",
  On = "on",
  Off = "off",
}

Stream Text to Speech

Converts text to speech with streaming audio output.

/**
 * @param voice_id - Voice ID to be used
 * @param request - Text and configuration for streaming
 * @param requestOptions - Optional request configuration
 * @returns ReadableStream of audio chunks
 * @throws UnprocessableEntityError if request fails
 */
client.textToSpeech.stream(
  voice_id: string,
  request: StreamTextToSpeechRequest,
  requestOptions?: RequestOptions
): HttpResponsePromise<ReadableStream<Uint8Array>>;

interface StreamTextToSpeechRequest {
  /** The text to convert to speech */
  text: string;
  /** Model ID to use */
  modelId?: string;
  /** Language code (ISO 639-1) for enforcing model language */
  languageCode?: string;
  /** Voice settings */
  voiceSettings?: VoiceSettings;
  /** Pronunciation dictionary locators */
  pronunciationDictionaryLocators?: PronunciationDictionaryVersionLocator[];
  /** Seed for reproducibility */
  seed?: number;
  /** Previous text for context */
  previousText?: string;
  /** Next text for context */
  nextText?: string;
  /** Previous request IDs for context */
  previousRequestIds?: string[];
  /** Next request IDs for context */
  nextRequestIds?: string[];
  /** Use IVC version instead of PVC */
  usePvcAsIvc?: boolean;
  /** Apply text normalization */
  applyTextNormalization?: ApplyTextNormalizationEnum;
  /** Language text normalization. WARNING: Increases latency */
  applyLanguageTextNormalization?: boolean;
  /** Enable logging */
  enableLogging?: boolean;
  /** Optimize streaming latency (0-4, default: 0) */
  optimizeStreamingLatency?: number;
  /** Output format (e.g., "mp3_44100_128", "pcm_16000") */
  outputFormat?: string;
}

Convert with Timestamps

Generate speech with precise character-level timing information for audio-text synchronization.

/**
 * @param voice_id - Voice ID to be used
 * @param request - Text and configuration with timestamp request
 * @param requestOptions - Optional request configuration
 * @returns Audio with character-level timestamps
 * @throws UnprocessableEntityError if request fails
 */
client.textToSpeech.convertWithTimestamps(
  voice_id: string,
  request: BodyTextToSpeechFullWithTimestamps,
  requestOptions?: RequestOptions
): HttpResponsePromise<AudioWithTimestampsResponse>;

interface BodyTextToSpeechFullWithTimestamps {
  /** The text to convert to speech */
  text: string;
  /** Model ID to use */
  modelId?: string;
  /** Language code (ISO 639-1) */
  languageCode?: string;
  /** Voice settings */
  voiceSettings?: VoiceSettings;
  /** Pronunciation dictionary locators */
  pronunciationDictionaryLocators?: PronunciationDictionaryVersionLocator[];
  /** Seed for reproducibility */
  seed?: number;
  /** Previous text for context */
  previousText?: string;
  /** Next text for context */
  nextText?: string;
  /** Previous request IDs */
  previousRequestIds?: string[];
  /** Next request IDs */
  nextRequestIds?: string[];
  /** Use IVC version instead of PVC */
  usePvcAsIvc?: boolean;
  /** Apply text normalization */
  applyTextNormalization?: ApplyTextNormalizationEnum;
  /** Language text normalization */
  applyLanguageTextNormalization?: boolean;
  /** Enable logging */
  enableLogging?: boolean;
  /** Output format */
  outputFormat?: string;
  /** Latency optimization level (0-4). 0: default, 1: ~50% improvement, 2: ~75% improvement, 3: max, 4: max with text normalizer off */
  optimizeStreamingLatency?: number;
}

interface AudioWithTimestampsResponse {
  /** Base64 encoded audio data */
  audioBase64: string;
  /** Character-level timestamps and alignment data */
  alignment: Alignment;
  /** Normalized text used for generation */
  normalizedAlignment?: NormalizedAlignment;
}

interface Alignment {
  /** Array of characters */
  characters: string[];
  /** Start times for each character (in seconds) */
  characterStartTimesSeconds: number[];
  /** End times for each character (in seconds) */
  characterEndTimesSeconds: number[];
}

interface NormalizedAlignment {
  /** Normalized characters */
  characters: string[];
  /** Start times for normalized characters */
  characterStartTimesSeconds: number[];
  /** End times for normalized characters */
  characterEndTimesSeconds: number[];
}

Stream with Timestamps

Stream speech generation with character timestamps as JSON chunks.

/**
 * @param voice_id - Voice ID to be used
 * @param request - Text and configuration for streaming with timestamps
 * @param requestOptions - Optional request configuration
 * @returns Stream of JSON chunks with audio and timestamps
 * @throws UnprocessableEntityError if request fails
 */
client.textToSpeech.streamWithTimestamps(
  voice_id: string,
  request: StreamTextToSpeechWithTimestampsRequest,
  requestOptions?: RequestOptions
): HttpResponsePromise<Stream<StreamingAudioChunkWithTimestampsResponse>>;

interface StreamTextToSpeechWithTimestampsRequest {
  /** The text to convert to speech */
  text: string;
  /** Model ID to use */
  modelId?: string;
  /** Language code (ISO 639-1) */
  languageCode?: string;
  /** Voice settings */
  voiceSettings?: VoiceSettings;
  /** Pronunciation dictionary locators */
  pronunciationDictionaryLocators?: PronunciationDictionaryVersionLocator[];
  /** Seed for reproducibility */
  seed?: number;
  /** Previous text for context */
  previousText?: string;
  /** Next text for context */
  nextText?: string;
  /** Previous request IDs */
  previousRequestIds?: string[];
  /** Next request IDs */
  nextRequestIds?: string[];
  /** Use IVC version instead of PVC */
  usePvcAsIvc?: boolean;
  /** Apply text normalization */
  applyTextNormalization?: ApplyTextNormalizationEnum;
  /** Language text normalization */
  applyLanguageTextNormalization?: boolean;
  /** Enable logging */
  enableLogging?: boolean;
}

interface StreamingAudioChunkWithTimestampsResponse {
  /** Base64 encoded audio chunk */
  audioBase64: string;
  /** Character alignment for this chunk */
  alignment?: Alignment;
  /** Normalized alignment for this chunk */
  normalizedAlignment?: NormalizedAlignment;
}

Usage Examples

Basic Text-to-Speech

import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";

const client = new ElevenLabsClient({ apiKey: "your-api-key" });

// Convert text to speech
const audio = await client.textToSpeech.convert("voice-id", {
  text: "Hello, world!",
  modelId: "eleven_multilingual_v2",
  voiceSettings: {
    stability: 0.5,
    similarityBoost: 0.75,
  },
});

// Process audio stream
for await (const chunk of audio) {
  // Handle audio chunk
}

Streaming TTS

// Stream with low latency
const audioStream = await client.textToSpeech.stream("voice-id", {
  text: "This is a streaming example.",
  optimizeStreamingLatency: 3,
  outputFormat: "mp3_44100_128",
});

for await (const chunk of audioStream) {
  // Process streaming audio chunk
}

TTS with Timestamps

// Get character-level timing information
const result = await client.textToSpeech.convertWithTimestamps("voice-id", {
  text: "Synchronized speech!",
  modelId: "eleven_turbo_v2_5",
});

console.log("Audio:", result.audioBase64);
console.log("Characters:", result.alignment.characters);
console.log("Start times:", result.alignment.characterStartTimesSeconds);
console.log("End times:", result.alignment.characterEndTimesSeconds);

Using Pronunciation Dictionaries

// Apply custom pronunciation rules
const audio = await client.textToSpeech.convert("voice-id", {
  text: "The CEO will discuss API best practices.",
  pronunciationDictionaryLocators: [
    {
      pronunciationDictionaryId: "dict-id-1",
      versionId: "version-1",
    },
  ],
});

Context-Aware Generation

// Provide context for better continuity
const audio = await client.textToSpeech.convert("voice-id", {
  text: "This is the current sentence.",
  previousText: "This was said before.",
  nextText: "This will be said after.",
});