Convert text into lifelike speech using AI voices with support for multiple models, voice settings, and streaming responses. Includes options for character-level timing information.
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
const client = new ElevenLabsClient({ apiKey: "your-api-key" });
// Access this API via: client.textToSpeechConverts text to speech and returns audio stream.
/**
* @param voice_id - Voice ID to be used
* @param request - Text and configuration options
* @param requestOptions - Optional request configuration
* @returns ReadableStream of audio chunks (Uint8Array)
* @throws UnprocessableEntityError if request fails
*/
client.textToSpeech.convert(
voice_id: string,
request: BodyTextToSpeechFull,
requestOptions?: RequestOptions
): HttpResponsePromise<ReadableStream<Uint8Array>>;
interface BodyTextToSpeechFull {
/** The text to convert to speech */
text: string;
/** Model ID to use (e.g., "eleven_multilingual_v2", "eleven_turbo_v2_5") */
modelId?: string;
/** Language code (ISO 639-1) used to enforce a language for the model and text normalization */
languageCode?: string;
/** Voice settings to override defaults */
voiceSettings?: VoiceSettings;
/** Pronunciation dictionary version IDs to apply (max 3) */
pronunciationDictionaryLocators?: PronunciationDictionaryVersionLocator[];
/** Seed for reproducible generation (integer between 0 and 4294967295) */
seed?: number;
/** Previous text for context continuity */
previousText?: string;
/** Next text for context continuity */
nextText?: string;
/** Request previous/next request IDs for context (max 3) */
previousRequestIds?: string[];
/** Request next request IDs for context (max 3) */
nextRequestIds?: string[];
/** If true, use IVC version of voice instead of PVC (temporary workaround for latency) */
usePvcAsIvc?: boolean;
/** Text normalization mode: 'auto' (default), 'on', or 'off' */
applyTextNormalization?: ApplyTextNormalizationEnum;
/** Language text normalization for pronunciation. WARNING: Increases latency. Currently only supports Japanese */
applyLanguageTextNormalization?: boolean;
/** Enable logging (when false, zero retention mode is used - enterprise only) */
enableLogging?: boolean;
/** Latency optimization level (0-4). 0: default, 1-3: progressive optimization, 4: max with text normalizer off */
optimizeStreamingLatency?: number;
/** Output format (e.g., "mp3_44100_128", "pcm_16000"). MP3 192kbps requires Creator+, PCM 44.1kHz requires Pro+ */
outputFormat?: string;
}
interface VoiceSettings {
/** Stability (0.0 to 1.0) */
stability?: number;
/** Similarity boost (0.0 to 1.0) */
similarityBoost?: number;
/** Style exaggeration (0.0 to 1.0) */
style?: number;
/** Use speaker boost */
useSpeakerBoost?: boolean;
/** Adjusts the speed of the voice. 1.0 is default speed */
speed?: number;
}
interface PronunciationDictionaryVersionLocator {
pronunciationDictionaryId: string;
/** Version ID. If not provided, the latest version will be used */
versionId?: string;
}
enum ApplyTextNormalizationEnum {
Auto = "auto",
On = "on",
Off = "off",
}Converts text to speech with streaming audio output.
/**
* @param voice_id - Voice ID to be used
* @param request - Text and configuration for streaming
* @param requestOptions - Optional request configuration
* @returns ReadableStream of audio chunks
* @throws UnprocessableEntityError if request fails
*/
client.textToSpeech.stream(
voice_id: string,
request: StreamTextToSpeechRequest,
requestOptions?: RequestOptions
): HttpResponsePromise<ReadableStream<Uint8Array>>;
interface StreamTextToSpeechRequest {
/** The text to convert to speech */
text: string;
/** Model ID to use */
modelId?: string;
/** Language code (ISO 639-1) for enforcing model language */
languageCode?: string;
/** Voice settings */
voiceSettings?: VoiceSettings;
/** Pronunciation dictionary locators */
pronunciationDictionaryLocators?: PronunciationDictionaryVersionLocator[];
/** Seed for reproducibility */
seed?: number;
/** Previous text for context */
previousText?: string;
/** Next text for context */
nextText?: string;
/** Previous request IDs for context */
previousRequestIds?: string[];
/** Next request IDs for context */
nextRequestIds?: string[];
/** Use IVC version instead of PVC */
usePvcAsIvc?: boolean;
/** Apply text normalization */
applyTextNormalization?: ApplyTextNormalizationEnum;
/** Language text normalization. WARNING: Increases latency */
applyLanguageTextNormalization?: boolean;
/** Enable logging */
enableLogging?: boolean;
/** Optimize streaming latency (0-4, default: 0) */
optimizeStreamingLatency?: number;
/** Output format (e.g., "mp3_44100_128", "pcm_16000") */
outputFormat?: string;
}Generate speech with precise character-level timing information for audio-text synchronization.
/**
* @param voice_id - Voice ID to be used
* @param request - Text and configuration with timestamp request
* @param requestOptions - Optional request configuration
* @returns Audio with character-level timestamps
* @throws UnprocessableEntityError if request fails
*/
client.textToSpeech.convertWithTimestamps(
voice_id: string,
request: BodyTextToSpeechFullWithTimestamps,
requestOptions?: RequestOptions
): HttpResponsePromise<AudioWithTimestampsResponse>;
interface BodyTextToSpeechFullWithTimestamps {
/** The text to convert to speech */
text: string;
/** Model ID to use */
modelId?: string;
/** Language code (ISO 639-1) */
languageCode?: string;
/** Voice settings */
voiceSettings?: VoiceSettings;
/** Pronunciation dictionary locators */
pronunciationDictionaryLocators?: PronunciationDictionaryVersionLocator[];
/** Seed for reproducibility */
seed?: number;
/** Previous text for context */
previousText?: string;
/** Next text for context */
nextText?: string;
/** Previous request IDs */
previousRequestIds?: string[];
/** Next request IDs */
nextRequestIds?: string[];
/** Use IVC version instead of PVC */
usePvcAsIvc?: boolean;
/** Apply text normalization */
applyTextNormalization?: ApplyTextNormalizationEnum;
/** Language text normalization */
applyLanguageTextNormalization?: boolean;
/** Enable logging */
enableLogging?: boolean;
/** Output format */
outputFormat?: string;
/** Latency optimization level (0-4). 0: default, 1: ~50% improvement, 2: ~75% improvement, 3: max, 4: max with text normalizer off */
optimizeStreamingLatency?: number;
}
interface AudioWithTimestampsResponse {
/** Base64 encoded audio data */
audioBase64: string;
/** Character-level timestamps and alignment data */
alignment: Alignment;
/** Normalized text used for generation */
normalizedAlignment?: NormalizedAlignment;
}
interface Alignment {
/** Array of characters */
characters: string[];
/** Start times for each character (in seconds) */
characterStartTimesSeconds: number[];
/** End times for each character (in seconds) */
characterEndTimesSeconds: number[];
}
interface NormalizedAlignment {
/** Normalized characters */
characters: string[];
/** Start times for normalized characters */
characterStartTimesSeconds: number[];
/** End times for normalized characters */
characterEndTimesSeconds: number[];
}Stream speech generation with character timestamps as JSON chunks.
/**
* @param voice_id - Voice ID to be used
* @param request - Text and configuration for streaming with timestamps
* @param requestOptions - Optional request configuration
* @returns Stream of JSON chunks with audio and timestamps
* @throws UnprocessableEntityError if request fails
*/
client.textToSpeech.streamWithTimestamps(
voice_id: string,
request: StreamTextToSpeechWithTimestampsRequest,
requestOptions?: RequestOptions
): HttpResponsePromise<Stream<StreamingAudioChunkWithTimestampsResponse>>;
interface StreamTextToSpeechWithTimestampsRequest {
/** The text to convert to speech */
text: string;
/** Model ID to use */
modelId?: string;
/** Language code (ISO 639-1) */
languageCode?: string;
/** Voice settings */
voiceSettings?: VoiceSettings;
/** Pronunciation dictionary locators */
pronunciationDictionaryLocators?: PronunciationDictionaryVersionLocator[];
/** Seed for reproducibility */
seed?: number;
/** Previous text for context */
previousText?: string;
/** Next text for context */
nextText?: string;
/** Previous request IDs */
previousRequestIds?: string[];
/** Next request IDs */
nextRequestIds?: string[];
/** Use IVC version instead of PVC */
usePvcAsIvc?: boolean;
/** Apply text normalization */
applyTextNormalization?: ApplyTextNormalizationEnum;
/** Language text normalization */
applyLanguageTextNormalization?: boolean;
/** Enable logging */
enableLogging?: boolean;
}
interface StreamingAudioChunkWithTimestampsResponse {
/** Base64 encoded audio chunk */
audioBase64: string;
/** Character alignment for this chunk */
alignment?: Alignment;
/** Normalized alignment for this chunk */
normalizedAlignment?: NormalizedAlignment;
}import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
const client = new ElevenLabsClient({ apiKey: "your-api-key" });
// Convert text to speech
const audio = await client.textToSpeech.convert("voice-id", {
text: "Hello, world!",
modelId: "eleven_multilingual_v2",
voiceSettings: {
stability: 0.5,
similarityBoost: 0.75,
},
});
// Process audio stream
for await (const chunk of audio) {
// Handle audio chunk
}// Stream with low latency
const audioStream = await client.textToSpeech.stream("voice-id", {
text: "This is a streaming example.",
optimizeStreamingLatency: 3,
outputFormat: "mp3_44100_128",
});
for await (const chunk of audioStream) {
// Process streaming audio chunk
}// Get character-level timing information
const result = await client.textToSpeech.convertWithTimestamps("voice-id", {
text: "Synchronized speech!",
modelId: "eleven_turbo_v2_5",
});
console.log("Audio:", result.audioBase64);
console.log("Characters:", result.alignment.characters);
console.log("Start times:", result.alignment.characterStartTimesSeconds);
console.log("End times:", result.alignment.characterEndTimesSeconds);// Apply custom pronunciation rules
const audio = await client.textToSpeech.convert("voice-id", {
text: "The CEO will discuss API best practices.",
pronunciationDictionaryLocators: [
{
pronunciationDictionaryId: "dict-id-1",
versionId: "version-1",
},
],
});// Provide context for better continuity
const audio = await client.textToSpeech.convert("voice-id", {
text: "This is the current sentence.",
previousText: "This was said before.",
nextText: "This will be said after.",
});