0
# Text-to-Speech
1
2
## API Methods
3
4
### textToSpeech.convert()
5
6
```typescript { .api }
7
convert(
8
voiceId: string,
9
request: TtsRequest,
10
options?: RequestOptions
11
): Promise<ReadableStream<Uint8Array>>
12
```
13
14
### textToSpeech.stream()
15
16
```typescript { .api }
17
stream(
18
voiceId: string,
19
request: TtsRequest,
20
options?: RequestOptions
21
): Promise<ReadableStream<Uint8Array>>
22
```
23
24
### textToSpeech.convertWithTimestamps()
25
26
```typescript { .api }
27
convertWithTimestamps(
28
voiceId: string,
29
request: TtsRequest,
30
options?: RequestOptions
31
): Promise<AudioWithTimestampsResponse>
32
```
33
34
### textToSpeech.streamWithTimestamps()
35
36
```typescript { .api }
37
streamWithTimestamps(
38
voiceId: string,
39
request: TtsRequest,
40
options?: RequestOptions
41
): Promise<ReadableStream<StreamingAudioChunkWithTimestampsResponse>>
42
```
43
44
## Text-to-Dialogue Methods
45
46
### textToDialogue.convert()
47
48
```typescript { .api }
49
convert(
50
request: DialogueRequest,
51
options?: RequestOptions
52
): Promise<ReadableStream<Uint8Array>>
53
```
54
55
### textToDialogue.stream()
56
57
```typescript { .api }
58
stream(
59
request: DialogueRequest,
60
options?: RequestOptions
61
): Promise<ReadableStream<Uint8Array>>
62
```
63
64
### textToDialogue.convertWithTimestamps()
65
66
```typescript { .api }
67
convertWithTimestamps(
68
request: DialogueRequest,
69
options?: RequestOptions
70
): Promise<AudioWithTimestampsAndVoiceSegmentsResponseModel>
71
```
72
73
### textToDialogue.streamWithTimestamps()
74
75
```typescript { .api }
76
streamWithTimestamps(
77
request: DialogueRequest,
78
options?: RequestOptions
79
): Promise<ReadableStream<StreamingAudioChunkWithTimestampsAndVoiceSegmentsResponseModel>>
80
```
81
82
## Sound Effects
83
84
### textToSoundEffects.convert()
85
86
```typescript { .api }
87
convert(
88
request: {
89
text: string;
90
outputFormat?: OutputFormat;
91
loop?: boolean; // Only for eleven_text_to_sound_v2
92
durationSeconds?: number; // 0.5-30, defaults to auto
93
promptInfluence?: number; // 0-1, defaults to 0.3
94
modelId?: string;
95
},
96
options?: RequestOptions
97
): Promise<ReadableStream<Uint8Array>>
98
```
99
100
## Request Types
101
102
### TtsRequest
103
104
```typescript { .api }
105
interface TtsRequest {
106
text: string;
107
modelId?: string;
108
voiceSettings?: VoiceSettings;
109
pronunciationDictionaryLocators?: PronunciationDictionaryLocator[];
110
languageCode?: string; // ISO 639-1
111
outputFormat?: OutputFormat; // Default: mp3_44100_128
112
113
// Latency optimization: 0 (default), 1 (50%), 2 (75%), 3 (max), 4 (max+no normalization)
114
optimizeStreamingLatency?: number;
115
116
// Continuity
117
previousText?: string;
118
nextText?: string;
119
previousRequestIds?: string[]; // Max 3
120
nextRequestIds?: string[]; // Max 3
121
122
// Control
123
seed?: number; // 0-4294967295, for deterministic generation
124
enableLogging?: boolean; // false = zero retention (enterprise only)
125
usePvcAsIvc?: boolean; // Workaround for PVC latency
126
applyTextNormalization?: "auto" | "on" | "off";
127
applyLanguageTextNormalization?: boolean; // Increases latency, Japanese only
128
}
129
```
130
131
### DialogueRequest
132
133
```typescript { .api }
134
interface DialogueRequest {
135
inputs: DialogueInput[];
136
modelId?: string;
137
outputFormat?: OutputFormat;
138
languageCode?: string;
139
settings?: ModelSettingsResponseModel;
140
pronunciationDictionaryLocators?: PronunciationDictionaryLocator[];
141
seed?: number;
142
applyTextNormalization?: "auto" | "on" | "off";
143
}
144
145
interface DialogueInput {
146
text: string;
147
voiceId: string;
148
}
149
150
interface ModelSettingsResponseModel {
151
stability?: number; // 0-1, lower = more emotional range
152
}
153
```
154
155
### Response Types
156
157
```typescript { .api }
158
interface AudioWithTimestampsResponse {
159
audioBase64: string;
160
alignment?: CharacterAlignmentResponseModel;
161
normalizedAlignment?: CharacterAlignmentResponseModel;
162
}
163
164
interface CharacterAlignmentResponseModel {
165
characters: string[];
166
characterStartTimesSeconds: number[];
167
characterEndTimesSeconds: number[];
168
}
169
170
interface StreamingAudioChunkWithTimestampsResponse {
171
audioBase64: string;
172
alignment?: CharacterAlignmentResponseModel;
173
normalizedAlignment?: CharacterAlignmentResponseModel;
174
}
175
176
interface AudioWithTimestampsAndVoiceSegmentsResponseModel {
177
audioBase64: string;
178
alignment?: CharacterAlignmentResponseModel;
179
normalizedAlignment?: CharacterAlignmentResponseModel;
180
voiceSegments: VoiceSegment[];
181
}
182
183
interface StreamingAudioChunkWithTimestampsAndVoiceSegmentsResponseModel {
184
audioBase64: string;
185
alignment?: CharacterAlignmentResponseModel;
186
normalizedAlignment?: CharacterAlignmentResponseModel;
187
voiceSegments: VoiceSegment[];
188
}
189
190
interface VoiceSegment {
191
voiceId: string;
192
startTimeSeconds: number;
193
endTimeSeconds: number;
194
characterStartIndex: number;
195
characterEndIndex: number;
196
dialogueInputIndex: number;
197
}
198
```
199
200
### VoiceSettings
201
202
```typescript { .api }
203
interface VoiceSettings {
204
stability?: number; // 0-1
205
similarityBoost?: number; // 0-1
206
style?: number; // 0-1, consumes extra resources
207
useSpeakerBoost?: boolean; // Increases latency
208
speed?: number; // 1.0 = default, <1 = slower, >1 = faster
209
}
210
```
211
212
### PronunciationDictionaryLocator
213
214
```typescript { .api }
215
interface PronunciationDictionaryLocator {
216
pronunciationDictionaryId: string;
217
versionId?: string; // Latest if omitted
218
}
219
```
220
221
### OutputFormat
222
223
```typescript { .api }
224
type OutputFormat =
225
| "mp3_22050_32" | "mp3_24000_48" | "mp3_44100_32" | "mp3_44100_64"
226
| "mp3_44100_96" | "mp3_44100_128" | "mp3_44100_192" // 192 requires Creator+
227
| "pcm_8000" | "pcm_16000" | "pcm_22050" | "pcm_24000"
228
| "pcm_32000" | "pcm_44100" | "pcm_48000" // 44.1kHz+ requires Pro+
229
| "ulaw_8000" // For Twilio
230
| "alaw_8000"
231
| "opus_48000_32" | "opus_48000_64" | "opus_48000_96"
232
| "opus_48000_128" | "opus_48000_192";
233
```
234
235
## Usage Examples
236
237
### Basic TTS
238
239
```typescript
240
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
241
242
const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });
243
244
const audio = await client.textToSpeech.convert("21m00Tcm4TlvDq8ikWAM", {
245
text: "Hello! This is a test.",
246
modelId: "eleven_multilingual_v2",
247
voiceSettings: { stability: 0.5, similarityBoost: 0.75 }
248
});
249
```
250
251
### Streaming
252
253
```typescript
254
const stream = await client.textToSpeech.stream("21m00Tcm4TlvDq8ikWAM", {
255
text: "Streaming audio for low latency.",
256
modelId: "eleven_flash_v2_5",
257
optimizeStreamingLatency: 3
258
});
259
260
for await (const chunk of stream) {
261
processAudioChunk(chunk);
262
}
263
```
264
265
### With Timestamps
266
267
```typescript
268
const result = await client.textToSpeech.convertWithTimestamps(
269
"21m00Tcm4TlvDq8ikWAM",
270
{ text: "Audio with timing data.", modelId: "eleven_multilingual_v2" }
271
);
272
273
result.alignment?.characters.forEach((char, i) => {
274
console.log(`"${char}": ${result.alignment.characterStartTimesSeconds[i]}s`);
275
});
276
```
277
278
### Dialogue
279
280
```typescript
281
const dialogue = await client.textToDialogue.convert({
282
inputs: [
283
{ text: "Knock knock", voiceId: "JBFqnCBsd6RMkjVDRZzb" },
284
{ text: "Who's there?", voiceId: "Aw4FAjKCGjjNkVhN1Xmq" }
285
],
286
modelId: "eleven_multilingual_v2",
287
outputFormat: "mp3_44100_128"
288
});
289
```
290
291
### Dialogue with Voice Segments
292
293
```typescript
294
const result = await client.textToDialogue.convertWithTimestamps({
295
inputs: [
296
{ text: "Hello, how are you?", voiceId: "bYTqZQo3Jz7LQtmGTgwi" },
297
{ text: "I'm well, thank you!", voiceId: "6lCwbsX1yVjD49QmpkTR" }
298
]
299
});
300
301
result.voiceSegments.forEach(seg => {
302
console.log(`Voice ${seg.voiceId}: ${seg.startTimeSeconds}s-${seg.endTimeSeconds}s`);
303
});
304
```
305
306
### Sound Effects
307
308
```typescript
309
const sfx = await client.textToSoundEffects.convert({
310
text: "Spacious braam for movie trailer",
311
durationSeconds: 5.0,
312
promptInfluence: 0.5,
313
outputFormat: "mp3_44100_128"
314
});
315
316
// Looping sound
317
const loop = await client.textToSoundEffects.convert({
318
text: "Ambient forest sounds",
319
loop: true,
320
durationSeconds: 10.0,
321
modelId: "eleven_text_to_sound_v2"
322
});
323
```
324
325
### Request Continuity
326
327
```typescript
328
// Chain segments with context
329
const seg1 = await client.textToSpeech.convert("voiceId", {
330
text: "First part.",
331
nextText: "Second part."
332
});
333
334
const seg2 = await client.textToSpeech.convert("voiceId", {
335
text: "Second part.",
336
previousText: "First part.",
337
nextText: "Third part."
338
});
339
340
// Or use request IDs
341
const req1 = await client.textToSpeech.convert("voiceId", {
342
text: "Part 1"
343
});
344
345
const req2 = await client.textToSpeech.convert("voiceId", {
346
text: "Part 2",
347
previousRequestIds: [req1.requestId]
348
});
349
```
350
351
### Deterministic Generation
352
353
```typescript
354
const audio1 = await client.textToSpeech.convert("voiceId", {
355
text: "Same seed produces same audio",
356
seed: 12345,
357
modelId: "eleven_multilingual_v2"
358
});
359
360
// Identical parameters produce identical audio
361
const audio2 = await client.textToSpeech.convert("voiceId", {
362
text: "Same seed produces same audio",
363
seed: 12345,
364
modelId: "eleven_multilingual_v2"
365
});
366
```
367
368
### Pronunciation Dictionaries
369
370
```typescript
371
const audio = await client.textToSpeech.convert("voiceId", {
372
text: "The API uses REST architecture",
373
pronunciationDictionaryLocators: [
374
{ pronunciationDictionaryId: "dict_1", versionId: "v1" },
375
{ pronunciationDictionaryId: "dict_2" } // Uses latest
376
]
377
});
378
```
379
380
### Multi-Language
381
382
```typescript
383
const spanish = await client.textToSpeech.convert("voiceId", {
384
text: "Hola, ¿cómo estás?",
385
languageCode: "es",
386
modelId: "eleven_multilingual_v2"
387
});
388
389
const japanese = await client.textToSpeech.convert("voiceId", {
390
text: "こんにちは",
391
languageCode: "ja",
392
modelId: "eleven_multilingual_v2",
393
applyLanguageTextNormalization: true // Warning: increases latency
394
});
395
```
396
397
## Stream Consumption Pattern
398
399
```typescript
400
// ReadableStream<Uint8Array> - use async iteration
401
const stream = await client.textToSpeech.stream("voiceId", { text: "..." });
402
for await (const chunk of stream) {
403
// Process Uint8Array chunks
404
}
405
406
// Alternative: manual reader
407
const reader = stream.getReader();
408
while (true) {
409
const { done, value } = await reader.read();
410
if (done) break;
411
processChunk(value);
412
}
413
```
414
415
## Error Handling
416
417
```typescript
418
import { ElevenLabsClient, ElevenLabsError } from "@elevenlabs/elevenlabs-js";
419
420
try {
421
const audio = await client.textToSpeech.convert("voiceId", {
422
text: "Hello",
423
modelId: "eleven_multilingual_v2"
424
});
425
} catch (error) {
426
if (error instanceof ElevenLabsError) {
427
console.error(`API error ${error.statusCode}: ${error.message}`);
428
}
429
throw error;
430
}
431
```
432
433
## Latency Optimization Levels
434
435
- **0**: Default, max quality, no optimization
436
- **1**: Normal optimization (~50% latency reduction)
437
- **2**: Strong optimization (~75% latency reduction)
438
- **3**: Max optimization
439
- **4**: Max optimization + no text normalization (may mispronounce numbers/dates)
440
441
## Edge Cases and Important Notes
442
443
### Request Continuity Constraints
444
- `previousRequestIds`/`nextRequestIds` max: 3 items each
445
- If both `previousText` and `previousRequestIds` provided, `previousText` is ignored
446
- Same behavior for `nextText` and `nextRequestIds`
447
- Request IDs must be from the same model for best continuity
448
- Maximum 3 pronunciation dictionaries per request
449
450
### Enterprise-Only Features
451
- Zero retention mode (`enableLogging: false`): disables history features, including request stitching
452
- Text normalization for `eleven_turbo_v2_5`/`eleven_flash_v2_5`: requires Enterprise plan
453
- `storeForInpainting` in music generation: Enterprise feature only
454
455
### Language and Normalization
456
- `applyLanguageTextNormalization`: currently Japanese only, significantly increases latency
457
- Language codes must be ISO 639-1 format (e.g., "en", "es", "ja")
458
- If model doesn't support provided language code, returns error
459
460
### Text Length Limits
461
- Model-specific limits vary (check `Model.maximumTextLengthPerRequest`)
462
- Free tier: typically 2500 characters
463
- Subscribed tier: typically 5000 characters
464
- For longer text, split into chunks and use `previousText`/`nextText` for continuity
465
466
### Output Format Constraints
467
- MP3 192kbps: requires Creator tier or above
468
- PCM 44.1kHz+: requires Pro tier or above
469
- μ-law format (`ulaw_8000`): commonly used for Twilio audio inputs
470
471
### Latency Optimization Trade-offs
472
- Level 0: Maximum quality, no optimization
473
- Level 1: ~50% latency reduction, slight quality impact
474
- Level 2: ~75% latency reduction, moderate quality impact
475
- Level 3: Maximum optimization, noticeable quality impact
476
- Level 4: Maximum optimization + no text normalization (may mispronounce numbers/dates)
477
478
### Seed and Determinism
479
- Seed range: 0-4294967295
480
- Determinism not guaranteed, but same seed + same parameters should produce similar results
481
- Best results when all parameters (model, voice, settings) are identical
482
483
### Error Scenarios
484
485
```typescript
486
import { ElevenLabsClient, ElevenLabsError } from "@elevenlabs/elevenlabs-js";
487
488
// Handle text length errors
489
try {
490
const audio = await client.textToSpeech.convert("voiceId", {
491
text: veryLongText, // Exceeds model limit
492
modelId: "eleven_multilingual_v2"
493
});
494
} catch (error) {
495
if (error instanceof ElevenLabsError && error.statusCode === 422) {
496
// Split text and retry
497
const chunks = splitText(veryLongText, 5000);
498
// Process chunks with continuity...
499
}
500
}
501
502
// Handle invalid voice ID
503
try {
504
const audio = await client.textToSpeech.convert("invalid_voice_id", {
505
text: "Hello"
506
});
507
} catch (error) {
508
if (error instanceof ElevenLabsError && error.statusCode === 404) {
509
console.error("Voice not found - check voice ID");
510
// List available voices
511
const voices = await client.voices.getAll();
512
}
513
}
514
515
// Handle unsupported language
516
try {
517
const audio = await client.textToSpeech.convert("voiceId", {
518
text: "Hello",
519
languageCode: "xx" // Invalid code
520
});
521
} catch (error) {
522
if (error instanceof ElevenLabsError && error.statusCode === 422) {
523
console.error("Language not supported by model");
524
// Check model languages
525
const models = await client.models.list();
526
const model = models.find(m => m.modelId === "eleven_multilingual_v2");
527
console.log("Supported languages:", model?.languages);
528
}
529
}
530
```
531
532
## Comprehensive Examples
533
534
### Long-Form Content Generation with Chunking
535
536
```typescript
537
import * as fs from "fs";
538
import { ElevenLabsClient, ElevenLabsError } from "@elevenlabs/elevenlabs-js";
539
540
async function generateLongFormAudio(
541
voiceId: string,
542
longText: string,
543
outputPath: string
544
) {
545
const maxChunkLength = 5000;
546
const chunks = splitTextIntoChunks(longText, maxChunkLength);
547
const audioChunks: Buffer[] = [];
548
549
for (let i = 0; i < chunks.length; i++) {
550
const chunk = chunks[i];
551
552
// Use continuity features for natural flow
553
const request: TtsRequest = {
554
text: chunk,
555
modelId: "eleven_multilingual_v2",
556
previousText: i > 0 ? chunks[i - 1].slice(-200) : undefined,
557
nextText: i < chunks.length - 1 ? chunks[i + 1].slice(0, 200) : undefined,
558
voiceSettings: {
559
stability: 0.7, // Higher for consistent long-form narration
560
similarityBoost: 0.75
561
}
562
};
563
564
try {
565
const audioStream = await client.textToSpeech.convert(voiceId, request, {
566
timeoutInSeconds: 120, // Longer timeout for processing
567
maxRetries: 3
568
});
569
570
const audio = await streamToBuffer(audioStream);
571
audioChunks.push(audio);
572
573
// Rate limiting: small delay between chunks
574
if (i < chunks.length - 1) {
575
await new Promise(resolve => setTimeout(resolve, 200));
576
}
577
} catch (error) {
578
if (error instanceof ElevenLabsError && error.statusCode === 429) {
579
// Exponential backoff on rate limit
580
const delay = Math.min(1000 * Math.pow(2, i), 10000);
581
await new Promise(resolve => setTimeout(resolve, delay));
582
i--; // Retry this chunk
583
continue;
584
}
585
throw error;
586
}
587
}
588
589
// Combine all chunks
590
const finalAudio = Buffer.concat(audioChunks);
591
fs.writeFileSync(outputPath, finalAudio);
592
}
593
594
function splitTextIntoChunks(text: string, maxLength: number): string[] {
595
const chunks: string[] = [];
596
let currentChunk = "";
597
598
const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
599
600
for (const sentence of sentences) {
601
if ((currentChunk + sentence).length <= maxLength) {
602
currentChunk += sentence;
603
} else {
604
if (currentChunk) chunks.push(currentChunk.trim());
605
currentChunk = sentence;
606
}
607
}
608
609
if (currentChunk) chunks.push(currentChunk.trim());
610
return chunks;
611
}
612
613
async function streamToBuffer(stream: ReadableStream<Uint8Array>): Promise<Buffer> {
614
const reader = stream.getReader();
615
const chunks: Uint8Array[] = [];
616
617
while (true) {
618
const { done, value } = await reader.read();
619
if (done) break;
620
chunks.push(value);
621
}
622
623
return Buffer.concat(chunks);
624
}
625
```
626
627
### Multi-Language TTS with Fallback
628
629
```typescript
630
async function generateMultilingualAudio(
631
voiceId: string,
632
texts: { lang: string; text: string }[]
633
) {
634
const results: { lang: string; audio: Buffer }[] = [];
635
636
for (const { lang, text } of texts) {
637
try {
638
// Try with language code first
639
const audioStream = await client.textToSpeech.convert(voiceId, {
640
text,
641
languageCode: lang,
642
modelId: "eleven_multilingual_v2"
643
});
644
645
const audio = await streamToBuffer(audioStream);
646
results.push({ lang, audio });
647
648
} catch (error) {
649
if (error instanceof ElevenLabsError && error.statusCode === 422) {
650
// Language not supported, try without language code
651
console.warn(`Language ${lang} not supported, using default`);
652
const audioStream = await client.textToSpeech.convert(voiceId, {
653
text,
654
modelId: "eleven_multilingual_v2"
655
});
656
const audio = await streamToBuffer(audioStream);
657
results.push({ lang: "default", audio });
658
} else {
659
throw error;
660
}
661
}
662
}
663
664
return results;
665
}
666
```
667
668
### Dialogue Generation with Error Recovery
669
670
```typescript
671
async function generateDialogueWithRetry(
672
inputs: DialogueInput[],
673
maxRetries = 3
674
) {
675
for (let attempt = 0; attempt < maxRetries; attempt++) {
676
try {
677
const audioStream = await client.textToDialogue.convert({
678
inputs,
679
modelId: "eleven_multilingual_v2",
680
outputFormat: "mp3_44100_128"
681
});
682
683
return await streamToBuffer(audioStream);
684
685
} catch (error) {
686
if (error instanceof ElevenLabsError) {
687
// Don't retry on validation errors
688
if (error.statusCode === 422) {
689
throw error;
690
}
691
692
// Retry on rate limits and server errors
693
if (attempt < maxRetries - 1) {
694
const delay = Math.pow(2, attempt) * 1000; // Exponential backoff
695
console.log(`Retry attempt ${attempt + 1} after ${delay}ms`);
696
await new Promise(resolve => setTimeout(resolve, delay));
697
continue;
698
}
699
}
700
throw error;
701
}
702
}
703
}
704
```
705