Tessl Tile for npm/@elevenlabs/elevenlabs-js@2.24.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio-processing.md conversational-ai.md dubbing.md index.md music.md realtime.md studio.md text-to-speech.md transcription.md voices.md workspace.md

text-to-speech.mddocs/

0
# Text-to-Speech
1

2
## API Methods
3

4
### textToSpeech.convert()
5

6
```typescript { .api }
7
convert(
8
  voiceId: string,
9
  request: TtsRequest,
10
  options?: RequestOptions
11
): Promise<ReadableStream<Uint8Array>>
12
```
13

14
### textToSpeech.stream()
15

16
```typescript { .api }
17
stream(
18
  voiceId: string,
19
  request: TtsRequest,
20
  options?: RequestOptions
21
): Promise<ReadableStream<Uint8Array>>
22
```
23

24
### textToSpeech.convertWithTimestamps()
25

26
```typescript { .api }
27
convertWithTimestamps(
28
  voiceId: string,
29
  request: TtsRequest,
30
  options?: RequestOptions
31
): Promise<AudioWithTimestampsResponse>
32
```
33

34
### textToSpeech.streamWithTimestamps()
35

36
```typescript { .api }
37
streamWithTimestamps(
38
  voiceId: string,
39
  request: TtsRequest,
40
  options?: RequestOptions
41
): Promise<ReadableStream<StreamingAudioChunkWithTimestampsResponse>>
42
```
43

44
## Text-to-Dialogue Methods
45

46
### textToDialogue.convert()
47

48
```typescript { .api }
49
convert(
50
  request: DialogueRequest,
51
  options?: RequestOptions
52
): Promise<ReadableStream<Uint8Array>>
53
```
54

55
### textToDialogue.stream()
56

57
```typescript { .api }
58
stream(
59
  request: DialogueRequest,
60
  options?: RequestOptions
61
): Promise<ReadableStream<Uint8Array>>
62
```
63

64
### textToDialogue.convertWithTimestamps()
65

66
```typescript { .api }
67
convertWithTimestamps(
68
  request: DialogueRequest,
69
  options?: RequestOptions
70
): Promise<AudioWithTimestampsAndVoiceSegmentsResponseModel>
71
```
72

73
### textToDialogue.streamWithTimestamps()
74

75
```typescript { .api }
76
streamWithTimestamps(
77
  request: DialogueRequest,
78
  options?: RequestOptions
79
): Promise<ReadableStream<StreamingAudioChunkWithTimestampsAndVoiceSegmentsResponseModel>>
80
```
81

82
## Sound Effects
83

84
### textToSoundEffects.convert()
85

86
```typescript { .api }
87
convert(
88
  request: {
89
    text: string;
90
    outputFormat?: OutputFormat;
91
    loop?: boolean;  // Only for eleven_text_to_sound_v2
92
    durationSeconds?: number;  // 0.5-30, defaults to auto
93
    promptInfluence?: number;  // 0-1, defaults to 0.3
94
    modelId?: string;
95
  },
96
  options?: RequestOptions
97
): Promise<ReadableStream<Uint8Array>>
98
```
99

100
## Request Types
101

102
### TtsRequest
103

104
```typescript { .api }
105
interface TtsRequest {
106
  text: string;
107
  modelId?: string;
108
  voiceSettings?: VoiceSettings;
109
  pronunciationDictionaryLocators?: PronunciationDictionaryLocator[];
110
  languageCode?: string;  // ISO 639-1
111
  outputFormat?: OutputFormat;  // Default: mp3_44100_128
112

113
  // Latency optimization: 0 (default), 1 (50%), 2 (75%), 3 (max), 4 (max+no normalization)
114
  optimizeStreamingLatency?: number;
115

116
  // Continuity
117
  previousText?: string;
118
  nextText?: string;
119
  previousRequestIds?: string[];  // Max 3
120
  nextRequestIds?: string[];  // Max 3
121

122
  // Control
123
  seed?: number;  // 0-4294967295, for deterministic generation
124
  enableLogging?: boolean;  // false = zero retention (enterprise only)
125
  usePvcAsIvc?: boolean;  // Workaround for PVC latency
126
  applyTextNormalization?: "auto" | "on" | "off";
127
  applyLanguageTextNormalization?: boolean;  // Increases latency, Japanese only
128
}
129
```
130

131
### DialogueRequest
132

133
```typescript { .api }
134
interface DialogueRequest {
135
  inputs: DialogueInput[];
136
  modelId?: string;
137
  outputFormat?: OutputFormat;
138
  languageCode?: string;
139
  settings?: ModelSettingsResponseModel;
140
  pronunciationDictionaryLocators?: PronunciationDictionaryLocator[];
141
  seed?: number;
142
  applyTextNormalization?: "auto" | "on" | "off";
143
}
144

145
interface DialogueInput {
146
  text: string;
147
  voiceId: string;
148
}
149

150
interface ModelSettingsResponseModel {
151
  stability?: number;  // 0-1, lower = more emotional range
152
}
153
```
154

155
### Response Types
156

157
```typescript { .api }
158
interface AudioWithTimestampsResponse {
159
  audioBase64: string;
160
  alignment?: CharacterAlignmentResponseModel;
161
  normalizedAlignment?: CharacterAlignmentResponseModel;
162
}
163

164
interface CharacterAlignmentResponseModel {
165
  characters: string[];
166
  characterStartTimesSeconds: number[];
167
  characterEndTimesSeconds: number[];
168
}
169

170
interface StreamingAudioChunkWithTimestampsResponse {
171
  audioBase64: string;
172
  alignment?: CharacterAlignmentResponseModel;
173
  normalizedAlignment?: CharacterAlignmentResponseModel;
174
}
175

176
interface AudioWithTimestampsAndVoiceSegmentsResponseModel {
177
  audioBase64: string;
178
  alignment?: CharacterAlignmentResponseModel;
179
  normalizedAlignment?: CharacterAlignmentResponseModel;
180
  voiceSegments: VoiceSegment[];
181
}
182

183
interface StreamingAudioChunkWithTimestampsAndVoiceSegmentsResponseModel {
184
  audioBase64: string;
185
  alignment?: CharacterAlignmentResponseModel;
186
  normalizedAlignment?: CharacterAlignmentResponseModel;
187
  voiceSegments: VoiceSegment[];
188
}
189

190
interface VoiceSegment {
191
  voiceId: string;
192
  startTimeSeconds: number;
193
  endTimeSeconds: number;
194
  characterStartIndex: number;
195
  characterEndIndex: number;
196
  dialogueInputIndex: number;
197
}
198
```
199

200
### VoiceSettings
201

202
```typescript { .api }
203
interface VoiceSettings {
204
  stability?: number;  // 0-1
205
  similarityBoost?: number;  // 0-1
206
  style?: number;  // 0-1, consumes extra resources
207
  useSpeakerBoost?: boolean;  // Increases latency
208
  speed?: number;  // 1.0 = default, <1 = slower, >1 = faster
209
}
210
```
211

212
### PronunciationDictionaryLocator
213

214
```typescript { .api }
215
interface PronunciationDictionaryLocator {
216
  pronunciationDictionaryId: string;
217
  versionId?: string;  // Latest if omitted
218
}
219
```
220

221
### OutputFormat
222

223
```typescript { .api }
224
type OutputFormat =
225
  | "mp3_22050_32" | "mp3_24000_48" | "mp3_44100_32" | "mp3_44100_64"
226
  | "mp3_44100_96" | "mp3_44100_128" | "mp3_44100_192"  // 192 requires Creator+
227
  | "pcm_8000" | "pcm_16000" | "pcm_22050" | "pcm_24000"
228
  | "pcm_32000" | "pcm_44100" | "pcm_48000"  // 44.1kHz+ requires Pro+
229
  | "ulaw_8000"  // For Twilio
230
  | "alaw_8000"
231
  | "opus_48000_32" | "opus_48000_64" | "opus_48000_96"
232
  | "opus_48000_128" | "opus_48000_192";
233
```
234

235
## Usage Examples
236

237
### Basic TTS
238

239
```typescript
240
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
241

242
const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });
243

244
const audio = await client.textToSpeech.convert("21m00Tcm4TlvDq8ikWAM", {
245
  text: "Hello! This is a test.",
246
  modelId: "eleven_multilingual_v2",
247
  voiceSettings: { stability: 0.5, similarityBoost: 0.75 }
248
});
249
```
250

251
### Streaming
252

253
```typescript
254
const stream = await client.textToSpeech.stream("21m00Tcm4TlvDq8ikWAM", {
255
  text: "Streaming audio for low latency.",
256
  modelId: "eleven_flash_v2_5",
257
  optimizeStreamingLatency: 3
258
});
259

260
for await (const chunk of stream) {
261
  processAudioChunk(chunk);
262
}
263
```
264

265
### With Timestamps
266

267
```typescript
268
const result = await client.textToSpeech.convertWithTimestamps(
269
  "21m00Tcm4TlvDq8ikWAM",
270
  { text: "Audio with timing data.", modelId: "eleven_multilingual_v2" }
271
);
272

273
result.alignment?.characters.forEach((char, i) => {
274
  console.log(`"${char}": ${result.alignment.characterStartTimesSeconds[i]}s`);
275
});
276
```
277

278
### Dialogue
279

280
```typescript
281
const dialogue = await client.textToDialogue.convert({
282
  inputs: [
283
    { text: "Knock knock", voiceId: "JBFqnCBsd6RMkjVDRZzb" },
284
    { text: "Who's there?", voiceId: "Aw4FAjKCGjjNkVhN1Xmq" }
285
  ],
286
  modelId: "eleven_multilingual_v2",
287
  outputFormat: "mp3_44100_128"
288
});
289
```
290

291
### Dialogue with Voice Segments
292

293
```typescript
294
const result = await client.textToDialogue.convertWithTimestamps({
295
  inputs: [
296
    { text: "Hello, how are you?", voiceId: "bYTqZQo3Jz7LQtmGTgwi" },
297
    { text: "I'm well, thank you!", voiceId: "6lCwbsX1yVjD49QmpkTR" }
298
  ]
299
});
300

301
result.voiceSegments.forEach(seg => {
302
  console.log(`Voice ${seg.voiceId}: ${seg.startTimeSeconds}s-${seg.endTimeSeconds}s`);
303
});
304
```
305

306
### Sound Effects
307

308
```typescript
309
const sfx = await client.textToSoundEffects.convert({
310
  text: "Spacious braam for movie trailer",
311
  durationSeconds: 5.0,
312
  promptInfluence: 0.5,
313
  outputFormat: "mp3_44100_128"
314
});
315

316
// Looping sound
317
const loop = await client.textToSoundEffects.convert({
318
  text: "Ambient forest sounds",
319
  loop: true,
320
  durationSeconds: 10.0,
321
  modelId: "eleven_text_to_sound_v2"
322
});
323
```
324

325
### Request Continuity
326

327
```typescript
328
// Chain segments with context
329
const seg1 = await client.textToSpeech.convert("voiceId", {
330
  text: "First part.",
331
  nextText: "Second part."
332
});
333

334
const seg2 = await client.textToSpeech.convert("voiceId", {
335
  text: "Second part.",
336
  previousText: "First part.",
337
  nextText: "Third part."
338
});
339

340
// Or use request IDs
341
const req1 = await client.textToSpeech.convert("voiceId", {
342
  text: "Part 1"
343
});
344

345
const req2 = await client.textToSpeech.convert("voiceId", {
346
  text: "Part 2",
347
  previousRequestIds: [req1.requestId]
348
});
349
```
350

351
### Deterministic Generation
352

353
```typescript
354
const audio1 = await client.textToSpeech.convert("voiceId", {
355
  text: "Same seed produces same audio",
356
  seed: 12345,
357
  modelId: "eleven_multilingual_v2"
358
});
359

360
// Identical parameters produce identical audio
361
const audio2 = await client.textToSpeech.convert("voiceId", {
362
  text: "Same seed produces same audio",
363
  seed: 12345,
364
  modelId: "eleven_multilingual_v2"
365
});
366
```
367

368
### Pronunciation Dictionaries
369

370
```typescript
371
const audio = await client.textToSpeech.convert("voiceId", {
372
  text: "The API uses REST architecture",
373
  pronunciationDictionaryLocators: [
374
    { pronunciationDictionaryId: "dict_1", versionId: "v1" },
375
    { pronunciationDictionaryId: "dict_2" }  // Uses latest
376
  ]
377
});
378
```
379

380
### Multi-Language
381

382
```typescript
383
const spanish = await client.textToSpeech.convert("voiceId", {
384
  text: "Hola, ¿cómo estás?",
385
  languageCode: "es",
386
  modelId: "eleven_multilingual_v2"
387
});
388

389
const japanese = await client.textToSpeech.convert("voiceId", {
390
  text: "こんにちは",
391
  languageCode: "ja",
392
  modelId: "eleven_multilingual_v2",
393
  applyLanguageTextNormalization: true  // Warning: increases latency
394
});
395
```
396

397
## Stream Consumption Pattern
398

399
```typescript
400
// ReadableStream<Uint8Array> - use async iteration
401
const stream = await client.textToSpeech.stream("voiceId", { text: "..." });
402
for await (const chunk of stream) {
403
  // Process Uint8Array chunks
404
}
405

406
// Alternative: manual reader
407
const reader = stream.getReader();
408
while (true) {
409
  const { done, value } = await reader.read();
410
  if (done) break;
411
  processChunk(value);
412
}
413
```
414

415
## Error Handling
416

417
```typescript
418
import { ElevenLabsClient, ElevenLabsError } from "@elevenlabs/elevenlabs-js";
419

420
try {
421
  const audio = await client.textToSpeech.convert("voiceId", {
422
    text: "Hello",
423
    modelId: "eleven_multilingual_v2"
424
  });
425
} catch (error) {
426
  if (error instanceof ElevenLabsError) {
427
    console.error(`API error ${error.statusCode}: ${error.message}`);
428
  }
429
  throw error;
430
}
431
```
432

433
## Latency Optimization Levels
434

435
- **0**: Default, max quality, no optimization
436
- **1**: Normal optimization (~50% latency reduction)
437
- **2**: Strong optimization (~75% latency reduction)
438
- **3**: Max optimization
439
- **4**: Max optimization + no text normalization (may mispronounce numbers/dates)
440

441
## Edge Cases and Important Notes
442

443
### Request Continuity Constraints
444
- `previousRequestIds`/`nextRequestIds` max: 3 items each
445
- If both `previousText` and `previousRequestIds` provided, `previousText` is ignored
446
- Same behavior for `nextText` and `nextRequestIds`
447
- Request IDs must be from the same model for best continuity
448
- Maximum 3 pronunciation dictionaries per request
449

450
### Enterprise-Only Features
451
- Zero retention mode (`enableLogging: false`): disables history features, including request stitching
452
- Text normalization for `eleven_turbo_v2_5`/`eleven_flash_v2_5`: requires Enterprise plan
453
- `storeForInpainting` in music generation: Enterprise feature only
454

455
### Language and Normalization
456
- `applyLanguageTextNormalization`: currently Japanese only, significantly increases latency
457
- Language codes must be ISO 639-1 format (e.g., "en", "es", "ja")
458
- If model doesn't support provided language code, returns error
459

460
### Text Length Limits
461
- Model-specific limits vary (check `Model.maximumTextLengthPerRequest`)
462
- Free tier: typically 2500 characters
463
- Subscribed tier: typically 5000 characters
464
- For longer text, split into chunks and use `previousText`/`nextText` for continuity
465

466
### Output Format Constraints
467
- MP3 192kbps: requires Creator tier or above
468
- PCM 44.1kHz+: requires Pro tier or above
469
- μ-law format (`ulaw_8000`): commonly used for Twilio audio inputs
470

471
### Latency Optimization Trade-offs
472
- Level 0: Maximum quality, no optimization
473
- Level 1: ~50% latency reduction, slight quality impact
474
- Level 2: ~75% latency reduction, moderate quality impact
475
- Level 3: Maximum optimization, noticeable quality impact
476
- Level 4: Maximum optimization + no text normalization (may mispronounce numbers/dates)
477

478
### Seed and Determinism
479
- Seed range: 0-4294967295
480
- Determinism not guaranteed, but same seed + same parameters should produce similar results
481
- Best results when all parameters (model, voice, settings) are identical
482

483
### Error Scenarios
484

485
```typescript
486
import { ElevenLabsClient, ElevenLabsError } from "@elevenlabs/elevenlabs-js";
487

488
// Handle text length errors
489
try {
490
  const audio = await client.textToSpeech.convert("voiceId", {
491
    text: veryLongText,  // Exceeds model limit
492
    modelId: "eleven_multilingual_v2"
493
  });
494
} catch (error) {
495
  if (error instanceof ElevenLabsError && error.statusCode === 422) {
496
    // Split text and retry
497
    const chunks = splitText(veryLongText, 5000);
498
    // Process chunks with continuity...
499
  }
500
}
501

502
// Handle invalid voice ID
503
try {
504
  const audio = await client.textToSpeech.convert("invalid_voice_id", {
505
    text: "Hello"
506
  });
507
} catch (error) {
508
  if (error instanceof ElevenLabsError && error.statusCode === 404) {
509
    console.error("Voice not found - check voice ID");
510
    // List available voices
511
    const voices = await client.voices.getAll();
512
  }
513
}
514

515
// Handle unsupported language
516
try {
517
  const audio = await client.textToSpeech.convert("voiceId", {
518
    text: "Hello",
519
    languageCode: "xx"  // Invalid code
520
  });
521
} catch (error) {
522
  if (error instanceof ElevenLabsError && error.statusCode === 422) {
523
    console.error("Language not supported by model");
524
    // Check model languages
525
    const models = await client.models.list();
526
    const model = models.find(m => m.modelId === "eleven_multilingual_v2");
527
    console.log("Supported languages:", model?.languages);
528
  }
529
}
530
```
531

532
## Comprehensive Examples
533

534
### Long-Form Content Generation with Chunking
535

536
```typescript
537
import * as fs from "fs";
538
import { ElevenLabsClient, ElevenLabsError } from "@elevenlabs/elevenlabs-js";
539

540
async function generateLongFormAudio(
541
  voiceId: string, 
542
  longText: string,
543
  outputPath: string
544
) {
545
  const maxChunkLength = 5000;
546
  const chunks = splitTextIntoChunks(longText, maxChunkLength);
547
  const audioChunks: Buffer[] = [];
548
  
549
  for (let i = 0; i < chunks.length; i++) {
550
    const chunk = chunks[i];
551
    
552
    // Use continuity features for natural flow
553
    const request: TtsRequest = {
554
      text: chunk,
555
      modelId: "eleven_multilingual_v2",
556
      previousText: i > 0 ? chunks[i - 1].slice(-200) : undefined,
557
      nextText: i < chunks.length - 1 ? chunks[i + 1].slice(0, 200) : undefined,
558
      voiceSettings: {
559
        stability: 0.7,  // Higher for consistent long-form narration
560
        similarityBoost: 0.75
561
      }
562
    };
563
    
564
    try {
565
      const audioStream = await client.textToSpeech.convert(voiceId, request, {
566
        timeoutInSeconds: 120,  // Longer timeout for processing
567
        maxRetries: 3
568
      });
569
      
570
      const audio = await streamToBuffer(audioStream);
571
      audioChunks.push(audio);
572
      
573
      // Rate limiting: small delay between chunks
574
      if (i < chunks.length - 1) {
575
        await new Promise(resolve => setTimeout(resolve, 200));
576
      }
577
    } catch (error) {
578
      if (error instanceof ElevenLabsError && error.statusCode === 429) {
579
        // Exponential backoff on rate limit
580
        const delay = Math.min(1000 * Math.pow(2, i), 10000);
581
        await new Promise(resolve => setTimeout(resolve, delay));
582
        i--;  // Retry this chunk
583
        continue;
584
      }
585
      throw error;
586
    }
587
  }
588
  
589
  // Combine all chunks
590
  const finalAudio = Buffer.concat(audioChunks);
591
  fs.writeFileSync(outputPath, finalAudio);
592
}
593

594
function splitTextIntoChunks(text: string, maxLength: number): string[] {
595
  const chunks: string[] = [];
596
  let currentChunk = "";
597
  
598
  const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
599
  
600
  for (const sentence of sentences) {
601
    if ((currentChunk + sentence).length <= maxLength) {
602
      currentChunk += sentence;
603
    } else {
604
      if (currentChunk) chunks.push(currentChunk.trim());
605
      currentChunk = sentence;
606
    }
607
  }
608
  
609
  if (currentChunk) chunks.push(currentChunk.trim());
610
  return chunks;
611
}
612

613
async function streamToBuffer(stream: ReadableStream<Uint8Array>): Promise<Buffer> {
614
  const reader = stream.getReader();
615
  const chunks: Uint8Array[] = [];
616
  
617
  while (true) {
618
    const { done, value } = await reader.read();
619
    if (done) break;
620
    chunks.push(value);
621
  }
622
  
623
  return Buffer.concat(chunks);
624
}
625
```
626

627
### Multi-Language TTS with Fallback
628

629
```typescript
630
async function generateMultilingualAudio(
631
  voiceId: string,
632
  texts: { lang: string; text: string }[]
633
) {
634
  const results: { lang: string; audio: Buffer }[] = [];
635
  
636
  for (const { lang, text } of texts) {
637
    try {
638
      // Try with language code first
639
      const audioStream = await client.textToSpeech.convert(voiceId, {
640
        text,
641
        languageCode: lang,
642
        modelId: "eleven_multilingual_v2"
643
      });
644
      
645
      const audio = await streamToBuffer(audioStream);
646
      results.push({ lang, audio });
647
      
648
    } catch (error) {
649
      if (error instanceof ElevenLabsError && error.statusCode === 422) {
650
        // Language not supported, try without language code
651
        console.warn(`Language ${lang} not supported, using default`);
652
        const audioStream = await client.textToSpeech.convert(voiceId, {
653
          text,
654
          modelId: "eleven_multilingual_v2"
655
        });
656
        const audio = await streamToBuffer(audioStream);
657
        results.push({ lang: "default", audio });
658
      } else {
659
        throw error;
660
      }
661
    }
662
  }
663
  
664
  return results;
665
}
666
```
667

668
### Dialogue Generation with Error Recovery
669

670
```typescript
671
async function generateDialogueWithRetry(
672
  inputs: DialogueInput[],
673
  maxRetries = 3
674
) {
675
  for (let attempt = 0; attempt < maxRetries; attempt++) {
676
    try {
677
      const audioStream = await client.textToDialogue.convert({
678
        inputs,
679
        modelId: "eleven_multilingual_v2",
680
        outputFormat: "mp3_44100_128"
681
      });
682
      
683
      return await streamToBuffer(audioStream);
684
      
685
    } catch (error) {
686
      if (error instanceof ElevenLabsError) {
687
        // Don't retry on validation errors
688
        if (error.statusCode === 422) {
689
          throw error;
690
        }
691
        
692
        // Retry on rate limits and server errors
693
        if (attempt < maxRetries - 1) {
694
          const delay = Math.pow(2, attempt) * 1000;  // Exponential backoff
695
          console.log(`Retry attempt ${attempt + 1} after ${delay}ms`);
696
          await new Promise(resolve => setTimeout(resolve, delay));
697
          continue;
698
        }
699
      }
700
      throw error;
701
    }
702
  }
703
}
704
```
705

Version

Tile

Files

text-to-speech.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text-to-speech.mddocs/