or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-processing.mdconversational-ai.mddubbing.mdindex.mdmusic.mdrealtime.mdstudio.mdtext-to-speech.mdtranscription.mdvoices.mdworkspace.md

text-to-speech.mddocs/

0

# Text-to-Speech

1

2

## API Methods

3

4

### textToSpeech.convert()

5

6

```typescript { .api }

7

convert(

8

voiceId: string,

9

request: TtsRequest,

10

options?: RequestOptions

11

): Promise<ReadableStream<Uint8Array>>

12

```

13

14

### textToSpeech.stream()

15

16

```typescript { .api }

17

stream(

18

voiceId: string,

19

request: TtsRequest,

20

options?: RequestOptions

21

): Promise<ReadableStream<Uint8Array>>

22

```

23

24

### textToSpeech.convertWithTimestamps()

25

26

```typescript { .api }

27

convertWithTimestamps(

28

voiceId: string,

29

request: TtsRequest,

30

options?: RequestOptions

31

): Promise<AudioWithTimestampsResponse>

32

```

33

34

### textToSpeech.streamWithTimestamps()

35

36

```typescript { .api }

37

streamWithTimestamps(

38

voiceId: string,

39

request: TtsRequest,

40

options?: RequestOptions

41

): Promise<ReadableStream<StreamingAudioChunkWithTimestampsResponse>>

42

```

43

44

## Text-to-Dialogue Methods

45

46

### textToDialogue.convert()

47

48

```typescript { .api }

49

convert(

50

request: DialogueRequest,

51

options?: RequestOptions

52

): Promise<ReadableStream<Uint8Array>>

53

```

54

55

### textToDialogue.stream()

56

57

```typescript { .api }

58

stream(

59

request: DialogueRequest,

60

options?: RequestOptions

61

): Promise<ReadableStream<Uint8Array>>

62

```

63

64

### textToDialogue.convertWithTimestamps()

65

66

```typescript { .api }

67

convertWithTimestamps(

68

request: DialogueRequest,

69

options?: RequestOptions

70

): Promise<AudioWithTimestampsAndVoiceSegmentsResponseModel>

71

```

72

73

### textToDialogue.streamWithTimestamps()

74

75

```typescript { .api }

76

streamWithTimestamps(

77

request: DialogueRequest,

78

options?: RequestOptions

79

): Promise<ReadableStream<StreamingAudioChunkWithTimestampsAndVoiceSegmentsResponseModel>>

80

```

81

82

## Sound Effects

83

84

### textToSoundEffects.convert()

85

86

```typescript { .api }

87

convert(

88

request: {

89

text: string;

90

outputFormat?: OutputFormat;

91

loop?: boolean; // Only for eleven_text_to_sound_v2

92

durationSeconds?: number; // 0.5-30, defaults to auto

93

promptInfluence?: number; // 0-1, defaults to 0.3

94

modelId?: string;

95

},

96

options?: RequestOptions

97

): Promise<ReadableStream<Uint8Array>>

98

```

99

100

## Request Types

101

102

### TtsRequest

103

104

```typescript { .api }

105

interface TtsRequest {

106

text: string;

107

modelId?: string;

108

voiceSettings?: VoiceSettings;

109

pronunciationDictionaryLocators?: PronunciationDictionaryLocator[];

110

languageCode?: string; // ISO 639-1

111

outputFormat?: OutputFormat; // Default: mp3_44100_128

112

113

// Latency optimization: 0 (default), 1 (50%), 2 (75%), 3 (max), 4 (max+no normalization)

114

optimizeStreamingLatency?: number;

115

116

// Continuity

117

previousText?: string;

118

nextText?: string;

119

previousRequestIds?: string[]; // Max 3

120

nextRequestIds?: string[]; // Max 3

121

122

// Control

123

seed?: number; // 0-4294967295, for deterministic generation

124

enableLogging?: boolean; // false = zero retention (enterprise only)

125

usePvcAsIvc?: boolean; // Workaround for PVC latency

126

applyTextNormalization?: "auto" | "on" | "off";

127

applyLanguageTextNormalization?: boolean; // Increases latency, Japanese only

128

}

129

```

130

131

### DialogueRequest

132

133

```typescript { .api }

134

interface DialogueRequest {

135

inputs: DialogueInput[];

136

modelId?: string;

137

outputFormat?: OutputFormat;

138

languageCode?: string;

139

settings?: ModelSettingsResponseModel;

140

pronunciationDictionaryLocators?: PronunciationDictionaryLocator[];

141

seed?: number;

142

applyTextNormalization?: "auto" | "on" | "off";

143

}

144

145

interface DialogueInput {

146

text: string;

147

voiceId: string;

148

}

149

150

interface ModelSettingsResponseModel {

151

stability?: number; // 0-1, lower = more emotional range

152

}

153

```

154

155

### Response Types

156

157

```typescript { .api }

158

interface AudioWithTimestampsResponse {

159

audioBase64: string;

160

alignment?: CharacterAlignmentResponseModel;

161

normalizedAlignment?: CharacterAlignmentResponseModel;

162

}

163

164

interface CharacterAlignmentResponseModel {

165

characters: string[];

166

characterStartTimesSeconds: number[];

167

characterEndTimesSeconds: number[];

168

}

169

170

interface StreamingAudioChunkWithTimestampsResponse {

171

audioBase64: string;

172

alignment?: CharacterAlignmentResponseModel;

173

normalizedAlignment?: CharacterAlignmentResponseModel;

174

}

175

176

interface AudioWithTimestampsAndVoiceSegmentsResponseModel {

177

audioBase64: string;

178

alignment?: CharacterAlignmentResponseModel;

179

normalizedAlignment?: CharacterAlignmentResponseModel;

180

voiceSegments: VoiceSegment[];

181

}

182

183

interface StreamingAudioChunkWithTimestampsAndVoiceSegmentsResponseModel {

184

audioBase64: string;

185

alignment?: CharacterAlignmentResponseModel;

186

normalizedAlignment?: CharacterAlignmentResponseModel;

187

voiceSegments: VoiceSegment[];

188

}

189

190

interface VoiceSegment {

191

voiceId: string;

192

startTimeSeconds: number;

193

endTimeSeconds: number;

194

characterStartIndex: number;

195

characterEndIndex: number;

196

dialogueInputIndex: number;

197

}

198

```

199

200

### VoiceSettings

201

202

```typescript { .api }

203

interface VoiceSettings {

204

stability?: number; // 0-1

205

similarityBoost?: number; // 0-1

206

style?: number; // 0-1, consumes extra resources

207

useSpeakerBoost?: boolean; // Increases latency

208

speed?: number; // 1.0 = default, <1 = slower, >1 = faster

209

}

210

```

211

212

### PronunciationDictionaryLocator

213

214

```typescript { .api }

215

interface PronunciationDictionaryLocator {

216

pronunciationDictionaryId: string;

217

versionId?: string; // Latest if omitted

218

}

219

```

220

221

### OutputFormat

222

223

```typescript { .api }

224

type OutputFormat =

225

| "mp3_22050_32" | "mp3_24000_48" | "mp3_44100_32" | "mp3_44100_64"

226

| "mp3_44100_96" | "mp3_44100_128" | "mp3_44100_192" // 192 requires Creator+

227

| "pcm_8000" | "pcm_16000" | "pcm_22050" | "pcm_24000"

228

| "pcm_32000" | "pcm_44100" | "pcm_48000" // 44.1kHz+ requires Pro+

229

| "ulaw_8000" // For Twilio

230

| "alaw_8000"

231

| "opus_48000_32" | "opus_48000_64" | "opus_48000_96"

232

| "opus_48000_128" | "opus_48000_192";

233

```

234

235

## Usage Examples

236

237

### Basic TTS

238

239

```typescript

240

import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";

241

242

const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });

243

244

const audio = await client.textToSpeech.convert("21m00Tcm4TlvDq8ikWAM", {

245

text: "Hello! This is a test.",

246

modelId: "eleven_multilingual_v2",

247

voiceSettings: { stability: 0.5, similarityBoost: 0.75 }

248

});

249

```

250

251

### Streaming

252

253

```typescript

254

const stream = await client.textToSpeech.stream("21m00Tcm4TlvDq8ikWAM", {

255

text: "Streaming audio for low latency.",

256

modelId: "eleven_flash_v2_5",

257

optimizeStreamingLatency: 3

258

});

259

260

for await (const chunk of stream) {

261

processAudioChunk(chunk);

262

}

263

```

264

265

### With Timestamps

266

267

```typescript

268

const result = await client.textToSpeech.convertWithTimestamps(

269

"21m00Tcm4TlvDq8ikWAM",

270

{ text: "Audio with timing data.", modelId: "eleven_multilingual_v2" }

271

);

272

273

result.alignment?.characters.forEach((char, i) => {

274

console.log(`"${char}": ${result.alignment.characterStartTimesSeconds[i]}s`);

275

});

276

```

277

278

### Dialogue

279

280

```typescript

281

const dialogue = await client.textToDialogue.convert({

282

inputs: [

283

{ text: "Knock knock", voiceId: "JBFqnCBsd6RMkjVDRZzb" },

284

{ text: "Who's there?", voiceId: "Aw4FAjKCGjjNkVhN1Xmq" }

285

],

286

modelId: "eleven_multilingual_v2",

287

outputFormat: "mp3_44100_128"

288

});

289

```

290

291

### Dialogue with Voice Segments

292

293

```typescript

294

const result = await client.textToDialogue.convertWithTimestamps({

295

inputs: [

296

{ text: "Hello, how are you?", voiceId: "bYTqZQo3Jz7LQtmGTgwi" },

297

{ text: "I'm well, thank you!", voiceId: "6lCwbsX1yVjD49QmpkTR" }

298

]

299

});

300

301

result.voiceSegments.forEach(seg => {

302

console.log(`Voice ${seg.voiceId}: ${seg.startTimeSeconds}s-${seg.endTimeSeconds}s`);

303

});

304

```

305

306

### Sound Effects

307

308

```typescript

309

const sfx = await client.textToSoundEffects.convert({

310

text: "Spacious braam for movie trailer",

311

durationSeconds: 5.0,

312

promptInfluence: 0.5,

313

outputFormat: "mp3_44100_128"

314

});

315

316

// Looping sound

317

const loop = await client.textToSoundEffects.convert({

318

text: "Ambient forest sounds",

319

loop: true,

320

durationSeconds: 10.0,

321

modelId: "eleven_text_to_sound_v2"

322

});

323

```

324

325

### Request Continuity

326

327

```typescript

328

// Chain segments with context

329

const seg1 = await client.textToSpeech.convert("voiceId", {

330

text: "First part.",

331

nextText: "Second part."

332

});

333

334

const seg2 = await client.textToSpeech.convert("voiceId", {

335

text: "Second part.",

336

previousText: "First part.",

337

nextText: "Third part."

338

});

339

340

// Or use request IDs

341

const req1 = await client.textToSpeech.convert("voiceId", {

342

text: "Part 1"

343

});

344

345

const req2 = await client.textToSpeech.convert("voiceId", {

346

text: "Part 2",

347

previousRequestIds: [req1.requestId]

348

});

349

```

350

351

### Deterministic Generation

352

353

```typescript

354

const audio1 = await client.textToSpeech.convert("voiceId", {

355

text: "Same seed produces same audio",

356

seed: 12345,

357

modelId: "eleven_multilingual_v2"

358

});

359

360

// Identical parameters produce identical audio

361

const audio2 = await client.textToSpeech.convert("voiceId", {

362

text: "Same seed produces same audio",

363

seed: 12345,

364

modelId: "eleven_multilingual_v2"

365

});

366

```

367

368

### Pronunciation Dictionaries

369

370

```typescript

371

const audio = await client.textToSpeech.convert("voiceId", {

372

text: "The API uses REST architecture",

373

pronunciationDictionaryLocators: [

374

{ pronunciationDictionaryId: "dict_1", versionId: "v1" },

375

{ pronunciationDictionaryId: "dict_2" } // Uses latest

376

]

377

});

378

```

379

380

### Multi-Language

381

382

```typescript

383

const spanish = await client.textToSpeech.convert("voiceId", {

384

text: "Hola, ¿cómo estás?",

385

languageCode: "es",

386

modelId: "eleven_multilingual_v2"

387

});

388

389

const japanese = await client.textToSpeech.convert("voiceId", {

390

text: "こんにちは",

391

languageCode: "ja",

392

modelId: "eleven_multilingual_v2",

393

applyLanguageTextNormalization: true // Warning: increases latency

394

});

395

```

396

397

## Stream Consumption Pattern

398

399

```typescript

400

// ReadableStream<Uint8Array> - use async iteration

401

const stream = await client.textToSpeech.stream("voiceId", { text: "..." });

402

for await (const chunk of stream) {

403

// Process Uint8Array chunks

404

}

405

406

// Alternative: manual reader

407

const reader = stream.getReader();

408

while (true) {

409

const { done, value } = await reader.read();

410

if (done) break;

411

processChunk(value);

412

}

413

```

414

415

## Error Handling

416

417

```typescript

418

import { ElevenLabsClient, ElevenLabsError } from "@elevenlabs/elevenlabs-js";

419

420

try {

421

const audio = await client.textToSpeech.convert("voiceId", {

422

text: "Hello",

423

modelId: "eleven_multilingual_v2"

424

});

425

} catch (error) {

426

if (error instanceof ElevenLabsError) {

427

console.error(`API error ${error.statusCode}: ${error.message}`);

428

}

429

throw error;

430

}

431

```

432

433

## Latency Optimization Levels

434

435

- **0**: Default, max quality, no optimization

436

- **1**: Normal optimization (~50% latency reduction)

437

- **2**: Strong optimization (~75% latency reduction)

438

- **3**: Max optimization

439

- **4**: Max optimization + no text normalization (may mispronounce numbers/dates)

440

441

## Edge Cases and Important Notes

442

443

### Request Continuity Constraints

444

- `previousRequestIds`/`nextRequestIds` max: 3 items each

445

- If both `previousText` and `previousRequestIds` provided, `previousText` is ignored

446

- Same behavior for `nextText` and `nextRequestIds`

447

- Request IDs must be from the same model for best continuity

448

- Maximum 3 pronunciation dictionaries per request

449

450

### Enterprise-Only Features

451

- Zero retention mode (`enableLogging: false`): disables history features, including request stitching

452

- Text normalization for `eleven_turbo_v2_5`/`eleven_flash_v2_5`: requires Enterprise plan

453

- `storeForInpainting` in music generation: Enterprise feature only

454

455

### Language and Normalization

456

- `applyLanguageTextNormalization`: currently Japanese only, significantly increases latency

457

- Language codes must be ISO 639-1 format (e.g., "en", "es", "ja")

458

- If model doesn't support provided language code, returns error

459

460

### Text Length Limits

461

- Model-specific limits vary (check `Model.maximumTextLengthPerRequest`)

462

- Free tier: typically 2500 characters

463

- Subscribed tier: typically 5000 characters

464

- For longer text, split into chunks and use `previousText`/`nextText` for continuity

465

466

### Output Format Constraints

467

- MP3 192kbps: requires Creator tier or above

468

- PCM 44.1kHz+: requires Pro tier or above

469

- μ-law format (`ulaw_8000`): commonly used for Twilio audio inputs

470

471

### Latency Optimization Trade-offs

472

- Level 0: Maximum quality, no optimization

473

- Level 1: ~50% latency reduction, slight quality impact

474

- Level 2: ~75% latency reduction, moderate quality impact

475

- Level 3: Maximum optimization, noticeable quality impact

476

- Level 4: Maximum optimization + no text normalization (may mispronounce numbers/dates)

477

478

### Seed and Determinism

479

- Seed range: 0-4294967295

480

- Determinism not guaranteed, but same seed + same parameters should produce similar results

481

- Best results when all parameters (model, voice, settings) are identical

482

483

### Error Scenarios

484

485

```typescript

486

import { ElevenLabsClient, ElevenLabsError } from "@elevenlabs/elevenlabs-js";

487

488

// Handle text length errors

489

try {

490

const audio = await client.textToSpeech.convert("voiceId", {

491

text: veryLongText, // Exceeds model limit

492

modelId: "eleven_multilingual_v2"

493

});

494

} catch (error) {

495

if (error instanceof ElevenLabsError && error.statusCode === 422) {

496

// Split text and retry

497

const chunks = splitText(veryLongText, 5000);

498

// Process chunks with continuity...

499

}

500

}

501

502

// Handle invalid voice ID

503

try {

504

const audio = await client.textToSpeech.convert("invalid_voice_id", {

505

text: "Hello"

506

});

507

} catch (error) {

508

if (error instanceof ElevenLabsError && error.statusCode === 404) {

509

console.error("Voice not found - check voice ID");

510

// List available voices

511

const voices = await client.voices.getAll();

512

}

513

}

514

515

// Handle unsupported language

516

try {

517

const audio = await client.textToSpeech.convert("voiceId", {

518

text: "Hello",

519

languageCode: "xx" // Invalid code

520

});

521

} catch (error) {

522

if (error instanceof ElevenLabsError && error.statusCode === 422) {

523

console.error("Language not supported by model");

524

// Check model languages

525

const models = await client.models.list();

526

const model = models.find(m => m.modelId === "eleven_multilingual_v2");

527

console.log("Supported languages:", model?.languages);

528

}

529

}

530

```

531

532

## Comprehensive Examples

533

534

### Long-Form Content Generation with Chunking

535

536

```typescript

537

import * as fs from "fs";

538

import { ElevenLabsClient, ElevenLabsError } from "@elevenlabs/elevenlabs-js";

539

540

async function generateLongFormAudio(

541

voiceId: string,

542

longText: string,

543

outputPath: string

544

) {

545

const maxChunkLength = 5000;

546

const chunks = splitTextIntoChunks(longText, maxChunkLength);

547

const audioChunks: Buffer[] = [];

548

549

for (let i = 0; i < chunks.length; i++) {

550

const chunk = chunks[i];

551

552

// Use continuity features for natural flow

553

const request: TtsRequest = {

554

text: chunk,

555

modelId: "eleven_multilingual_v2",

556

previousText: i > 0 ? chunks[i - 1].slice(-200) : undefined,

557

nextText: i < chunks.length - 1 ? chunks[i + 1].slice(0, 200) : undefined,

558

voiceSettings: {

559

stability: 0.7, // Higher for consistent long-form narration

560

similarityBoost: 0.75

561

}

562

};

563

564

try {

565

const audioStream = await client.textToSpeech.convert(voiceId, request, {

566

timeoutInSeconds: 120, // Longer timeout for processing

567

maxRetries: 3

568

});

569

570

const audio = await streamToBuffer(audioStream);

571

audioChunks.push(audio);

572

573

// Rate limiting: small delay between chunks

574

if (i < chunks.length - 1) {

575

await new Promise(resolve => setTimeout(resolve, 200));

576

}

577

} catch (error) {

578

if (error instanceof ElevenLabsError && error.statusCode === 429) {

579

// Exponential backoff on rate limit

580

const delay = Math.min(1000 * Math.pow(2, i), 10000);

581

await new Promise(resolve => setTimeout(resolve, delay));

582

i--; // Retry this chunk

583

continue;

584

}

585

throw error;

586

}

587

}

588

589

// Combine all chunks

590

const finalAudio = Buffer.concat(audioChunks);

591

fs.writeFileSync(outputPath, finalAudio);

592

}

593

594

function splitTextIntoChunks(text: string, maxLength: number): string[] {

595

const chunks: string[] = [];

596

let currentChunk = "";

597

598

const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];

599

600

for (const sentence of sentences) {

601

if ((currentChunk + sentence).length <= maxLength) {

602

currentChunk += sentence;

603

} else {

604

if (currentChunk) chunks.push(currentChunk.trim());

605

currentChunk = sentence;

606

}

607

}

608

609

if (currentChunk) chunks.push(currentChunk.trim());

610

return chunks;

611

}

612

613

async function streamToBuffer(stream: ReadableStream<Uint8Array>): Promise<Buffer> {

614

const reader = stream.getReader();

615

const chunks: Uint8Array[] = [];

616

617

while (true) {

618

const { done, value } = await reader.read();

619

if (done) break;

620

chunks.push(value);

621

}

622

623

return Buffer.concat(chunks);

624

}

625

```

626

627

### Multi-Language TTS with Fallback

628

629

```typescript

630

async function generateMultilingualAudio(

631

voiceId: string,

632

texts: { lang: string; text: string }[]

633

) {

634

const results: { lang: string; audio: Buffer }[] = [];

635

636

for (const { lang, text } of texts) {

637

try {

638

// Try with language code first

639

const audioStream = await client.textToSpeech.convert(voiceId, {

640

text,

641

languageCode: lang,

642

modelId: "eleven_multilingual_v2"

643

});

644

645

const audio = await streamToBuffer(audioStream);

646

results.push({ lang, audio });

647

648

} catch (error) {

649

if (error instanceof ElevenLabsError && error.statusCode === 422) {

650

// Language not supported, try without language code

651

console.warn(`Language ${lang} not supported, using default`);

652

const audioStream = await client.textToSpeech.convert(voiceId, {

653

text,

654

modelId: "eleven_multilingual_v2"

655

});

656

const audio = await streamToBuffer(audioStream);

657

results.push({ lang: "default", audio });

658

} else {

659

throw error;

660

}

661

}

662

}

663

664

return results;

665

}

666

```

667

668

### Dialogue Generation with Error Recovery

669

670

```typescript

671

async function generateDialogueWithRetry(

672

inputs: DialogueInput[],

673

maxRetries = 3

674

) {

675

for (let attempt = 0; attempt < maxRetries; attempt++) {

676

try {

677

const audioStream = await client.textToDialogue.convert({

678

inputs,

679

modelId: "eleven_multilingual_v2",

680

outputFormat: "mp3_44100_128"

681

});

682

683

return await streamToBuffer(audioStream);

684

685

} catch (error) {

686

if (error instanceof ElevenLabsError) {

687

// Don't retry on validation errors

688

if (error.statusCode === 422) {

689

throw error;

690

}

691

692

// Retry on rate limits and server errors

693

if (attempt < maxRetries - 1) {

694

const delay = Math.pow(2, attempt) * 1000; // Exponential backoff

695

console.log(`Retry attempt ${attempt + 1} after ${delay}ms`);

696

await new Promise(resolve => setTimeout(resolve, delay));

697

continue;

698

}

699

}

700

throw error;

701

}

702

}

703

}

704

```

705