or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-processing.mdconversational-ai.mddubbing.mdindex.mdmusic.mdrealtime.mdstudio.mdtext-to-speech.mdtranscription.mdvoices.mdworkspace.md

realtime.mddocs/

0

# Realtime Speech-to-Text

1

2

**Node.js Only**: Uses WebSocket (`ws`) and `child_process`. Not compatible with browsers, Deno, or Cloudflare Workers.

3

4

## Access

5

6

```typescript

7

import { ElevenLabsClient, AudioFormat, CommitStrategy, RealtimeEvents } from "@elevenlabs/elevenlabs-js";

8

9

const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });

10

const connection = await client.speechToText.realtime.connect(options);

11

```

12

13

## Connection Methods

14

15

### speechToText.realtime.connect()

16

17

```typescript { .api }

18

connect(

19

options: AudioOptions | UrlOptions

20

): Promise<RealtimeConnection>

21

22

interface AudioOptions {

23

audioFormat: AudioFormat;

24

sampleRate: number;

25

modelId: string;

26

commitStrategy?: CommitStrategy; // Default: MANUAL

27

vadSilenceThresholdSecs?: number; // >0.3 and ≤3.0

28

vadThreshold?: number; // 0.1-0.9

29

minSpeechDurationMs?: number; // >50 and ≤2000

30

minSilenceDurationMs?: number; // >50 and ≤2000

31

languageCode?: string; // ISO-639-1 or ISO-639-3

32

includeTimestamps?: boolean; // Default: false

33

}

34

35

interface UrlOptions {

36

url: string; // Requires ffmpeg in PATH

37

modelId: string;

38

commitStrategy?: CommitStrategy;

39

vadSilenceThresholdSecs?: number;

40

vadThreshold?: number;

41

minSpeechDurationMs?: number;

42

minSilenceDurationMs?: number;

43

languageCode?: string;

44

includeTimestamps?: boolean;

45

}

46

47

enum AudioFormat {

48

PCM_8000 = "pcm_8000",

49

PCM_16000 = "pcm_16000",

50

PCM_22050 = "pcm_22050",

51

PCM_24000 = "pcm_24000",

52

PCM_44100 = "pcm_44100",

53

PCM_48000 = "pcm_48000",

54

ULAW_8000 = "ulaw_8000"

55

}

56

57

enum CommitStrategy {

58

MANUAL = "manual", // Call connection.commit() to finalize

59

VAD = "vad" // Automatic commits on speech detection

60

}

61

```

62

63

## RealtimeConnection API

64

65

```typescript { .api }

66

class RealtimeConnection {

67

on(event: RealtimeEvents, listener: (...args: unknown[]) => void): void;

68

off(event: RealtimeEvents, listener: (...args: unknown[]) => void): void;

69

70

send(data: {

71

audioBase64: string;

72

commit?: boolean;

73

sampleRate?: number;

74

}): void;

75

76

commit(): void; // Manual commit only

77

close(): void;

78

}

79

80

enum RealtimeEvents {

81

SESSION_STARTED = "session_started",

82

PARTIAL_TRANSCRIPT = "partial_transcript",

83

COMMITTED_TRANSCRIPT = "committed_transcript",

84

COMMITTED_TRANSCRIPT_WITH_TIMESTAMPS = "committed_transcript_with_timestamps",

85

ERROR = "error",

86

AUTH_ERROR = "auth_error",

87

QUOTA_EXCEEDED = "quota_exceeded",

88

OPEN = "open",

89

CLOSE = "close"

90

}

91

```

92

93

## Usage Examples

94

95

### Manual Audio Streaming

96

97

```typescript

98

import { ElevenLabsClient, AudioFormat, CommitStrategy, RealtimeEvents } from "@elevenlabs/elevenlabs-js";

99

100

const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });

101

102

const conn = await client.speechToText.realtime.connect({

103

modelId: "scribe_v2_realtime",

104

audioFormat: AudioFormat.PCM_16000,

105

sampleRate: 16000,

106

commitStrategy: CommitStrategy.MANUAL

107

});

108

109

conn.on(RealtimeEvents.SESSION_STARTED, (data) => {

110

console.log("Session:", data.session_id);

111

});

112

113

conn.on(RealtimeEvents.PARTIAL_TRANSCRIPT, (data) => {

114

console.log("Partial:", data.text);

115

});

116

117

conn.on(RealtimeEvents.COMMITTED_TRANSCRIPT, (data) => {

118

console.log("Final:", data.text);

119

});

120

121

// Send audio chunks

122

conn.send({ audioBase64: base64Audio });

123

124

// Finalize

125

conn.commit();

126

conn.close();

127

```

128

129

### URL Streaming (requires ffmpeg)

130

131

```typescript

132

const conn = await client.speechToText.realtime.connect({

133

modelId: "scribe_v2_realtime",

134

url: "https://example.com/audio.mp3",

135

commitStrategy: CommitStrategy.VAD

136

});

137

138

conn.on(RealtimeEvents.COMMITTED_TRANSCRIPT, (data) => {

139

console.log("Transcript:", data.text);

140

});

141

```

142

143

### With VAD Configuration

144

145

```typescript

146

const conn = await client.speechToText.realtime.connect({

147

modelId: "scribe_v2_realtime",

148

audioFormat: AudioFormat.PCM_16000,

149

sampleRate: 16000,

150

commitStrategy: CommitStrategy.VAD,

151

vadSilenceThresholdSecs: 0.5,

152

vadThreshold: 0.5,

153

minSpeechDurationMs: 100,

154

minSilenceDurationMs: 200

155

});

156

```

157

158

### With Timestamps

159

160

```typescript

161

const conn = await client.speechToText.realtime.connect({

162

modelId: "scribe_v2_realtime",

163

audioFormat: AudioFormat.PCM_16000,

164

sampleRate: 16000,

165

includeTimestamps: true,

166

languageCode: "en"

167

});

168

169

conn.on(RealtimeEvents.COMMITTED_TRANSCRIPT_WITH_TIMESTAMPS, (data) => {

170

console.log("Text:", data.text);

171

data.words?.forEach(word => {

172

console.log(`"${word.word}": ${word.start_time}s - ${word.end_time}s`);

173

});

174

});

175

```

176

177

## Error Handling

178

179

```typescript

180

conn.on(RealtimeEvents.ERROR, (error) => {

181

console.error("Error:", error.message);

182

});

183

184

conn.on(RealtimeEvents.AUTH_ERROR, (error) => {

185

console.error("Auth error:", error);

186

});

187

188

conn.on(RealtimeEvents.QUOTA_EXCEEDED, (error) => {

189

console.error("Quota exceeded");

190

});

191

```

192

193

## Edge Cases and Important Notes

194

195

### Platform Requirements

196

- **Node.js only**: Uses `ws` package and `child_process` - not compatible with browsers, Deno, or Cloudflare Workers

197

- **URL streaming**: Requires `ffmpeg` installed and available in PATH

198

- **Connection reuse**: Cannot reuse connection after `close()` - create new connection for each session

199

200

### VAD Configuration Constraints

201

- `vadSilenceThresholdSecs`: Must be >0.3 and ≤3.0 (e.g., 0.31 to 3.0)

202

- `vadThreshold`: Range 0.1-0.9, lower = more sensitive

203

- `minSpeechDurationMs`: Must be >50 and ≤2000 (e.g., 51 to 2000)

204

- `minSilenceDurationMs`: Must be >50 and ≤2000 (e.g., 51 to 2000)

205

- VAD parameters are interdependent - adjust together for best results

206

207

### Commit Strategy Behavior

208

- **Manual commit**: Must call `commit()` explicitly with `CommitStrategy.MANUAL`

209

- **VAD commit**: Automatic commits based on voice activity detection

210

- Manual commit gives precise control but requires managing commit timing

211

- VAD commit is easier but may commit at unexpected times

212

213

### Audio Format Requirements

214

- Sample rate must match `audioFormat` (e.g., PCM_16000 requires 16000 Hz)

215

- Audio data must be base64 encoded when sending

216

- For URL streaming, ffmpeg automatically converts to required format

217

218

### Error Handling

219

- Always listen for `ERROR`, `AUTH_ERROR`, and `QUOTA_EXCEEDED` events

220

- Connection may close unexpectedly - handle `CLOSE` event

221

- WebSocket errors are separate from API errors

222

223

### Message Types

224

- Internal message types (`Config`, `InputAudioChunk`, `WordsItem`, etc.) not exported from package root

225

- Access message data through event callbacks only

226

227

### Error Scenarios

228

229

```typescript

230

import { ElevenLabsClient, ElevenLabsError, RealtimeEvents } from "@elevenlabs/elevenlabs-js";

231

232

// Handle connection errors

233

async function connectWithRetry(options: AudioOptions | UrlOptions, maxRetries = 3) {

234

for (let attempt = 0; attempt < maxRetries; attempt++) {

235

try {

236

const connection = await client.speechToText.realtime.connect(options);

237

238

// Set up error handlers immediately

239

connection.on(RealtimeEvents.ERROR, (error) => {

240

console.error("Transcription error:", error);

241

});

242

243

connection.on(RealtimeEvents.AUTH_ERROR, (error) => {

244

console.error("Authentication error:", error);

245

connection.close();

246

});

247

248

connection.on(RealtimeEvents.QUOTA_EXCEEDED, (error) => {

249

console.error("Quota exceeded:", error);

250

connection.close();

251

});

252

253

return connection;

254

255

} catch (error) {

256

if (attempt < maxRetries - 1) {

257

console.log(`Connection attempt ${attempt + 1} failed, retrying...`);

258

await new Promise(resolve => setTimeout(resolve, 1000 * (attempt + 1)));

259

continue;

260

}

261

throw error;

262

}

263

}

264

}

265

266

// Handle invalid audio format

267

try {

268

const connection = await client.speechToText.realtime.connect({

269

modelId: "scribe_v2_realtime",

270

audioFormat: AudioFormat.PCM_16000,

271

sampleRate: 22050, // Mismatch!

272

});

273

} catch (error) {

274

console.error("Invalid audio configuration:", error);

275

// Fix: match sampleRate to audioFormat

276

}

277

278

// Handle URL streaming without ffmpeg

279

try {

280

const connection = await client.speechToText.realtime.connect({

281

modelId: "scribe_v2_realtime",

282

url: "https://example.com/audio.mp3"

283

});

284

} catch (error) {

285

if (error.message?.includes("ffmpeg")) {

286

console.error("ffmpeg not found - install ffmpeg or use AudioOptions instead");

287

}

288

}

289

```

290

291

## Comprehensive Examples

292

293

### Production-Ready Realtime Transcription

294

295

```typescript

296

import {

297

ElevenLabsClient,

298

AudioFormat,

299

CommitStrategy,

300

RealtimeEvents,

301

RealtimeConnection

302

} from "@elevenlabs/elevenlabs-js";

303

import { EventEmitter } from "events";

304

305

class TranscriptionSession extends EventEmitter {

306

private connection: RealtimeConnection | null = null;

307

private isActive = false;

308

private transcripts: string[] = [];

309

310

constructor(private client: ElevenLabsClient) {

311

super();

312

}

313

314

async start(options: AudioOptions | UrlOptions) {

315

if (this.isActive) {

316

throw new Error("Session already active");

317

}

318

319

try {

320

this.connection = await this.client.speechToText.realtime.connect(options);

321

this.isActive = true;

322

323

// Set up all event handlers

324

this.setupEventHandlers();

325

326

this.emit("started");

327

328

} catch (error) {

329

this.isActive = false;

330

this.emit("error", error);

331

throw error;

332

}

333

}

334

335

private setupEventHandlers() {

336

if (!this.connection) return;

337

338

this.connection.on(RealtimeEvents.SESSION_STARTED, (data) => {

339

console.log("Session started:", data.session_id);

340

this.emit("sessionStarted", data);

341

});

342

343

this.connection.on(RealtimeEvents.PARTIAL_TRANSCRIPT, (data) => {

344

this.emit("partialTranscript", data.text);

345

});

346

347

this.connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT, (data) => {

348

this.transcripts.push(data.text);

349

this.emit("committedTranscript", data.text);

350

});

351

352

this.connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT_WITH_TIMESTAMPS, (data) => {

353

this.transcripts.push(data.text);

354

this.emit("committedTranscriptWithTimestamps", {

355

text: data.text,

356

words: data.words

357

});

358

});

359

360

this.connection.on(RealtimeEvents.ERROR, (error) => {

361

console.error("Transcription error:", error);

362

this.emit("error", error);

363

});

364

365

this.connection.on(RealtimeEvents.AUTH_ERROR, (error) => {

366

console.error("Auth error:", error);

367

this.emit("authError", error);

368

this.stop();

369

});

370

371

this.connection.on(RealtimeEvents.QUOTA_EXCEEDED, (error) => {

372

console.error("Quota exceeded:", error);

373

this.emit("quotaExceeded", error);

374

this.stop();

375

});

376

377

this.connection.on(RealtimeEvents.CLOSE, () => {

378

this.isActive = false;

379

this.emit("closed");

380

});

381

}

382

383

sendAudio(audioBase64: string, commit = false) {

384

if (!this.connection || !this.isActive) {

385

throw new Error("Session not active");

386

}

387

388

try {

389

this.connection.send({ audioBase64, commit });

390

} catch (error) {

391

this.emit("error", error);

392

throw error;

393

}

394

}

395

396

commit() {

397

if (!this.connection || !this.isActive) {

398

throw new Error("Session not active");

399

}

400

401

this.connection.commit();

402

}

403

404

stop() {

405

if (this.connection && this.isActive) {

406

this.connection.close();

407

this.connection = null;

408

this.isActive = false;

409

}

410

}

411

412

getTranscripts(): string[] {

413

return [...this.transcripts];

414

}

415

416

getFullTranscript(): string {

417

return this.transcripts.join(" ");

418

}

419

}

420

421

// Usage

422

const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });

423

const session = new TranscriptionSession(client);

424

425

session.on("committedTranscript", (text) => {

426

console.log("Final:", text);

427

});

428

429

session.on("error", (error) => {

430

console.error("Session error:", error);

431

});

432

433

await session.start({

434

modelId: "scribe_v2_realtime",

435

audioFormat: AudioFormat.PCM_16000,

436

sampleRate: 16000,

437

commitStrategy: CommitStrategy.VAD,

438

includeTimestamps: true

439

});

440

441

// Send audio chunks

442

session.sendAudio(base64AudioChunk1);

443

session.sendAudio(base64AudioChunk2);

444

session.commit(); // If using MANUAL strategy

445

446

// Later

447

session.stop();

448

console.log("Full transcript:", session.getFullTranscript());

449

```

450

451

### Audio Stream Processing with Buffering

452

453

```typescript

454

import { Readable } from "stream";

455

456

async function transcribeAudioStream(

457

audioStream: Readable,

458

sampleRate: number

459

) {

460

const connection = await client.speechToText.realtime.connect({

461

modelId: "scribe_v2_realtime",

462

audioFormat: AudioFormat.PCM_16000,

463

sampleRate: 16000,

464

commitStrategy: CommitStrategy.VAD,

465

vadSilenceThresholdSecs: 0.5,

466

vadThreshold: 0.5

467

});

468

469

const transcripts: string[] = [];

470

471

connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT, (data) => {

472

transcripts.push(data.text);

473

console.log("Transcript:", data.text);

474

});

475

476

// Buffer audio data

477

const buffer: Buffer[] = [];

478

const chunkSize = 1600; // 100ms at 16kHz

479

480

audioStream.on("data", (chunk: Buffer) => {

481

buffer.push(chunk);

482

483

// Send when buffer is full enough

484

if (buffer.length >= chunkSize) {

485

const audioData = Buffer.concat(buffer.splice(0, chunkSize));

486

const base64 = audioData.toString("base64");

487

connection.send({ audioBase64: base64 });

488

}

489

});

490

491

audioStream.on("end", () => {

492

// Send remaining buffer

493

if (buffer.length > 0) {

494

const audioData = Buffer.concat(buffer);

495

const base64 = audioData.toString("base64");

496

connection.send({ audioBase64: base64, commit: true });

497

}

498

499

// Final commit

500

setTimeout(() => {

501

connection.commit();

502

connection.close();

503

}, 1000);

504

});

505

506

audioStream.on("error", (error) => {

507

console.error("Stream error:", error);

508

connection.close();

509

});

510

511

return new Promise<string[]>((resolve) => {

512

connection.on(RealtimeEvents.CLOSE, () => {

513

resolve(transcripts);

514

});

515

});

516

}

517

```

518

519

### URL Streaming with Progress Tracking

520

521

```typescript

522

async function transcribeUrlWithProgress(url: string) {

523

const connection = await client.speechToText.realtime.connect({

524

modelId: "scribe_v2_realtime",

525

url,

526

commitStrategy: CommitStrategy.VAD,

527

includeTimestamps: true

528

});

529

530

const transcripts: Array<{ text: string; timestamp: number }> = [];

531

let startTime = Date.now();

532

533

connection.on(RealtimeEvents.SESSION_STARTED, () => {

534

startTime = Date.now();

535

console.log("Transcription started");

536

});

537

538

connection.on(RealtimeEvents.COMMITTED_TRANSCRIPT_WITH_TIMESTAMPS, (data) => {

539

const elapsed = (Date.now() - startTime) / 1000;

540

transcripts.push({

541

text: data.text,

542

timestamp: elapsed

543

});

544

545

console.log(`[${elapsed.toFixed(2)}s] ${data.text}`);

546

547

if (data.words) {

548

data.words.forEach(word => {

549

console.log(` "${word.text}": ${word.start_time}s - ${word.end_time}s`);

550

});

551

}

552

});

553

554

connection.on(RealtimeEvents.CLOSE, () => {

555

console.log(`Transcription complete. Total time: ${(Date.now() - startTime) / 1000}s`);

556

console.log(`Total transcripts: ${transcripts.length}`);

557

});

558

559

connection.on(RealtimeEvents.ERROR, (error) => {

560

console.error("Error during transcription:", error);

561

});

562

563

// Connection will automatically close when URL stream completes

564

return new Promise<typeof transcripts>((resolve, reject) => {

565

connection.on(RealtimeEvents.CLOSE, () => {

566

resolve(transcripts);

567

});

568

569

connection.on(RealtimeEvents.ERROR, (error) => {

570

reject(error);

571

});

572

});

573

}

574

```

575