WebSocket-based real-time speech transcription with word-level timestamps, voice activity detection, and low-latency streaming. Enables live audio transcription for applications requiring immediate results.

Wrapper Extension: Real-time transcription functionality is provided by the SpeechToText wrapper class which extends the base Speech-to-Text client. The client.speechToText.realtime property provides access to ScribeRealtime which manages WebSocket connections via the RealtimeConnection class.

Platform Requirements: Real-time transcription requires Node.js (WebSocket support via 'ws' module). URL streaming also requires ffmpeg.

Quick Reference

import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";

const client = new ElevenLabsClient({ apiKey: "your-api-key" });
// Access this API via: client.speechToText.realtime.connect()

Importing Real-time Types

import {
  // Core classes
  RealtimeConnection,

  // Enums
  RealtimeEvents,
  AudioFormat,
  CommitStrategy,

  // Configuration types
  AudioOptions,
  UrlOptions,

  // Message types
  SessionStartedMessage,
  PartialTranscriptMessage,
  CommittedTranscriptMessage,
  CommittedTranscriptWithTimestampsMessage,

  // Error types
  RealtimeErrorPayload,
  ServerErrorMessage,
  ErrorMessage,
  AuthErrorMessage,
  QuotaExceededErrorMessage,
  CommitThrottledErrorMessage,
  TranscriberErrorMessage,
  UnacceptedTermsErrorMessage,
  RateLimitedErrorMessage,
  InputErrorMessage,
  QueueOverflowErrorMessage,
  ResourceExhaustedErrorMessage,
  SessionTimeLimitExceededErrorMessage,
  ChunkSizeExceededErrorMessage,
  InsufficientAudioActivityErrorMessage,

  // Utility types
  RealtimeEventMap,
  SessionConfig,
  InputAudioChunk,
  WordsItem
} from "@elevenlabs/elevenlabs-js";

Capabilities

Connect to Real-time Transcription

Establish a WebSocket connection for streaming audio transcription.

/**
 * @param options - Audio format and transcription settings
 * @returns Promise resolving to RealtimeConnection
 */
client.speechToText.realtime.connect(
  options: AudioOptions | UrlOptions
): Promise<RealtimeConnection>;

interface AudioOptions {
  /** Model ID for transcription */
  modelId: string;
  /** Audio format (PCM or ULAW) */
  audioFormat: AudioFormat;
  /** Sample rate in Hz */
  sampleRate: number;
  /** Commit strategy for finalizing transcripts */
  commitStrategy?: CommitStrategy;
  /** Silence duration before committing (0.3-3.0 seconds) */
  vadSilenceThresholdSecs?: number;
  /** Voice activity detection threshold (0.1-0.9) */
  vadThreshold?: number;
  /** Minimum speech duration in ms (50-2000) */
  minSpeechDurationMs?: number;
  /** Minimum silence duration in ms (50-2000) */
  minSilenceDurationMs?: number;
  /** Language code (ISO-639-1 or ISO-639-3) */
  languageCode?: string;
  /** Include word-level timestamps */
  includeTimestamps?: boolean;
}

interface UrlOptions {
  /** Model ID for transcription */
  modelId: string;
  /** URL to stream audio from (requires ffmpeg) */
  url: string;
  /** Commit strategy */
  commitStrategy?: CommitStrategy;
  /** VAD silence threshold */
  vadSilenceThresholdSecs?: number;
  /** VAD threshold */
  vadThreshold?: number;
  /** Minimum speech duration */
  minSpeechDurationMs?: number;
  /** Minimum silence duration */
  minSilenceDurationMs?: number;
  /** Language code */
  languageCode?: string;
  /** Include timestamps */
  includeTimestamps?: boolean;
}

enum AudioFormat {
  PCM_8000 = "pcm_8000",
  PCM_16000 = "pcm_16000",
  PCM_22050 = "pcm_22050",
  PCM_24000 = "pcm_24000",
  PCM_44100 = "pcm_44100",
  PCM_48000 = "pcm_48000",
  ULAW_8000 = "ulaw_8000",
}

enum CommitStrategy {
  /** Automatic commit via voice activity detection */
  VAD = "vad",
  /** Manual commit - call commit() to finalize transcripts */
  MANUAL = "manual",
}

RealtimeConnection

Event-driven WebSocket connection for real-time transcription.

class RealtimeConnection extends EventEmitter {
  /**
   * Send audio chunk for transcription
   * @param data - Audio data and options
   * @param data.audioBase64 - Base64 encoded audio data
   * @param data.commit - Whether to commit this chunk (finalize transcript)
   * @param data.sampleRate - Sample rate in Hz (overrides connection default)
   * @param data.previousText - Previous text for context
   */
  send(data: {
    audioBase64: string;
    commit?: boolean;
    sampleRate?: number;
    previousText?: string;
  }): void;

  /**
   * Commit the current segment, triggering a COMMITTED_TRANSCRIPT event
   * Only needed when using CommitStrategy.VAD
   */
  commit(): void;

  /**
   * Close the WebSocket connection
   */
  close(): void;

  /**
   * Register event listeners
   */
  on(event: RealtimeEvents, listener: (data: any) => void): this;

  /**
   * Register one-time event listener
   */
  once(event: RealtimeEvents, listener: (data: any) => void): this;

  /**
   * Remove event listener
   */
  off(event: RealtimeEvents, listener: (data: any) => void): this;
}

Events

enum RealtimeEvents {
  /** Session successfully started */
  SESSION_STARTED = "session_started",
  /** Partial (interim) transcript available */
  PARTIAL_TRANSCRIPT = "partial_transcript",
  /** Final committed transcript available */
  COMMITTED_TRANSCRIPT = "committed_transcript",
  /** Final transcript with word-level timestamps */
  COMMITTED_TRANSCRIPT_WITH_TIMESTAMPS = "committed_transcript_with_timestamps",
  /** Error occurred */
  ERROR = "error",
  /** Authentication error */
  AUTH_ERROR = "auth_error",
  /** Quota exceeded */
  QUOTA_EXCEEDED = "quota_exceeded",
  /** Commit throttled */
  COMMIT_THROTTLED = "commit_throttled",
  /** Transcriber error */
  TRANSCRIBER_ERROR = "transcriber_error",
  /** Terms not accepted */
  UNACCEPTED_TERMS_ERROR = "unaccepted_terms_error",
  /** Rate limited */
  RATE_LIMITED = "rate_limited",
  /** Input error */
  INPUT_ERROR = "input_error",
  /** Queue overflow */
  QUEUE_OVERFLOW = "queue_overflow",
  /** Resource exhausted */
  RESOURCE_EXHAUSTED = "resource_exhausted",
  /** Session time limit exceeded */
  SESSION_TIME_LIMIT_EXCEEDED = "session_time_limit_exceeded",
  /** Chunk size exceeded */
  CHUNK_SIZE_EXCEEDED = "chunk_size_exceeded",
  /** Insufficient audio activity */
  INSUFFICIENT_AUDIO_ACTIVITY = "insufficient_audio_activity",
  /** WebSocket opened */
  OPEN = "open",
  /** WebSocket closed */
  CLOSE = "close",
}

Message Types

interface SessionStartedMessage {
  message_type: "session_started";
  /** Unique session identifier */
  session_id: string;
  /** Session configuration */
  config: SessionConfig;
}

interface PartialTranscriptMessage {
  message_type: "partial_transcript";
  /** Interim transcript text */
  text: string;
}

interface CommittedTranscriptMessage {
  message_type: "committed_transcript";
  /** Final transcript text */
  text: string;
}

interface CommittedTranscriptWithTimestampsMessage {
  message_type: "committed_transcript_with_timestamps";
  /** Final transcript text */
  text: string;
  /** Detected language code */
  language_code?: string;
  /** Array of word timing information */
  words?: WordsItem[];
}

interface WordsItem {
  /** Word text */
  text?: string;
  /** Start time in seconds */
  start?: number;
  /** End time in seconds */
  end?: number;
  /** Type of item */
  type?: "word" | "spacing";
  /** Speaker identifier */
  speaker_id?: string;
  /** Log probability (confidence) */
  logprob?: number;
  /** Individual characters */
  characters?: string[];
}

interface Config {
  model_id?: string;
  audio_format?: AudioFormat;
  sample_rate?: number;
  vad_commit_strategy?: CommitStrategy;
  language_code?: string;
  include_timestamps?: boolean;
  vad_silence_threshold_secs?: number;
  vad_threshold?: number;
  min_speech_duration_ms?: number;
  min_silence_duration_ms?: number;
  disable_logging?: boolean;
}

Error Message Types

All error messages include a message_type field and an error string describing the issue.

interface ErrorMessage {
  message_type: "error";
  error: string;
}

interface AuthErrorMessage {
  message_type: "auth_error";
  error: string;
}

interface QuotaExceededErrorMessage {
  message_type: "quota_exceeded";
  error: string;
}

interface CommitThrottledErrorMessage {
  message_type: "commit_throttled";
  error: string;
}

interface TranscriberErrorMessage {
  message_type: "transcriber_error";
  error: string;
}

interface UnacceptedTermsErrorMessage {
  message_type: "unaccepted_terms_error";
  error: string;
}

interface RateLimitedErrorMessage {
  message_type: "rate_limited";
  error: string;
}

interface InputErrorMessage {
  message_type: "input_error";
  error: string;
}

interface QueueOverflowErrorMessage {
  message_type: "queue_overflow";
  error: string;
}

interface ResourceExhaustedErrorMessage {
  message_type: "resource_exhausted";
  error: string;
}

interface SessionTimeLimitExceededErrorMessage {
  message_type: "session_time_limit_exceeded";
  error: string;
}

interface ChunkSizeExceededErrorMessage {
  message_type: "chunk_size_exceeded";
  error: string;
}

interface InsufficientAudioActivityErrorMessage {
  message_type: "insufficient_audio_activity";
  error: string;
}

/**
 * Union type for all server error messages
 */
type ServerErrorMessage =
  | ErrorMessage
  | AuthErrorMessage
  | QuotaExceededErrorMessage
  | CommitThrottledErrorMessage
  | TranscriberErrorMessage
  | UnacceptedTermsErrorMessage
  | RateLimitedErrorMessage
  | InputErrorMessage
  | QueueOverflowErrorMessage
  | ResourceExhaustedErrorMessage
  | SessionTimeLimitExceededErrorMessage
  | ChunkSizeExceededErrorMessage
  | InsufficientAudioActivityErrorMessage;

/**
 * Union type for all possible error payloads emitted by the ERROR event.
 * Can be a server error message or a native WebSocket error.
 */
type RealtimeErrorPayload = ServerErrorMessage | Error;

Type-Safe Event Handling

The SDK provides a type-safe event map for RealtimeConnection events.

/**
 * Type-safe event map for RealtimeConnection events.
 * Maps each event to its corresponding payload type.
 */
interface RealtimeEventMap {
  [RealtimeEvents.SESSION_STARTED]: SessionStartedMessage;
  [RealtimeEvents.PARTIAL_TRANSCRIPT]: PartialTranscriptMessage;
  [RealtimeEvents.COMMITTED_TRANSCRIPT]: CommittedTranscriptMessage;
  [RealtimeEvents.COMMITTED_TRANSCRIPT_WITH_TIMESTAMPS]: CommittedTranscriptWithTimestampsMessage;
  [RealtimeEvents.ERROR]: RealtimeErrorPayload;
  [RealtimeEvents.AUTH_ERROR]: AuthErrorMessage;
  [RealtimeEvents.QUOTA_EXCEEDED]: QuotaExceededErrorMessage;
  [RealtimeEvents.OPEN]: undefined;
  [RealtimeEvents.CLOSE]: undefined;
  [RealtimeEvents.COMMIT_THROTTLED]: CommitThrottledErrorMessage;
  [RealtimeEvents.TRANSCRIBER_ERROR]: TranscriberErrorMessage;
  [RealtimeEvents.UNACCEPTED_TERMS_ERROR]: UnacceptedTermsErrorMessage;
  [RealtimeEvents.RATE_LIMITED]: RateLimitedErrorMessage;
  [RealtimeEvents.INPUT_ERROR]: InputErrorMessage;
  [RealtimeEvents.QUEUE_OVERFLOW]: QueueOverflowErrorMessage;
  [RealtimeEvents.RESOURCE_EXHAUSTED]: ResourceExhaustedErrorMessage;
  [RealtimeEvents.SESSION_TIME_LIMIT_EXCEEDED]: SessionTimeLimitExceededErrorMessage;
  [RealtimeEvents.CHUNK_SIZE_EXCEEDED]: ChunkSizeExceededErrorMessage;
  [RealtimeEvents.INSUFFICIENT_AUDIO_ACTIVITY]: InsufficientAudioActivityErrorMessage;
}

Input Audio Format

interface InputAudioChunk {
  message_type: "input_audio_chunk";
  /** Base64 encoded audio data */
  audio_base_64: string;
  /** Whether to commit (finalize) transcript */
  commit: boolean;
  /** Sample rate in Hz */
  sample_rate: number;
  /** Previous text for context */
  previous_text?: string;
}

Usage Examples

Basic Real-time Transcription

import { ElevenLabsClient, AudioFormat, CommitStrategy } from "@elevenlabs/elevenlabs-js";

const client = new ElevenLabsClient({ apiKey: "your-api-key" });

// Connect to real-time transcription
const connection = await client.speechToText.realtime.connect({
  modelId: "scribe-v1",
  audioFormat: AudioFormat.PCM_16000,
  sampleRate: 16000,
  commitStrategy: CommitStrategy.VAD,
});

// Listen for session start
connection.on("session_started", (data) => {
  console.log("Session started:", data.session_id);
});

// Listen for partial transcripts
connection.on("partial_transcript", (data) => {
  console.log("Partial:", data.text);
});

// Listen for final transcripts
connection.on("committed_transcript", (data) => {
  console.log("Final:", data.text);
});

// Listen for errors
connection.on("error", (error) => {
  console.error("Error:", error);
});

// Send audio chunks (must be base64 encoded)
const audioBase64 = Buffer.from(audioChunk).toString('base64');
connection.send({ audioBase64, commit: false }); // Don't commit yet
connection.send({ audioBase64, commit: true });  // Commit this chunk

// Or use separate commit
connection.send({ audioBase64 });
connection.commit(); // Commit manually

// Close when done
connection.close();

Voice Activity Detection

// Use VAD for automatic transcript commits
const connection = await client.speechToText.realtime.connect({
  modelId: "scribe-v1",
  audioFormat: AudioFormat.PCM_16000,
  sampleRate: 16000,
  commitStrategy: CommitStrategy.VAD,
  vadSilenceThresholdSecs: 1.0, // Commit after 1 second of silence
  vadThreshold: 0.5, // Medium sensitivity
});

connection.on("committed_transcript", (data) => {
  console.log("Auto-committed:", data.text);
});

// Just send audio, VAD handles commits
for (const chunk of audioChunks) {
  const audioBase64 = Buffer.from(chunk).toString('base64');
  connection.send({ audioBase64 });
}

With Word-Level Timestamps

// Enable word-level timestamps
const connection = await client.speechToText.realtime.connect({
  modelId: "scribe-v1",
  audioFormat: AudioFormat.PCM_16000,
  sampleRate: 16000,
  commitStrategy: CommitStrategy.VAD,
  includeTimestamps: true,
});

connection.on("committed_transcript_with_timestamps", (data) => {
  console.log("Transcript:", data.text);
  console.log("Language:", data.language_code);

  if (data.words) {
    for (const word of data.words) {
      console.log(`${word.text} [${word.start}s - ${word.end}s]`);
    }
  }
});

Stream from URL

// Stream audio from URL (requires ffmpeg)
const connection = await client.speechToText.realtime.connect({
  modelId: "scribe-v1",
  url: "https://example.com/audio-stream.mp3",
  commitStrategy: CommitStrategy.VAD,
});

connection.on("partial_transcript", (data) => {
  console.log("Streaming:", data.text);
});

connection.on("committed_transcript", (data) => {
  console.log("Final:", data.text);
});

Multi-Language Transcription

// Transcribe in specific language
const connection = await client.speechToText.realtime.connect({
  modelId: "scribe-v1",
  audioFormat: AudioFormat.PCM_16000,
  sampleRate: 16000,
  commitStrategy: CommitStrategy.VAD,
  languageCode: "es", // Spanish
});

Microphone Recording

import { spawn } from "child_process";

// Connect with high-quality audio settings
const connection = await client.speechToText.realtime.connect({
  modelId: "scribe-v1",
  audioFormat: AudioFormat.PCM_44100,
  sampleRate: 44100,
  commitStrategy: CommitStrategy.VAD,
  vadSilenceThresholdSecs: 0.8,
  includeTimestamps: true,
});

// Capture microphone input with ffmpeg
const ffmpeg = spawn("ffmpeg", [
  "-f", "avfoundation", // macOS (use "alsa" for Linux, "dshow" for Windows)
  "-i", ":0",
  "-ar", "44100",
  "-ac", "1",
  "-f", "s16le",
  "-",
]);

// Stream microphone audio to transcription
ffmpeg.stdout.on("data", (chunk) => {
  const audioBase64 = chunk.toString('base64');
  connection.send({ audioBase64 });
});

connection.on("committed_transcript", (data) => {
  console.log("You said:", data.text);
});

// Stop recording
setTimeout(() => {
  ffmpeg.kill();
  connection.close();
}, 30000); // Record for 30 seconds

Error Handling

const connection = await client.speechToText.realtime.connect({
  modelId: "scribe-v1",
  audioFormat: AudioFormat.PCM_16000,
  sampleRate: 16000,
});

// Handle specific error types
connection.on("auth_error", () => {
  console.error("Authentication failed");
});

connection.on("quota_exceeded", () => {
  console.error("Quota exceeded");
});

connection.on("rate_limited", () => {
  console.error("Rate limited, slow down");
});

connection.on("chunk_size_exceeded", () => {
  console.error("Audio chunk too large");
});

connection.on("error", (error) => {
  console.error("General error:", error);
});

// Handle connection close
connection.on("close", () => {
  console.log("Connection closed");
});

Advanced VAD Configuration

// Fine-tune voice activity detection
const connection = await client.speechToText.realtime.connect({
  modelId: "scribe-v1",
  audioFormat: AudioFormat.PCM_16000,
  sampleRate: 16000,
  commitStrategy: CommitStrategy.VAD,
  vadSilenceThresholdSecs: 0.5, // Commit after 0.5s silence
  vadThreshold: 0.3, // Lower threshold = more sensitive
  minSpeechDurationMs: 100, // Minimum 100ms of speech
  minSilenceDurationMs: 200, // Minimum 200ms of silence
});

Version

Tile

Files