WebSocket-based real-time speech transcription with word-level timestamps, voice activity detection, and low-latency streaming. Enables live audio transcription for applications requiring immediate results.
Wrapper Extension: Real-time transcription functionality is provided by the SpeechToText wrapper class which extends the base Speech-to-Text client. The client.speechToText.realtime property provides access to ScribeRealtime which manages WebSocket connections via the RealtimeConnection class.
Platform Requirements: Real-time transcription requires Node.js (WebSocket support via 'ws' module). URL streaming also requires ffmpeg.
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
const client = new ElevenLabsClient({ apiKey: "your-api-key" });
// Access this API via: client.speechToText.realtime.connect()import {
// Core classes
RealtimeConnection,
// Enums
RealtimeEvents,
AudioFormat,
CommitStrategy,
// Configuration types
AudioOptions,
UrlOptions,
// Message types
SessionStartedMessage,
PartialTranscriptMessage,
CommittedTranscriptMessage,
CommittedTranscriptWithTimestampsMessage,
// Error types
RealtimeErrorPayload,
ServerErrorMessage,
ErrorMessage,
AuthErrorMessage,
QuotaExceededErrorMessage,
CommitThrottledErrorMessage,
TranscriberErrorMessage,
UnacceptedTermsErrorMessage,
RateLimitedErrorMessage,
InputErrorMessage,
QueueOverflowErrorMessage,
ResourceExhaustedErrorMessage,
SessionTimeLimitExceededErrorMessage,
ChunkSizeExceededErrorMessage,
InsufficientAudioActivityErrorMessage,
// Utility types
RealtimeEventMap,
SessionConfig,
InputAudioChunk,
WordsItem
} from "@elevenlabs/elevenlabs-js";Establish a WebSocket connection for streaming audio transcription.
/**
* @param options - Audio format and transcription settings
* @returns Promise resolving to RealtimeConnection
*/
client.speechToText.realtime.connect(
options: AudioOptions | UrlOptions
): Promise<RealtimeConnection>;
interface AudioOptions {
/** Model ID for transcription */
modelId: string;
/** Audio format (PCM or ULAW) */
audioFormat: AudioFormat;
/** Sample rate in Hz */
sampleRate: number;
/** Commit strategy for finalizing transcripts */
commitStrategy?: CommitStrategy;
/** Silence duration before committing (0.3-3.0 seconds) */
vadSilenceThresholdSecs?: number;
/** Voice activity detection threshold (0.1-0.9) */
vadThreshold?: number;
/** Minimum speech duration in ms (50-2000) */
minSpeechDurationMs?: number;
/** Minimum silence duration in ms (50-2000) */
minSilenceDurationMs?: number;
/** Language code (ISO-639-1 or ISO-639-3) */
languageCode?: string;
/** Include word-level timestamps */
includeTimestamps?: boolean;
}
interface UrlOptions {
/** Model ID for transcription */
modelId: string;
/** URL to stream audio from (requires ffmpeg) */
url: string;
/** Commit strategy */
commitStrategy?: CommitStrategy;
/** VAD silence threshold */
vadSilenceThresholdSecs?: number;
/** VAD threshold */
vadThreshold?: number;
/** Minimum speech duration */
minSpeechDurationMs?: number;
/** Minimum silence duration */
minSilenceDurationMs?: number;
/** Language code */
languageCode?: string;
/** Include timestamps */
includeTimestamps?: boolean;
}
enum AudioFormat {
PCM_8000 = "pcm_8000",
PCM_16000 = "pcm_16000",
PCM_22050 = "pcm_22050",
PCM_24000 = "pcm_24000",
PCM_44100 = "pcm_44100",
PCM_48000 = "pcm_48000",
ULAW_8000 = "ulaw_8000",
}
enum CommitStrategy {
/** Automatic commit via voice activity detection */
VAD = "vad",
/** Manual commit - call commit() to finalize transcripts */
MANUAL = "manual",
}Event-driven WebSocket connection for real-time transcription.
class RealtimeConnection extends EventEmitter {
/**
* Send audio chunk for transcription
* @param data - Audio data and options
* @param data.audioBase64 - Base64 encoded audio data
* @param data.commit - Whether to commit this chunk (finalize transcript)
* @param data.sampleRate - Sample rate in Hz (overrides connection default)
* @param data.previousText - Previous text for context
*/
send(data: {
audioBase64: string;
commit?: boolean;
sampleRate?: number;
previousText?: string;
}): void;
/**
* Commit the current segment, triggering a COMMITTED_TRANSCRIPT event
* Only needed when using CommitStrategy.VAD
*/
commit(): void;
/**
* Close the WebSocket connection
*/
close(): void;
/**
* Register event listeners
*/
on(event: RealtimeEvents, listener: (data: any) => void): this;
/**
* Register one-time event listener
*/
once(event: RealtimeEvents, listener: (data: any) => void): this;
/**
* Remove event listener
*/
off(event: RealtimeEvents, listener: (data: any) => void): this;
}enum RealtimeEvents {
/** Session successfully started */
SESSION_STARTED = "session_started",
/** Partial (interim) transcript available */
PARTIAL_TRANSCRIPT = "partial_transcript",
/** Final committed transcript available */
COMMITTED_TRANSCRIPT = "committed_transcript",
/** Final transcript with word-level timestamps */
COMMITTED_TRANSCRIPT_WITH_TIMESTAMPS = "committed_transcript_with_timestamps",
/** Error occurred */
ERROR = "error",
/** Authentication error */
AUTH_ERROR = "auth_error",
/** Quota exceeded */
QUOTA_EXCEEDED = "quota_exceeded",
/** Commit throttled */
COMMIT_THROTTLED = "commit_throttled",
/** Transcriber error */
TRANSCRIBER_ERROR = "transcriber_error",
/** Terms not accepted */
UNACCEPTED_TERMS_ERROR = "unaccepted_terms_error",
/** Rate limited */
RATE_LIMITED = "rate_limited",
/** Input error */
INPUT_ERROR = "input_error",
/** Queue overflow */
QUEUE_OVERFLOW = "queue_overflow",
/** Resource exhausted */
RESOURCE_EXHAUSTED = "resource_exhausted",
/** Session time limit exceeded */
SESSION_TIME_LIMIT_EXCEEDED = "session_time_limit_exceeded",
/** Chunk size exceeded */
CHUNK_SIZE_EXCEEDED = "chunk_size_exceeded",
/** Insufficient audio activity */
INSUFFICIENT_AUDIO_ACTIVITY = "insufficient_audio_activity",
/** WebSocket opened */
OPEN = "open",
/** WebSocket closed */
CLOSE = "close",
}interface SessionStartedMessage {
message_type: "session_started";
/** Unique session identifier */
session_id: string;
/** Session configuration */
config: SessionConfig;
}
interface PartialTranscriptMessage {
message_type: "partial_transcript";
/** Interim transcript text */
text: string;
}
interface CommittedTranscriptMessage {
message_type: "committed_transcript";
/** Final transcript text */
text: string;
}
interface CommittedTranscriptWithTimestampsMessage {
message_type: "committed_transcript_with_timestamps";
/** Final transcript text */
text: string;
/** Detected language code */
language_code?: string;
/** Array of word timing information */
words?: WordsItem[];
}
interface WordsItem {
/** Word text */
text?: string;
/** Start time in seconds */
start?: number;
/** End time in seconds */
end?: number;
/** Type of item */
type?: "word" | "spacing";
/** Speaker identifier */
speaker_id?: string;
/** Log probability (confidence) */
logprob?: number;
/** Individual characters */
characters?: string[];
}
interface Config {
model_id?: string;
audio_format?: AudioFormat;
sample_rate?: number;
vad_commit_strategy?: CommitStrategy;
language_code?: string;
include_timestamps?: boolean;
vad_silence_threshold_secs?: number;
vad_threshold?: number;
min_speech_duration_ms?: number;
min_silence_duration_ms?: number;
disable_logging?: boolean;
}All error messages include a message_type field and an error string describing the issue.
interface ErrorMessage {
message_type: "error";
error: string;
}
interface AuthErrorMessage {
message_type: "auth_error";
error: string;
}
interface QuotaExceededErrorMessage {
message_type: "quota_exceeded";
error: string;
}
interface CommitThrottledErrorMessage {
message_type: "commit_throttled";
error: string;
}
interface TranscriberErrorMessage {
message_type: "transcriber_error";
error: string;
}
interface UnacceptedTermsErrorMessage {
message_type: "unaccepted_terms_error";
error: string;
}
interface RateLimitedErrorMessage {
message_type: "rate_limited";
error: string;
}
interface InputErrorMessage {
message_type: "input_error";
error: string;
}
interface QueueOverflowErrorMessage {
message_type: "queue_overflow";
error: string;
}
interface ResourceExhaustedErrorMessage {
message_type: "resource_exhausted";
error: string;
}
interface SessionTimeLimitExceededErrorMessage {
message_type: "session_time_limit_exceeded";
error: string;
}
interface ChunkSizeExceededErrorMessage {
message_type: "chunk_size_exceeded";
error: string;
}
interface InsufficientAudioActivityErrorMessage {
message_type: "insufficient_audio_activity";
error: string;
}
/**
* Union type for all server error messages
*/
type ServerErrorMessage =
| ErrorMessage
| AuthErrorMessage
| QuotaExceededErrorMessage
| CommitThrottledErrorMessage
| TranscriberErrorMessage
| UnacceptedTermsErrorMessage
| RateLimitedErrorMessage
| InputErrorMessage
| QueueOverflowErrorMessage
| ResourceExhaustedErrorMessage
| SessionTimeLimitExceededErrorMessage
| ChunkSizeExceededErrorMessage
| InsufficientAudioActivityErrorMessage;
/**
* Union type for all possible error payloads emitted by the ERROR event.
* Can be a server error message or a native WebSocket error.
*/
type RealtimeErrorPayload = ServerErrorMessage | Error;The SDK provides a type-safe event map for RealtimeConnection events.
/**
* Type-safe event map for RealtimeConnection events.
* Maps each event to its corresponding payload type.
*/
interface RealtimeEventMap {
[RealtimeEvents.SESSION_STARTED]: SessionStartedMessage;
[RealtimeEvents.PARTIAL_TRANSCRIPT]: PartialTranscriptMessage;
[RealtimeEvents.COMMITTED_TRANSCRIPT]: CommittedTranscriptMessage;
[RealtimeEvents.COMMITTED_TRANSCRIPT_WITH_TIMESTAMPS]: CommittedTranscriptWithTimestampsMessage;
[RealtimeEvents.ERROR]: RealtimeErrorPayload;
[RealtimeEvents.AUTH_ERROR]: AuthErrorMessage;
[RealtimeEvents.QUOTA_EXCEEDED]: QuotaExceededErrorMessage;
[RealtimeEvents.OPEN]: undefined;
[RealtimeEvents.CLOSE]: undefined;
[RealtimeEvents.COMMIT_THROTTLED]: CommitThrottledErrorMessage;
[RealtimeEvents.TRANSCRIBER_ERROR]: TranscriberErrorMessage;
[RealtimeEvents.UNACCEPTED_TERMS_ERROR]: UnacceptedTermsErrorMessage;
[RealtimeEvents.RATE_LIMITED]: RateLimitedErrorMessage;
[RealtimeEvents.INPUT_ERROR]: InputErrorMessage;
[RealtimeEvents.QUEUE_OVERFLOW]: QueueOverflowErrorMessage;
[RealtimeEvents.RESOURCE_EXHAUSTED]: ResourceExhaustedErrorMessage;
[RealtimeEvents.SESSION_TIME_LIMIT_EXCEEDED]: SessionTimeLimitExceededErrorMessage;
[RealtimeEvents.CHUNK_SIZE_EXCEEDED]: ChunkSizeExceededErrorMessage;
[RealtimeEvents.INSUFFICIENT_AUDIO_ACTIVITY]: InsufficientAudioActivityErrorMessage;
}interface InputAudioChunk {
message_type: "input_audio_chunk";
/** Base64 encoded audio data */
audio_base_64: string;
/** Whether to commit (finalize) transcript */
commit: boolean;
/** Sample rate in Hz */
sample_rate: number;
/** Previous text for context */
previous_text?: string;
}import { ElevenLabsClient, AudioFormat, CommitStrategy } from "@elevenlabs/elevenlabs-js";
const client = new ElevenLabsClient({ apiKey: "your-api-key" });
// Connect to real-time transcription
const connection = await client.speechToText.realtime.connect({
modelId: "scribe-v1",
audioFormat: AudioFormat.PCM_16000,
sampleRate: 16000,
commitStrategy: CommitStrategy.VAD,
});
// Listen for session start
connection.on("session_started", (data) => {
console.log("Session started:", data.session_id);
});
// Listen for partial transcripts
connection.on("partial_transcript", (data) => {
console.log("Partial:", data.text);
});
// Listen for final transcripts
connection.on("committed_transcript", (data) => {
console.log("Final:", data.text);
});
// Listen for errors
connection.on("error", (error) => {
console.error("Error:", error);
});
// Send audio chunks (must be base64 encoded)
const audioBase64 = Buffer.from(audioChunk).toString('base64');
connection.send({ audioBase64, commit: false }); // Don't commit yet
connection.send({ audioBase64, commit: true }); // Commit this chunk
// Or use separate commit
connection.send({ audioBase64 });
connection.commit(); // Commit manually
// Close when done
connection.close();// Use VAD for automatic transcript commits
const connection = await client.speechToText.realtime.connect({
modelId: "scribe-v1",
audioFormat: AudioFormat.PCM_16000,
sampleRate: 16000,
commitStrategy: CommitStrategy.VAD,
vadSilenceThresholdSecs: 1.0, // Commit after 1 second of silence
vadThreshold: 0.5, // Medium sensitivity
});
connection.on("committed_transcript", (data) => {
console.log("Auto-committed:", data.text);
});
// Just send audio, VAD handles commits
for (const chunk of audioChunks) {
const audioBase64 = Buffer.from(chunk).toString('base64');
connection.send({ audioBase64 });
}// Enable word-level timestamps
const connection = await client.speechToText.realtime.connect({
modelId: "scribe-v1",
audioFormat: AudioFormat.PCM_16000,
sampleRate: 16000,
commitStrategy: CommitStrategy.VAD,
includeTimestamps: true,
});
connection.on("committed_transcript_with_timestamps", (data) => {
console.log("Transcript:", data.text);
console.log("Language:", data.language_code);
if (data.words) {
for (const word of data.words) {
console.log(`${word.text} [${word.start}s - ${word.end}s]`);
}
}
});// Stream audio from URL (requires ffmpeg)
const connection = await client.speechToText.realtime.connect({
modelId: "scribe-v1",
url: "https://example.com/audio-stream.mp3",
commitStrategy: CommitStrategy.VAD,
});
connection.on("partial_transcript", (data) => {
console.log("Streaming:", data.text);
});
connection.on("committed_transcript", (data) => {
console.log("Final:", data.text);
});// Transcribe in specific language
const connection = await client.speechToText.realtime.connect({
modelId: "scribe-v1",
audioFormat: AudioFormat.PCM_16000,
sampleRate: 16000,
commitStrategy: CommitStrategy.VAD,
languageCode: "es", // Spanish
});import { spawn } from "child_process";
// Connect with high-quality audio settings
const connection = await client.speechToText.realtime.connect({
modelId: "scribe-v1",
audioFormat: AudioFormat.PCM_44100,
sampleRate: 44100,
commitStrategy: CommitStrategy.VAD,
vadSilenceThresholdSecs: 0.8,
includeTimestamps: true,
});
// Capture microphone input with ffmpeg
const ffmpeg = spawn("ffmpeg", [
"-f", "avfoundation", // macOS (use "alsa" for Linux, "dshow" for Windows)
"-i", ":0",
"-ar", "44100",
"-ac", "1",
"-f", "s16le",
"-",
]);
// Stream microphone audio to transcription
ffmpeg.stdout.on("data", (chunk) => {
const audioBase64 = chunk.toString('base64');
connection.send({ audioBase64 });
});
connection.on("committed_transcript", (data) => {
console.log("You said:", data.text);
});
// Stop recording
setTimeout(() => {
ffmpeg.kill();
connection.close();
}, 30000); // Record for 30 secondsconst connection = await client.speechToText.realtime.connect({
modelId: "scribe-v1",
audioFormat: AudioFormat.PCM_16000,
sampleRate: 16000,
});
// Handle specific error types
connection.on("auth_error", () => {
console.error("Authentication failed");
});
connection.on("quota_exceeded", () => {
console.error("Quota exceeded");
});
connection.on("rate_limited", () => {
console.error("Rate limited, slow down");
});
connection.on("chunk_size_exceeded", () => {
console.error("Audio chunk too large");
});
connection.on("error", (error) => {
console.error("General error:", error);
});
// Handle connection close
connection.on("close", () => {
console.log("Connection closed");
});// Fine-tune voice activity detection
const connection = await client.speechToText.realtime.connect({
modelId: "scribe-v1",
audioFormat: AudioFormat.PCM_16000,
sampleRate: 16000,
commitStrategy: CommitStrategy.VAD,
vadSilenceThresholdSecs: 0.5, // Commit after 0.5s silence
vadThreshold: 0.3, // Lower threshold = more sensitive
minSpeechDurationMs: 100, // Minimum 100ms of speech
minSilenceDurationMs: 200, // Minimum 200ms of silence
});