Transcribe audio and video files with support for multi-channel audio, webhook notifications, and real-time WebSocket transcription. The Speech-to-Text API converts spoken audio into text with high accuracy.

Note: The client.speechToText property uses an enhanced wrapper class that extends the base API client with real-time transcription capabilities via WebSocket. For real-time transcription, see Real-time Transcription.

Quick Reference

import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";

const client = new ElevenLabsClient({ apiKey: "your-api-key" });
// Access this API via: client.speechToText

Capabilities

Convert Audio/Video to Text

Transcribe audio or video files to text with optional webhook notifications.

/**
 * @param request - Audio/video file and transcription options
 * @param requestOptions - Optional request configuration
 * @returns Transcription result
 * @throws UnprocessableEntityError if request fails
 */
client.speechToText.convert(
  request: BodySpeechToTextV1SpeechToTextPost,
  requestOptions?: RequestOptions
): HttpResponsePromise<SpeechToTextConvertResponse>;

interface BodySpeechToTextV1SpeechToTextPost {
  /** The ID of the model to use for transcription, currently only 'scribe_v1' and 'scribe_v1_experimental' are available */
  modelId: string;
  /** The file to transcribe. All major audio and video formats are supported. Exactly one of the file or cloudStorageUrl parameters must be provided. The file size must be less than 3.0GB */
  file?: File | Blob;
  /** The HTTPS URL of the file to transcribe. Exactly one of the file or cloudStorageUrl parameters must be provided. The file must be accessible via HTTPS and the file size must be less than 2GB. Any valid HTTPS URL is accepted, including URLs from cloud storage providers (AWS S3, Google Cloud Storage, Cloudflare R2, etc.), CDNs, or any other HTTPS source. URLs can be pre-signed or include authentication tokens in query parameters */
  cloudStorageUrl?: string;
  /** When enableLogging is set to false zero retention mode will be used for the request. This will mean log and transcript storage features are unavailable for this request. Zero retention mode may only be used by enterprise customers */
  enableLogging?: boolean;
  /** An ISO-639-1 or ISO-639-3 languageCode corresponding to the language of the audio file. Can sometimes improve transcription performance if known beforehand. Defaults to null, in this case the language is predicted automatically */
  languageCode?: string;
  /** Whether to tag audio events like (laughter), (footsteps), etc. in the transcription */
  tagAudioEvents?: boolean;
  /** The maximum amount of speakers talking in the uploaded file. Can help with predicting who speaks when. The maximum amount of speakers that can be predicted is 32. Defaults to null, in this case the amount of speakers is set to the maximum value the model supports */
  numSpeakers?: number;
  /** The granularity of the timestamps in the transcription. 'word' provides word-level timestamps and 'character' provides character-level timestamps per word */
  timestampsGranularity?: "word" | "character";
  /** Whether to annotate which speaker is currently talking in the uploaded file */
  diarize?: boolean;
  /** Diarization threshold to apply during speaker diarization. A higher value means there will be a lower chance of one speaker being diarized as two different speakers but also a higher chance of two different speakers being diarized as one speaker (less total speakers predicted). A low value means there will be a higher chance of one speaker being diarized as two different speakers but also a lower chance of two different speakers being diarized as one speaker (more total speakers predicted). Can only be set when diarize=true and numSpeakers=null. Defaults to null, in which case we will choose a threshold based on the modelId (0.22 usually) */
  diarizationThreshold?: number;
  /** A list of additional formats to export the transcript to. Options include 'srt', 'vtt', 'json', 'txt' */
  additionalFormats?: string[];
  /** The format of input audio. Options are 'pcm_s16le_16' or 'other'. For 'pcm_s16le_16', the input audio must be 16-bit PCM at a 16kHz sample rate, single channel (mono), and little-endian byte order. Latency will be lower than with passing an encoded waveform */
  fileFormat?: "pcm_s16le_16" | "other";
  /** Whether to send the transcription result to configured speech-to-text webhooks. If set, the request will return early without the transcription, which will be delivered later via webhook */
  webhook?: boolean;
  /** Optional specific webhook ID to send the transcription result to. Only valid when webhook is set to true. If not provided, transcription will be sent to all configured speech-to-text webhooks */
  webhookId?: string;
  /** Controls the randomness of the transcription output. Accepts values between 0.0 and 2.0, where higher values result in more diverse and less deterministic results. If omitted, we will use a temperature based on the model you selected which is usually 0 */
  temperature?: number;
  /** If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed. Must be an integer between 0 and 2147483647 */
  seed?: number;
  /** Whether the audio file contains multiple channels where each channel contains a single speaker. When enabled, each channel will be transcribed independently and the results will be combined. Each word in the response will include a 'channel_index' field indicating which channel it was spoken on. A maximum of 5 channels is supported */
  useMultiChannel?: boolean;
  /** Optional metadata to be included in the webhook response. This should be a JSON string representing an object with a maximum depth of 2 levels and maximum size of 16KB. Useful for tracking internal IDs, job references, or other contextual information */
  webhookMetadata?: string;
  /** Detect entities in the transcript. Can be 'all' to detect all entities, a single entity type or category string, or a list of entity types/categories. Categories include 'pii', 'phi', 'pci', 'other', 'offensive_language'. When enabled, detected entities will be returned in the 'entities' field with their text, type, and character positions */
  entityDetection?: string | string[];
  /** A list of keyterms to bias the transcription towards. The keyterms are words or phrases you want the model to recognise more accurately. The number of keyterms cannot exceed 100. The length of each keyterm must be less than 50 characters. Keyterms can contain at most 5 words (after normalisation). For example ["hello", "world", "technical term"] */
  keyterms?: string[];
}

interface SpeechToTextConvertResponse {
  /** Transcribed text */
  text: string;
  /** Array of channel transcripts (for multi-channel audio) */
  channels?: ChannelTranscript[];
}

interface ChannelTranscript {
  /** Channel number (0-indexed) */
  channel: number;
  /** Transcribed text for this channel */
  text: string;
  /** Word-level timing information */
  words?: WordTimestamp[];
}

interface WordTimestamp {
  /** Word text */
  word: string;
  /** Start time in seconds */
  start: number;
  /** End time in seconds */
  end: number;
  /** Confidence score (0.0 to 1.0) */
  confidence?: number;
}

Transcript Management

Manage transcription jobs and retrieve results.

/**
 * Get status and result of a transcription job
 */
client.speechToText.transcripts.get(
  transcript_id: string,
  requestOptions?: RequestOptions
): HttpResponsePromise<TranscriptResponse>;

/**
 * Delete a previously generated transcript
 * @param transcription_id - The unique ID of the transcript to delete
 * @param requestOptions - Optional request configuration
 * @throws UnauthorizedError if not authenticated
 * @throws UnprocessableEntityError if request fails
 */
client.speechToText.transcripts.delete(
  transcription_id: string,
  requestOptions?: RequestOptions
): HttpResponsePromise<unknown>;

interface TranscriptResponse {
  /** Unique transcript identifier */
  transcript_id: string;
  /** Transcription status */
  status: "queued" | "processing" | "completed" | "failed";
  /** Transcribed text (when complete) */
  text?: string;
  /** Multi-channel transcripts */
  channels?: ChannelTranscript[];
  /** Error message (if failed) */
  error?: string;
}

Usage Examples

Basic Audio Transcription

import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
import { readFile } from "fs/promises";

const client = new ElevenLabsClient({ apiKey: "your-api-key" });

// Load audio file
const audioFile = await readFile("recording.mp3");

// Transcribe audio
const result = await client.speechToText.convert({
  modelId: "scribe_v1",
  file: new File([audioFile], "recording.mp3"),
  languageCode: "en",
});

console.log("Transcription:", result.text);

Multi-Channel Audio

// Transcribe multi-channel audio (e.g., stereo interview)
const audioFile = await readFile("interview.wav");

const result = await client.speechToText.convert({
  modelId: "scribe_v1",
  file: new File([audioFile], "interview.wav"),
  languageCode: "en",
  useMultiChannel: true,
});

// Process each channel separately
if (result.channels) {
  for (const channel of result.channels) {
    console.log(`Channel ${channel.channel}:`, channel.text);

    // Access word-level timestamps
    if (channel.words) {
      for (const word of channel.words) {
        console.log(`  ${word.word} [${word.start}s - ${word.end}s]`);
      }
    }
  }
}

Async Transcription with Webhooks

// Submit transcription with webhook for async results
const audioFile = await readFile("long-recording.mp3");

await client.speechToText.convert({
  modelId: "scribe_v1",
  file: new File([audioFile], "long-recording.mp3"),
  webhook: true,
  webhookMetadata: JSON.stringify({
    user_id: "12345",
    session_id: "abc-def",
  }),
});

// Your webhook endpoint will receive:
// {
//   transcript_id: "...",
//   status: "completed",
//   text: "...",
//   metadata: "{\"user_id\":\"12345\",\"session_id\":\"abc-def\"}"
// }

Video Transcription

// Transcribe video file (audio will be extracted)
const videoFile = await readFile("presentation.mp4");

const result = await client.speechToText.convert({
  modelId: "scribe_v1",
  file: new File([videoFile], "presentation.mp4"),
  languageCode: "en",
});

console.log("Video transcription:", result.text);

Poll Transcription Status

// Submit transcription job
const audioFile = await readFile("audio.mp3");

const initialResult = await client.speechToText.convert({
  modelId: "scribe_v1",
  file: new File([audioFile], "audio.mp3"),
  webhook: true,
});

// If using async processing, poll for results
async function waitForTranscription(transcriptId: string) {
  while (true) {
    const status = await client.speechToText.transcripts.get(transcriptId);

    if (status.status === "completed") {
      return status.text;
    } else if (status.status === "failed") {
      throw new Error(`Transcription failed: ${status.error}`);
    }

    // Wait before polling again
    await new Promise(resolve => setTimeout(resolve, 2000));
  }
}

Different Languages

// Transcribe in Spanish
const spanishAudio = await readFile("spanish.mp3");
const result = await client.speechToText.convert({
  modelId: "scribe_v1",
  file: new File([spanishAudio], "spanish.mp3"),
  languageCode: "es",
});

// Transcribe in French
const frenchAudio = await readFile("french.mp3");
const result2 = await client.speechToText.convert({
  modelId: "scribe_v1",
  file: new File([frenchAudio], "french.mp3"),
  languageCode: "fr",
});

Version

Tile

Files

speech-to-text.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/audio/

Speech-to-Text