Tessl Tile for npm/@elevenlabs/elevenlabs-js@2.24.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio-processing.md conversational-ai.md dubbing.md index.md music.md realtime.md studio.md text-to-speech.md transcription.md voices.md workspace.md

transcription.mddocs/

0
# Speech-to-Text Transcription
1

2
## Transcription Method
3

4
### speechToText.convert()
5

6
```typescript { .api }
7
convert(
8
  request: {
9
    file: File | Blob;
10
    modelId: string;  // e.g., "scribe_v2"
11
    languageCode?: string;  // ISO 639-1
12
    diarize?: boolean;  // Speaker diarization
13
    numSpeakers?: number;  // Expected number of speakers
14
    timestampsGranularity?: "word" | "segment";
15
    tagAudioEvents?: boolean;  // Tag events like laughter, music
16
    webhookUrl?: string;  // Async webhook callback
17
    enableLogging?: boolean;
18
  },
19
  options?: RequestOptions
20
): Promise<SpeechToTextConvertResponse>
21

22
interface SpeechToTextConvertResponse {
23
  transcriptId: string;
24
  status: string;  // "processing" | "completed" | "failed"
25
  text?: string;
26
  segments?: TranscriptSegment[];
27
  audioEvents?: AudioEvent[];
28
  language?: string;
29
  // Additional fields...
30
}
31

32
interface TranscriptSegment {
33
  text: string;
34
  startTime: number;  // Seconds
35
  endTime: number;
36
  speakerId?: string;  // If diarize enabled
37
  words?: Word[];
38
}
39

40
interface Word {
41
  word: string;
42
  startTime: number;
43
  endTime: number;
44
  confidence?: number;
45
}
46

47
interface AudioEvent {
48
  type: string;  // e.g., "laughter", "music", "applause"
49
  startTime: number;
50
  endTime: number;
51
}
52
```
53

54
## Usage Examples
55

56
### Basic Transcription
57

58
```typescript
59
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
60
import fs from "fs";
61

62
const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });
63

64
const audioFile = fs.readFileSync("/path/audio.mp3");
65

66
const transcript = await client.speechToText.convert({
67
  file: audioFile,
68
  modelId: "scribe_v2",
69
  languageCode: "en"
70
});
71

72
console.log("Text:", transcript.text);
73
console.log("Status:", transcript.status);
74
```
75

76
### With Speaker Diarization
77

78
```typescript
79
const transcript = await client.speechToText.convert({
80
  file: audioFile,
81
  modelId: "scribe_v2",
82
  diarize: true,
83
  numSpeakers: 3  // Expected number of speakers
84
});
85

86
transcript.segments?.forEach(seg => {
87
  console.log(`Speaker ${seg.speakerId}: ${seg.text}`);
88
  console.log(`  Time: ${seg.startTime}s - ${seg.endTime}s`);
89
});
90
```
91

92
### With Word-Level Timestamps
93

94
```typescript
95
const transcript = await client.speechToText.convert({
96
  file: audioFile,
97
  modelId: "scribe_v2",
98
  timestampsGranularity: "word"
99
});
100

101
transcript.segments?.forEach(seg => {
102
  seg.words?.forEach(word => {
103
    console.log(`"${word.word}": ${word.startTime}s - ${word.endTime}s`);
104
    console.log(`  Confidence: ${word.confidence}`);
105
  });
106
});
107
```
108

109
### With Audio Event Tagging
110

111
```typescript
112
const transcript = await client.speechToText.convert({
113
  file: audioFile,
114
  modelId: "scribe_v2",
115
  tagAudioEvents: true
116
});
117

118
console.log("Transcript:", transcript.text);
119

120
transcript.audioEvents?.forEach(event => {
121
  console.log(`Event: ${event.type} at ${event.startTime}s - ${event.endTime}s`);
122
});
123
```
124

125
### Async with Webhook
126

127
```typescript
128
// Start transcription with webhook callback
129
const result = await client.speechToText.convert({
130
  file: audioFile,
131
  modelId: "scribe_v2",
132
  webhookUrl: "https://example.com/webhook",
133
  diarize: true,
134
  timestampsGranularity: "word"
135
});
136

137
console.log(`Transcript ID: ${result.transcriptId}`);
138
console.log(`Status: ${result.status}`);
139

140
// Webhook will receive result when complete
141
// {
142
//   "transcriptId": "...",
143
//   "status": "completed",
144
//   "text": "...",
145
//   "segments": [...]
146
// }
147
```
148

149
### Language Auto-Detection
150

151
```typescript
152
// Omit languageCode for auto-detection
153
const transcript = await client.speechToText.convert({
154
  file: audioFile,
155
  modelId: "scribe_v2"
156
});
157

158
console.log("Detected language:", transcript.language);
159
console.log("Text:", transcript.text);
160
```
161

162
### Complete Example with All Features
163

164
```typescript
165
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
166
import fs from "fs";
167

168
const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });
169

170
const audioFile = fs.readFileSync("meeting.mp3");
171

172
const transcript = await client.speechToText.convert({
173
  file: audioFile,
174
  modelId: "scribe_v2",
175
  languageCode: "en",
176
  diarize: true,
177
  numSpeakers: 4,
178
  timestampsGranularity: "word",
179
  tagAudioEvents: true,
180
  enableLogging: true
181
});
182

183
console.log("Full transcript:", transcript.text);
184
console.log("Language:", transcript.language);
185
console.log("Status:", transcript.status);
186

187
// Process segments with speakers
188
transcript.segments?.forEach((seg, i) => {
189
  console.log(`\n[Segment ${i + 1}] Speaker ${seg.speakerId}`);
190
  console.log(`Time: ${seg.startTime}s - ${seg.endTime}s`);
191
  console.log(`Text: ${seg.text}`);
192

193
  // Word-level details
194
  seg.words?.forEach(word => {
195
    console.log(`  "${word.word}": ${word.startTime}s`);
196
  });
197
});
198

199
// Audio events
200
console.log("\nAudio Events:");
201
transcript.audioEvents?.forEach(event => {
202
  console.log(`${event.type}: ${event.startTime}s - ${event.endTime}s`);
203
});
204

205
// Export as JSON
206
fs.writeFileSync("transcript.json", JSON.stringify(transcript, null, 2));
207
```
208

209
### Generate SRT Subtitles
210

211
```typescript
212
function generateSRT(segments: TranscriptSegment[]): string {
213
  return segments.map((seg, i) => {
214
    const start = formatSRTTime(seg.startTime);
215
    const end = formatSRTTime(seg.endTime);
216
    return `${i + 1}\n${start} --> ${end}\n${seg.text}\n`;
217
  }).join("\n");
218
}
219

220
function formatSRTTime(seconds: number): string {
221
  const hours = Math.floor(seconds / 3600);
222
  const minutes = Math.floor((seconds % 3600) / 60);
223
  const secs = Math.floor(seconds % 60);
224
  const ms = Math.floor((seconds % 1) * 1000);
225
  return `${pad(hours)}:${pad(minutes)}:${pad(secs)},${pad(ms, 3)}`;
226
}
227

228
function pad(num: number, size = 2): string {
229
  return String(num).padStart(size, "0");
230
}
231

232
const transcript = await client.speechToText.convert({
233
  file: audioFile,
234
  modelId: "scribe_v2",
235
  timestampsGranularity: "segment"
236
});
237

238
const srt = generateSRT(transcript.segments || []);
239
fs.writeFileSync("subtitles.srt", srt);
240
```
241

242
## Important Notes
243

244
- **Models**: Use "scribe_v2" or check available models
245
- **File formats**: MP3, WAV, M4A, FLAC, etc.
246
- **Language codes**: ISO 639-1 (e.g., "en", "es", "fr")
247
- **Diarization**: Requires `numSpeakers` for best results
248
- **Timestamps**: "word" or "segment" granularity
249
- **Audio events**: Tags laughter, music, applause, etc.
250
- **Webhook**: Async processing, callback on completion
251
- **Status**: "processing", "completed", "failed"
252
- **Logging**: `enableLogging: false` for zero retention (enterprise)
253
- **Language detection**: Auto-detect if `languageCode` omitted
254

Version

Tile

Files

transcription.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

transcription.mddocs/