0
# Speech-to-Text Transcription
1
2
## Transcription Method
3
4
### speechToText.convert()
5
6
```typescript { .api }
7
convert(
8
request: {
9
file: File | Blob;
10
modelId: string; // e.g., "scribe_v2"
11
languageCode?: string; // ISO 639-1
12
diarize?: boolean; // Speaker diarization
13
numSpeakers?: number; // Expected number of speakers
14
timestampsGranularity?: "word" | "segment";
15
tagAudioEvents?: boolean; // Tag events like laughter, music
16
webhookUrl?: string; // Async webhook callback
17
enableLogging?: boolean;
18
},
19
options?: RequestOptions
20
): Promise<SpeechToTextConvertResponse>
21
22
interface SpeechToTextConvertResponse {
23
transcriptId: string;
24
status: string; // "processing" | "completed" | "failed"
25
text?: string;
26
segments?: TranscriptSegment[];
27
audioEvents?: AudioEvent[];
28
language?: string;
29
// Additional fields...
30
}
31
32
interface TranscriptSegment {
33
text: string;
34
startTime: number; // Seconds
35
endTime: number;
36
speakerId?: string; // If diarize enabled
37
words?: Word[];
38
}
39
40
interface Word {
41
word: string;
42
startTime: number;
43
endTime: number;
44
confidence?: number;
45
}
46
47
interface AudioEvent {
48
type: string; // e.g., "laughter", "music", "applause"
49
startTime: number;
50
endTime: number;
51
}
52
```
53
54
## Usage Examples
55
56
### Basic Transcription
57
58
```typescript
59
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
60
import fs from "fs";
61
62
const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });
63
64
const audioFile = fs.readFileSync("/path/audio.mp3");
65
66
const transcript = await client.speechToText.convert({
67
file: audioFile,
68
modelId: "scribe_v2",
69
languageCode: "en"
70
});
71
72
console.log("Text:", transcript.text);
73
console.log("Status:", transcript.status);
74
```
75
76
### With Speaker Diarization
77
78
```typescript
79
const transcript = await client.speechToText.convert({
80
file: audioFile,
81
modelId: "scribe_v2",
82
diarize: true,
83
numSpeakers: 3 // Expected number of speakers
84
});
85
86
transcript.segments?.forEach(seg => {
87
console.log(`Speaker ${seg.speakerId}: ${seg.text}`);
88
console.log(` Time: ${seg.startTime}s - ${seg.endTime}s`);
89
});
90
```
91
92
### With Word-Level Timestamps
93
94
```typescript
95
const transcript = await client.speechToText.convert({
96
file: audioFile,
97
modelId: "scribe_v2",
98
timestampsGranularity: "word"
99
});
100
101
transcript.segments?.forEach(seg => {
102
seg.words?.forEach(word => {
103
console.log(`"${word.word}": ${word.startTime}s - ${word.endTime}s`);
104
console.log(` Confidence: ${word.confidence}`);
105
});
106
});
107
```
108
109
### With Audio Event Tagging
110
111
```typescript
112
const transcript = await client.speechToText.convert({
113
file: audioFile,
114
modelId: "scribe_v2",
115
tagAudioEvents: true
116
});
117
118
console.log("Transcript:", transcript.text);
119
120
transcript.audioEvents?.forEach(event => {
121
console.log(`Event: ${event.type} at ${event.startTime}s - ${event.endTime}s`);
122
});
123
```
124
125
### Async with Webhook
126
127
```typescript
128
// Start transcription with webhook callback
129
const result = await client.speechToText.convert({
130
file: audioFile,
131
modelId: "scribe_v2",
132
webhookUrl: "https://example.com/webhook",
133
diarize: true,
134
timestampsGranularity: "word"
135
});
136
137
console.log(`Transcript ID: ${result.transcriptId}`);
138
console.log(`Status: ${result.status}`);
139
140
// Webhook will receive result when complete
141
// {
142
// "transcriptId": "...",
143
// "status": "completed",
144
// "text": "...",
145
// "segments": [...]
146
// }
147
```
148
149
### Language Auto-Detection
150
151
```typescript
152
// Omit languageCode for auto-detection
153
const transcript = await client.speechToText.convert({
154
file: audioFile,
155
modelId: "scribe_v2"
156
});
157
158
console.log("Detected language:", transcript.language);
159
console.log("Text:", transcript.text);
160
```
161
162
### Complete Example with All Features
163
164
```typescript
165
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
166
import fs from "fs";
167
168
const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });
169
170
const audioFile = fs.readFileSync("meeting.mp3");
171
172
const transcript = await client.speechToText.convert({
173
file: audioFile,
174
modelId: "scribe_v2",
175
languageCode: "en",
176
diarize: true,
177
numSpeakers: 4,
178
timestampsGranularity: "word",
179
tagAudioEvents: true,
180
enableLogging: true
181
});
182
183
console.log("Full transcript:", transcript.text);
184
console.log("Language:", transcript.language);
185
console.log("Status:", transcript.status);
186
187
// Process segments with speakers
188
transcript.segments?.forEach((seg, i) => {
189
console.log(`\n[Segment ${i + 1}] Speaker ${seg.speakerId}`);
190
console.log(`Time: ${seg.startTime}s - ${seg.endTime}s`);
191
console.log(`Text: ${seg.text}`);
192
193
// Word-level details
194
seg.words?.forEach(word => {
195
console.log(` "${word.word}": ${word.startTime}s`);
196
});
197
});
198
199
// Audio events
200
console.log("\nAudio Events:");
201
transcript.audioEvents?.forEach(event => {
202
console.log(`${event.type}: ${event.startTime}s - ${event.endTime}s`);
203
});
204
205
// Export as JSON
206
fs.writeFileSync("transcript.json", JSON.stringify(transcript, null, 2));
207
```
208
209
### Generate SRT Subtitles
210
211
```typescript
212
function generateSRT(segments: TranscriptSegment[]): string {
213
return segments.map((seg, i) => {
214
const start = formatSRTTime(seg.startTime);
215
const end = formatSRTTime(seg.endTime);
216
return `${i + 1}\n${start} --> ${end}\n${seg.text}\n`;
217
}).join("\n");
218
}
219
220
function formatSRTTime(seconds: number): string {
221
const hours = Math.floor(seconds / 3600);
222
const minutes = Math.floor((seconds % 3600) / 60);
223
const secs = Math.floor(seconds % 60);
224
const ms = Math.floor((seconds % 1) * 1000);
225
return `${pad(hours)}:${pad(minutes)}:${pad(secs)},${pad(ms, 3)}`;
226
}
227
228
function pad(num: number, size = 2): string {
229
return String(num).padStart(size, "0");
230
}
231
232
const transcript = await client.speechToText.convert({
233
file: audioFile,
234
modelId: "scribe_v2",
235
timestampsGranularity: "segment"
236
});
237
238
const srt = generateSRT(transcript.segments || []);
239
fs.writeFileSync("subtitles.srt", srt);
240
```
241
242
## Important Notes
243
244
- **Models**: Use "scribe_v2" or check available models
245
- **File formats**: MP3, WAV, M4A, FLAC, etc.
246
- **Language codes**: ISO 639-1 (e.g., "en", "es", "fr")
247
- **Diarization**: Requires `numSpeakers` for best results
248
- **Timestamps**: "word" or "segment" granularity
249
- **Audio events**: Tags laughter, music, applause, etc.
250
- **Webhook**: Async processing, callback on completion
251
- **Status**: "processing", "completed", "failed"
252
- **Logging**: `enableLogging: false` for zero retention (enterprise)
253
- **Language detection**: Auto-detect if `languageCode` omitted
254