or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-processing.mdconversational-ai.mddubbing.mdindex.mdmusic.mdrealtime.mdstudio.mdtext-to-speech.mdtranscription.mdvoices.mdworkspace.md

transcription.mddocs/

0

# Speech-to-Text Transcription

1

2

## Transcription Method

3

4

### speechToText.convert()

5

6

```typescript { .api }

7

convert(

8

request: {

9

file: File | Blob;

10

modelId: string; // e.g., "scribe_v2"

11

languageCode?: string; // ISO 639-1

12

diarize?: boolean; // Speaker diarization

13

numSpeakers?: number; // Expected number of speakers

14

timestampsGranularity?: "word" | "segment";

15

tagAudioEvents?: boolean; // Tag events like laughter, music

16

webhookUrl?: string; // Async webhook callback

17

enableLogging?: boolean;

18

},

19

options?: RequestOptions

20

): Promise<SpeechToTextConvertResponse>

21

22

interface SpeechToTextConvertResponse {

23

transcriptId: string;

24

status: string; // "processing" | "completed" | "failed"

25

text?: string;

26

segments?: TranscriptSegment[];

27

audioEvents?: AudioEvent[];

28

language?: string;

29

// Additional fields...

30

}

31

32

interface TranscriptSegment {

33

text: string;

34

startTime: number; // Seconds

35

endTime: number;

36

speakerId?: string; // If diarize enabled

37

words?: Word[];

38

}

39

40

interface Word {

41

word: string;

42

startTime: number;

43

endTime: number;

44

confidence?: number;

45

}

46

47

interface AudioEvent {

48

type: string; // e.g., "laughter", "music", "applause"

49

startTime: number;

50

endTime: number;

51

}

52

```

53

54

## Usage Examples

55

56

### Basic Transcription

57

58

```typescript

59

import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";

60

import fs from "fs";

61

62

const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });

63

64

const audioFile = fs.readFileSync("/path/audio.mp3");

65

66

const transcript = await client.speechToText.convert({

67

file: audioFile,

68

modelId: "scribe_v2",

69

languageCode: "en"

70

});

71

72

console.log("Text:", transcript.text);

73

console.log("Status:", transcript.status);

74

```

75

76

### With Speaker Diarization

77

78

```typescript

79

const transcript = await client.speechToText.convert({

80

file: audioFile,

81

modelId: "scribe_v2",

82

diarize: true,

83

numSpeakers: 3 // Expected number of speakers

84

});

85

86

transcript.segments?.forEach(seg => {

87

console.log(`Speaker ${seg.speakerId}: ${seg.text}`);

88

console.log(` Time: ${seg.startTime}s - ${seg.endTime}s`);

89

});

90

```

91

92

### With Word-Level Timestamps

93

94

```typescript

95

const transcript = await client.speechToText.convert({

96

file: audioFile,

97

modelId: "scribe_v2",

98

timestampsGranularity: "word"

99

});

100

101

transcript.segments?.forEach(seg => {

102

seg.words?.forEach(word => {

103

console.log(`"${word.word}": ${word.startTime}s - ${word.endTime}s`);

104

console.log(` Confidence: ${word.confidence}`);

105

});

106

});

107

```

108

109

### With Audio Event Tagging

110

111

```typescript

112

const transcript = await client.speechToText.convert({

113

file: audioFile,

114

modelId: "scribe_v2",

115

tagAudioEvents: true

116

});

117

118

console.log("Transcript:", transcript.text);

119

120

transcript.audioEvents?.forEach(event => {

121

console.log(`Event: ${event.type} at ${event.startTime}s - ${event.endTime}s`);

122

});

123

```

124

125

### Async with Webhook

126

127

```typescript

128

// Start transcription with webhook callback

129

const result = await client.speechToText.convert({

130

file: audioFile,

131

modelId: "scribe_v2",

132

webhookUrl: "https://example.com/webhook",

133

diarize: true,

134

timestampsGranularity: "word"

135

});

136

137

console.log(`Transcript ID: ${result.transcriptId}`);

138

console.log(`Status: ${result.status}`);

139

140

// Webhook will receive result when complete

141

// {

142

// "transcriptId": "...",

143

// "status": "completed",

144

// "text": "...",

145

// "segments": [...]

146

// }

147

```

148

149

### Language Auto-Detection

150

151

```typescript

152

// Omit languageCode for auto-detection

153

const transcript = await client.speechToText.convert({

154

file: audioFile,

155

modelId: "scribe_v2"

156

});

157

158

console.log("Detected language:", transcript.language);

159

console.log("Text:", transcript.text);

160

```

161

162

### Complete Example with All Features

163

164

```typescript

165

import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";

166

import fs from "fs";

167

168

const client = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY });

169

170

const audioFile = fs.readFileSync("meeting.mp3");

171

172

const transcript = await client.speechToText.convert({

173

file: audioFile,

174

modelId: "scribe_v2",

175

languageCode: "en",

176

diarize: true,

177

numSpeakers: 4,

178

timestampsGranularity: "word",

179

tagAudioEvents: true,

180

enableLogging: true

181

});

182

183

console.log("Full transcript:", transcript.text);

184

console.log("Language:", transcript.language);

185

console.log("Status:", transcript.status);

186

187

// Process segments with speakers

188

transcript.segments?.forEach((seg, i) => {

189

console.log(`\n[Segment ${i + 1}] Speaker ${seg.speakerId}`);

190

console.log(`Time: ${seg.startTime}s - ${seg.endTime}s`);

191

console.log(`Text: ${seg.text}`);

192

193

// Word-level details

194

seg.words?.forEach(word => {

195

console.log(` "${word.word}": ${word.startTime}s`);

196

});

197

});

198

199

// Audio events

200

console.log("\nAudio Events:");

201

transcript.audioEvents?.forEach(event => {

202

console.log(`${event.type}: ${event.startTime}s - ${event.endTime}s`);

203

});

204

205

// Export as JSON

206

fs.writeFileSync("transcript.json", JSON.stringify(transcript, null, 2));

207

```

208

209

### Generate SRT Subtitles

210

211

```typescript

212

function generateSRT(segments: TranscriptSegment[]): string {

213

return segments.map((seg, i) => {

214

const start = formatSRTTime(seg.startTime);

215

const end = formatSRTTime(seg.endTime);

216

return `${i + 1}\n${start} --> ${end}\n${seg.text}\n`;

217

}).join("\n");

218

}

219

220

function formatSRTTime(seconds: number): string {

221

const hours = Math.floor(seconds / 3600);

222

const minutes = Math.floor((seconds % 3600) / 60);

223

const secs = Math.floor(seconds % 60);

224

const ms = Math.floor((seconds % 1) * 1000);

225

return `${pad(hours)}:${pad(minutes)}:${pad(secs)},${pad(ms, 3)}`;

226

}

227

228

function pad(num: number, size = 2): string {

229

return String(num).padStart(size, "0");

230

}

231

232

const transcript = await client.speechToText.convert({

233

file: audioFile,

234

modelId: "scribe_v2",

235

timestampsGranularity: "segment"

236

});

237

238

const srt = generateSRT(transcript.segments || []);

239

fs.writeFileSync("subtitles.srt", srt);

240

```

241

242

## Important Notes

243

244

- **Models**: Use "scribe_v2" or check available models

245

- **File formats**: MP3, WAV, M4A, FLAC, etc.

246

- **Language codes**: ISO 639-1 (e.g., "en", "es", "fr")

247

- **Diarization**: Requires `numSpeakers` for best results

248

- **Timestamps**: "word" or "segment" granularity

249

- **Audio events**: Tags laughter, music, applause, etc.

250

- **Webhook**: Async processing, callback on completion

251

- **Status**: "processing", "completed", "failed"

252

- **Logging**: `enableLogging: false` for zero retention (enterprise)

253

- **Language detection**: Auto-detect if `languageCode` omitted

254