or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio.mdbatches.mdchat-completions.mdembeddings.mdfiles.mdindex.mdmodels.md

audio.mddocs/

0

# Audio Processing

1

2

Comprehensive audio capabilities including speech-to-text transcription, translation, and text-to-speech synthesis. The audio API provides high-quality processing for various audio formats and use cases.

3

4

## Capabilities

5

6

### Speech-to-Text Transcription

7

8

Convert audio files to text with high accuracy and support for multiple languages and formats.

9

10

```python { .api }

11

def transcribe(

12

file: FileTypes,

13

model: str,

14

language: Optional[str] = NOT_GIVEN,

15

prompt: Optional[str] = NOT_GIVEN,

16

response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]] = NOT_GIVEN,

17

temperature: Optional[float] = NOT_GIVEN,

18

timestamp_granularities: Optional[List[Literal["word", "segment"]]] = NOT_GIVEN,

19

extra_headers: Headers | None = None,

20

extra_query: Query | None = None,

21

extra_body: Body | None = None,

22

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN

23

) -> TranscriptionResponse:

24

"""

25

Transcribe audio to text.

26

27

Parameters:

28

- file: Audio file to transcribe (various formats supported)

29

- model: Model to use for transcription

30

- language: Language of the input audio (ISO-639-1 format)

31

- prompt: Optional text prompt to guide the model's style

32

- response_format: Format of the transcript output

33

- temperature: Sampling temperature between 0 and 1

34

- timestamp_granularities: Timestamp granularities to populate

35

36

Returns:

37

TranscriptionResponse with transcribed text and optional metadata

38

"""

39

```

40

41

### Speech Translation

42

43

Translate audio from various languages to English text.

44

45

```python { .api }

46

def translate(

47

file: FileTypes,

48

model: str,

49

prompt: Optional[str] = NOT_GIVEN,

50

response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]] = NOT_GIVEN,

51

temperature: Optional[float] = NOT_GIVEN,

52

extra_headers: Headers | None = None,

53

extra_query: Query | None = None,

54

extra_body: Body | None = None,

55

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN

56

) -> TranslationResponse:

57

"""

58

Translate audio to English text.

59

60

Parameters:

61

- file: Audio file to translate (various formats supported)

62

- model: Model to use for translation

63

- prompt: Optional text prompt to guide the model's style

64

- response_format: Format of the transcript output

65

- temperature: Sampling temperature between 0 and 1

66

67

Returns:

68

TranslationResponse with translated English text and optional metadata

69

"""

70

```

71

72

### Text-to-Speech Synthesis

73

74

Generate spoken audio from text input with various voice options.

75

76

```python { .api }

77

def speech(

78

input: str,

79

model: str,

80

voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"],

81

response_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = NOT_GIVEN,

82

speed: Optional[float] = NOT_GIVEN,

83

extra_headers: Headers | None = None,

84

extra_query: Query | None = None,

85

extra_body: Body | None = None,

86

timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN

87

) -> bytes:

88

"""

89

Generate audio from text.

90

91

Parameters:

92

- input: Text to convert to audio

93

- model: Model to use for speech synthesis

94

- voice: Voice to use for the generated audio

95

- response_format: Audio format for the output

96

- speed: Speed of the generated audio (0.25 to 4.0)

97

98

Returns:

99

Raw audio bytes in the specified format

100

"""

101

```

102

103

### Async Audio Operations

104

105

All audio operations have asynchronous counterparts with identical parameters.

106

107

```python { .api }

108

async def transcribe(file: FileTypes, model: str, **kwargs) -> TranscriptionResponse: ...

109

async def translate(file: FileTypes, model: str, **kwargs) -> TranslationResponse: ...

110

async def speech(input: str, model: str, voice: str, **kwargs) -> bytes: ...

111

```

112

113

## Usage Examples

114

115

### Audio Transcription

116

117

```python

118

from groq import Groq

119

120

client = Groq()

121

122

# Transcribe an audio file

123

with open("audio.mp3", "rb") as audio_file:

124

transcript = client.audio.transcriptions.create(

125

file=audio_file,

126

model="whisper-large-v3",

127

language="en",

128

response_format="text"

129

)

130

131

print("Transcript:", transcript)

132

133

# With detailed response format

134

with open("audio.wav", "rb") as audio_file:

135

response = client.audio.transcriptions.create(

136

file=audio_file,

137

model="whisper-large-v3",

138

response_format="verbose_json",

139

timestamp_granularities=["word", "segment"]

140

)

141

142

print("Text:", response.text)

143

print("Language:", response.language)

144

for segment in response.segments:

145

print(f"[{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text}")

146

```

147

148

### Audio Translation

149

150

```python

151

from groq import Groq

152

153

client = Groq()

154

155

# Translate non-English audio to English

156

with open("spanish_audio.mp3", "rb") as audio_file:

157

translation = client.audio.translations.create(

158

file=audio_file,

159

model="whisper-large-v3",

160

response_format="text"

161

)

162

163

print("English translation:", translation)

164

165

# With JSON response format

166

with open("french_audio.wav", "rb") as audio_file:

167

response = client.audio.translations.create(

168

file=audio_file,

169

model="whisper-large-v3",

170

response_format="json"

171

)

172

173

print("Translated text:", response.text)

174

```

175

176

### Text-to-Speech

177

178

```python

179

from groq import Groq

180

181

client = Groq()

182

183

# Generate speech from text

184

response = client.audio.speech.create(

185

input="Hello, this is a test of the text-to-speech functionality.",

186

model="tts-1",

187

voice="nova",

188

response_format="mp3"

189

)

190

191

# Save the audio to a file

192

with open("output.mp3", "wb") as audio_file:

193

audio_file.write(response)

194

195

# Different voice and format

196

response = client.audio.speech.create(

197

input="This is a different voice and format example.",

198

model="tts-1-hd",

199

voice="alloy",

200

response_format="wav",

201

speed=1.2

202

)

203

204

with open("output.wav", "wb") as audio_file:

205

audio_file.write(response)

206

```

207

208

### Using file_from_path Utility

209

210

```python

211

from groq import Groq, file_from_path

212

213

client = Groq()

214

215

# Use the utility function for file handling

216

audio_file = file_from_path("path/to/audio.mp3")

217

transcript = client.audio.transcriptions.create(

218

file=audio_file,

219

model="whisper-large-v3"

220

)

221

222

print(transcript)

223

```

224

225

### Async Usage

226

227

```python

228

import asyncio

229

from groq import AsyncGroq

230

231

async def main():

232

client = AsyncGroq()

233

234

# Async transcription

235

with open("audio.mp3", "rb") as audio_file:

236

transcript = await client.audio.transcriptions.create(

237

file=audio_file,

238

model="whisper-large-v3",

239

response_format="text"

240

)

241

242

print("Transcript:", transcript)

243

244

# Async text-to-speech

245

speech_response = await client.audio.speech.create(

246

input="Async text-to-speech example",

247

model="tts-1",

248

voice="echo"

249

)

250

251

with open("async_output.mp3", "wb") as f:

252

f.write(speech_response)

253

254

asyncio.run(main())

255

```

256

257

## Types

258

259

### File Types

260

261

```python { .api }

262

FileTypes = Union[IO[bytes], bytes, PathLike, str]

263

```

264

265

### Response Types

266

267

```python { .api }

268

class TranscriptionResponse:

269

text: str

270

271

class TranslationResponse:

272

text: str

273

274

# Verbose response format (when response_format="verbose_json")

275

class TranscriptionVerboseResponse:

276

text: str

277

language: str

278

duration: float

279

segments: List[TranscriptionSegment]

280

words: Optional[List[TranscriptionWord]]

281

282

class TranscriptionSegment:

283

id: int

284

seek: int

285

start: float

286

end: float

287

text: str

288

tokens: List[int]

289

temperature: float

290

avg_logprob: float

291

compression_ratio: float

292

no_speech_prob: float

293

294

class TranscriptionWord:

295

word: str

296

start: float

297

end: float

298

```

299

300

### Request Parameter Types

301

302

```python { .api }

303

class TranscriptionCreateParams:

304

file: FileTypes

305

model: str

306

language: Optional[str]

307

prompt: Optional[str]

308

response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]]

309

temperature: Optional[float]

310

timestamp_granularities: Optional[List[Literal["word", "segment"]]]

311

312

class TranslationCreateParams:

313

file: FileTypes

314

model: str

315

prompt: Optional[str]

316

response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]]

317

temperature: Optional[float]

318

319

class SpeechCreateParams:

320

input: str

321

model: str

322

voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]

323

response_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]]

324

speed: Optional[float]

325

```