0
# Audio Processing
1
2
Comprehensive audio capabilities including speech-to-text transcription, translation, and text-to-speech synthesis. The audio API provides high-quality processing for various audio formats and use cases.
3
4
## Capabilities
5
6
### Speech-to-Text Transcription
7
8
Convert audio files to text with high accuracy and support for multiple languages and formats.
9
10
```python { .api }
11
def transcribe(
12
file: FileTypes,
13
model: str,
14
language: Optional[str] = NOT_GIVEN,
15
prompt: Optional[str] = NOT_GIVEN,
16
response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]] = NOT_GIVEN,
17
temperature: Optional[float] = NOT_GIVEN,
18
timestamp_granularities: Optional[List[Literal["word", "segment"]]] = NOT_GIVEN,
19
extra_headers: Headers | None = None,
20
extra_query: Query | None = None,
21
extra_body: Body | None = None,
22
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
23
) -> TranscriptionResponse:
24
"""
25
Transcribe audio to text.
26
27
Parameters:
28
- file: Audio file to transcribe (various formats supported)
29
- model: Model to use for transcription
30
- language: Language of the input audio (ISO-639-1 format)
31
- prompt: Optional text prompt to guide the model's style
32
- response_format: Format of the transcript output
33
- temperature: Sampling temperature between 0 and 1
34
- timestamp_granularities: Timestamp granularities to populate
35
36
Returns:
37
TranscriptionResponse with transcribed text and optional metadata
38
"""
39
```
40
41
### Speech Translation
42
43
Translate audio from various languages to English text.
44
45
```python { .api }
46
def translate(
47
file: FileTypes,
48
model: str,
49
prompt: Optional[str] = NOT_GIVEN,
50
response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]] = NOT_GIVEN,
51
temperature: Optional[float] = NOT_GIVEN,
52
extra_headers: Headers | None = None,
53
extra_query: Query | None = None,
54
extra_body: Body | None = None,
55
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
56
) -> TranslationResponse:
57
"""
58
Translate audio to English text.
59
60
Parameters:
61
- file: Audio file to translate (various formats supported)
62
- model: Model to use for translation
63
- prompt: Optional text prompt to guide the model's style
64
- response_format: Format of the transcript output
65
- temperature: Sampling temperature between 0 and 1
66
67
Returns:
68
TranslationResponse with translated English text and optional metadata
69
"""
70
```
71
72
### Text-to-Speech Synthesis
73
74
Generate spoken audio from text input with various voice options.
75
76
```python { .api }
77
def speech(
78
input: str,
79
model: str,
80
voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
81
response_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = NOT_GIVEN,
82
speed: Optional[float] = NOT_GIVEN,
83
extra_headers: Headers | None = None,
84
extra_query: Query | None = None,
85
extra_body: Body | None = None,
86
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
87
) -> bytes:
88
"""
89
Generate audio from text.
90
91
Parameters:
92
- input: Text to convert to audio
93
- model: Model to use for speech synthesis
94
- voice: Voice to use for the generated audio
95
- response_format: Audio format for the output
96
- speed: Speed of the generated audio (0.25 to 4.0)
97
98
Returns:
99
Raw audio bytes in the specified format
100
"""
101
```
102
103
### Async Audio Operations
104
105
All audio operations have asynchronous counterparts with identical parameters.
106
107
```python { .api }
108
async def transcribe(file: FileTypes, model: str, **kwargs) -> TranscriptionResponse: ...
109
async def translate(file: FileTypes, model: str, **kwargs) -> TranslationResponse: ...
110
async def speech(input: str, model: str, voice: str, **kwargs) -> bytes: ...
111
```
112
113
## Usage Examples
114
115
### Audio Transcription
116
117
```python
118
from groq import Groq
119
120
client = Groq()
121
122
# Transcribe an audio file
123
with open("audio.mp3", "rb") as audio_file:
124
transcript = client.audio.transcriptions.create(
125
file=audio_file,
126
model="whisper-large-v3",
127
language="en",
128
response_format="text"
129
)
130
131
print("Transcript:", transcript)
132
133
# With detailed response format
134
with open("audio.wav", "rb") as audio_file:
135
response = client.audio.transcriptions.create(
136
file=audio_file,
137
model="whisper-large-v3",
138
response_format="verbose_json",
139
timestamp_granularities=["word", "segment"]
140
)
141
142
print("Text:", response.text)
143
print("Language:", response.language)
144
for segment in response.segments:
145
print(f"[{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text}")
146
```
147
148
### Audio Translation
149
150
```python
151
from groq import Groq
152
153
client = Groq()
154
155
# Translate non-English audio to English
156
with open("spanish_audio.mp3", "rb") as audio_file:
157
translation = client.audio.translations.create(
158
file=audio_file,
159
model="whisper-large-v3",
160
response_format="text"
161
)
162
163
print("English translation:", translation)
164
165
# With JSON response format
166
with open("french_audio.wav", "rb") as audio_file:
167
response = client.audio.translations.create(
168
file=audio_file,
169
model="whisper-large-v3",
170
response_format="json"
171
)
172
173
print("Translated text:", response.text)
174
```
175
176
### Text-to-Speech
177
178
```python
179
from groq import Groq
180
181
client = Groq()
182
183
# Generate speech from text
184
response = client.audio.speech.create(
185
input="Hello, this is a test of the text-to-speech functionality.",
186
model="tts-1",
187
voice="nova",
188
response_format="mp3"
189
)
190
191
# Save the audio to a file
192
with open("output.mp3", "wb") as audio_file:
193
audio_file.write(response)
194
195
# Different voice and format
196
response = client.audio.speech.create(
197
input="This is a different voice and format example.",
198
model="tts-1-hd",
199
voice="alloy",
200
response_format="wav",
201
speed=1.2
202
)
203
204
with open("output.wav", "wb") as audio_file:
205
audio_file.write(response)
206
```
207
208
### Using file_from_path Utility
209
210
```python
211
from groq import Groq, file_from_path
212
213
client = Groq()
214
215
# Use the utility function for file handling
216
audio_file = file_from_path("path/to/audio.mp3")
217
transcript = client.audio.transcriptions.create(
218
file=audio_file,
219
model="whisper-large-v3"
220
)
221
222
print(transcript)
223
```
224
225
### Async Usage
226
227
```python
228
import asyncio
229
from groq import AsyncGroq
230
231
async def main():
232
client = AsyncGroq()
233
234
# Async transcription
235
with open("audio.mp3", "rb") as audio_file:
236
transcript = await client.audio.transcriptions.create(
237
file=audio_file,
238
model="whisper-large-v3",
239
response_format="text"
240
)
241
242
print("Transcript:", transcript)
243
244
# Async text-to-speech
245
speech_response = await client.audio.speech.create(
246
input="Async text-to-speech example",
247
model="tts-1",
248
voice="echo"
249
)
250
251
with open("async_output.mp3", "wb") as f:
252
f.write(speech_response)
253
254
asyncio.run(main())
255
```
256
257
## Types
258
259
### File Types
260
261
```python { .api }
262
FileTypes = Union[IO[bytes], bytes, PathLike, str]
263
```
264
265
### Response Types
266
267
```python { .api }
268
class TranscriptionResponse:
269
text: str
270
271
class TranslationResponse:
272
text: str
273
274
# Verbose response format (when response_format="verbose_json")
275
class TranscriptionVerboseResponse:
276
text: str
277
language: str
278
duration: float
279
segments: List[TranscriptionSegment]
280
words: Optional[List[TranscriptionWord]]
281
282
class TranscriptionSegment:
283
id: int
284
seek: int
285
start: float
286
end: float
287
text: str
288
tokens: List[int]
289
temperature: float
290
avg_logprob: float
291
compression_ratio: float
292
no_speech_prob: float
293
294
class TranscriptionWord:
295
word: str
296
start: float
297
end: float
298
```
299
300
### Request Parameter Types
301
302
```python { .api }
303
class TranscriptionCreateParams:
304
file: FileTypes
305
model: str
306
language: Optional[str]
307
prompt: Optional[str]
308
response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]]
309
temperature: Optional[float]
310
timestamp_granularities: Optional[List[Literal["word", "segment"]]]
311
312
class TranslationCreateParams:
313
file: FileTypes
314
model: str
315
prompt: Optional[str]
316
response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]]
317
temperature: Optional[float]
318
319
class SpeechCreateParams:
320
input: str
321
model: str
322
voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
323
response_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]]
324
speed: Optional[float]
325
```