The official Python library for the groq API
—
Comprehensive audio capabilities including speech-to-text transcription, translation, and text-to-speech synthesis. The audio API provides high-quality processing for various audio formats and use cases.
Convert audio files to text with high accuracy and support for multiple languages and formats.
def transcribe(
file: FileTypes,
model: str,
language: Optional[str] = NOT_GIVEN,
prompt: Optional[str] = NOT_GIVEN,
response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]] = NOT_GIVEN,
temperature: Optional[float] = NOT_GIVEN,
timestamp_granularities: Optional[List[Literal["word", "segment"]]] = NOT_GIVEN,
extra_headers: Headers | None = None,
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
) -> TranscriptionResponse:
"""
Transcribe audio to text.
Parameters:
- file: Audio file to transcribe (various formats supported)
- model: Model to use for transcription
- language: Language of the input audio (ISO-639-1 format)
- prompt: Optional text prompt to guide the model's style
- response_format: Format of the transcript output
- temperature: Sampling temperature between 0 and 1
- timestamp_granularities: Timestamp granularities to populate
Returns:
TranscriptionResponse with transcribed text and optional metadata
"""Translate audio from various languages to English text.
def translate(
file: FileTypes,
model: str,
prompt: Optional[str] = NOT_GIVEN,
response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]] = NOT_GIVEN,
temperature: Optional[float] = NOT_GIVEN,
extra_headers: Headers | None = None,
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
) -> TranslationResponse:
"""
Translate audio to English text.
Parameters:
- file: Audio file to translate (various formats supported)
- model: Model to use for translation
- prompt: Optional text prompt to guide the model's style
- response_format: Format of the transcript output
- temperature: Sampling temperature between 0 and 1
Returns:
TranslationResponse with translated English text and optional metadata
"""Generate spoken audio from text input with various voice options.
def speech(
input: str,
model: str,
voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
response_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = NOT_GIVEN,
speed: Optional[float] = NOT_GIVEN,
extra_headers: Headers | None = None,
extra_query: Query | None = None,
extra_body: Body | None = None,
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
) -> bytes:
"""
Generate audio from text.
Parameters:
- input: Text to convert to audio
- model: Model to use for speech synthesis
- voice: Voice to use for the generated audio
- response_format: Audio format for the output
- speed: Speed of the generated audio (0.25 to 4.0)
Returns:
Raw audio bytes in the specified format
"""All audio operations have asynchronous counterparts with identical parameters.
async def transcribe(file: FileTypes, model: str, **kwargs) -> TranscriptionResponse: ...
async def translate(file: FileTypes, model: str, **kwargs) -> TranslationResponse: ...
async def speech(input: str, model: str, voice: str, **kwargs) -> bytes: ...from groq import Groq
client = Groq()
# Transcribe an audio file
with open("audio.mp3", "rb") as audio_file:
transcript = client.audio.transcriptions.create(
file=audio_file,
model="whisper-large-v3",
language="en",
response_format="text"
)
print("Transcript:", transcript)
# With detailed response format
with open("audio.wav", "rb") as audio_file:
response = client.audio.transcriptions.create(
file=audio_file,
model="whisper-large-v3",
response_format="verbose_json",
timestamp_granularities=["word", "segment"]
)
print("Text:", response.text)
print("Language:", response.language)
for segment in response.segments:
print(f"[{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text}")from groq import Groq
client = Groq()
# Translate non-English audio to English
with open("spanish_audio.mp3", "rb") as audio_file:
translation = client.audio.translations.create(
file=audio_file,
model="whisper-large-v3",
response_format="text"
)
print("English translation:", translation)
# With JSON response format
with open("french_audio.wav", "rb") as audio_file:
response = client.audio.translations.create(
file=audio_file,
model="whisper-large-v3",
response_format="json"
)
print("Translated text:", response.text)from groq import Groq
client = Groq()
# Generate speech from text
response = client.audio.speech.create(
input="Hello, this is a test of the text-to-speech functionality.",
model="tts-1",
voice="nova",
response_format="mp3"
)
# Save the audio to a file
with open("output.mp3", "wb") as audio_file:
audio_file.write(response)
# Different voice and format
response = client.audio.speech.create(
input="This is a different voice and format example.",
model="tts-1-hd",
voice="alloy",
response_format="wav",
speed=1.2
)
with open("output.wav", "wb") as audio_file:
audio_file.write(response)from groq import Groq, file_from_path
client = Groq()
# Use the utility function for file handling
audio_file = file_from_path("path/to/audio.mp3")
transcript = client.audio.transcriptions.create(
file=audio_file,
model="whisper-large-v3"
)
print(transcript)import asyncio
from groq import AsyncGroq
async def main():
client = AsyncGroq()
# Async transcription
with open("audio.mp3", "rb") as audio_file:
transcript = await client.audio.transcriptions.create(
file=audio_file,
model="whisper-large-v3",
response_format="text"
)
print("Transcript:", transcript)
# Async text-to-speech
speech_response = await client.audio.speech.create(
input="Async text-to-speech example",
model="tts-1",
voice="echo"
)
with open("async_output.mp3", "wb") as f:
f.write(speech_response)
asyncio.run(main())FileTypes = Union[IO[bytes], bytes, PathLike, str]class TranscriptionResponse:
text: str
class TranslationResponse:
text: str
# Verbose response format (when response_format="verbose_json")
class TranscriptionVerboseResponse:
text: str
language: str
duration: float
segments: List[TranscriptionSegment]
words: Optional[List[TranscriptionWord]]
class TranscriptionSegment:
id: int
seek: int
start: float
end: float
text: str
tokens: List[int]
temperature: float
avg_logprob: float
compression_ratio: float
no_speech_prob: float
class TranscriptionWord:
word: str
start: float
end: floatclass TranscriptionCreateParams:
file: FileTypes
model: str
language: Optional[str]
prompt: Optional[str]
response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]]
temperature: Optional[float]
timestamp_granularities: Optional[List[Literal["word", "segment"]]]
class TranslationCreateParams:
file: FileTypes
model: str
prompt: Optional[str]
response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]]
temperature: Optional[float]
class SpeechCreateParams:
input: str
model: str
voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
response_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]]
speed: Optional[float]Install with Tessl CLI
npx tessl i tessl/pypi-groq