tessl/pypi-groq

The official Python library for the groq API

—

Pending

Overview

Eval results

Files

Audio Processing

Name: tessl/pypi-groq
Author: tessl

Comprehensive audio capabilities including speech-to-text transcription, translation, and text-to-speech synthesis. The audio API provides high-quality processing for various audio formats and use cases.

Capabilities

Speech-to-Text Transcription

Convert audio files to text with high accuracy and support for multiple languages and formats.

def transcribe(
    file: FileTypes,
    model: str,
    language: Optional[str] = NOT_GIVEN,
    prompt: Optional[str] = NOT_GIVEN,
    response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]] = NOT_GIVEN,
    temperature: Optional[float] = NOT_GIVEN,
    timestamp_granularities: Optional[List[Literal["word", "segment"]]] = NOT_GIVEN,
    extra_headers: Headers | None = None,
    extra_query: Query | None = None,
    extra_body: Body | None = None,
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
) -> TranscriptionResponse:
    """
    Transcribe audio to text.
    
    Parameters:
    - file: Audio file to transcribe (various formats supported)
    - model: Model to use for transcription
    - language: Language of the input audio (ISO-639-1 format)
    - prompt: Optional text prompt to guide the model's style
    - response_format: Format of the transcript output
    - temperature: Sampling temperature between 0 and 1
    - timestamp_granularities: Timestamp granularities to populate
    
    Returns:
    TranscriptionResponse with transcribed text and optional metadata
    """

Speech Translation

Translate audio from various languages to English text.

def translate(
    file: FileTypes,
    model: str,
    prompt: Optional[str] = NOT_GIVEN,
    response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]] = NOT_GIVEN,
    temperature: Optional[float] = NOT_GIVEN,
    extra_headers: Headers | None = None,
    extra_query: Query | None = None,
    extra_body: Body | None = None,
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
) -> TranslationResponse:
    """
    Translate audio to English text.
    
    Parameters:
    - file: Audio file to translate (various formats supported)
    - model: Model to use for translation
    - prompt: Optional text prompt to guide the model's style
    - response_format: Format of the transcript output
    - temperature: Sampling temperature between 0 and 1
    
    Returns:
    TranslationResponse with translated English text and optional metadata
    """

Text-to-Speech Synthesis

Generate spoken audio from text input with various voice options.

def speech(
    input: str,
    model: str,
    voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
    response_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = NOT_GIVEN,
    speed: Optional[float] = NOT_GIVEN,
    extra_headers: Headers | None = None,
    extra_query: Query | None = None,
    extra_body: Body | None = None,
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
) -> bytes:
    """
    Generate audio from text.
    
    Parameters:
    - input: Text to convert to audio
    - model: Model to use for speech synthesis
    - voice: Voice to use for the generated audio
    - response_format: Audio format for the output
    - speed: Speed of the generated audio (0.25 to 4.0)
    
    Returns:
    Raw audio bytes in the specified format
    """

Async Audio Operations

All audio operations have asynchronous counterparts with identical parameters.

async def transcribe(file: FileTypes, model: str, **kwargs) -> TranscriptionResponse: ...
async def translate(file: FileTypes, model: str, **kwargs) -> TranslationResponse: ...
async def speech(input: str, model: str, voice: str, **kwargs) -> bytes: ...

Usage Examples

Audio Transcription

from groq import Groq

client = Groq()

# Transcribe an audio file
with open("audio.mp3", "rb") as audio_file:
    transcript = client.audio.transcriptions.create(
        file=audio_file,
        model="whisper-large-v3",
        language="en",
        response_format="text"
    )

print("Transcript:", transcript)

# With detailed response format
with open("audio.wav", "rb") as audio_file:
    response = client.audio.transcriptions.create(
        file=audio_file,
        model="whisper-large-v3",
        response_format="verbose_json",
        timestamp_granularities=["word", "segment"]
    )

print("Text:", response.text)
print("Language:", response.language)
for segment in response.segments:
    print(f"[{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text}")

Audio Translation

from groq import Groq

client = Groq()

# Translate non-English audio to English
with open("spanish_audio.mp3", "rb") as audio_file:
    translation = client.audio.translations.create(
        file=audio_file,
        model="whisper-large-v3",
        response_format="text"
    )

print("English translation:", translation)

# With JSON response format
with open("french_audio.wav", "rb") as audio_file:
    response = client.audio.translations.create(
        file=audio_file,
        model="whisper-large-v3",
        response_format="json"
    )

print("Translated text:", response.text)

Text-to-Speech

from groq import Groq

client = Groq()

# Generate speech from text
response = client.audio.speech.create(
    input="Hello, this is a test of the text-to-speech functionality.",
    model="tts-1",
    voice="nova",
    response_format="mp3"
)

# Save the audio to a file
with open("output.mp3", "wb") as audio_file:
    audio_file.write(response)

# Different voice and format
response = client.audio.speech.create(
    input="This is a different voice and format example.",
    model="tts-1-hd",
    voice="alloy",
    response_format="wav",
    speed=1.2
)

with open("output.wav", "wb") as audio_file:
    audio_file.write(response)

Using file_from_path Utility

from groq import Groq, file_from_path

client = Groq()

# Use the utility function for file handling
audio_file = file_from_path("path/to/audio.mp3")
transcript = client.audio.transcriptions.create(
    file=audio_file,
    model="whisper-large-v3"
)

print(transcript)

Async Usage

import asyncio
from groq import AsyncGroq

async def main():
    client = AsyncGroq()
    
    # Async transcription
    with open("audio.mp3", "rb") as audio_file:
        transcript = await client.audio.transcriptions.create(
            file=audio_file,
            model="whisper-large-v3",
            response_format="text"
        )
    
    print("Transcript:", transcript)
    
    # Async text-to-speech
    speech_response = await client.audio.speech.create(
        input="Async text-to-speech example",
        model="tts-1",
        voice="echo"
    )
    
    with open("async_output.mp3", "wb") as f:
        f.write(speech_response)

asyncio.run(main())

Types

File Types

FileTypes = Union[IO[bytes], bytes, PathLike, str]

Response Types

class TranscriptionResponse:
    text: str

class TranslationResponse:
    text: str

# Verbose response format (when response_format="verbose_json")
class TranscriptionVerboseResponse:
    text: str
    language: str
    duration: float
    segments: List[TranscriptionSegment]
    words: Optional[List[TranscriptionWord]]

class TranscriptionSegment:
    id: int
    seek: int
    start: float
    end: float
    text: str
    tokens: List[int]
    temperature: float
    avg_logprob: float
    compression_ratio: float
    no_speech_prob: float

class TranscriptionWord:
    word: str
    start: float
    end: float

Request Parameter Types

class TranscriptionCreateParams:
    file: FileTypes
    model: str
    language: Optional[str]
    prompt: Optional[str]
    response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]]
    temperature: Optional[float]
    timestamp_granularities: Optional[List[Literal["word", "segment"]]]

class TranslationCreateParams:
    file: FileTypes
    model: str
    prompt: Optional[str]
    response_format: Optional[Literal["json", "text", "srt", "verbose_json", "vtt"]]
    temperature: Optional[float]

class SpeechCreateParams:
    input: str
    model: str
    voice: Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
    response_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]]
    speed: Optional[float]

Install with Tessl CLI