tessl/pypi-together

Python client for Together's Cloud Platform providing comprehensive AI model APIs

Overview

Eval results

Files

Audio Processing

Name: tessl/pypi-together
Author: tessl

Speech synthesis, transcription, and translation capabilities supporting multiple languages and audio formats. Process audio content with state-of-the-art models for converting between speech and text in various languages.

Capabilities

Speech Synthesis

Generate natural-sounding speech from text input with various voice options.

def create(
    model: str,
    input: str,
    voice: str,
    response_format: Optional[str] = None,
    speed: Optional[float] = None,
    **kwargs
) -> bytes:
    """
    Generate speech from text.

    Args:
        model: Speech synthesis model identifier
        input: Text to convert to speech
        voice: Voice identifier for synthesis
        response_format: Audio format (mp3, wav, flac, etc.)
        speed: Speech speed (0.25 to 4.0)

    Returns:
        Audio data as bytes
    """

Audio Transcription

Convert spoken audio to text with language detection and formatting options.

def create(
    file: str,
    model: str,
    language: Optional[str] = None,
    prompt: Optional[str] = None,
    response_format: Optional[str] = None,
    temperature: Optional[float] = None,
    timestamp_granularities: Optional[List[str]] = None,
    **kwargs
) -> AudioTranscriptionResponse:
    """
    Transcribe audio to text.

    Args:  
        file: Path to audio file to transcribe
        model: Transcription model identifier
        language: Source language code (ISO-639-1)
        prompt: Optional prompt to guide transcription
        response_format: Response format (json, text, srt, verbose_json, vtt)
        temperature: Sampling temperature
        timestamp_granularities: Timestamp precision levels

    Returns:
        AudioTranscriptionResponse with transcribed text
    """

Audio Translation

Translate audio from various languages to English text.

def create(
    file: str,
    model: str,
    prompt: Optional[str] = None,
    response_format: Optional[str] = None,
    temperature: Optional[float] = None,
    **kwargs
) -> AudioTranslationResponse:
    """
    Translate audio to English text.

    Args:
        file: Path to audio file to translate
        model: Translation model identifier
        prompt: Optional prompt to guide translation
        response_format: Response format (json, text, verbose_json)
        temperature: Sampling temperature

    Returns:
        AudioTranslationResponse with translated text
    """

Async Audio Operations

All audio operations support asynchronous execution.

async def create(model: str, input: str, voice: str, **kwargs) -> bytes: ...
async def create(file: str, model: str, **kwargs) -> AudioTranscriptionResponse: ...
async def create(file: str, model: str, **kwargs) -> AudioTranslationResponse: ...

Usage Examples

Text-to-Speech Generation

from together import Together

client = Together()

# Generate speech from text
audio_data = client.audio.speech.create(
    model="together-ai/speech-v1",
    input="Hello, this is a test of the speech synthesis system.",
    voice="alloy",
    response_format="mp3",
    speed=1.0
)

# Save audio to file
with open("generated_speech.mp3", "wb") as f:
    f.write(audio_data)

print("Speech generated and saved to generated_speech.mp3")

Audio Transcription

# Transcribe audio file to text
response = client.audio.transcriptions.create(
    file="recorded_speech.mp3",
    model="whisper-large-v3",
    language="en",
    response_format="verbose_json",
    timestamp_granularities=["word", "segment"]
)

print(f"Transcribed text: {response.text}")
print(f"Language detected: {response.language}")
print(f"Duration: {response.duration} seconds")

# Access word-level timestamps
if hasattr(response, 'words'):
    print("Word-level timestamps:")
    for word in response.words[:10]:  # First 10 words
        print(f"  {word.word}: {word.start:.2f}s - {word.end:.2f}s")

Audio Translation to English

# Translate Spanish audio to English text
response = client.audio.translations.create(
    file="spanish_audio.mp3",
    model="whisper-large-v3",
    response_format="verbose_json"
)

print(f"Original language detected: {response.language}")
print(f"English translation: {response.text}")
print(f"Translation duration: {response.duration} seconds")

Batch Audio Processing

import os

def process_audio_files(client: Together, audio_dir: str, model: str):
    """Process all audio files in a directory."""
    
    results = []
    audio_files = [f for f in os.listdir(audio_dir) if f.endswith(('.mp3', '.wav', '.m4a'))]
    
    for audio_file in audio_files:
        file_path = os.path.join(audio_dir, audio_file)
        
        try:
            response = client.audio.transcriptions.create(
                file=file_path,
                model=model,
                response_format="json"
            )
            
            results.append({
                'file': audio_file,
                'text': response.text,
                'language': getattr(response, 'language', 'unknown'),
                'status': 'success'
            })
            
            print(f"✅ Processed: {audio_file}")
            
        except Exception as e:
            results.append({
                'file': audio_file,
                'error': str(e),
                'status': 'failed'
            })
            print(f"❌ Failed: {audio_file} - {e}")
    
    return results

# Process all audio files
results = process_audio_files(client, "./audio_files", "whisper-large-v3")

# Save results
import json
with open("transcription_results.json", "w") as f:
    json.dump(results, f, indent=2)

Streaming Speech Synthesis

def stream_speech(client: Together, text: str, voice: str = "alloy"):
    """Stream speech synthesis for real-time playback."""
    
    # Break text into chunks for streaming
    chunks = [text[i:i+200] for i in range(0, len(text), 200)]
    
    audio_chunks = []
    
    for i, chunk in enumerate(chunks):
        audio_data = client.audio.speech.create(
            model="together-ai/speech-v1",
            input=chunk,
            voice=voice,
            response_format="mp3",
            speed=1.0
        )
        
        audio_chunks.append(audio_data)
        print(f"Generated chunk {i+1}/{len(chunks)}")
    
    # Combine audio chunks
    combined_audio = b''.join(audio_chunks)
    
    with open("streamed_speech.mp3", "wb") as f:
        f.write(combined_audio)
    
    return combined_audio

# Generate speech in chunks
long_text = """
This is a long text that will be converted to speech in multiple chunks.
The streaming approach allows for better memory management and faster 
perceived response times when processing large amounts of text.
"""

stream_speech(client, long_text, voice="nova")

Multi-language Audio Processing

def detect_and_process_audio(client: Together, audio_file: str):
    """Detect language and process accordingly."""
    
    # First, transcribe to detect language
    transcription = client.audio.transcriptions.create(
        file=audio_file,
        model="whisper-large-v3",
        response_format="verbose_json"
    )
    
    detected_language = transcription.language
    print(f"Detected language: {detected_language}")
    
    if detected_language == "en":
        # Already English, just return transcription
        return {
            'original_text': transcription.text,
            'translated_text': transcription.text,
            'language': detected_language
        }
    else:
        # Translate to English
        translation = client.audio.translations.create(
            file=audio_file,
            model="whisper-large-v3",
            response_format="json"
        )
        
        return {
            'original_text': transcription.text,
            'translated_text': translation.text,
            'language': detected_language
        }

# Process multilingual audio
result = detect_and_process_audio(client, "multilingual_audio.mp3")
print(f"Original ({result['language']}): {result['original_text'][:100]}...")
print(f"English: {result['translated_text'][:100]}...")

Types

Speech Synthesis Types

class AudioSpeechRequest:
    model: str
    input: str
    voice: str
    response_format: Optional[str] = None
    speed: Optional[float] = None

class AudioResponseFormat:
    MP3 = "mp3"
    OPUS = "opus"
    AAC = "aac"
    FLAC = "flac"
    WAV = "wav"
    PCM = "pcm"

class AudioResponseEncoding:
    MP3 = "mp3"
    OPUS = "opus"
    AAC = "aac"
    FLAC = "flac"

Transcription Types

class AudioTranscriptionRequest:
    file: str
    model: str
    language: Optional[str] = None
    prompt: Optional[str] = None
    response_format: Optional[str] = None
    temperature: Optional[float] = None
    timestamp_granularities: Optional[List[str]] = None

class AudioTranscriptionResponse:
    text: str

class AudioTranscriptionVerboseResponse:
    language: str
    duration: float
    text: str
    words: Optional[List[AudioWord]] = None
    segments: Optional[List[AudioSegment]] = None

class AudioWord:
    word: str
    start: float
    end: float

class AudioSegment:
    id: int
    seek: int
    start: float
    end: float
    text: str
    tokens: List[int]
    temperature: float
    avg_logprob: float
    compression_ratio: float
    no_speech_prob: float

Translation Types

class AudioTranslationRequest:
    file: str
    model: str
    prompt: Optional[str] = None
    response_format: Optional[str] = None
    temperature: Optional[float] = None

class AudioTranslationResponse:
    text: str

class AudioTranslationVerboseResponse:
    language: str
    duration: float
    text: str
    segments: Optional[List[AudioSegment]] = None

Language and Format Options

class AudioLanguage:
    """ISO-639-1 language codes for audio processing"""
    ENGLISH = "en"
    SPANISH = "es"
    FRENCH = "fr"
    GERMAN = "de"
    ITALIAN = "it"
    PORTUGUESE = "pt"
    RUSSIAN = "ru"
    JAPANESE = "ja"
    KOREAN = "ko"
    CHINESE = "zh"

class AudioTranscriptionResponseFormat:
    JSON = "json"
    TEXT = "text"
    SRT = "srt"
    VERBOSE_JSON = "verbose_json"
    VTT = "vtt"

class AudioTimestampGranularities:
    WORD = "word"
    SEGMENT = "segment"