Python client for Together's Cloud Platform providing comprehensive AI model APIs
Speech synthesis, transcription, and translation capabilities supporting multiple languages and audio formats. Process audio content with state-of-the-art models for converting between speech and text in various languages.
Generate natural-sounding speech from text input with various voice options.
def create(
model: str,
input: str,
voice: str,
response_format: Optional[str] = None,
speed: Optional[float] = None,
**kwargs
) -> bytes:
"""
Generate speech from text.
Args:
model: Speech synthesis model identifier
input: Text to convert to speech
voice: Voice identifier for synthesis
response_format: Audio format (mp3, wav, flac, etc.)
speed: Speech speed (0.25 to 4.0)
Returns:
Audio data as bytes
"""Convert spoken audio to text with language detection and formatting options.
def create(
file: str,
model: str,
language: Optional[str] = None,
prompt: Optional[str] = None,
response_format: Optional[str] = None,
temperature: Optional[float] = None,
timestamp_granularities: Optional[List[str]] = None,
**kwargs
) -> AudioTranscriptionResponse:
"""
Transcribe audio to text.
Args:
file: Path to audio file to transcribe
model: Transcription model identifier
language: Source language code (ISO-639-1)
prompt: Optional prompt to guide transcription
response_format: Response format (json, text, srt, verbose_json, vtt)
temperature: Sampling temperature
timestamp_granularities: Timestamp precision levels
Returns:
AudioTranscriptionResponse with transcribed text
"""Translate audio from various languages to English text.
def create(
file: str,
model: str,
prompt: Optional[str] = None,
response_format: Optional[str] = None,
temperature: Optional[float] = None,
**kwargs
) -> AudioTranslationResponse:
"""
Translate audio to English text.
Args:
file: Path to audio file to translate
model: Translation model identifier
prompt: Optional prompt to guide translation
response_format: Response format (json, text, verbose_json)
temperature: Sampling temperature
Returns:
AudioTranslationResponse with translated text
"""All audio operations support asynchronous execution.
async def create(model: str, input: str, voice: str, **kwargs) -> bytes: ...
async def create(file: str, model: str, **kwargs) -> AudioTranscriptionResponse: ...
async def create(file: str, model: str, **kwargs) -> AudioTranslationResponse: ...from together import Together
client = Together()
# Generate speech from text
audio_data = client.audio.speech.create(
model="together-ai/speech-v1",
input="Hello, this is a test of the speech synthesis system.",
voice="alloy",
response_format="mp3",
speed=1.0
)
# Save audio to file
with open("generated_speech.mp3", "wb") as f:
f.write(audio_data)
print("Speech generated and saved to generated_speech.mp3")# Transcribe audio file to text
response = client.audio.transcriptions.create(
file="recorded_speech.mp3",
model="whisper-large-v3",
language="en",
response_format="verbose_json",
timestamp_granularities=["word", "segment"]
)
print(f"Transcribed text: {response.text}")
print(f"Language detected: {response.language}")
print(f"Duration: {response.duration} seconds")
# Access word-level timestamps
if hasattr(response, 'words'):
print("Word-level timestamps:")
for word in response.words[:10]: # First 10 words
print(f" {word.word}: {word.start:.2f}s - {word.end:.2f}s")# Translate Spanish audio to English text
response = client.audio.translations.create(
file="spanish_audio.mp3",
model="whisper-large-v3",
response_format="verbose_json"
)
print(f"Original language detected: {response.language}")
print(f"English translation: {response.text}")
print(f"Translation duration: {response.duration} seconds")import os
def process_audio_files(client: Together, audio_dir: str, model: str):
"""Process all audio files in a directory."""
results = []
audio_files = [f for f in os.listdir(audio_dir) if f.endswith(('.mp3', '.wav', '.m4a'))]
for audio_file in audio_files:
file_path = os.path.join(audio_dir, audio_file)
try:
response = client.audio.transcriptions.create(
file=file_path,
model=model,
response_format="json"
)
results.append({
'file': audio_file,
'text': response.text,
'language': getattr(response, 'language', 'unknown'),
'status': 'success'
})
print(f"✅ Processed: {audio_file}")
except Exception as e:
results.append({
'file': audio_file,
'error': str(e),
'status': 'failed'
})
print(f"❌ Failed: {audio_file} - {e}")
return results
# Process all audio files
results = process_audio_files(client, "./audio_files", "whisper-large-v3")
# Save results
import json
with open("transcription_results.json", "w") as f:
json.dump(results, f, indent=2)def stream_speech(client: Together, text: str, voice: str = "alloy"):
"""Stream speech synthesis for real-time playback."""
# Break text into chunks for streaming
chunks = [text[i:i+200] for i in range(0, len(text), 200)]
audio_chunks = []
for i, chunk in enumerate(chunks):
audio_data = client.audio.speech.create(
model="together-ai/speech-v1",
input=chunk,
voice=voice,
response_format="mp3",
speed=1.0
)
audio_chunks.append(audio_data)
print(f"Generated chunk {i+1}/{len(chunks)}")
# Combine audio chunks
combined_audio = b''.join(audio_chunks)
with open("streamed_speech.mp3", "wb") as f:
f.write(combined_audio)
return combined_audio
# Generate speech in chunks
long_text = """
This is a long text that will be converted to speech in multiple chunks.
The streaming approach allows for better memory management and faster
perceived response times when processing large amounts of text.
"""
stream_speech(client, long_text, voice="nova")def detect_and_process_audio(client: Together, audio_file: str):
"""Detect language and process accordingly."""
# First, transcribe to detect language
transcription = client.audio.transcriptions.create(
file=audio_file,
model="whisper-large-v3",
response_format="verbose_json"
)
detected_language = transcription.language
print(f"Detected language: {detected_language}")
if detected_language == "en":
# Already English, just return transcription
return {
'original_text': transcription.text,
'translated_text': transcription.text,
'language': detected_language
}
else:
# Translate to English
translation = client.audio.translations.create(
file=audio_file,
model="whisper-large-v3",
response_format="json"
)
return {
'original_text': transcription.text,
'translated_text': translation.text,
'language': detected_language
}
# Process multilingual audio
result = detect_and_process_audio(client, "multilingual_audio.mp3")
print(f"Original ({result['language']}): {result['original_text'][:100]}...")
print(f"English: {result['translated_text'][:100]}...")class AudioSpeechRequest:
model: str
input: str
voice: str
response_format: Optional[str] = None
speed: Optional[float] = None
class AudioResponseFormat:
MP3 = "mp3"
OPUS = "opus"
AAC = "aac"
FLAC = "flac"
WAV = "wav"
PCM = "pcm"
class AudioResponseEncoding:
MP3 = "mp3"
OPUS = "opus"
AAC = "aac"
FLAC = "flac"class AudioTranscriptionRequest:
file: str
model: str
language: Optional[str] = None
prompt: Optional[str] = None
response_format: Optional[str] = None
temperature: Optional[float] = None
timestamp_granularities: Optional[List[str]] = None
class AudioTranscriptionResponse:
text: str
class AudioTranscriptionVerboseResponse:
language: str
duration: float
text: str
words: Optional[List[AudioWord]] = None
segments: Optional[List[AudioSegment]] = None
class AudioWord:
word: str
start: float
end: float
class AudioSegment:
id: int
seek: int
start: float
end: float
text: str
tokens: List[int]
temperature: float
avg_logprob: float
compression_ratio: float
no_speech_prob: floatclass AudioTranslationRequest:
file: str
model: str
prompt: Optional[str] = None
response_format: Optional[str] = None
temperature: Optional[float] = None
class AudioTranslationResponse:
text: str
class AudioTranslationVerboseResponse:
language: str
duration: float
text: str
segments: Optional[List[AudioSegment]] = Noneclass AudioLanguage:
"""ISO-639-1 language codes for audio processing"""
ENGLISH = "en"
SPANISH = "es"
FRENCH = "fr"
GERMAN = "de"
ITALIAN = "it"
PORTUGUESE = "pt"
RUSSIAN = "ru"
JAPANESE = "ja"
KOREAN = "ko"
CHINESE = "zh"
class AudioTranscriptionResponseFormat:
JSON = "json"
TEXT = "text"
SRT = "srt"
VERBOSE_JSON = "verbose_json"
VTT = "vtt"
class AudioTimestampGranularities:
WORD = "word"
SEGMENT = "segment"whisper-large-v3 - High-accuracy transcription and translationwhisper-large-v2 - Previous generation Whisper modeltogether-ai/speech-v1 - Text-to-speech synthesisInstall with Tessl CLI
npx tessl i tessl/pypi-together