Tessl Tile for pypi/together@1.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio.md batch.md chat-completions.md code-interpreter.md completions.md embeddings.md endpoints.md evaluation.md files.md fine-tuning.md images.md index.md models.md rerank.md

audio.mddocs/

0
# Audio Processing  
1

2
Speech synthesis, transcription, and translation capabilities supporting multiple languages and audio formats. Process audio content with state-of-the-art models for converting between speech and text in various languages.
3

4
## Capabilities
5

6
### Speech Synthesis
7

8
Generate natural-sounding speech from text input with various voice options.
9

10
```python { .api }
11
def create(
12
    model: str,
13
    input: str,
14
    voice: str,
15
    response_format: Optional[str] = None,
16
    speed: Optional[float] = None,
17
    **kwargs
18
) -> bytes:
19
    """
20
    Generate speech from text.
21

22
    Args:
23
        model: Speech synthesis model identifier
24
        input: Text to convert to speech
25
        voice: Voice identifier for synthesis
26
        response_format: Audio format (mp3, wav, flac, etc.)
27
        speed: Speech speed (0.25 to 4.0)
28

29
    Returns:
30
        Audio data as bytes
31
    """
32
```
33

34
### Audio Transcription
35

36
Convert spoken audio to text with language detection and formatting options.
37

38
```python { .api }
39
def create(
40
    file: str,
41
    model: str,
42
    language: Optional[str] = None,
43
    prompt: Optional[str] = None,
44
    response_format: Optional[str] = None,
45
    temperature: Optional[float] = None,
46
    timestamp_granularities: Optional[List[str]] = None,
47
    **kwargs
48
) -> AudioTranscriptionResponse:
49
    """
50
    Transcribe audio to text.
51

52
    Args:  
53
        file: Path to audio file to transcribe
54
        model: Transcription model identifier
55
        language: Source language code (ISO-639-1)
56
        prompt: Optional prompt to guide transcription
57
        response_format: Response format (json, text, srt, verbose_json, vtt)
58
        temperature: Sampling temperature
59
        timestamp_granularities: Timestamp precision levels
60

61
    Returns:
62
        AudioTranscriptionResponse with transcribed text
63
    """
64
```
65

66
### Audio Translation
67

68
Translate audio from various languages to English text.
69

70
```python { .api }
71
def create(
72
    file: str,
73
    model: str,
74
    prompt: Optional[str] = None,
75
    response_format: Optional[str] = None,
76
    temperature: Optional[float] = None,
77
    **kwargs
78
) -> AudioTranslationResponse:
79
    """
80
    Translate audio to English text.
81

82
    Args:
83
        file: Path to audio file to translate
84
        model: Translation model identifier
85
        prompt: Optional prompt to guide translation
86
        response_format: Response format (json, text, verbose_json)
87
        temperature: Sampling temperature
88

89
    Returns:
90
        AudioTranslationResponse with translated text
91
    """
92
```
93

94
### Async Audio Operations
95

96
All audio operations support asynchronous execution.
97

98
```python { .api }
99
async def create(model: str, input: str, voice: str, **kwargs) -> bytes: ...
100
async def create(file: str, model: str, **kwargs) -> AudioTranscriptionResponse: ...
101
async def create(file: str, model: str, **kwargs) -> AudioTranslationResponse: ...
102
```
103

104
## Usage Examples
105

106
### Text-to-Speech Generation
107

108
```python
109
from together import Together
110

111
client = Together()
112

113
# Generate speech from text
114
audio_data = client.audio.speech.create(
115
    model="together-ai/speech-v1",
116
    input="Hello, this is a test of the speech synthesis system.",
117
    voice="alloy",
118
    response_format="mp3",
119
    speed=1.0
120
)
121

122
# Save audio to file
123
with open("generated_speech.mp3", "wb") as f:
124
    f.write(audio_data)
125

126
print("Speech generated and saved to generated_speech.mp3")
127
```
128

129
### Audio Transcription
130

131
```python
132
# Transcribe audio file to text
133
response = client.audio.transcriptions.create(
134
    file="recorded_speech.mp3",
135
    model="whisper-large-v3",
136
    language="en",
137
    response_format="verbose_json",
138
    timestamp_granularities=["word", "segment"]
139
)
140

141
print(f"Transcribed text: {response.text}")
142
print(f"Language detected: {response.language}")
143
print(f"Duration: {response.duration} seconds")
144

145
# Access word-level timestamps
146
if hasattr(response, 'words'):
147
    print("Word-level timestamps:")
148
    for word in response.words[:10]:  # First 10 words
149
        print(f"  {word.word}: {word.start:.2f}s - {word.end:.2f}s")
150
```
151

152
### Audio Translation to English
153

154
```python
155
# Translate Spanish audio to English text
156
response = client.audio.translations.create(
157
    file="spanish_audio.mp3",
158
    model="whisper-large-v3",
159
    response_format="verbose_json"
160
)
161

162
print(f"Original language detected: {response.language}")
163
print(f"English translation: {response.text}")
164
print(f"Translation duration: {response.duration} seconds")
165
```
166

167
### Batch Audio Processing
168

169
```python
170
import os
171

172
def process_audio_files(client: Together, audio_dir: str, model: str):
173
    """Process all audio files in a directory."""
174
    
175
    results = []
176
    audio_files = [f for f in os.listdir(audio_dir) if f.endswith(('.mp3', '.wav', '.m4a'))]
177
    
178
    for audio_file in audio_files:
179
        file_path = os.path.join(audio_dir, audio_file)
180
        
181
        try:
182
            response = client.audio.transcriptions.create(
183
                file=file_path,
184
                model=model,
185
                response_format="json"
186
            )
187
            
188
            results.append({
189
                'file': audio_file,
190
                'text': response.text,
191
                'language': getattr(response, 'language', 'unknown'),
192
                'status': 'success'
193
            })
194
            
195
            print(f"✅ Processed: {audio_file}")
196
            
197
        except Exception as e:
198
            results.append({
199
                'file': audio_file,
200
                'error': str(e),
201
                'status': 'failed'
202
            })
203
            print(f"❌ Failed: {audio_file} - {e}")
204
    
205
    return results
206

207
# Process all audio files
208
results = process_audio_files(client, "./audio_files", "whisper-large-v3")
209

210
# Save results
211
import json
212
with open("transcription_results.json", "w") as f:
213
    json.dump(results, f, indent=2)
214
```
215

216
### Streaming Speech Synthesis
217

218
```python
219
def stream_speech(client: Together, text: str, voice: str = "alloy"):
220
    """Stream speech synthesis for real-time playback."""
221
    
222
    # Break text into chunks for streaming
223
    chunks = [text[i:i+200] for i in range(0, len(text), 200)]
224
    
225
    audio_chunks = []
226
    
227
    for i, chunk in enumerate(chunks):
228
        audio_data = client.audio.speech.create(
229
            model="together-ai/speech-v1",
230
            input=chunk,
231
            voice=voice,
232
            response_format="mp3",
233
            speed=1.0
234
        )
235
        
236
        audio_chunks.append(audio_data)
237
        print(f"Generated chunk {i+1}/{len(chunks)}")
238
    
239
    # Combine audio chunks
240
    combined_audio = b''.join(audio_chunks)
241
    
242
    with open("streamed_speech.mp3", "wb") as f:
243
        f.write(combined_audio)
244
    
245
    return combined_audio
246

247
# Generate speech in chunks
248
long_text = """
249
This is a long text that will be converted to speech in multiple chunks.
250
The streaming approach allows for better memory management and faster 
251
perceived response times when processing large amounts of text.
252
"""
253

254
stream_speech(client, long_text, voice="nova")
255
```
256

257
### Multi-language Audio Processing
258

259
```python
260
def detect_and_process_audio(client: Together, audio_file: str):
261
    """Detect language and process accordingly."""
262
    
263
    # First, transcribe to detect language
264
    transcription = client.audio.transcriptions.create(
265
        file=audio_file,
266
        model="whisper-large-v3",
267
        response_format="verbose_json"
268
    )
269
    
270
    detected_language = transcription.language
271
    print(f"Detected language: {detected_language}")
272
    
273
    if detected_language == "en":
274
        # Already English, just return transcription
275
        return {
276
            'original_text': transcription.text,
277
            'translated_text': transcription.text,
278
            'language': detected_language
279
        }
280
    else:
281
        # Translate to English
282
        translation = client.audio.translations.create(
283
            file=audio_file,
284
            model="whisper-large-v3",
285
            response_format="json"
286
        )
287
        
288
        return {
289
            'original_text': transcription.text,
290
            'translated_text': translation.text,
291
            'language': detected_language
292
        }
293

294
# Process multilingual audio
295
result = detect_and_process_audio(client, "multilingual_audio.mp3")
296
print(f"Original ({result['language']}): {result['original_text'][:100]}...")
297
print(f"English: {result['translated_text'][:100]}...")
298
```
299

300
## Types
301

302
### Speech Synthesis Types
303

304
```python { .api }
305
class AudioSpeechRequest:
306
    model: str
307
    input: str
308
    voice: str
309
    response_format: Optional[str] = None
310
    speed: Optional[float] = None
311

312
class AudioResponseFormat:
313
    MP3 = "mp3"
314
    OPUS = "opus"
315
    AAC = "aac"
316
    FLAC = "flac"
317
    WAV = "wav"
318
    PCM = "pcm"
319

320
class AudioResponseEncoding:
321
    MP3 = "mp3"
322
    OPUS = "opus"
323
    AAC = "aac"
324
    FLAC = "flac"
325
```
326

327
### Transcription Types
328

329
```python { .api }
330
class AudioTranscriptionRequest:
331
    file: str
332
    model: str
333
    language: Optional[str] = None
334
    prompt: Optional[str] = None
335
    response_format: Optional[str] = None
336
    temperature: Optional[float] = None
337
    timestamp_granularities: Optional[List[str]] = None
338

339
class AudioTranscriptionResponse:
340
    text: str
341

342
class AudioTranscriptionVerboseResponse:
343
    language: str
344
    duration: float
345
    text: str
346
    words: Optional[List[AudioWord]] = None
347
    segments: Optional[List[AudioSegment]] = None
348

349
class AudioWord:
350
    word: str
351
    start: float
352
    end: float
353

354
class AudioSegment:
355
    id: int
356
    seek: int
357
    start: float
358
    end: float
359
    text: str
360
    tokens: List[int]
361
    temperature: float
362
    avg_logprob: float
363
    compression_ratio: float
364
    no_speech_prob: float
365
```
366

367
### Translation Types
368

369
```python { .api }
370
class AudioTranslationRequest:
371
    file: str
372
    model: str
373
    prompt: Optional[str] = None
374
    response_format: Optional[str] = None
375
    temperature: Optional[float] = None
376

377
class AudioTranslationResponse:
378
    text: str
379

380
class AudioTranslationVerboseResponse:
381
    language: str
382
    duration: float
383
    text: str
384
    segments: Optional[List[AudioSegment]] = None
385
```
386

387
### Language and Format Options
388

389
```python { .api }
390
class AudioLanguage:
391
    """ISO-639-1 language codes for audio processing"""
392
    ENGLISH = "en"
393
    SPANISH = "es"
394
    FRENCH = "fr"
395
    GERMAN = "de"
396
    ITALIAN = "it"
397
    PORTUGUESE = "pt"
398
    RUSSIAN = "ru"
399
    JAPANESE = "ja"
400
    KOREAN = "ko"
401
    CHINESE = "zh"
402

403
class AudioTranscriptionResponseFormat:
404
    JSON = "json"
405
    TEXT = "text"
406
    SRT = "srt"
407
    VERBOSE_JSON = "verbose_json"
408
    VTT = "vtt"
409

410
class AudioTimestampGranularities:
411
    WORD = "word"
412
    SEGMENT = "segment"
413
```
414

415
## Supported Models
416

417
- `whisper-large-v3` - High-accuracy transcription and translation
418
- `whisper-large-v2` - Previous generation Whisper model
419
- `together-ai/speech-v1` - Text-to-speech synthesis

Version

Tile

Files

audio.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

audio.mddocs/