or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio.mdbatch.mdchat-completions.mdcode-interpreter.mdcompletions.mdembeddings.mdendpoints.mdevaluation.mdfiles.mdfine-tuning.mdimages.mdindex.mdmodels.mdrerank.md

audio.mddocs/

0

# Audio Processing

1

2

Speech synthesis, transcription, and translation capabilities supporting multiple languages and audio formats. Process audio content with state-of-the-art models for converting between speech and text in various languages.

3

4

## Capabilities

5

6

### Speech Synthesis

7

8

Generate natural-sounding speech from text input with various voice options.

9

10

```python { .api }

11

def create(

12

model: str,

13

input: str,

14

voice: str,

15

response_format: Optional[str] = None,

16

speed: Optional[float] = None,

17

**kwargs

18

) -> bytes:

19

"""

20

Generate speech from text.

21

22

Args:

23

model: Speech synthesis model identifier

24

input: Text to convert to speech

25

voice: Voice identifier for synthesis

26

response_format: Audio format (mp3, wav, flac, etc.)

27

speed: Speech speed (0.25 to 4.0)

28

29

Returns:

30

Audio data as bytes

31

"""

32

```

33

34

### Audio Transcription

35

36

Convert spoken audio to text with language detection and formatting options.

37

38

```python { .api }

39

def create(

40

file: str,

41

model: str,

42

language: Optional[str] = None,

43

prompt: Optional[str] = None,

44

response_format: Optional[str] = None,

45

temperature: Optional[float] = None,

46

timestamp_granularities: Optional[List[str]] = None,

47

**kwargs

48

) -> AudioTranscriptionResponse:

49

"""

50

Transcribe audio to text.

51

52

Args:

53

file: Path to audio file to transcribe

54

model: Transcription model identifier

55

language: Source language code (ISO-639-1)

56

prompt: Optional prompt to guide transcription

57

response_format: Response format (json, text, srt, verbose_json, vtt)

58

temperature: Sampling temperature

59

timestamp_granularities: Timestamp precision levels

60

61

Returns:

62

AudioTranscriptionResponse with transcribed text

63

"""

64

```

65

66

### Audio Translation

67

68

Translate audio from various languages to English text.

69

70

```python { .api }

71

def create(

72

file: str,

73

model: str,

74

prompt: Optional[str] = None,

75

response_format: Optional[str] = None,

76

temperature: Optional[float] = None,

77

**kwargs

78

) -> AudioTranslationResponse:

79

"""

80

Translate audio to English text.

81

82

Args:

83

file: Path to audio file to translate

84

model: Translation model identifier

85

prompt: Optional prompt to guide translation

86

response_format: Response format (json, text, verbose_json)

87

temperature: Sampling temperature

88

89

Returns:

90

AudioTranslationResponse with translated text

91

"""

92

```

93

94

### Async Audio Operations

95

96

All audio operations support asynchronous execution.

97

98

```python { .api }

99

async def create(model: str, input: str, voice: str, **kwargs) -> bytes: ...

100

async def create(file: str, model: str, **kwargs) -> AudioTranscriptionResponse: ...

101

async def create(file: str, model: str, **kwargs) -> AudioTranslationResponse: ...

102

```

103

104

## Usage Examples

105

106

### Text-to-Speech Generation

107

108

```python

109

from together import Together

110

111

client = Together()

112

113

# Generate speech from text

114

audio_data = client.audio.speech.create(

115

model="together-ai/speech-v1",

116

input="Hello, this is a test of the speech synthesis system.",

117

voice="alloy",

118

response_format="mp3",

119

speed=1.0

120

)

121

122

# Save audio to file

123

with open("generated_speech.mp3", "wb") as f:

124

f.write(audio_data)

125

126

print("Speech generated and saved to generated_speech.mp3")

127

```

128

129

### Audio Transcription

130

131

```python

132

# Transcribe audio file to text

133

response = client.audio.transcriptions.create(

134

file="recorded_speech.mp3",

135

model="whisper-large-v3",

136

language="en",

137

response_format="verbose_json",

138

timestamp_granularities=["word", "segment"]

139

)

140

141

print(f"Transcribed text: {response.text}")

142

print(f"Language detected: {response.language}")

143

print(f"Duration: {response.duration} seconds")

144

145

# Access word-level timestamps

146

if hasattr(response, 'words'):

147

print("Word-level timestamps:")

148

for word in response.words[:10]: # First 10 words

149

print(f" {word.word}: {word.start:.2f}s - {word.end:.2f}s")

150

```

151

152

### Audio Translation to English

153

154

```python

155

# Translate Spanish audio to English text

156

response = client.audio.translations.create(

157

file="spanish_audio.mp3",

158

model="whisper-large-v3",

159

response_format="verbose_json"

160

)

161

162

print(f"Original language detected: {response.language}")

163

print(f"English translation: {response.text}")

164

print(f"Translation duration: {response.duration} seconds")

165

```

166

167

### Batch Audio Processing

168

169

```python

170

import os

171

172

def process_audio_files(client: Together, audio_dir: str, model: str):

173

"""Process all audio files in a directory."""

174

175

results = []

176

audio_files = [f for f in os.listdir(audio_dir) if f.endswith(('.mp3', '.wav', '.m4a'))]

177

178

for audio_file in audio_files:

179

file_path = os.path.join(audio_dir, audio_file)

180

181

try:

182

response = client.audio.transcriptions.create(

183

file=file_path,

184

model=model,

185

response_format="json"

186

)

187

188

results.append({

189

'file': audio_file,

190

'text': response.text,

191

'language': getattr(response, 'language', 'unknown'),

192

'status': 'success'

193

})

194

195

print(f"✅ Processed: {audio_file}")

196

197

except Exception as e:

198

results.append({

199

'file': audio_file,

200

'error': str(e),

201

'status': 'failed'

202

})

203

print(f"❌ Failed: {audio_file} - {e}")

204

205

return results

206

207

# Process all audio files

208

results = process_audio_files(client, "./audio_files", "whisper-large-v3")

209

210

# Save results

211

import json

212

with open("transcription_results.json", "w") as f:

213

json.dump(results, f, indent=2)

214

```

215

216

### Streaming Speech Synthesis

217

218

```python

219

def stream_speech(client: Together, text: str, voice: str = "alloy"):

220

"""Stream speech synthesis for real-time playback."""

221

222

# Break text into chunks for streaming

223

chunks = [text[i:i+200] for i in range(0, len(text), 200)]

224

225

audio_chunks = []

226

227

for i, chunk in enumerate(chunks):

228

audio_data = client.audio.speech.create(

229

model="together-ai/speech-v1",

230

input=chunk,

231

voice=voice,

232

response_format="mp3",

233

speed=1.0

234

)

235

236

audio_chunks.append(audio_data)

237

print(f"Generated chunk {i+1}/{len(chunks)}")

238

239

# Combine audio chunks

240

combined_audio = b''.join(audio_chunks)

241

242

with open("streamed_speech.mp3", "wb") as f:

243

f.write(combined_audio)

244

245

return combined_audio

246

247

# Generate speech in chunks

248

long_text = """

249

This is a long text that will be converted to speech in multiple chunks.

250

The streaming approach allows for better memory management and faster

251

perceived response times when processing large amounts of text.

252

"""

253

254

stream_speech(client, long_text, voice="nova")

255

```

256

257

### Multi-language Audio Processing

258

259

```python

260

def detect_and_process_audio(client: Together, audio_file: str):

261

"""Detect language and process accordingly."""

262

263

# First, transcribe to detect language

264

transcription = client.audio.transcriptions.create(

265

file=audio_file,

266

model="whisper-large-v3",

267

response_format="verbose_json"

268

)

269

270

detected_language = transcription.language

271

print(f"Detected language: {detected_language}")

272

273

if detected_language == "en":

274

# Already English, just return transcription

275

return {

276

'original_text': transcription.text,

277

'translated_text': transcription.text,

278

'language': detected_language

279

}

280

else:

281

# Translate to English

282

translation = client.audio.translations.create(

283

file=audio_file,

284

model="whisper-large-v3",

285

response_format="json"

286

)

287

288

return {

289

'original_text': transcription.text,

290

'translated_text': translation.text,

291

'language': detected_language

292

}

293

294

# Process multilingual audio

295

result = detect_and_process_audio(client, "multilingual_audio.mp3")

296

print(f"Original ({result['language']}): {result['original_text'][:100]}...")

297

print(f"English: {result['translated_text'][:100]}...")

298

```

299

300

## Types

301

302

### Speech Synthesis Types

303

304

```python { .api }

305

class AudioSpeechRequest:

306

model: str

307

input: str

308

voice: str

309

response_format: Optional[str] = None

310

speed: Optional[float] = None

311

312

class AudioResponseFormat:

313

MP3 = "mp3"

314

OPUS = "opus"

315

AAC = "aac"

316

FLAC = "flac"

317

WAV = "wav"

318

PCM = "pcm"

319

320

class AudioResponseEncoding:

321

MP3 = "mp3"

322

OPUS = "opus"

323

AAC = "aac"

324

FLAC = "flac"

325

```

326

327

### Transcription Types

328

329

```python { .api }

330

class AudioTranscriptionRequest:

331

file: str

332

model: str

333

language: Optional[str] = None

334

prompt: Optional[str] = None

335

response_format: Optional[str] = None

336

temperature: Optional[float] = None

337

timestamp_granularities: Optional[List[str]] = None

338

339

class AudioTranscriptionResponse:

340

text: str

341

342

class AudioTranscriptionVerboseResponse:

343

language: str

344

duration: float

345

text: str

346

words: Optional[List[AudioWord]] = None

347

segments: Optional[List[AudioSegment]] = None

348

349

class AudioWord:

350

word: str

351

start: float

352

end: float

353

354

class AudioSegment:

355

id: int

356

seek: int

357

start: float

358

end: float

359

text: str

360

tokens: List[int]

361

temperature: float

362

avg_logprob: float

363

compression_ratio: float

364

no_speech_prob: float

365

```

366

367

### Translation Types

368

369

```python { .api }

370

class AudioTranslationRequest:

371

file: str

372

model: str

373

prompt: Optional[str] = None

374

response_format: Optional[str] = None

375

temperature: Optional[float] = None

376

377

class AudioTranslationResponse:

378

text: str

379

380

class AudioTranslationVerboseResponse:

381

language: str

382

duration: float

383

text: str

384

segments: Optional[List[AudioSegment]] = None

385

```

386

387

### Language and Format Options

388

389

```python { .api }

390

class AudioLanguage:

391

"""ISO-639-1 language codes for audio processing"""

392

ENGLISH = "en"

393

SPANISH = "es"

394

FRENCH = "fr"

395

GERMAN = "de"

396

ITALIAN = "it"

397

PORTUGUESE = "pt"

398

RUSSIAN = "ru"

399

JAPANESE = "ja"

400

KOREAN = "ko"

401

CHINESE = "zh"

402

403

class AudioTranscriptionResponseFormat:

404

JSON = "json"

405

TEXT = "text"

406

SRT = "srt"

407

VERBOSE_JSON = "verbose_json"

408

VTT = "vtt"

409

410

class AudioTimestampGranularities:

411

WORD = "word"

412

SEGMENT = "segment"

413

```

414

415

## Supported Models

416

417

- `whisper-large-v3` - High-accuracy transcription and translation

418

- `whisper-large-v2` - Previous generation Whisper model

419

- `together-ai/speech-v1` - Text-to-speech synthesis