or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-io.mddatasets.mdeffects.mdfunctional.mdindex.mdmodels.mdpipelines.mdstreaming.mdtransforms.mdutils.md

utils.mddocs/

0

# Utility Functions

1

2

Helper functions for audio file management, format conversion, backend configuration, and integration with other audio processing libraries. These utilities provide essential support functionality for TorchAudio applications.

3

4

## Capabilities

5

6

### Backend Management

7

8

Control and query audio processing backends.

9

10

```python { .api }

11

def list_audio_backends() -> List[str]:

12

"""

13

List available audio backends.

14

15

Returns:

16

List[str]: Available backends (e.g., ["ffmpeg", "sox", "soundfile"])

17

"""

18

19

def get_audio_backend() -> Optional[str]:

20

"""

21

Get currently active audio backend.

22

23

Returns:

24

Optional[str]: Current backend name or None if using dispatcher mode

25

"""

26

27

def set_audio_backend(backend: Optional[str]) -> None:

28

"""

29

Set global audio backend.

30

31

Args:

32

backend: Backend name ("sox_io", "soundfile") or None to unset

33

34

Note:

35

This function is deprecated. Modern TorchAudio uses dispatcher mode

36

and automatically selects the best available backend.

37

"""

38

```

39

40

### Asset Management

41

42

Download and manage TorchAudio assets and example files.

43

44

```python { .api }

45

def download_asset(filename: str, subfolder: str = "") -> str:

46

"""

47

Download asset file from TorchAudio repository.

48

49

Args:

50

filename: Name of file to download

51

subfolder: Subfolder within assets directory

52

53

Returns:

54

str: Path to downloaded file

55

56

Examples:

57

>>> # Download sample audio file

58

>>> path = download_asset("steam-train-whistle-daniel_simon.wav")

59

>>> waveform, sr = torchaudio.load(path)

60

61

>>> # Download tutorial data

62

>>> path = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")

63

"""

64

```

65

66

### SoX Utilities

67

68

Integration with SoX (Sound eXchange) audio processing library.

69

70

```python { .api }

71

# SoX Effects Management

72

def init_sox_effects() -> None:

73

"""Initialize SoX effects library."""

74

75

def shutdown_sox_effects() -> None:

76

"""Shutdown SoX effects library and clean up resources."""

77

78

def effect_names() -> List[str]:

79

"""

80

Get list of available SoX effects.

81

82

Returns:

83

List[str]: Names of available SoX effects

84

"""

85

86

def apply_effects_tensor(tensor: torch.Tensor, sample_rate: int, effects: List[List[str]],

87

channels_first: bool = True) -> Tuple[torch.Tensor, int]:

88

"""

89

Apply SoX effects to tensor.

90

91

Args:

92

tensor: Input audio tensor

93

sample_rate: Sample rate of input

94

effects: List of effect chains (each effect is [name, *args])

95

channels_first: Whether tensor is (channels, time) or (time, channels)

96

97

Returns:

98

Tuple[torch.Tensor, int]: (processed_tensor, output_sample_rate)

99

100

Examples:

101

>>> # Apply reverb and normalize

102

>>> effects = [

103

... ["reverb", "50"],

104

... ["norm", "-1"]

105

... ]

106

>>> processed, sr = apply_effects_tensor(waveform, 44100, effects)

107

"""

108

109

def apply_effects_file(path: str, effects: List[List[str]], normalize: bool = True,

110

channels_first: bool = True, format: Optional[str] = None) -> Tuple[torch.Tensor, int]:

111

"""

112

Apply SoX effects to audio file.

113

114

Args:

115

path: Path to input audio file

116

effects: List of effect chains

117

normalize: Whether to normalize output

118

channels_first: Whether to return (channels, time) format

119

format: Input format override

120

121

Returns:

122

Tuple[torch.Tensor, int]: (processed_tensor, sample_rate)

123

"""

124

```

125

126

### SoX Utilities Module

127

128

Detailed SoX integration utilities.

129

130

```python { .api }

131

# In torchaudio.utils.sox_utils module

132

def list_effects() -> List[str]:

133

"""List all available SoX effects."""

134

135

def list_read_formats() -> List[str]:

136

"""List audio formats that SoX can read."""

137

138

def list_write_formats() -> List[str]:

139

"""List audio formats that SoX can write."""

140

141

def get_buffer_size() -> int:

142

"""Get SoX internal buffer size."""

143

144

def set_buffer_size(buffer_size: int) -> None:

145

"""Set SoX internal buffer size."""

146

147

def get_verbosity() -> int:

148

"""Get SoX verbosity level."""

149

150

def set_verbosity(verbosity: int) -> None:

151

"""Set SoX verbosity level."""

152

```

153

154

### FFmpeg Utilities

155

156

Integration with FFmpeg media processing framework.

157

158

```python { .api }

159

# In torchaudio.utils.ffmpeg_utils module (from torio)

160

def get_ffmpeg_version() -> str:

161

"""Get FFmpeg version string."""

162

163

def get_supported_decoders() -> List[str]:

164

"""Get list of supported audio decoders."""

165

166

def get_supported_encoders() -> List[str]:

167

"""Get list of supported audio encoders."""

168

169

def get_supported_demuxers() -> List[str]:

170

"""Get list of supported demuxers (input formats)."""

171

172

def get_supported_muxers() -> List[str]:

173

"""Get list of supported muxers (output formats)."""

174

175

def get_audio_decoders() -> List[str]:

176

"""Get audio-specific decoders."""

177

178

def get_audio_encoders() -> List[str]:

179

"""Get audio-specific encoders."""

180

```

181

182

### Kaldi I/O Integration

183

184

Functions for working with Kaldi ASR toolkit file formats.

185

186

```python { .api }

187

def read_vec_int_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:

188

"""

189

Read integer vector ark files.

190

191

Args:

192

file_or_fd: File path or file descriptor

193

194

Yields:

195

Tuple[str, torch.Tensor]: (utterance_id, vector)

196

"""

197

198

def read_vec_flt_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:

199

"""

200

Read float vector ark files.

201

202

Args:

203

file_or_fd: File path or file descriptor

204

205

Yields:

206

Tuple[str, torch.Tensor]: (utterance_id, vector)

207

"""

208

209

def read_vec_flt_scp(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:

210

"""

211

Read float vector scp files.

212

213

Args:

214

file_or_fd: File path or file descriptor

215

216

Yields:

217

Tuple[str, torch.Tensor]: (utterance_id, vector)

218

"""

219

220

def read_mat_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:

221

"""

222

Read matrix ark files.

223

224

Args:

225

file_or_fd: File path or file descriptor

226

227

Yields:

228

Tuple[str, torch.Tensor]: (utterance_id, matrix)

229

"""

230

231

def read_mat_scp(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:

232

"""

233

Read matrix scp files.

234

235

Args:

236

file_or_fd: File path or file descriptor

237

238

Yields:

239

Tuple[str, torch.Tensor]: (utterance_id, matrix)

240

"""

241

```

242

243

### Compliance Utilities

244

245

Compatibility functions for other audio processing libraries.

246

247

```python { .api }

248

# In torchaudio.compliance.kaldi module

249

def fbank(waveform: torch.Tensor, blackman_coeff: float = 0.42,

250

channel: int = -1, dither: float = 0.0, energy_floor: float = 1.0,

251

frame_length: float = 25.0, frame_shift: float = 10.0,

252

high_freq: float = 0.0, htk_compat: bool = False,

253

low_freq: float = 20.0, min_duration: float = 0.0,

254

num_mel_bins: int = 23, preemphasis_coefficient: float = 0.97,

255

raw_energy: bool = True, remove_dc_offset: bool = True,

256

round_to_power_of_two: bool = True, sample_frequency: float = 16000.0,

257

snip_edges: bool = True, subtract_mean: bool = False,

258

use_energy: bool = False, use_log_fbank: bool = True,

259

use_power: bool = True, vtln_high: float = -500.0,

260

vtln_low: float = 100.0, vtln_warp: float = 1.0,

261

window_type: str = "povey") -> torch.Tensor:

262

"""

263

Kaldi-compatible filter bank feature extraction.

264

265

Args:

266

waveform: Input waveform

267

(many Kaldi-specific parameters...)

268

269

Returns:

270

torch.Tensor: Filter bank features

271

"""

272

273

def mfcc(waveform: torch.Tensor, num_ceps: int = 13, **kwargs) -> torch.Tensor:

274

"""

275

Kaldi-compatible MFCC feature extraction.

276

277

Args:

278

waveform: Input waveform

279

num_ceps: Number of cepstral coefficients

280

**kwargs: Additional fbank parameters

281

282

Returns:

283

torch.Tensor: MFCC features

284

"""

285

286

def spectrogram(waveform: torch.Tensor, **kwargs) -> torch.Tensor:

287

"""Kaldi-compatible spectrogram computation."""

288

```

289

290

## Usage Examples

291

292

### Backend Configuration

293

294

```python

295

import torchaudio

296

297

# Check available backends

298

backends = torchaudio.list_audio_backends()

299

print(f"Available backends: {backends}")

300

301

# Check current backend (returns None in dispatcher mode)

302

current = torchaudio.get_audio_backend()

303

print(f"Current backend: {current}")

304

305

# In older versions, you could set backend manually:

306

# torchaudio.set_audio_backend("sox_io") # Now deprecated

307

```

308

309

### Asset Management

310

311

```python

312

import torchaudio

313

from torchaudio.utils import download_asset

314

315

# Download sample audio file

316

audio_path = download_asset("steam-train-whistle-daniel_simon.wav")

317

waveform, sample_rate = torchaudio.load(audio_path)

318

319

print(f"Downloaded sample: {audio_path}")

320

print(f"Audio shape: {waveform.shape}")

321

print(f"Sample rate: {sample_rate}")

322

323

# Download tutorial data

324

tutorial_path = download_asset(

325

"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"

326

)

327

```

328

329

### SoX Effects Processing

330

331

```python

332

import torchaudio

333

from torchaudio.sox_effects import apply_effects_tensor, effect_names

334

335

# Check available effects

336

effects = effect_names()

337

print(f"Available SoX effects: {len(effects)}")

338

print(f"First 10 effects: {effects[:10]}")

339

340

# Apply effects chain

341

waveform, sample_rate = torchaudio.load("input.wav")

342

343

effects_chain = [

344

["reverb", "50"], # Add reverb

345

["bass", "+5"], # Boost bass by 5dB

346

["treble", "+2"], # Boost treble by 2dB

347

["norm", "-1"], # Normalize to -1dB

348

["rate", "44100"] # Resample to 44.1kHz

349

]

350

351

processed_waveform, new_sr = apply_effects_tensor(

352

waveform, sample_rate, effects_chain

353

)

354

355

torchaudio.save("processed.wav", processed_waveform, new_sr)

356

```

357

358

### Format Conversion Utility

359

360

```python

361

import torchaudio

362

from torchaudio.sox_effects import apply_effects_file

363

364

def convert_audio_file(input_path: str, output_path: str,

365

target_sr: int = 44100, target_channels: int = 2):

366

"""Convert audio file format and properties."""

367

368

effects = [

369

["channels", str(target_channels)], # Convert to stereo/mono

370

["rate", str(target_sr)], # Resample

371

["norm", "-1"] # Normalize

372

]

373

374

# Apply effects and load

375

waveform, sr = apply_effects_file(input_path, effects)

376

377

# Save in new format

378

torchaudio.save(output_path, waveform, sr)

379

print(f"Converted {input_path} -> {output_path}")

380

print(f"New format: {sr} Hz, {waveform.shape[0]} channels")

381

382

# Convert various formats

383

convert_audio_file("input.mp3", "output.wav", target_sr=48000, target_channels=1)

384

```

385

386

### Kaldi Integration

387

388

```python

389

import torchaudio

390

from torchaudio.kaldi_io import read_mat_ark

391

392

# Read Kaldi archive files

393

def process_kaldi_features(ark_file: str):

394

"""Process features from Kaldi ark file."""

395

396

for utterance_id, feature_matrix in read_mat_ark(ark_file):

397

print(f"Processing {utterance_id}: {feature_matrix.shape}")

398

399

# Convert to PyTorch tensor and process

400

features = feature_matrix # Already a tensor

401

402

# Apply processing (e.g., normalization, augmentation)

403

processed = torchaudio.functional.sliding_window_cmn(

404

features.T.unsqueeze(0) # Add batch dim and transpose

405

).squeeze(0).T

406

407

# Further processing...

408

yield utterance_id, processed

409

410

# Process Kaldi ark file

411

# for utt_id, features in process_kaldi_features("features.ark"):

412

# # Process each utterance

413

# pass

414

```

415

416

### FFmpeg Capabilities Query

417

418

```python

419

from torchaudio.utils import ffmpeg_utils

420

421

# Check FFmpeg capabilities

422

print(f"FFmpeg version: {ffmpeg_utils.get_ffmpeg_version()}")

423

print(f"Audio decoders: {len(ffmpeg_utils.get_audio_decoders())}")

424

print(f"Audio encoders: {len(ffmpeg_utils.get_audio_encoders())}")

425

426

# Check specific codec support

427

decoders = ffmpeg_utils.get_audio_decoders()

428

encoders = ffmpeg_utils.get_audio_encoders()

429

430

print("Supported formats:")

431

print(f"MP3 decode: {'mp3' in decoders}")

432

print(f"AAC encode: {'aac' in encoders}")

433

print(f"FLAC support: {'flac' in decoders and 'flac' in encoders}")

434

```

435

436

These utilities provide essential infrastructure for audio processing applications, enabling integration with external libraries, format handling, and system configuration.