0
# Utility Functions
1
2
Helper functions for audio file management, format conversion, backend configuration, and integration with other audio processing libraries. These utilities provide essential support functionality for TorchAudio applications.
3
4
## Capabilities
5
6
### Backend Management
7
8
Control and query audio processing backends.
9
10
```python { .api }
11
def list_audio_backends() -> List[str]:
12
"""
13
List available audio backends.
14
15
Returns:
16
List[str]: Available backends (e.g., ["ffmpeg", "sox", "soundfile"])
17
"""
18
19
def get_audio_backend() -> Optional[str]:
20
"""
21
Get currently active audio backend.
22
23
Returns:
24
Optional[str]: Current backend name or None if using dispatcher mode
25
"""
26
27
def set_audio_backend(backend: Optional[str]) -> None:
28
"""
29
Set global audio backend.
30
31
Args:
32
backend: Backend name ("sox_io", "soundfile") or None to unset
33
34
Note:
35
This function is deprecated. Modern TorchAudio uses dispatcher mode
36
and automatically selects the best available backend.
37
"""
38
```
39
40
### Asset Management
41
42
Download and manage TorchAudio assets and example files.
43
44
```python { .api }
45
def download_asset(filename: str, subfolder: str = "") -> str:
46
"""
47
Download asset file from TorchAudio repository.
48
49
Args:
50
filename: Name of file to download
51
subfolder: Subfolder within assets directory
52
53
Returns:
54
str: Path to downloaded file
55
56
Examples:
57
>>> # Download sample audio file
58
>>> path = download_asset("steam-train-whistle-daniel_simon.wav")
59
>>> waveform, sr = torchaudio.load(path)
60
61
>>> # Download tutorial data
62
>>> path = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
63
"""
64
```
65
66
### SoX Utilities
67
68
Integration with SoX (Sound eXchange) audio processing library.
69
70
```python { .api }
71
# SoX Effects Management
72
def init_sox_effects() -> None:
73
"""Initialize SoX effects library."""
74
75
def shutdown_sox_effects() -> None:
76
"""Shutdown SoX effects library and clean up resources."""
77
78
def effect_names() -> List[str]:
79
"""
80
Get list of available SoX effects.
81
82
Returns:
83
List[str]: Names of available SoX effects
84
"""
85
86
def apply_effects_tensor(tensor: torch.Tensor, sample_rate: int, effects: List[List[str]],
87
channels_first: bool = True) -> Tuple[torch.Tensor, int]:
88
"""
89
Apply SoX effects to tensor.
90
91
Args:
92
tensor: Input audio tensor
93
sample_rate: Sample rate of input
94
effects: List of effect chains (each effect is [name, *args])
95
channels_first: Whether tensor is (channels, time) or (time, channels)
96
97
Returns:
98
Tuple[torch.Tensor, int]: (processed_tensor, output_sample_rate)
99
100
Examples:
101
>>> # Apply reverb and normalize
102
>>> effects = [
103
... ["reverb", "50"],
104
... ["norm", "-1"]
105
... ]
106
>>> processed, sr = apply_effects_tensor(waveform, 44100, effects)
107
"""
108
109
def apply_effects_file(path: str, effects: List[List[str]], normalize: bool = True,
110
channels_first: bool = True, format: Optional[str] = None) -> Tuple[torch.Tensor, int]:
111
"""
112
Apply SoX effects to audio file.
113
114
Args:
115
path: Path to input audio file
116
effects: List of effect chains
117
normalize: Whether to normalize output
118
channels_first: Whether to return (channels, time) format
119
format: Input format override
120
121
Returns:
122
Tuple[torch.Tensor, int]: (processed_tensor, sample_rate)
123
"""
124
```
125
126
### SoX Utilities Module
127
128
Detailed SoX integration utilities.
129
130
```python { .api }
131
# In torchaudio.utils.sox_utils module
132
def list_effects() -> List[str]:
133
"""List all available SoX effects."""
134
135
def list_read_formats() -> List[str]:
136
"""List audio formats that SoX can read."""
137
138
def list_write_formats() -> List[str]:
139
"""List audio formats that SoX can write."""
140
141
def get_buffer_size() -> int:
142
"""Get SoX internal buffer size."""
143
144
def set_buffer_size(buffer_size: int) -> None:
145
"""Set SoX internal buffer size."""
146
147
def get_verbosity() -> int:
148
"""Get SoX verbosity level."""
149
150
def set_verbosity(verbosity: int) -> None:
151
"""Set SoX verbosity level."""
152
```
153
154
### FFmpeg Utilities
155
156
Integration with FFmpeg media processing framework.
157
158
```python { .api }
159
# In torchaudio.utils.ffmpeg_utils module (from torio)
160
def get_ffmpeg_version() -> str:
161
"""Get FFmpeg version string."""
162
163
def get_supported_decoders() -> List[str]:
164
"""Get list of supported audio decoders."""
165
166
def get_supported_encoders() -> List[str]:
167
"""Get list of supported audio encoders."""
168
169
def get_supported_demuxers() -> List[str]:
170
"""Get list of supported demuxers (input formats)."""
171
172
def get_supported_muxers() -> List[str]:
173
"""Get list of supported muxers (output formats)."""
174
175
def get_audio_decoders() -> List[str]:
176
"""Get audio-specific decoders."""
177
178
def get_audio_encoders() -> List[str]:
179
"""Get audio-specific encoders."""
180
```
181
182
### Kaldi I/O Integration
183
184
Functions for working with Kaldi ASR toolkit file formats.
185
186
```python { .api }
187
def read_vec_int_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
188
"""
189
Read integer vector ark files.
190
191
Args:
192
file_or_fd: File path or file descriptor
193
194
Yields:
195
Tuple[str, torch.Tensor]: (utterance_id, vector)
196
"""
197
198
def read_vec_flt_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
199
"""
200
Read float vector ark files.
201
202
Args:
203
file_or_fd: File path or file descriptor
204
205
Yields:
206
Tuple[str, torch.Tensor]: (utterance_id, vector)
207
"""
208
209
def read_vec_flt_scp(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
210
"""
211
Read float vector scp files.
212
213
Args:
214
file_or_fd: File path or file descriptor
215
216
Yields:
217
Tuple[str, torch.Tensor]: (utterance_id, vector)
218
"""
219
220
def read_mat_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
221
"""
222
Read matrix ark files.
223
224
Args:
225
file_or_fd: File path or file descriptor
226
227
Yields:
228
Tuple[str, torch.Tensor]: (utterance_id, matrix)
229
"""
230
231
def read_mat_scp(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
232
"""
233
Read matrix scp files.
234
235
Args:
236
file_or_fd: File path or file descriptor
237
238
Yields:
239
Tuple[str, torch.Tensor]: (utterance_id, matrix)
240
"""
241
```
242
243
### Compliance Utilities
244
245
Compatibility functions for other audio processing libraries.
246
247
```python { .api }
248
# In torchaudio.compliance.kaldi module
249
def fbank(waveform: torch.Tensor, blackman_coeff: float = 0.42,
250
channel: int = -1, dither: float = 0.0, energy_floor: float = 1.0,
251
frame_length: float = 25.0, frame_shift: float = 10.0,
252
high_freq: float = 0.0, htk_compat: bool = False,
253
low_freq: float = 20.0, min_duration: float = 0.0,
254
num_mel_bins: int = 23, preemphasis_coefficient: float = 0.97,
255
raw_energy: bool = True, remove_dc_offset: bool = True,
256
round_to_power_of_two: bool = True, sample_frequency: float = 16000.0,
257
snip_edges: bool = True, subtract_mean: bool = False,
258
use_energy: bool = False, use_log_fbank: bool = True,
259
use_power: bool = True, vtln_high: float = -500.0,
260
vtln_low: float = 100.0, vtln_warp: float = 1.0,
261
window_type: str = "povey") -> torch.Tensor:
262
"""
263
Kaldi-compatible filter bank feature extraction.
264
265
Args:
266
waveform: Input waveform
267
(many Kaldi-specific parameters...)
268
269
Returns:
270
torch.Tensor: Filter bank features
271
"""
272
273
def mfcc(waveform: torch.Tensor, num_ceps: int = 13, **kwargs) -> torch.Tensor:
274
"""
275
Kaldi-compatible MFCC feature extraction.
276
277
Args:
278
waveform: Input waveform
279
num_ceps: Number of cepstral coefficients
280
**kwargs: Additional fbank parameters
281
282
Returns:
283
torch.Tensor: MFCC features
284
"""
285
286
def spectrogram(waveform: torch.Tensor, **kwargs) -> torch.Tensor:
287
"""Kaldi-compatible spectrogram computation."""
288
```
289
290
## Usage Examples
291
292
### Backend Configuration
293
294
```python
295
import torchaudio
296
297
# Check available backends
298
backends = torchaudio.list_audio_backends()
299
print(f"Available backends: {backends}")
300
301
# Check current backend (returns None in dispatcher mode)
302
current = torchaudio.get_audio_backend()
303
print(f"Current backend: {current}")
304
305
# In older versions, you could set backend manually:
306
# torchaudio.set_audio_backend("sox_io") # Now deprecated
307
```
308
309
### Asset Management
310
311
```python
312
import torchaudio
313
from torchaudio.utils import download_asset
314
315
# Download sample audio file
316
audio_path = download_asset("steam-train-whistle-daniel_simon.wav")
317
waveform, sample_rate = torchaudio.load(audio_path)
318
319
print(f"Downloaded sample: {audio_path}")
320
print(f"Audio shape: {waveform.shape}")
321
print(f"Sample rate: {sample_rate}")
322
323
# Download tutorial data
324
tutorial_path = download_asset(
325
"tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
326
)
327
```
328
329
### SoX Effects Processing
330
331
```python
332
import torchaudio
333
from torchaudio.sox_effects import apply_effects_tensor, effect_names
334
335
# Check available effects
336
effects = effect_names()
337
print(f"Available SoX effects: {len(effects)}")
338
print(f"First 10 effects: {effects[:10]}")
339
340
# Apply effects chain
341
waveform, sample_rate = torchaudio.load("input.wav")
342
343
effects_chain = [
344
["reverb", "50"], # Add reverb
345
["bass", "+5"], # Boost bass by 5dB
346
["treble", "+2"], # Boost treble by 2dB
347
["norm", "-1"], # Normalize to -1dB
348
["rate", "44100"] # Resample to 44.1kHz
349
]
350
351
processed_waveform, new_sr = apply_effects_tensor(
352
waveform, sample_rate, effects_chain
353
)
354
355
torchaudio.save("processed.wav", processed_waveform, new_sr)
356
```
357
358
### Format Conversion Utility
359
360
```python
361
import torchaudio
362
from torchaudio.sox_effects import apply_effects_file
363
364
def convert_audio_file(input_path: str, output_path: str,
365
target_sr: int = 44100, target_channels: int = 2):
366
"""Convert audio file format and properties."""
367
368
effects = [
369
["channels", str(target_channels)], # Convert to stereo/mono
370
["rate", str(target_sr)], # Resample
371
["norm", "-1"] # Normalize
372
]
373
374
# Apply effects and load
375
waveform, sr = apply_effects_file(input_path, effects)
376
377
# Save in new format
378
torchaudio.save(output_path, waveform, sr)
379
print(f"Converted {input_path} -> {output_path}")
380
print(f"New format: {sr} Hz, {waveform.shape[0]} channels")
381
382
# Convert various formats
383
convert_audio_file("input.mp3", "output.wav", target_sr=48000, target_channels=1)
384
```
385
386
### Kaldi Integration
387
388
```python
389
import torchaudio
390
from torchaudio.kaldi_io import read_mat_ark
391
392
# Read Kaldi archive files
393
def process_kaldi_features(ark_file: str):
394
"""Process features from Kaldi ark file."""
395
396
for utterance_id, feature_matrix in read_mat_ark(ark_file):
397
print(f"Processing {utterance_id}: {feature_matrix.shape}")
398
399
# Convert to PyTorch tensor and process
400
features = feature_matrix # Already a tensor
401
402
# Apply processing (e.g., normalization, augmentation)
403
processed = torchaudio.functional.sliding_window_cmn(
404
features.T.unsqueeze(0) # Add batch dim and transpose
405
).squeeze(0).T
406
407
# Further processing...
408
yield utterance_id, processed
409
410
# Process Kaldi ark file
411
# for utt_id, features in process_kaldi_features("features.ark"):
412
# # Process each utterance
413
# pass
414
```
415
416
### FFmpeg Capabilities Query
417
418
```python
419
from torchaudio.utils import ffmpeg_utils
420
421
# Check FFmpeg capabilities
422
print(f"FFmpeg version: {ffmpeg_utils.get_ffmpeg_version()}")
423
print(f"Audio decoders: {len(ffmpeg_utils.get_audio_decoders())}")
424
print(f"Audio encoders: {len(ffmpeg_utils.get_audio_encoders())}")
425
426
# Check specific codec support
427
decoders = ffmpeg_utils.get_audio_decoders()
428
encoders = ffmpeg_utils.get_audio_encoders()
429
430
print("Supported formats:")
431
print(f"MP3 decode: {'mp3' in decoders}")
432
print(f"AAC encode: {'aac' in encoders}")
433
print(f"FLAC support: {'flac' in decoders and 'flac' in encoders}")
434
```
435
436
These utilities provide essential infrastructure for audio processing applications, enabling integration with external libraries, format handling, and system configuration.