Tessl Tile for pypi/torchaudio@2.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio-io.md datasets.md effects.md functional.md index.md models.md pipelines.md streaming.md transforms.md utils.md

utils.mddocs/

0
# Utility Functions
1

2
Helper functions for audio file management, format conversion, backend configuration, and integration with other audio processing libraries. These utilities provide essential support functionality for TorchAudio applications.
3

4
## Capabilities
5

6
### Backend Management
7

8
Control and query audio processing backends.
9

10
```python { .api }
11
def list_audio_backends() -> List[str]:
12
    """
13
    List available audio backends.
14
    
15
    Returns:
16
        List[str]: Available backends (e.g., ["ffmpeg", "sox", "soundfile"])
17
    """
18

19
def get_audio_backend() -> Optional[str]:
20
    """
21
    Get currently active audio backend.
22
    
23
    Returns:
24
        Optional[str]: Current backend name or None if using dispatcher mode
25
    """
26

27
def set_audio_backend(backend: Optional[str]) -> None:
28
    """
29
    Set global audio backend.
30
    
31
    Args:
32
        backend: Backend name ("sox_io", "soundfile") or None to unset
33
        
34
    Note:
35
        This function is deprecated. Modern TorchAudio uses dispatcher mode
36
        and automatically selects the best available backend.
37
    """
38
```
39

40
### Asset Management
41

42
Download and manage TorchAudio assets and example files.
43

44
```python { .api }
45
def download_asset(filename: str, subfolder: str = "") -> str:
46
    """
47
    Download asset file from TorchAudio repository.
48
    
49
    Args:
50
        filename: Name of file to download
51
        subfolder: Subfolder within assets directory
52
        
53
    Returns:
54
        str: Path to downloaded file
55
        
56
    Examples:
57
        >>> # Download sample audio file
58
        >>> path = download_asset("steam-train-whistle-daniel_simon.wav")
59
        >>> waveform, sr = torchaudio.load(path)
60
        
61
        >>> # Download tutorial data
62
        >>> path = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
63
    """
64
```
65

66
### SoX Utilities
67

68
Integration with SoX (Sound eXchange) audio processing library.
69

70
```python { .api }
71
# SoX Effects Management
72
def init_sox_effects() -> None:
73
    """Initialize SoX effects library."""
74

75
def shutdown_sox_effects() -> None:
76
    """Shutdown SoX effects library and clean up resources."""
77

78
def effect_names() -> List[str]:
79
    """
80
    Get list of available SoX effects.
81
    
82
    Returns:
83
        List[str]: Names of available SoX effects
84
    """
85

86
def apply_effects_tensor(tensor: torch.Tensor, sample_rate: int, effects: List[List[str]],
87
                        channels_first: bool = True) -> Tuple[torch.Tensor, int]:
88
    """
89
    Apply SoX effects to tensor.
90
    
91
    Args:
92
        tensor: Input audio tensor
93
        sample_rate: Sample rate of input
94
        effects: List of effect chains (each effect is [name, *args])
95
        channels_first: Whether tensor is (channels, time) or (time, channels)
96
        
97
    Returns:
98
        Tuple[torch.Tensor, int]: (processed_tensor, output_sample_rate)
99
        
100
    Examples:
101
        >>> # Apply reverb and normalize
102
        >>> effects = [
103
        ...     ["reverb", "50"],
104
        ...     ["norm", "-1"]  
105
        ... ]
106
        >>> processed, sr = apply_effects_tensor(waveform, 44100, effects)
107
    """
108

109
def apply_effects_file(path: str, effects: List[List[str]], normalize: bool = True,
110
                      channels_first: bool = True, format: Optional[str] = None) -> Tuple[torch.Tensor, int]:
111
    """
112
    Apply SoX effects to audio file.
113
    
114
    Args:
115
        path: Path to input audio file
116
        effects: List of effect chains
117
        normalize: Whether to normalize output
118
        channels_first: Whether to return (channels, time) format
119
        format: Input format override
120
        
121
    Returns:
122
        Tuple[torch.Tensor, int]: (processed_tensor, sample_rate)
123
    """
124
```
125

126
### SoX Utilities Module
127

128
Detailed SoX integration utilities.
129

130
```python { .api }
131
# In torchaudio.utils.sox_utils module
132
def list_effects() -> List[str]:
133
    """List all available SoX effects."""
134

135
def list_read_formats() -> List[str]:
136
    """List audio formats that SoX can read."""
137

138
def list_write_formats() -> List[str]:
139
    """List audio formats that SoX can write."""
140

141
def get_buffer_size() -> int:
142
    """Get SoX internal buffer size."""
143

144
def set_buffer_size(buffer_size: int) -> None:
145
    """Set SoX internal buffer size."""
146

147
def get_verbosity() -> int:
148
    """Get SoX verbosity level."""
149

150
def set_verbosity(verbosity: int) -> None:
151
    """Set SoX verbosity level."""
152
```
153

154
### FFmpeg Utilities
155

156
Integration with FFmpeg media processing framework.
157

158
```python { .api }
159
# In torchaudio.utils.ffmpeg_utils module (from torio)
160
def get_ffmpeg_version() -> str:
161
    """Get FFmpeg version string."""
162

163
def get_supported_decoders() -> List[str]:
164
    """Get list of supported audio decoders."""
165

166
def get_supported_encoders() -> List[str]:
167
    """Get list of supported audio encoders."""
168

169
def get_supported_demuxers() -> List[str]:
170
    """Get list of supported demuxers (input formats)."""
171

172
def get_supported_muxers() -> List[str]:
173
    """Get list of supported muxers (output formats)."""
174

175
def get_audio_decoders() -> List[str]:
176
    """Get audio-specific decoders."""
177

178
def get_audio_encoders() -> List[str]:
179
    """Get audio-specific encoders."""
180
```
181

182
### Kaldi I/O Integration
183

184
Functions for working with Kaldi ASR toolkit file formats.
185

186
```python { .api }
187
def read_vec_int_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
188
    """
189
    Read integer vector ark files.
190
    
191
    Args:
192
        file_or_fd: File path or file descriptor
193
        
194
    Yields:
195
        Tuple[str, torch.Tensor]: (utterance_id, vector)
196
    """
197

198
def read_vec_flt_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
199
    """
200
    Read float vector ark files.
201
    
202
    Args:
203
        file_or_fd: File path or file descriptor
204
        
205
    Yields:
206
        Tuple[str, torch.Tensor]: (utterance_id, vector)
207
    """
208

209
def read_vec_flt_scp(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
210
    """
211
    Read float vector scp files.
212
    
213
    Args:
214
        file_or_fd: File path or file descriptor
215
        
216
    Yields:
217
        Tuple[str, torch.Tensor]: (utterance_id, vector)
218
    """
219

220
def read_mat_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
221
    """
222
    Read matrix ark files.
223
    
224
    Args:
225
        file_or_fd: File path or file descriptor
226
        
227
    Yields:
228
        Tuple[str, torch.Tensor]: (utterance_id, matrix)
229
    """
230

231
def read_mat_scp(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
232
    """
233
    Read matrix scp files.
234
    
235
    Args:
236
        file_or_fd: File path or file descriptor
237
        
238
    Yields:
239
        Tuple[str, torch.Tensor]: (utterance_id, matrix)
240
    """
241
```
242

243
### Compliance Utilities
244

245
Compatibility functions for other audio processing libraries.
246

247
```python { .api }
248
# In torchaudio.compliance.kaldi module
249
def fbank(waveform: torch.Tensor, blackman_coeff: float = 0.42, 
250
          channel: int = -1, dither: float = 0.0, energy_floor: float = 1.0,
251
          frame_length: float = 25.0, frame_shift: float = 10.0,
252
          high_freq: float = 0.0, htk_compat: bool = False,
253
          low_freq: float = 20.0, min_duration: float = 0.0,
254
          num_mel_bins: int = 23, preemphasis_coefficient: float = 0.97,
255
          raw_energy: bool = True, remove_dc_offset: bool = True,
256
          round_to_power_of_two: bool = True, sample_frequency: float = 16000.0,
257
          snip_edges: bool = True, subtract_mean: bool = False,
258
          use_energy: bool = False, use_log_fbank: bool = True,
259
          use_power: bool = True, vtln_high: float = -500.0,
260
          vtln_low: float = 100.0, vtln_warp: float = 1.0,
261
          window_type: str = "povey") -> torch.Tensor:
262
    """
263
    Kaldi-compatible filter bank feature extraction.
264
    
265
    Args:
266
        waveform: Input waveform
267
        (many Kaldi-specific parameters...)
268
        
269
    Returns:
270
        torch.Tensor: Filter bank features
271
    """
272

273
def mfcc(waveform: torch.Tensor, num_ceps: int = 13, **kwargs) -> torch.Tensor:
274
    """
275
    Kaldi-compatible MFCC feature extraction.
276
    
277
    Args:
278
        waveform: Input waveform
279
        num_ceps: Number of cepstral coefficients
280
        **kwargs: Additional fbank parameters
281
        
282
    Returns:
283
        torch.Tensor: MFCC features
284
    """
285

286
def spectrogram(waveform: torch.Tensor, **kwargs) -> torch.Tensor:
287
    """Kaldi-compatible spectrogram computation."""
288
```
289

290
## Usage Examples
291

292
### Backend Configuration
293

294
```python
295
import torchaudio
296

297
# Check available backends
298
backends = torchaudio.list_audio_backends()
299
print(f"Available backends: {backends}")
300

301
# Check current backend (returns None in dispatcher mode)
302
current = torchaudio.get_audio_backend()
303
print(f"Current backend: {current}")
304

305
# In older versions, you could set backend manually:
306
# torchaudio.set_audio_backend("sox_io")  # Now deprecated
307
```
308

309
### Asset Management
310

311
```python
312
import torchaudio
313
from torchaudio.utils import download_asset
314

315
# Download sample audio file
316
audio_path = download_asset("steam-train-whistle-daniel_simon.wav")
317
waveform, sample_rate = torchaudio.load(audio_path)
318

319
print(f"Downloaded sample: {audio_path}")
320
print(f"Audio shape: {waveform.shape}")
321
print(f"Sample rate: {sample_rate}")
322

323
# Download tutorial data
324
tutorial_path = download_asset(
325
    "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
326
)
327
```
328

329
### SoX Effects Processing
330

331
```python
332
import torchaudio
333
from torchaudio.sox_effects import apply_effects_tensor, effect_names
334

335
# Check available effects
336
effects = effect_names()
337
print(f"Available SoX effects: {len(effects)}")
338
print(f"First 10 effects: {effects[:10]}")
339

340
# Apply effects chain
341
waveform, sample_rate = torchaudio.load("input.wav")
342

343
effects_chain = [
344
    ["reverb", "50"],           # Add reverb
345
    ["bass", "+5"],             # Boost bass by 5dB
346
    ["treble", "+2"],           # Boost treble by 2dB
347
    ["norm", "-1"],             # Normalize to -1dB
348
    ["rate", "44100"]           # Resample to 44.1kHz
349
]
350

351
processed_waveform, new_sr = apply_effects_tensor(
352
    waveform, sample_rate, effects_chain
353
)
354

355
torchaudio.save("processed.wav", processed_waveform, new_sr)
356
```
357

358
### Format Conversion Utility
359

360
```python
361
import torchaudio
362
from torchaudio.sox_effects import apply_effects_file
363

364
def convert_audio_file(input_path: str, output_path: str, 
365
                      target_sr: int = 44100, target_channels: int = 2):
366
    """Convert audio file format and properties."""
367
    
368
    effects = [
369
        ["channels", str(target_channels)],  # Convert to stereo/mono
370
        ["rate", str(target_sr)],           # Resample
371
        ["norm", "-1"]                      # Normalize
372
    ]
373
    
374
    # Apply effects and load
375
    waveform, sr = apply_effects_file(input_path, effects)
376
    
377
    # Save in new format
378
    torchaudio.save(output_path, waveform, sr)
379
    print(f"Converted {input_path} -> {output_path}")
380
    print(f"New format: {sr} Hz, {waveform.shape[0]} channels")
381

382
# Convert various formats
383
convert_audio_file("input.mp3", "output.wav", target_sr=48000, target_channels=1)
384
```
385

386
### Kaldi Integration
387

388
```python
389
import torchaudio
390
from torchaudio.kaldi_io import read_mat_ark
391

392
# Read Kaldi archive files
393
def process_kaldi_features(ark_file: str):
394
    """Process features from Kaldi ark file."""
395
    
396
    for utterance_id, feature_matrix in read_mat_ark(ark_file):
397
        print(f"Processing {utterance_id}: {feature_matrix.shape}")
398
        
399
        # Convert to PyTorch tensor and process
400
        features = feature_matrix  # Already a tensor
401
        
402
        # Apply processing (e.g., normalization, augmentation)
403
        processed = torchaudio.functional.sliding_window_cmn(
404
            features.T.unsqueeze(0)  # Add batch dim and transpose
405
        ).squeeze(0).T
406
        
407
        # Further processing...
408
        yield utterance_id, processed
409

410
# Process Kaldi ark file
411
# for utt_id, features in process_kaldi_features("features.ark"):
412
#     # Process each utterance
413
#     pass
414
```
415

416
### FFmpeg Capabilities Query
417

418
```python
419
from torchaudio.utils import ffmpeg_utils
420

421
# Check FFmpeg capabilities
422
print(f"FFmpeg version: {ffmpeg_utils.get_ffmpeg_version()}")
423
print(f"Audio decoders: {len(ffmpeg_utils.get_audio_decoders())}")
424
print(f"Audio encoders: {len(ffmpeg_utils.get_audio_encoders())}")
425

426
# Check specific codec support
427
decoders = ffmpeg_utils.get_audio_decoders()
428
encoders = ffmpeg_utils.get_audio_encoders()
429

430
print("Supported formats:")
431
print(f"MP3 decode: {'mp3' in decoders}")
432
print(f"AAC encode: {'aac' in encoders}")
433
print(f"FLAC support: {'flac' in decoders and 'flac' in encoders}")
434
```
435

436
These utilities provide essential infrastructure for audio processing applications, enabling integration with external libraries, format handling, and system configuration.

Version

Tile

Files

utils.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utils.mddocs/