CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-torchaudio

An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.

Overview
Eval results
Files

utils.mddocs/

Utility Functions

Helper functions for audio file management, format conversion, backend configuration, and integration with other audio processing libraries. These utilities provide essential support functionality for TorchAudio applications.

Capabilities

Backend Management

Control and query audio processing backends.

def list_audio_backends() -> List[str]:
    """
    List available audio backends.
    
    Returns:
        List[str]: Available backends (e.g., ["ffmpeg", "sox", "soundfile"])
    """

def get_audio_backend() -> Optional[str]:
    """
    Get currently active audio backend.
    
    Returns:
        Optional[str]: Current backend name or None if using dispatcher mode
    """

def set_audio_backend(backend: Optional[str]) -> None:
    """
    Set global audio backend.
    
    Args:
        backend: Backend name ("sox_io", "soundfile") or None to unset
        
    Note:
        This function is deprecated. Modern TorchAudio uses dispatcher mode
        and automatically selects the best available backend.
    """

Asset Management

Download and manage TorchAudio assets and example files.

def download_asset(filename: str, subfolder: str = "") -> str:
    """
    Download asset file from TorchAudio repository.
    
    Args:
        filename: Name of file to download
        subfolder: Subfolder within assets directory
        
    Returns:
        str: Path to downloaded file
        
    Examples:
        >>> # Download sample audio file
        >>> path = download_asset("steam-train-whistle-daniel_simon.wav")
        >>> waveform, sr = torchaudio.load(path)
        
        >>> # Download tutorial data
        >>> path = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
    """

SoX Utilities

Integration with SoX (Sound eXchange) audio processing library.

# SoX Effects Management
def init_sox_effects() -> None:
    """Initialize SoX effects library."""

def shutdown_sox_effects() -> None:
    """Shutdown SoX effects library and clean up resources."""

def effect_names() -> List[str]:
    """
    Get list of available SoX effects.
    
    Returns:
        List[str]: Names of available SoX effects
    """

def apply_effects_tensor(tensor: torch.Tensor, sample_rate: int, effects: List[List[str]],
                        channels_first: bool = True) -> Tuple[torch.Tensor, int]:
    """
    Apply SoX effects to tensor.
    
    Args:
        tensor: Input audio tensor
        sample_rate: Sample rate of input
        effects: List of effect chains (each effect is [name, *args])
        channels_first: Whether tensor is (channels, time) or (time, channels)
        
    Returns:
        Tuple[torch.Tensor, int]: (processed_tensor, output_sample_rate)
        
    Examples:
        >>> # Apply reverb and normalize
        >>> effects = [
        ...     ["reverb", "50"],
        ...     ["norm", "-1"]  
        ... ]
        >>> processed, sr = apply_effects_tensor(waveform, 44100, effects)
    """

def apply_effects_file(path: str, effects: List[List[str]], normalize: bool = True,
                      channels_first: bool = True, format: Optional[str] = None) -> Tuple[torch.Tensor, int]:
    """
    Apply SoX effects to audio file.
    
    Args:
        path: Path to input audio file
        effects: List of effect chains
        normalize: Whether to normalize output
        channels_first: Whether to return (channels, time) format
        format: Input format override
        
    Returns:
        Tuple[torch.Tensor, int]: (processed_tensor, sample_rate)
    """

SoX Utilities Module

Detailed SoX integration utilities.

# In torchaudio.utils.sox_utils module
def list_effects() -> List[str]:
    """List all available SoX effects."""

def list_read_formats() -> List[str]:
    """List audio formats that SoX can read."""

def list_write_formats() -> List[str]:
    """List audio formats that SoX can write."""

def get_buffer_size() -> int:
    """Get SoX internal buffer size."""

def set_buffer_size(buffer_size: int) -> None:
    """Set SoX internal buffer size."""

def get_verbosity() -> int:
    """Get SoX verbosity level."""

def set_verbosity(verbosity: int) -> None:
    """Set SoX verbosity level."""

FFmpeg Utilities

Integration with FFmpeg media processing framework.

# In torchaudio.utils.ffmpeg_utils module (from torio)
def get_ffmpeg_version() -> str:
    """Get FFmpeg version string."""

def get_supported_decoders() -> List[str]:
    """Get list of supported audio decoders."""

def get_supported_encoders() -> List[str]:
    """Get list of supported audio encoders."""

def get_supported_demuxers() -> List[str]:
    """Get list of supported demuxers (input formats)."""

def get_supported_muxers() -> List[str]:
    """Get list of supported muxers (output formats)."""

def get_audio_decoders() -> List[str]:
    """Get audio-specific decoders."""

def get_audio_encoders() -> List[str]:
    """Get audio-specific encoders."""

Kaldi I/O Integration

Functions for working with Kaldi ASR toolkit file formats.

def read_vec_int_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
    """
    Read integer vector ark files.
    
    Args:
        file_or_fd: File path or file descriptor
        
    Yields:
        Tuple[str, torch.Tensor]: (utterance_id, vector)
    """

def read_vec_flt_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
    """
    Read float vector ark files.
    
    Args:
        file_or_fd: File path or file descriptor
        
    Yields:
        Tuple[str, torch.Tensor]: (utterance_id, vector)
    """

def read_vec_flt_scp(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
    """
    Read float vector scp files.
    
    Args:
        file_or_fd: File path or file descriptor
        
    Yields:
        Tuple[str, torch.Tensor]: (utterance_id, vector)
    """

def read_mat_ark(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
    """
    Read matrix ark files.
    
    Args:
        file_or_fd: File path or file descriptor
        
    Yields:
        Tuple[str, torch.Tensor]: (utterance_id, matrix)
    """

def read_mat_scp(file_or_fd: Any) -> Iterable[Tuple[str, torch.Tensor]]:
    """
    Read matrix scp files.
    
    Args:
        file_or_fd: File path or file descriptor
        
    Yields:
        Tuple[str, torch.Tensor]: (utterance_id, matrix)
    """

Compliance Utilities

Compatibility functions for other audio processing libraries.

# In torchaudio.compliance.kaldi module
def fbank(waveform: torch.Tensor, blackman_coeff: float = 0.42, 
          channel: int = -1, dither: float = 0.0, energy_floor: float = 1.0,
          frame_length: float = 25.0, frame_shift: float = 10.0,
          high_freq: float = 0.0, htk_compat: bool = False,
          low_freq: float = 20.0, min_duration: float = 0.0,
          num_mel_bins: int = 23, preemphasis_coefficient: float = 0.97,
          raw_energy: bool = True, remove_dc_offset: bool = True,
          round_to_power_of_two: bool = True, sample_frequency: float = 16000.0,
          snip_edges: bool = True, subtract_mean: bool = False,
          use_energy: bool = False, use_log_fbank: bool = True,
          use_power: bool = True, vtln_high: float = -500.0,
          vtln_low: float = 100.0, vtln_warp: float = 1.0,
          window_type: str = "povey") -> torch.Tensor:
    """
    Kaldi-compatible filter bank feature extraction.
    
    Args:
        waveform: Input waveform
        (many Kaldi-specific parameters...)
        
    Returns:
        torch.Tensor: Filter bank features
    """

def mfcc(waveform: torch.Tensor, num_ceps: int = 13, **kwargs) -> torch.Tensor:
    """
    Kaldi-compatible MFCC feature extraction.
    
    Args:
        waveform: Input waveform
        num_ceps: Number of cepstral coefficients
        **kwargs: Additional fbank parameters
        
    Returns:
        torch.Tensor: MFCC features
    """

def spectrogram(waveform: torch.Tensor, **kwargs) -> torch.Tensor:
    """Kaldi-compatible spectrogram computation."""

Usage Examples

Backend Configuration

import torchaudio

# Check available backends
backends = torchaudio.list_audio_backends()
print(f"Available backends: {backends}")

# Check current backend (returns None in dispatcher mode)
current = torchaudio.get_audio_backend()
print(f"Current backend: {current}")

# In older versions, you could set backend manually:
# torchaudio.set_audio_backend("sox_io")  # Now deprecated

Asset Management

import torchaudio
from torchaudio.utils import download_asset

# Download sample audio file
audio_path = download_asset("steam-train-whistle-daniel_simon.wav")
waveform, sample_rate = torchaudio.load(audio_path)

print(f"Downloaded sample: {audio_path}")
print(f"Audio shape: {waveform.shape}")
print(f"Sample rate: {sample_rate}")

# Download tutorial data
tutorial_path = download_asset(
    "tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav"
)

SoX Effects Processing

import torchaudio
from torchaudio.sox_effects import apply_effects_tensor, effect_names

# Check available effects
effects = effect_names()
print(f"Available SoX effects: {len(effects)}")
print(f"First 10 effects: {effects[:10]}")

# Apply effects chain
waveform, sample_rate = torchaudio.load("input.wav")

effects_chain = [
    ["reverb", "50"],           # Add reverb
    ["bass", "+5"],             # Boost bass by 5dB
    ["treble", "+2"],           # Boost treble by 2dB
    ["norm", "-1"],             # Normalize to -1dB
    ["rate", "44100"]           # Resample to 44.1kHz
]

processed_waveform, new_sr = apply_effects_tensor(
    waveform, sample_rate, effects_chain
)

torchaudio.save("processed.wav", processed_waveform, new_sr)

Format Conversion Utility

import torchaudio
from torchaudio.sox_effects import apply_effects_file

def convert_audio_file(input_path: str, output_path: str, 
                      target_sr: int = 44100, target_channels: int = 2):
    """Convert audio file format and properties."""
    
    effects = [
        ["channels", str(target_channels)],  # Convert to stereo/mono
        ["rate", str(target_sr)],           # Resample
        ["norm", "-1"]                      # Normalize
    ]
    
    # Apply effects and load
    waveform, sr = apply_effects_file(input_path, effects)
    
    # Save in new format
    torchaudio.save(output_path, waveform, sr)
    print(f"Converted {input_path} -> {output_path}")
    print(f"New format: {sr} Hz, {waveform.shape[0]} channels")

# Convert various formats
convert_audio_file("input.mp3", "output.wav", target_sr=48000, target_channels=1)

Kaldi Integration

import torchaudio
from torchaudio.kaldi_io import read_mat_ark

# Read Kaldi archive files
def process_kaldi_features(ark_file: str):
    """Process features from Kaldi ark file."""
    
    for utterance_id, feature_matrix in read_mat_ark(ark_file):
        print(f"Processing {utterance_id}: {feature_matrix.shape}")
        
        # Convert to PyTorch tensor and process
        features = feature_matrix  # Already a tensor
        
        # Apply processing (e.g., normalization, augmentation)
        processed = torchaudio.functional.sliding_window_cmn(
            features.T.unsqueeze(0)  # Add batch dim and transpose
        ).squeeze(0).T
        
        # Further processing...
        yield utterance_id, processed

# Process Kaldi ark file
# for utt_id, features in process_kaldi_features("features.ark"):
#     # Process each utterance
#     pass

FFmpeg Capabilities Query

from torchaudio.utils import ffmpeg_utils

# Check FFmpeg capabilities
print(f"FFmpeg version: {ffmpeg_utils.get_ffmpeg_version()}")
print(f"Audio decoders: {len(ffmpeg_utils.get_audio_decoders())}")
print(f"Audio encoders: {len(ffmpeg_utils.get_audio_encoders())}")

# Check specific codec support
decoders = ffmpeg_utils.get_audio_decoders()
encoders = ffmpeg_utils.get_audio_encoders()

print("Supported formats:")
print(f"MP3 decode: {'mp3' in decoders}")
print(f"AAC encode: {'aac' in encoders}")
print(f"FLAC support: {'flac' in decoders and 'flac' in encoders}")

These utilities provide essential infrastructure for audio processing applications, enabling integration with external libraries, format handling, and system configuration.

Install with Tessl CLI

npx tessl i tessl/pypi-torchaudio

docs

audio-io.md

datasets.md

effects.md

functional.md

index.md

models.md

pipelines.md

streaming.md

transforms.md

utils.md

tile.json