An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.
Core functionality for loading, saving, and managing audio files with support for multiple backends and formats. TorchAudio provides a unified interface that works across different audio backends (FFmpeg, SoX, SoundFile) while maintaining consistent behavior and PyTorch tensor integration.
Load audio files into PyTorch tensors with control over format, channel layout, and data windowing.
def load(filepath: str, frame_offset: int = 0, num_frames: int = -1,
normalize: bool = True, channels_first: bool = True,
format: Optional[str] = None) -> Tuple[torch.Tensor, int]:
"""
Load audio file into tensor.
Args:
filepath: Path to audio file
frame_offset: Number of frames to skip at beginning
num_frames: Number of frames to load (-1 for all)
normalize: Whether to normalize audio to [-1, 1] range
channels_first: Whether to return shape (channels, time) or (time, channels)
format: Audio format override (auto-detected if None)
Returns:
Tuple of (waveform tensor, sample_rate)
- waveform: Audio data as tensor with shape (channels, samples) if channels_first=True
- sample_rate: Sample rate in Hz
"""Usage example:
import torchaudio
# Load entire audio file
waveform, sample_rate = torchaudio.load("speech.wav")
print(f"Shape: {waveform.shape}, Sample rate: {sample_rate}")
# Load specific segment (1 second starting at 2 seconds)
segment, sr = torchaudio.load("speech.wav", frame_offset=2*16000, num_frames=16000)
# Load with different channel ordering
waveform_tcf, sr = torchaudio.load("speech.wav", channels_first=False) # (time, channels)Save PyTorch tensors as audio files with format control and compression options.
def save(filepath: str, src: torch.Tensor, sample_rate: int,
channels_first: bool = True, compression: Optional[float] = None) -> None:
"""
Save tensor as audio file.
Args:
filepath: Output path (format determined by extension)
src: Audio tensor to save
sample_rate: Sample rate in Hz
channels_first: Whether input tensor has shape (channels, time) or (time, channels)
compression: Compression level (format-dependent, None for default)
"""Usage example:
import torch
import torchaudio
# Create synthetic audio
sample_rate = 16000
duration = 3 # 3 seconds
t = torch.linspace(0, duration, int(sample_rate * duration))
waveform = torch.sin(2 * torch.pi * 440 * t).unsqueeze(0) # 440 Hz sine wave
# Save in different formats
torchaudio.save("output.wav", waveform, sample_rate)
torchaudio.save("output.mp3", waveform, sample_rate, compression=128) # 128 kbps
torchaudio.save("output.flac", waveform, sample_rate)Extract metadata from audio files without loading the full audio data.
def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
"""
Get audio file metadata.
Args:
filepath: Path to audio file
format: Audio format override (auto-detected if None)
Returns:
AudioMetaData object with file information
"""
class AudioMetaData:
"""Audio file metadata container."""
sample_rate: int # Sample rate in Hz
num_frames: int # Total number of audio frames
num_channels: int # Number of audio channels
bits_per_sample: int # Bits per sample (bit depth)
encoding: str # Audio encoding formatUsage example:
import torchaudio
# Get file info without loading audio
metadata = torchaudio.info("audio.wav")
print(f"Duration: {metadata.num_frames / metadata.sample_rate:.2f} seconds")
print(f"Channels: {metadata.num_channels}")
print(f"Sample rate: {metadata.sample_rate} Hz")
print(f"Encoding: {metadata.encoding}")
print(f"Bit depth: {metadata.bits_per_sample}")Advanced loading and saving using TorchCodec backend for additional format support and streaming capabilities.
def load_with_torchcodec(filepath: str, **kwargs) -> Tuple[torch.Tensor, int]:
"""
Load audio using TorchCodec backend.
Args:
filepath: Path to audio file
**kwargs: Additional TorchCodec-specific options
Returns:
Tuple of (waveform tensor, sample_rate)
"""
def save_with_torchcodec(filepath: str, src: torch.Tensor, sample_rate: int, **kwargs) -> None:
"""
Save audio using TorchCodec backend.
Args:
filepath: Output path
src: Audio tensor to save
sample_rate: Sample rate in Hz
**kwargs: Additional TorchCodec-specific options
"""Control which audio backend is used for I/O operations across TorchAudio.
def list_audio_backends() -> List[str]:
"""
List available audio backends.
Returns:
List of backend names: ["ffmpeg", "sox", "soundfile"]
"""
def get_audio_backend() -> Optional[str]:
"""
Get currently active audio backend.
Returns:
Backend name or None if using dispatcher mode
"""
def set_audio_backend(backend: Optional[str]) -> None:
"""
Set global audio backend.
Args:
backend: Backend name ("sox_io", "soundfile") or None to unset
Note:
This function is deprecated with dispatcher mode enabled.
Modern TorchAudio automatically selects the best backend.
"""Usage example:
import torchaudio
# Check available backends
backends = torchaudio.list_audio_backends()
print(f"Available backends: {backends}")
# Check current backend (returns None in dispatcher mode)
current = torchaudio.get_audio_backend()
print(f"Current backend: {current}")TorchAudio supports a wide variety of audio formats through its multiple backends:
Common exceptions when working with audio I/O:
import torchaudio
try:
waveform, sr = torchaudio.load("nonexistent.wav")
except FileNotFoundError:
print("Audio file not found")
try:
waveform, sr = torchaudio.load("corrupted.wav")
except RuntimeError as e:
print(f"Failed to load audio: {e}")
try:
torchaudio.save("readonly/output.wav", waveform, sr)
except PermissionError:
print("Cannot write to readonly directory")Install with Tessl CLI
npx tessl i tessl/pypi-torchaudio