An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.
npx @tessl/cli install tessl/pypi-torchaudio@2.8.0A comprehensive audio processing library for PyTorch that provides GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities specifically designed for audio data. TorchAudio supports loading and saving various audio formats, offers dataloaders for common audio datasets, implements essential audio transforms, and provides pre-trained models for speech recognition, synthesis, and source separation.
pip install torchaudioimport torchaudioCommon import patterns:
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from torchaudio.models import Wav2Vec2Model
from torchaudio.datasets import LIBRISPEECHimport torchaudio
import torch
# Load audio file
waveform, sample_rate = torchaudio.load("audio.wav")
print(f"Shape: {waveform.shape}, Sample rate: {sample_rate}")
# Apply spectrogram transform
spectrogram_transform = torchaudio.transforms.Spectrogram(
n_fft=1024,
hop_length=512
)
spectrogram = spectrogram_transform(waveform)
# Apply mel spectrogram
mel_transform = torchaudio.transforms.MelSpectrogram(
sample_rate=sample_rate,
n_mels=80
)
mel_spectrogram = mel_transform(waveform)
# Save processed audio
torchaudio.save("processed_audio.wav", waveform, sample_rate)TorchAudio is built around several key architectural components:
This design ensures seamless integration with PyTorch's autograd system and enables end-to-end differentiable audio processing pipelines.
Core functionality for loading, saving, and managing audio files with support for multiple backends and formats. Includes metadata extraction and backend management.
def load(filepath: str, frame_offset: int = 0, num_frames: int = -1,
normalize: bool = True, channels_first: bool = True,
format: Optional[str] = None) -> Tuple[torch.Tensor, int]: ...
def save(filepath: str, src: torch.Tensor, sample_rate: int,
channels_first: bool = True, compression: Optional[float] = None) -> None: ...
def info(filepath: str, format: Optional[str] = None) -> AudioMetaData: ...
class AudioMetaData:
sample_rate: int
num_frames: int
num_channels: int
bits_per_sample: int
encoding: strExtensive collection of stateless audio processing functions including spectral analysis, filtering, resampling, pitch manipulation, and advanced signal processing algorithms.
def spectrogram(waveform: torch.Tensor, pad: int = 0, window: torch.Tensor = None,
n_fft: int = 400, hop_length: Optional[int] = None,
win_length: Optional[int] = None, power: Optional[float] = 2.0,
normalized: bool = False) -> torch.Tensor: ...
def melscale_fbanks(n_freqs: int, f_min: float, f_max: float, n_mels: int,
sample_rate: int, norm: Optional[str] = None,
mel_scale: str = "htk") -> torch.Tensor: ...
def resample(waveform: torch.Tensor, orig_freq: int, new_freq: int,
resampling_method: str = "sinc_interp_kaiser") -> torch.Tensor: ...PyTorch-compatible transform classes for building differentiable audio processing pipelines. Includes spectral transforms, data augmentation, and preprocessing transforms.
class Spectrogram(torch.nn.Module):
def __init__(self, n_fft: int = 400, win_length: Optional[int] = None,
hop_length: Optional[int] = None, pad: int = 0,
window_fn: Callable = torch.hann_window, power: Optional[float] = 2.0): ...
def forward(self, waveform: torch.Tensor) -> torch.Tensor: ...
class MelSpectrogram(torch.nn.Module):
def __init__(self, sample_rate: int = 16000, n_fft: int = 400,
win_length: Optional[int] = None, hop_length: Optional[int] = None,
f_min: float = 0., f_max: Optional[float] = None, n_mels: int = 128): ...
def forward(self, waveform: torch.Tensor) -> torch.Tensor: ...
class MFCC(torch.nn.Module):
def __init__(self, sample_rate: int = 16000, n_mfcc: int = 40,
dct_type: int = 2, norm: str = "ortho", log_mels: bool = False): ...
def forward(self, waveform: torch.Tensor) -> torch.Tensor: ...Ready-to-use neural network models for speech recognition, synthesis, and source separation. Includes Wav2Vec2, HuBERT, Tacotron2, WaveRNN, and source separation models.
class Wav2Vec2Model(torch.nn.Module):
def __init__(self, feature_extractor: torch.nn.Module, encoder: torch.nn.Module): ...
def forward(self, waveforms: torch.Tensor) -> torch.Tensor: ...
def wav2vec2_base(num_out: Optional[int] = None) -> Wav2Vec2Model: ...
def wav2vec2_large(num_out: Optional[int] = None) -> Wav2Vec2Model: ...
class Tacotron2(torch.nn.Module):
def __init__(self, mask_padding: bool = False, n_mels: int = 80): ...
def forward(self, tokens: torch.Tensor, token_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: ...
class ConvTasNet(torch.nn.Module):
def __init__(self, num_sources: int = 2, enc_kernel_size: int = 16): ...
def forward(self, waveforms: torch.Tensor) -> torch.Tensor: ...Pre-configured model bundles with preprocessing, inference, and post-processing for production-ready audio applications. Includes ASR, TTS, and source separation pipelines.
class Wav2Vec2Bundle:
def get_model(self) -> Wav2Vec2Model: ...
def get_labels(self) -> List[str]: ...
sample_rate: int
class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
def get_model(self) -> Wav2Vec2Model: ...
def get_decoder(self) -> torch.nn.Module: ...
# Pre-trained bundle instances
WAV2VEC2_ASR_BASE_960H: Wav2Vec2ASRBundle
HUBERT_ASR_LARGE: Wav2Vec2ASRBundle
TACOTRON2_WAVERNN_CHAR_LJSPEECH: Tacotron2TTSBundleStandard dataset loaders for common audio datasets with consistent interfaces and preprocessing. Supports speech recognition, synthesis, music analysis, and source separation datasets.
class LIBRISPEECH(torch.utils.data.Dataset):
def __init__(self, root: str, url: str = "train-clean-100",
folder_in_archive: str = "LibriSpeech", download: bool = False): ...
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, int, int, int]: ...
class SPEECHCOMMANDS(torch.utils.data.Dataset):
def __init__(self, root: str, url: str = "speech_commands_v0.02",
folder_in_archive: str = "SpeechCommands", download: bool = False): ...
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, int]: ...
class LJSPEECH(torch.utils.data.Dataset):
def __init__(self, root: str, url: str = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",
folder_in_archive: str = "LJSpeech-1.1", download: bool = False): ...
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str]: ...Advanced streaming capabilities for real-time audio processing, media encoding/decoding, and efficient handling of large audio files.
class StreamReader:
def __init__(self, src: str, format: Optional[str] = None, option: Optional[Dict[str, str]] = None): ...
def add_basic_audio_stream(self, frames_per_chunk: int, buffer_chunk_size: int = 3,
stream_index: Optional[int] = None, decoder: Optional[str] = None) -> int: ...
def process_packet(self, timeout: Optional[float] = None, backoff: float = 10.) -> int: ...
class StreamWriter:
def __init__(self, dst: str, format: Optional[str] = None, option: Optional[Dict[str, str]] = None): ...
def add_audio_stream(self, sample_rate: int, num_channels: int, format: str = "fltp",
encoder: Optional[str] = None, codec_config: Optional[CodecConfig] = None) -> int: ...
def write_audio_chunk(self, stream_index: int, chunk: torch.Tensor, pts: Optional[int] = None) -> None: ...Comprehensive audio effects processing including filters, EQ, dynamic effects, and spatial audio processing capabilities.
def biquad(waveform: torch.Tensor, b0: float, b1: float, b2: float,
a0: float, a1: float, a2: float) -> torch.Tensor: ...
def lowpass_biquad(waveform: torch.Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> torch.Tensor: ...
def highpass_biquad(waveform: torch.Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> torch.Tensor: ...
def flanger(waveform: torch.Tensor, sample_rate: int, delay: float = 0.0,
depth: float = 2.0, regen: float = 0.0, width: float = 71.0) -> torch.Tensor: ...
def phaser(waveform: torch.Tensor, sample_rate: int, gain_in: float = 0.4,
gain_out: float = 0.74, delay_ms: float = 3.0, decay: float = 0.4) -> torch.Tensor: ...Helper functions for audio file management, format conversion, backend configuration, and integration with other audio processing libraries.
def list_audio_backends() -> List[str]: ...
def get_audio_backend() -> Optional[str]: ...
def set_audio_backend(backend: Optional[str]) -> None: ...
def download_asset(filename: str, subfolder: str = "") -> str: ...
# SoX utilities
def init_sox_effects() -> None: ...
def shutdown_sox_effects() -> None: ...
def effect_names() -> List[str]: ...