An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.
Ready-to-use neural network models for speech recognition, synthesis, and source separation. TorchAudio provides implementations of state-of-the-art models along with factory functions for creating pre-trained instances.
Neural networks for automatic speech recognition and speech representation learning.
class Wav2Vec2Model(torch.nn.Module):
"""Wav2Vec2 model for speech representation learning."""
def __init__(self, feature_extractor: torch.nn.Module, encoder: torch.nn.Module,
aux: Optional[torch.nn.Module] = None) -> None:
"""
Args:
feature_extractor: CNN feature extractor
encoder: Transformer encoder
aux: Auxiliary output layer (for fine-tuned models)
"""
def forward(self, waveforms: torch.Tensor, lengths: Optional[torch.Tensor] = None) -> Wav2Vec2ModelOutput:
"""
Args:
waveforms: Input audio (..., time)
lengths: Length of each sequence in batch
Returns:
Wav2Vec2ModelOutput with last_hidden_state, extract_features, etc.
"""
def wav2vec2_model(arch: str, num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create Wav2Vec2 model with specified architecture."""
def wav2vec2_base(num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create base Wav2Vec2 model (12 layers, 768 dim)."""
def wav2vec2_large(num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create large Wav2Vec2 model (24 layers, 1024 dim)."""
def wav2vec2_large_lv60k(num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create large Wav2Vec2 model pre-trained on Libri-Light."""
def wav2vec2_xlsr_300m(num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create XLSR-53 300M parameter multilingual model."""
def wav2vec2_xlsr_1b(num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create XLSR-53 1B parameter multilingual model."""
def wav2vec2_xlsr_2b(num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create XLSR-53 2B parameter multilingual model."""
class HuBERTPretrainModel(torch.nn.Module):
"""HuBERT model for self-supervised speech representation learning."""
def __init__(self, feature_extractor: torch.nn.Module, encoder: torch.nn.Module,
final_proj: torch.nn.Module, label_embs_concat: torch.nn.Module,
mask_generator: torch.nn.Module, logit_temp: float) -> None:
"""
Args:
feature_extractor: CNN feature extractor
encoder: Transformer encoder
final_proj: Final projection layer
label_embs_concat: Label embedding concatenation
mask_generator: Mask generator for pre-training
logit_temp: Temperature for logits
"""
def forward(self, waveforms: torch.Tensor, labels: Optional[torch.Tensor] = None,
audio_lengths: Optional[torch.Tensor] = None) -> HuBERTPretrainModelOutput:
"""
Args:
waveforms: Input audio (..., time)
labels: Target labels for pre-training
audio_lengths: Length of each sequence
Returns:
HuBERTPretrainModelOutput with logits, features, etc.
"""
def hubert_base(aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create base HuBERT model."""
def hubert_large(aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create large HuBERT model."""
def hubert_xlarge(aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create extra-large HuBERT model."""
def hubert_pretrain_model(arch: str, aux_num_out: Optional[int] = None) -> HuBERTPretrainModel:
"""Create HuBERT pre-training model."""
def wavlm_model(arch: str, aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create WavLM model with specified architecture."""
def wavlm_base(aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create base WavLM model."""
def wavlm_large(aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
"""Create large WavLM model."""Traditional neural network architectures for speech recognition.
class DeepSpeech(torch.nn.Module):
"""DeepSpeech model for end-to-end speech recognition."""
def __init__(self, n_hidden: int, n_class: int) -> None:
"""
Args:
n_hidden: Number of hidden units in RNN layers
n_class: Number of output classes (characters/phonemes)
"""
def forward(self, x: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
"""
Args:
x: Input features (..., freq, time)
lengths: Length of each sequence
Returns:
Tensor: Logits over character classes (..., time, n_class)
"""
class Wav2Letter(torch.nn.Module):
"""Wav2Letter model for speech recognition."""
def __init__(self, num_classes: int, input_type: str = "waveform",
num_features: Optional[int] = None, num_hidden: int = 1000) -> None:
"""
Args:
num_classes: Number of output classes
input_type: Type of input ("waveform" or "features")
num_features: Number of input features (required if input_type="features")
num_hidden: Number of hidden units
"""
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x: Input tensor (waveform or features)
Returns:
Tensor: Class probabilities
"""Neural transducer models for streaming speech recognition.
class RNNT(torch.nn.Module):
"""RNN-Transducer model for streaming speech recognition."""
def __init__(self, transcriber: torch.nn.Module, predictor: torch.nn.Module,
joiner: torch.nn.Module) -> None:
"""
Args:
transcriber: Encoder network (processes audio features)
predictor: Decoder network (processes previous predictions)
joiner: Joint network (combines encoder and decoder outputs)
"""
def forward(self, sources: torch.Tensor, source_lengths: torch.Tensor,
targets: torch.Tensor, target_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Args:
sources: Input audio features (batch, time, feature_dim)
source_lengths: Length of each audio sequence
targets: Target token sequences (batch, target_time)
target_lengths: Length of each target sequence
Returns:
Tuple of (transcriber_out, predictor_out, joiner_out)
"""
class Conformer(torch.nn.Module):
"""Conformer model combining CNN and self-attention."""
def __init__(self, input_dim: int, num_heads: int, ffn_dim: int, num_layers: int,
depthwise_conv_kernel_size: int = 31, dropout: float = 0.1,
use_group_norm: bool = False, convolution_first: bool = False) -> None:
"""
Args:
input_dim: Input feature dimension
num_heads: Number of attention heads
ffn_dim: Feed-forward network dimension
num_layers: Number of conformer layers
depthwise_conv_kernel_size: Kernel size for depthwise convolution
dropout: Dropout probability
use_group_norm: Whether to use group normalization
convolution_first: Whether to apply convolution before self-attention
"""
def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Args:
input: Input features (batch, time, feature_dim)
lengths: Length of each sequence
Returns:
Tuple of (output, output_lengths)
"""
class Emformer(torch.nn.Module):
"""Emformer model for streaming applications."""
def __init__(self, input_dim: int, num_heads: int, ffn_dim: int, num_layers: int,
segment_length: int, left_context_length: int = 0,
right_context_length: int = 0, max_memory_size: int = 0,
weight_init_scale_strategy: str = "depthwise", tanh_on_mem: bool = False,
negative_inf: float = -1e8) -> None:
"""
Args:
input_dim: Input feature dimension
num_heads: Number of attention heads
ffn_dim: Feed-forward dimension
num_layers: Number of layers
segment_length: Length of each segment
left_context_length: Left context length
right_context_length: Right context length
max_memory_size: Maximum memory size
weight_init_scale_strategy: Weight initialization strategy
tanh_on_mem: Whether to apply tanh on memory
negative_inf: Negative infinity value for masking
"""
def forward(self, input: torch.Tensor, lengths: torch.Tensor,
mems: Optional[List[List[torch.Tensor]]] = None) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
"""
Args:
input: Input features (batch, time, feature_dim)
lengths: Length of each sequence
mems: Previous memory states
Returns:
Tuple of (output, output_lengths, new_mems)
"""
def emformer_rnnt_base(num_symbols: int) -> RNNT:
"""Create base Emformer RNN-T model."""
def emformer_rnnt_model(arch: str, num_symbols: int) -> RNNT:
"""Create Emformer RNN-T model with specified architecture."""Neural networks for text-to-speech synthesis and vocoding.
class Tacotron2(torch.nn.Module):
"""Tacotron2 model for text-to-speech synthesis."""
def __init__(self, mask_padding: bool = False, n_mels: int = 80,
n_frames_per_step: int = 1, n_characters: int = 188,
n_hidden: int = 1024, p_attention_dropout: float = 0.1,
p_decoder_dropout: float = 0.1, prenet_dim: int = 256,
postnet_embedding_dim: int = 512, postnet_kernel_size: int = 5,
postnet_n_convolutions: int = 5, postnet_dropout: float = 0.5,
attention_rnn_dim: int = 1024, attention_dim: int = 128,
attention_location_n_filters: int = 32, attention_location_kernel_size: int = 31,
encoder_embedding_dim: int = 512, encoder_n_convolutions: int = 3,
encoder_kernel_size: int = 5, encoder_dropout: float = 0.5,
decoder_rnn_dim: int = 1024, decoder_max_step: int = 2000,
gate_threshold: float = 0.5, p_teacher_forcing: float = 1.0,
decoder_dropout: float = 0.1, memory_dropout: float = 0.1) -> None:
"""
Args:
mask_padding: Whether to mask padding in loss computation
n_mels: Number of mel frequency bins
n_frames_per_step: Number of frames generated per step
(additional parameters for model architecture configuration)
"""
def forward(self, tokens: torch.Tensor, token_lengths: torch.Tensor,
mel_specgram: Optional[torch.Tensor] = None,
mel_specgram_lengths: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Args:
tokens: Input token sequences (batch, max_token_length)
token_lengths: Length of each token sequence
mel_specgram: Target mel spectrograms (for training)
mel_specgram_lengths: Length of each mel spectrogram
Returns:
Tuple of (mel_outputs, mel_outputs_postnet, gate_outputs)
"""
class WaveRNN(torch.nn.Module):
"""WaveRNN vocoder for high-quality audio generation."""
def __init__(self, upsample_scales: List[int], n_classes: int, hop_length: int,
n_res_block: int = 10, n_rnn: int = 512, n_fc: int = 512,
kernel_size: int = 5, n_freq: int = 128, padding: int = 2) -> None:
"""
Args:
upsample_scales: Upsampling scales for each layer
n_classes: Number of output classes (for mu-law quantization)
hop_length: Hop length for upsampling
n_res_block: Number of residual blocks
n_rnn: RNN hidden dimension
n_fc: Fully connected layer dimension
kernel_size: Convolution kernel size
n_freq: Number of frequency bins
padding: Convolution padding
"""
def forward(self, x: torch.Tensor, mels: torch.Tensor) -> torch.Tensor:
"""
Args:
x: Input audio sequence (batch, time)
mels: Mel spectrogram conditioning (batch, freq, time)
Returns:
Tensor: Output logits (batch, time, n_classes)
"""Neural networks for separating mixed audio into individual sources.
class ConvTasNet(torch.nn.Module):
"""Convolutional Time-domain Audio Source Separation Network."""
def __init__(self, num_sources: int = 2, enc_kernel_size: int = 16,
enc_num_feats: int = 512, msk_kernel_size: int = 3,
msk_num_feats: int = 128, msk_num_hidden_feats: int = 512,
msk_num_layers: int = 8, msk_num_stacks: int = 3,
msk_activate: str = "sigmoid") -> None:
"""
Args:
num_sources: Number of sources to separate
enc_kernel_size: Encoder kernel size
enc_num_feats: Number of encoder features
msk_kernel_size: Mask generator kernel size
msk_num_feats: Number of mask features
msk_num_hidden_feats: Number of hidden features in mask generator
msk_num_layers: Number of layers in each stack
msk_num_stacks: Number of stacks
msk_activate: Activation function for masks
"""
def forward(self, input: torch.Tensor) -> torch.Tensor:
"""
Args:
input: Mixed audio waveform (batch, time)
Returns:
Tensor: Separated sources (batch, num_sources, time)
"""
def conv_tasnet_base(num_sources: int) -> ConvTasNet:
"""Create base ConvTasNet model."""
class HDemucs(torch.nn.Module):
"""Hybrid Demucs model for music source separation."""
def __init__(self, sources: List[str], audio_channels: int = 2, channels: int = 48,
growth: float = 2.0, nfft: int = 4096, wiener_iters: int = 0,
end_iters: int = 0, wiener_residual: bool = False, cac: bool = True,
depth: int = 6, rewrite: bool = True, hybrid: bool = True,
hybrid_old: bool = False, multi_freqs: List[int] = None,
multi_freqs_depth: int = 2, freq_emb: Optional[int] = None,
emb_scale: int = 10, emb_smooth: bool = False,
kernel_size: int = 8, time_stride: int = 2, stride: int = 4,
context: int = 1, context_enc: int = 0, norm_starts: int = 4,
norm_groups: int = 4, dconv_mode: int = 1, dconv_depth: int = 2,
dconv_comp: int = 4, dconv_attn: int = 4, dconv_lstm: int = 4,
dconv_init: float = 1e-4, bottom_channels: int = 0,
clone_kw: Dict[str, Any] = None, num_subbands: int = 1,
spec_complex: bool = True, segment_length: int = 4 * 10 * 44100) -> None:
"""
Args:
sources: List of source names to separate
audio_channels: Number of audio channels
channels: Base number of channels
growth: Channel growth factor per layer
nfft: FFT size for spectral branch
wiener_iters: Number of Wiener filtering iterations
(additional parameters for model configuration)
"""
def forward(self, wav: torch.Tensor) -> torch.Tensor:
"""
Args:
wav: Input audio (batch, channels, time)
Returns:
Tensor: Separated sources (batch, sources, channels, time)
"""
def hdemucs_low() -> HDemucs:
"""Create low-complexity HDemucs model."""
def hdemucs_medium() -> HDemucs:
"""Create medium HDemucs model."""
def hdemucs_high() -> HDemucs:
"""Create high-quality HDemucs model."""Models for objective and subjective speech quality assessment.
class SquimObjective(torch.nn.Module):
"""SQUIM model for objective speech quality assessment."""
def __init__(self, encoder: torch.nn.Module, classifier: torch.nn.Module) -> None:
"""
Args:
encoder: Feature encoder network
classifier: Quality prediction classifier
"""
def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
"""
Args:
waveforms: Input audio (batch, time)
Returns:
Tensor: Quality scores (STOI, PESQ, SI-SDR)
"""
class SquimSubjective(torch.nn.Module):
"""SQUIM model for subjective speech quality assessment."""
def __init__(self, encoder: torch.nn.Module, classifier: torch.nn.Module) -> None:
"""
Args:
encoder: Feature encoder network
classifier: Quality prediction classifier
"""
def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
"""
Args:
waveforms: Input audio (batch, time)
Returns:
Tensor: Subjective quality scores (MOS)
"""
def squim_objective_base() -> SquimObjective:
"""Create base SQUIM objective model."""
def squim_objective_model() -> SquimObjective:
"""Create SQUIM objective model."""
def squim_subjective_base() -> SquimSubjective:
"""Create base SQUIM subjective model."""
def squim_subjective_model() -> SquimSubjective:
"""Create SQUIM subjective model."""Utilities for decoding model outputs, particularly for sequence-to-sequence models.
class RNNTBeamSearch(torch.nn.Module):
"""Beam search decoder for RNN-Transducer models."""
def __init__(self, model: RNNT, blank: int, temperature: float = 1.0,
hyp_sort_score: Optional[Callable] = None,
token_sort_score: Optional[Callable] = None) -> None:
"""
Args:
model: RNN-T model to decode
blank: Blank token index
temperature: Temperature for softmax
hyp_sort_score: Function to score hypotheses
token_sort_score: Function to score tokens
"""
def forward(self, input: torch.Tensor, length: torch.Tensor, beam_width: int,
max_symbol_per_frame: Optional[int] = None) -> List[List[Hypothesis]]:
"""
Args:
input: Input features (batch, time, feature_dim)
length: Length of each sequence
beam_width: Beam search width
max_symbol_per_frame: Maximum symbols per frame
Returns:
List of hypotheses for each batch item
"""
class Hypothesis:
"""Hypothesis object for beam search."""
def __init__(self, score: float, y_sequence: List[int], dec_state: List[List[torch.Tensor]],
lm_state: Optional[Any] = None, lm_score: Optional[torch.Tensor] = None,
tokens: Optional[torch.Tensor] = None, timestep: Optional[torch.Tensor] = None,
last_token: Optional[int] = None) -> None:
"""
Args:
score: Hypothesis score
y_sequence: Sequence of predicted tokens
dec_state: Decoder state
lm_state: Language model state
lm_score: Language model score
tokens: Token probabilities
timestep: Current timestep
last_token: Last predicted token
"""
score: float
y_sequence: List[int]
dec_state: List[List[torch.Tensor]]
lm_state: Optional[Any]
lm_score: Optional[torch.Tensor]
tokens: Optional[torch.Tensor]
timestep: Optional[torch.Tensor]
last_token: Optional[int]Usage example:
import torch
import torchaudio
from torchaudio.models import wav2vec2_base, Tacotron2
# Load pre-trained Wav2Vec2 model
model = wav2vec2_base(num_out=32) # 32 output classes for character recognition
model.eval()
# Process audio with Wav2Vec2
waveform, sample_rate = torchaudio.load("speech.wav")
with torch.no_grad():
features, lengths = model(waveform) # Extract features
logits = model.aux(features) # Get classification logits
# Create Tacotron2 for TTS
tts_model = Tacotron2()
tts_model.eval()
# Synthesize speech (tokens would come from text processing)
tokens = torch.randint(0, 188, (1, 50)) # Random tokens for example
token_lengths = torch.tensor([50])
with torch.no_grad():
mel_outputs, mel_outputs_postnet, gate_outputs = tts_model(tokens, token_lengths)These models provide state-of-the-art capabilities for various audio processing tasks and can be used as building blocks for more complex applications.
Install with Tessl CLI
npx tessl i tessl/pypi-torchaudio