An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.
Pre-configured model bundles with preprocessing, inference, and post-processing for production-ready audio applications. Pipelines provide complete workflows for ASR, TTS, source separation, and speech quality assessment with pre-trained weights and consistent interfaces.
Base classes that provide common functionality for all pipeline bundles.
class Wav2Vec2Bundle:
"""Base bundle for Wav2Vec2 models."""
def get_model(self) -> Wav2Vec2Model:
"""
Get the Wav2Vec2 model.
Returns:
Wav2Vec2Model: Pre-trained model instance
"""
def get_labels(self) -> List[str]:
"""
Get the class labels.
Returns:
List[str]: List of class labels (characters, phonemes, etc.)
"""
sample_rate: int # Expected sample rate for input audio
class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
"""Bundle for Wav2Vec2 automatic speech recognition models."""
def get_model(self) -> Wav2Vec2Model:
"""Get the fine-tuned ASR model."""
def get_decoder(self) -> torch.nn.Module:
"""
Get the decoder for converting logits to text.
Returns:
torch.nn.Module: Decoder module (e.g., CTC decoder)
"""
class Wav2Vec2FABundle(Wav2Vec2Bundle):
"""Bundle for Wav2Vec2 forced alignment models."""
def get_model(self) -> Wav2Vec2Model:
"""Get the forced alignment model."""
def get_dict(self) -> Dict[str, int]:
"""
Get the token dictionary for alignment.
Returns:
Dict[str, int]: Mapping from tokens to indices
"""
class Tacotron2TTSBundle:
"""Bundle for Tacotron2 text-to-speech synthesis."""
def get_tacotron2(self) -> Tacotron2:
"""
Get the Tacotron2 model.
Returns:
Tacotron2: Pre-trained synthesis model
"""
def get_vocoder(self) -> torch.nn.Module:
"""
Get the vocoder for converting mel spectrograms to audio.
Returns:
torch.nn.Module: Vocoder model (WaveRNN or Griffin-Lim)
"""
def get_text_processor(self) -> torch.nn.Module:
"""
Get the text processor for converting text to tokens.
Returns:
torch.nn.Module: Text processing pipeline
"""
sample_rate: int # Output sample rate
class RNNTBundle:
"""Bundle for RNN-Transducer streaming ASR models."""
def get_model(self) -> RNNT:
"""
Get the RNN-T model.
Returns:
RNNT: Pre-trained RNN-Transducer model
"""
def get_decoder(self) -> RNNTBeamSearch:
"""
Get the beam search decoder.
Returns:
RNNTBeamSearch: Configured beam search decoder
"""
def get_tokens(self) -> List[str]:
"""
Get the token vocabulary.
Returns:
List[str]: List of tokens (characters, subwords, etc.)
"""
sample_rate: int
class SourceSeparationBundle:
"""Bundle for source separation models."""
def get_model(self) -> torch.nn.Module:
"""
Get the source separation model.
Returns:
torch.nn.Module: Pre-trained separation model
"""
def get_source_labels(self) -> List[str]:
"""
Get the source labels.
Returns:
List[str]: Names of separated sources (e.g., ["vocals", "drums", "bass", "other"])
"""
sample_rate: int
class SquimObjectiveBundle:
"""Bundle for objective speech quality assessment."""
def get_model(self) -> SquimObjective:
"""
Get the SQUIM objective model.
Returns:
SquimObjective: Pre-trained quality assessment model
"""
sample_rate: int
class SquimSubjectiveBundle:
"""Bundle for subjective speech quality assessment."""
def get_model(self) -> SquimSubjective:
"""
Get the SQUIM subjective model.
Returns:
SquimSubjective: Pre-trained quality assessment model
"""
sample_rate: intSelf-supervised speech representation models trained on large-scale unlabeled audio.
# Base models (self-supervised representations)
WAV2VEC2_BASE: Wav2Vec2Bundle # Base model (12 layers, 768 dim) trained on LibriSpeech
WAV2VEC2_LARGE: Wav2Vec2Bundle # Large model (24 layers, 1024 dim) trained on LibriSpeech
WAV2VEC2_LARGE_LV60K: Wav2Vec2Bundle # Large model trained on 60k hours of Libri-Light
# Cross-lingual models
WAV2VEC2_XLSR53: Wav2Vec2Bundle # Cross-lingual model trained on 53 languages
WAV2VEC2_XLSR_300M: Wav2Vec2Bundle # 300M parameter multilingual model
WAV2VEC2_XLSR_1B: Wav2Vec2Bundle # 1B parameter multilingual model
WAV2VEC2_XLSR_2B: Wav2Vec2Bundle # 2B parameter multilingual model
# Fine-tuned ASR models (English)
WAV2VEC2_ASR_BASE_10M: Wav2Vec2ASRBundle # Base model fine-tuned on 10min LibriSpeech
WAV2VEC2_ASR_BASE_100H: Wav2Vec2ASRBundle # Base model fine-tuned on 100h LibriSpeech
WAV2VEC2_ASR_BASE_960H: Wav2Vec2ASRBundle # Base model fine-tuned on 960h LibriSpeech
WAV2VEC2_ASR_LARGE_10M: Wav2Vec2ASRBundle # Large model fine-tuned on 10min LibriSpeech
WAV2VEC2_ASR_LARGE_100H: Wav2Vec2ASRBundle # Large model fine-tuned on 100h LibriSpeech
WAV2VEC2_ASR_LARGE_960H: Wav2Vec2ASRBundle # Large model fine-tuned on 960h LibriSpeech
WAV2VEC2_ASR_LARGE_LV60K_10M: Wav2Vec2ASRBundle # LV60K model fine-tuned on 10min
WAV2VEC2_ASR_LARGE_LV60K_100H: Wav2Vec2ASRBundle # LV60K model fine-tuned on 100h
WAV2VEC2_ASR_LARGE_LV60K_960H: Wav2Vec2ASRBundle # LV60K model fine-tuned on 960h
# Multilingual ASR models (VoxPopuli)
VOXPOPULI_ASR_BASE_10K_EN: Wav2Vec2ASRBundle # English ASR on VoxPopuli
VOXPOPULI_ASR_BASE_10K_ES: Wav2Vec2ASRBundle # Spanish ASR on VoxPopuli
VOXPOPULI_ASR_BASE_10K_DE: Wav2Vec2ASRBundle # German ASR on VoxPopuli
VOXPOPULI_ASR_BASE_10K_FR: Wav2Vec2ASRBundle # French ASR on VoxPopuli
VOXPOPULI_ASR_BASE_10K_IT: Wav2Vec2ASRBundle # Italian ASR on VoxPopuliSelf-supervised speech models using hidden unit BERT approach.
# Base HuBERT models
HUBERT_BASE: Wav2Vec2Bundle # Base HuBERT model (12 layers, 768 dim)
HUBERT_LARGE: Wav2Vec2Bundle # Large HuBERT model (24 layers, 1024 dim)
HUBERT_XLARGE: Wav2Vec2Bundle # Extra-large HuBERT model (24 layers, 1280 dim)
# Fine-tuned ASR models
HUBERT_ASR_LARGE: Wav2Vec2ASRBundle # Large HuBERT fine-tuned for ASR
HUBERT_ASR_XLARGE: Wav2Vec2ASRBundle # XLarge HuBERT fine-tuned for ASR
# Forced alignment model
MMS_FA: Wav2Vec2FABundle # Multilingual forced alignment model (Massively Multilingual Speech)Models trained for various speech processing tasks including speaker verification.
WAVLM_BASE: Wav2Vec2Bundle # Base WavLM model
WAVLM_BASE_PLUS: Wav2Vec2Bundle # Base WavLM model with additional training
WAVLM_LARGE: Wav2Vec2Bundle # Large WavLM modelComplete text-to-speech synthesis pipelines.
# Tacotron2 + Griffin-Lim vocoder
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH: Tacotron2TTSBundle # Character-based, Griffin-Lim vocoder
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH: Tacotron2TTSBundle # Phoneme-based, Griffin-Lim vocoder
# Tacotron2 + WaveRNN vocoder
TACOTRON2_WAVERNN_CHAR_LJSPEECH: Tacotron2TTSBundle # Character-based, WaveRNN vocoder
TACOTRON2_WAVERNN_PHONE_LJSPEECH: Tacotron2TTSBundle # Phoneme-based, WaveRNN vocoderStreaming speech recognition models.
EMFORMER_RNNT_BASE_LIBRISPEECH: RNNTBundle # Emformer-based RNN-T trained on LibriSpeechModels for separating mixed audio into individual sources.
# Speech separation
CONVTASNET_BASE_LIBRI2MIX: SourceSeparationBundle # ConvTasNet trained on Libri2Mix dataset
# Music separation
HDEMUCS_HIGH_MUSDB: SourceSeparationBundle # High-quality HDemucs trained on MUSDB18
HDEMUCS_HIGH_MUSDB_PLUS: SourceSeparationBundle # HDemucs trained on MUSDB18-HQ with extra dataModels for evaluating speech quality and intelligibility.
SQUIM_OBJECTIVE: SquimObjectiveBundle # Objective quality metrics (STOI, PESQ, SI-SDR)
SQUIM_SUBJECTIVE: SquimSubjectiveBundle # Subjective quality metrics (MOS prediction)import torch
import torchaudio
from torchaudio.pipelines import WAV2VEC2_ASR_BASE_960H
# Load bundle and models
bundle = WAV2VEC2_ASR_BASE_960H
model = bundle.get_model()
decoder = bundle.get_decoder()
labels = bundle.get_labels()
# Load and preprocess audio
waveform, sample_rate = torchaudio.load("speech.wav")
if sample_rate != bundle.sample_rate:
waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
# Run inference
model.eval()
with torch.no_grad():
emission, lengths = model(waveform)
# Decode to text
transcripts = decoder(emission, lengths)
transcript = "".join([labels[i] for i in transcripts[0][0].tokens])
print(f"Transcript: {transcript}")import torch
import torchaudio
from torchaudio.pipelines import TACOTRON2_WAVERNN_CHAR_LJSPEECH
# Load bundle and models
bundle = TACOTRON2_WAVERNN_CHAR_LJSPEECH
tacotron2 = bundle.get_tacotron2()
vocoder = bundle.get_vocoder()
text_processor = bundle.get_text_processor()
# Process text to tokens
text = "Hello, this is a test of text-to-speech synthesis."
tokens, token_lengths = text_processor(text)
# Generate mel spectrogram
tacotron2.eval()
with torch.no_grad():
mel_outputs, mel_outputs_postnet, gate_outputs = tacotron2(tokens, token_lengths)
# Generate audio with vocoder
vocoder.eval()
with torch.no_grad():
waveform = vocoder(mel_outputs_postnet)
# Save generated audio
torchaudio.save("synthesized.wav", waveform, bundle.sample_rate)import torch
import torchaudio
from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB
# Load bundle and model
bundle = HDEMUCS_HIGH_MUSDB
model = bundle.get_model()
source_labels = bundle.get_source_labels() # ["drums", "bass", "other", "vocals"]
# Load audio
waveform, sample_rate = torchaudio.load("mixed_music.wav")
if sample_rate != bundle.sample_rate:
waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
# Ensure stereo and correct shape
if waveform.shape[0] == 1:
waveform = waveform.repeat(2, 1) # Convert mono to stereo
waveform = waveform.unsqueeze(0) # Add batch dimension: (1, channels, time)
# Separate sources
model.eval()
with torch.no_grad():
sources = model(waveform) # (1, sources, channels, time)
# Save separated sources
for i, source_name in enumerate(source_labels):
source_audio = sources[0, i] # (channels, time)
torchaudio.save(f"separated_{source_name}.wav", source_audio, bundle.sample_rate)import torch
import torchaudio
from torchaudio.pipelines import SQUIM_OBJECTIVE
# Load bundle and model
bundle = SQUIM_OBJECTIVE
model = bundle.get_model()
# Load audio
waveform, sample_rate = torchaudio.load("speech_sample.wav")
if sample_rate != bundle.sample_rate:
waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
# Assess quality
model.eval()
with torch.no_grad():
scores = model(waveform) # Returns [STOI, PESQ, SI-SDR] scores
print(f"STOI: {scores[0]:.3f}") # Speech Transmission Index Objective
print(f"PESQ: {scores[1]:.3f}") # Perceptual Evaluation of Speech Quality
print(f"SI-SDR: {scores[2]:.3f}") # Scale-Invariant Signal-to-Distortion Ratioimport torch
import torchaudio
from torchaudio.pipelines import WAV2VEC2_XLSR53
# Load multilingual model
bundle = WAV2VEC2_XLSR53
model = bundle.get_model()
# Load audio in any supported language
waveform, sample_rate = torchaudio.load("multilingual_speech.wav")
if sample_rate != bundle.sample_rate:
waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
# Extract features (can be used for downstream tasks)
model.eval()
with torch.no_grad():
features, lengths = model(waveform)
# Features can be used for language identification, ASR, etc.
print(f"Feature shape: {features.shape}") # (batch, time, feature_dim)These pipelines provide production-ready solutions for common audio processing tasks, with pre-trained weights and optimized preprocessing/postprocessing workflows.
Install with Tessl CLI
npx tessl i tessl/pypi-torchaudio