tessl/pypi-torchaudio

An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.

Overview

Eval results

Files

Model Pipelines

Name: tessl/pypi-torchaudio
Author: tessl

Pre-configured model bundles with preprocessing, inference, and post-processing for production-ready audio applications. Pipelines provide complete workflows for ASR, TTS, source separation, and speech quality assessment with pre-trained weights and consistent interfaces.

Capabilities

Pipeline Bundle Base Classes

Base classes that provide common functionality for all pipeline bundles.

class Wav2Vec2Bundle:
    """Base bundle for Wav2Vec2 models."""
    
    def get_model(self) -> Wav2Vec2Model:
        """
        Get the Wav2Vec2 model.
        
        Returns:
            Wav2Vec2Model: Pre-trained model instance
        """
    
    def get_labels(self) -> List[str]:
        """
        Get the class labels.
        
        Returns:
            List[str]: List of class labels (characters, phonemes, etc.)
        """
    
    sample_rate: int  # Expected sample rate for input audio

class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
    """Bundle for Wav2Vec2 automatic speech recognition models."""
    
    def get_model(self) -> Wav2Vec2Model:
        """Get the fine-tuned ASR model."""
    
    def get_decoder(self) -> torch.nn.Module:
        """
        Get the decoder for converting logits to text.
        
        Returns:
            torch.nn.Module: Decoder module (e.g., CTC decoder)
        """

class Wav2Vec2FABundle(Wav2Vec2Bundle):
    """Bundle for Wav2Vec2 forced alignment models."""
    
    def get_model(self) -> Wav2Vec2Model:
        """Get the forced alignment model."""
    
    def get_dict(self) -> Dict[str, int]:
        """
        Get the token dictionary for alignment.
        
        Returns:
            Dict[str, int]: Mapping from tokens to indices
        """

class Tacotron2TTSBundle:
    """Bundle for Tacotron2 text-to-speech synthesis."""
    
    def get_tacotron2(self) -> Tacotron2:
        """
        Get the Tacotron2 model.
        
        Returns:
            Tacotron2: Pre-trained synthesis model
        """
    
    def get_vocoder(self) -> torch.nn.Module:
        """
        Get the vocoder for converting mel spectrograms to audio.
        
        Returns:
            torch.nn.Module: Vocoder model (WaveRNN or Griffin-Lim)
        """
    
    def get_text_processor(self) -> torch.nn.Module:
        """
        Get the text processor for converting text to tokens.
        
        Returns:
            torch.nn.Module: Text processing pipeline
        """
    
    sample_rate: int  # Output sample rate

class RNNTBundle:
    """Bundle for RNN-Transducer streaming ASR models."""
    
    def get_model(self) -> RNNT:
        """
        Get the RNN-T model.
        
        Returns:
            RNNT: Pre-trained RNN-Transducer model
        """
    
    def get_decoder(self) -> RNNTBeamSearch:
        """
        Get the beam search decoder.
        
        Returns:
            RNNTBeamSearch: Configured beam search decoder
        """
    
    def get_tokens(self) -> List[str]:
        """
        Get the token vocabulary.
        
        Returns:
            List[str]: List of tokens (characters, subwords, etc.)
        """
    
    sample_rate: int

class SourceSeparationBundle:
    """Bundle for source separation models."""
    
    def get_model(self) -> torch.nn.Module:
        """
        Get the source separation model.
        
        Returns:
            torch.nn.Module: Pre-trained separation model
        """
    
    def get_source_labels(self) -> List[str]:
        """
        Get the source labels.
        
        Returns:
            List[str]: Names of separated sources (e.g., ["vocals", "drums", "bass", "other"])
        """
    
    sample_rate: int

class SquimObjectiveBundle:
    """Bundle for objective speech quality assessment."""
    
    def get_model(self) -> SquimObjective:
        """
        Get the SQUIM objective model.
        
        Returns:
            SquimObjective: Pre-trained quality assessment model
        """
    
    sample_rate: int

class SquimSubjectiveBundle:
    """Bundle for subjective speech quality assessment."""
    
    def get_model(self) -> SquimSubjective:
        """
        Get the SQUIM subjective model.
        
        Returns:
            SquimSubjective: Pre-trained quality assessment model
        """
    
    sample_rate: int

Wav2Vec2 Pre-trained Bundles

Self-supervised speech representation models trained on large-scale unlabeled audio.

# Base models (self-supervised representations)
WAV2VEC2_BASE: Wav2Vec2Bundle          # Base model (12 layers, 768 dim) trained on LibriSpeech
WAV2VEC2_LARGE: Wav2Vec2Bundle         # Large model (24 layers, 1024 dim) trained on LibriSpeech
WAV2VEC2_LARGE_LV60K: Wav2Vec2Bundle   # Large model trained on 60k hours of Libri-Light

# Cross-lingual models
WAV2VEC2_XLSR53: Wav2Vec2Bundle        # Cross-lingual model trained on 53 languages
WAV2VEC2_XLSR_300M: Wav2Vec2Bundle     # 300M parameter multilingual model
WAV2VEC2_XLSR_1B: Wav2Vec2Bundle       # 1B parameter multilingual model
WAV2VEC2_XLSR_2B: Wav2Vec2Bundle       # 2B parameter multilingual model

# Fine-tuned ASR models (English)
WAV2VEC2_ASR_BASE_10M: Wav2Vec2ASRBundle    # Base model fine-tuned on 10min LibriSpeech
WAV2VEC2_ASR_BASE_100H: Wav2Vec2ASRBundle   # Base model fine-tuned on 100h LibriSpeech
WAV2VEC2_ASR_BASE_960H: Wav2Vec2ASRBundle   # Base model fine-tuned on 960h LibriSpeech
WAV2VEC2_ASR_LARGE_10M: Wav2Vec2ASRBundle   # Large model fine-tuned on 10min LibriSpeech
WAV2VEC2_ASR_LARGE_100H: Wav2Vec2ASRBundle  # Large model fine-tuned on 100h LibriSpeech
WAV2VEC2_ASR_LARGE_960H: Wav2Vec2ASRBundle  # Large model fine-tuned on 960h LibriSpeech
WAV2VEC2_ASR_LARGE_LV60K_10M: Wav2Vec2ASRBundle   # LV60K model fine-tuned on 10min
WAV2VEC2_ASR_LARGE_LV60K_100H: Wav2Vec2ASRBundle  # LV60K model fine-tuned on 100h
WAV2VEC2_ASR_LARGE_LV60K_960H: Wav2Vec2ASRBundle  # LV60K model fine-tuned on 960h

# Multilingual ASR models (VoxPopuli)
VOXPOPULI_ASR_BASE_10K_EN: Wav2Vec2ASRBundle  # English ASR on VoxPopuli
VOXPOPULI_ASR_BASE_10K_ES: Wav2Vec2ASRBundle  # Spanish ASR on VoxPopuli
VOXPOPULI_ASR_BASE_10K_DE: Wav2Vec2ASRBundle  # German ASR on VoxPopuli
VOXPOPULI_ASR_BASE_10K_FR: Wav2Vec2ASRBundle  # French ASR on VoxPopuli
VOXPOPULI_ASR_BASE_10K_IT: Wav2Vec2ASRBundle  # Italian ASR on VoxPopuli

HuBERT Pre-trained Bundles

Self-supervised speech models using hidden unit BERT approach.

# Base HuBERT models
HUBERT_BASE: Wav2Vec2Bundle    # Base HuBERT model (12 layers, 768 dim)
HUBERT_LARGE: Wav2Vec2Bundle   # Large HuBERT model (24 layers, 1024 dim)  
HUBERT_XLARGE: Wav2Vec2Bundle  # Extra-large HuBERT model (24 layers, 1280 dim)

# Fine-tuned ASR models
HUBERT_ASR_LARGE: Wav2Vec2ASRBundle   # Large HuBERT fine-tuned for ASR
HUBERT_ASR_XLARGE: Wav2Vec2ASRBundle  # XLarge HuBERT fine-tuned for ASR

# Forced alignment model
MMS_FA: Wav2Vec2FABundle  # Multilingual forced alignment model (Massively Multilingual Speech)

WavLM Pre-trained Bundles

Models trained for various speech processing tasks including speaker verification.

WAVLM_BASE: Wav2Vec2Bundle       # Base WavLM model
WAVLM_BASE_PLUS: Wav2Vec2Bundle  # Base WavLM model with additional training
WAVLM_LARGE: Wav2Vec2Bundle      # Large WavLM model

Text-to-Speech Bundles

Complete text-to-speech synthesis pipelines.

# Tacotron2 + Griffin-Lim vocoder
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH: Tacotron2TTSBundle  # Character-based, Griffin-Lim vocoder
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH: Tacotron2TTSBundle # Phoneme-based, Griffin-Lim vocoder

# Tacotron2 + WaveRNN vocoder  
TACOTRON2_WAVERNN_CHAR_LJSPEECH: Tacotron2TTSBundle   # Character-based, WaveRNN vocoder
TACOTRON2_WAVERNN_PHONE_LJSPEECH: Tacotron2TTSBundle  # Phoneme-based, WaveRNN vocoder

RNN-Transducer Bundles

Streaming speech recognition models.

EMFORMER_RNNT_BASE_LIBRISPEECH: RNNTBundle  # Emformer-based RNN-T trained on LibriSpeech

Source Separation Bundles

Models for separating mixed audio into individual sources.

# Speech separation
CONVTASNET_BASE_LIBRI2MIX: SourceSeparationBundle  # ConvTasNet trained on Libri2Mix dataset

# Music separation  
HDEMUCS_HIGH_MUSDB: SourceSeparationBundle      # High-quality HDemucs trained on MUSDB18
HDEMUCS_HIGH_MUSDB_PLUS: SourceSeparationBundle # HDemucs trained on MUSDB18-HQ with extra data

Speech Quality Assessment Bundles

Models for evaluating speech quality and intelligibility.

SQUIM_OBJECTIVE: SquimObjectiveBundle    # Objective quality metrics (STOI, PESQ, SI-SDR)
SQUIM_SUBJECTIVE: SquimSubjectiveBundle  # Subjective quality metrics (MOS prediction)

Usage Examples

Speech Recognition with Wav2Vec2

import torch
import torchaudio
from torchaudio.pipelines import WAV2VEC2_ASR_BASE_960H

# Load bundle and models
bundle = WAV2VEC2_ASR_BASE_960H
model = bundle.get_model()
decoder = bundle.get_decoder()
labels = bundle.get_labels()

# Load and preprocess audio
waveform, sample_rate = torchaudio.load("speech.wav")
if sample_rate != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

# Run inference
model.eval()
with torch.no_grad():
    emission, lengths = model(waveform)
    
# Decode to text
transcripts = decoder(emission, lengths)
transcript = "".join([labels[i] for i in transcripts[0][0].tokens])
print(f"Transcript: {transcript}")

Text-to-Speech with Tacotron2

import torch
import torchaudio
from torchaudio.pipelines import TACOTRON2_WAVERNN_CHAR_LJSPEECH

# Load bundle and models
bundle = TACOTRON2_WAVERNN_CHAR_LJSPEECH
tacotron2 = bundle.get_tacotron2()
vocoder = bundle.get_vocoder()
text_processor = bundle.get_text_processor()

# Process text to tokens
text = "Hello, this is a test of text-to-speech synthesis."
tokens, token_lengths = text_processor(text)

# Generate mel spectrogram
tacotron2.eval()
with torch.no_grad():
    mel_outputs, mel_outputs_postnet, gate_outputs = tacotron2(tokens, token_lengths)

# Generate audio with vocoder
vocoder.eval()
with torch.no_grad():
    waveform = vocoder(mel_outputs_postnet)

# Save generated audio
torchaudio.save("synthesized.wav", waveform, bundle.sample_rate)

Source Separation

import torch
import torchaudio
from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB

# Load bundle and model
bundle = HDEMUCS_HIGH_MUSDB
model = bundle.get_model()
source_labels = bundle.get_source_labels()  # ["drums", "bass", "other", "vocals"]

# Load audio
waveform, sample_rate = torchaudio.load("mixed_music.wav")
if sample_rate != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

# Ensure stereo and correct shape
if waveform.shape[0] == 1:
    waveform = waveform.repeat(2, 1)  # Convert mono to stereo
waveform = waveform.unsqueeze(0)  # Add batch dimension: (1, channels, time)

# Separate sources
model.eval()
with torch.no_grad():
    sources = model(waveform)  # (1, sources, channels, time)

# Save separated sources
for i, source_name in enumerate(source_labels):
    source_audio = sources[0, i]  # (channels, time)
    torchaudio.save(f"separated_{source_name}.wav", source_audio, bundle.sample_rate)

Speech Quality Assessment

import torch
import torchaudio
from torchaudio.pipelines import SQUIM_OBJECTIVE

# Load bundle and model
bundle = SQUIM_OBJECTIVE
model = bundle.get_model()

# Load audio
waveform, sample_rate = torchaudio.load("speech_sample.wav")
if sample_rate != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

# Assess quality
model.eval()
with torch.no_grad():
    scores = model(waveform)  # Returns [STOI, PESQ, SI-SDR] scores

print(f"STOI: {scores[0]:.3f}")    # Speech Transmission Index Objective
print(f"PESQ: {scores[1]:.3f}")    # Perceptual Evaluation of Speech Quality  
print(f"SI-SDR: {scores[2]:.3f}")  # Scale-Invariant Signal-to-Distortion Ratio

Multilingual Speech Recognition

import torch
import torchaudio
from torchaudio.pipelines import WAV2VEC2_XLSR53

# Load multilingual model
bundle = WAV2VEC2_XLSR53
model = bundle.get_model()

# Load audio in any supported language
waveform, sample_rate = torchaudio.load("multilingual_speech.wav")
if sample_rate != bundle.sample_rate:
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

# Extract features (can be used for downstream tasks)
model.eval()
with torch.no_grad():
    features, lengths = model(waveform)
    
# Features can be used for language identification, ASR, etc.
print(f"Feature shape: {features.shape}")  # (batch, time, feature_dim)

These pipelines provide production-ready solutions for common audio processing tasks, with pre-trained weights and optimized preprocessing/postprocessing workflows.

Install with Tessl CLI