CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-torchaudio

An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.

Overview
Eval results
Files

models.mddocs/

Pre-trained Models

Ready-to-use neural network models for speech recognition, synthesis, and source separation. TorchAudio provides implementations of state-of-the-art models along with factory functions for creating pre-trained instances.

Capabilities

Speech Recognition Models

Neural networks for automatic speech recognition and speech representation learning.

class Wav2Vec2Model(torch.nn.Module):
    """Wav2Vec2 model for speech representation learning."""
    
    def __init__(self, feature_extractor: torch.nn.Module, encoder: torch.nn.Module,
                 aux: Optional[torch.nn.Module] = None) -> None:
        """
        Args:
            feature_extractor: CNN feature extractor
            encoder: Transformer encoder
            aux: Auxiliary output layer (for fine-tuned models)
        """

    def forward(self, waveforms: torch.Tensor, lengths: Optional[torch.Tensor] = None) -> Wav2Vec2ModelOutput:
        """
        Args:
            waveforms: Input audio (..., time)
            lengths: Length of each sequence in batch

        Returns:
            Wav2Vec2ModelOutput with last_hidden_state, extract_features, etc.
        """

def wav2vec2_model(arch: str, num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create Wav2Vec2 model with specified architecture."""

def wav2vec2_base(num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create base Wav2Vec2 model (12 layers, 768 dim)."""

def wav2vec2_large(num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create large Wav2Vec2 model (24 layers, 1024 dim)."""

def wav2vec2_large_lv60k(num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create large Wav2Vec2 model pre-trained on Libri-Light."""

def wav2vec2_xlsr_300m(num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create XLSR-53 300M parameter multilingual model."""

def wav2vec2_xlsr_1b(num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create XLSR-53 1B parameter multilingual model."""

def wav2vec2_xlsr_2b(num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create XLSR-53 2B parameter multilingual model."""

class HuBERTPretrainModel(torch.nn.Module):
    """HuBERT model for self-supervised speech representation learning."""
    
    def __init__(self, feature_extractor: torch.nn.Module, encoder: torch.nn.Module,
                 final_proj: torch.nn.Module, label_embs_concat: torch.nn.Module,
                 mask_generator: torch.nn.Module, logit_temp: float) -> None:
        """
        Args:
            feature_extractor: CNN feature extractor
            encoder: Transformer encoder
            final_proj: Final projection layer
            label_embs_concat: Label embedding concatenation
            mask_generator: Mask generator for pre-training
            logit_temp: Temperature for logits
        """

    def forward(self, waveforms: torch.Tensor, labels: Optional[torch.Tensor] = None,
                audio_lengths: Optional[torch.Tensor] = None) -> HuBERTPretrainModelOutput:
        """
        Args:
            waveforms: Input audio (..., time)
            labels: Target labels for pre-training
            audio_lengths: Length of each sequence

        Returns:
            HuBERTPretrainModelOutput with logits, features, etc.
        """

def hubert_base(aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create base HuBERT model."""

def hubert_large(aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create large HuBERT model."""

def hubert_xlarge(aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create extra-large HuBERT model."""

def hubert_pretrain_model(arch: str, aux_num_out: Optional[int] = None) -> HuBERTPretrainModel:
    """Create HuBERT pre-training model."""

def wavlm_model(arch: str, aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create WavLM model with specified architecture."""

def wavlm_base(aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create base WavLM model."""

def wavlm_large(aux_num_out: Optional[int] = None) -> Wav2Vec2Model:
    """Create large WavLM model."""

Legacy Speech Recognition Models

Traditional neural network architectures for speech recognition.

class DeepSpeech(torch.nn.Module):
    """DeepSpeech model for end-to-end speech recognition."""
    
    def __init__(self, n_hidden: int, n_class: int) -> None:
        """
        Args:
            n_hidden: Number of hidden units in RNN layers
            n_class: Number of output classes (characters/phonemes)
        """

    def forward(self, x: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Input features (..., freq, time)
            lengths: Length of each sequence

        Returns:
            Tensor: Logits over character classes (..., time, n_class)
        """

class Wav2Letter(torch.nn.Module):
    """Wav2Letter model for speech recognition."""
    
    def __init__(self, num_classes: int, input_type: str = "waveform",
                 num_features: Optional[int] = None, num_hidden: int = 1000) -> None:
        """
        Args:
            num_classes: Number of output classes
            input_type: Type of input ("waveform" or "features")
            num_features: Number of input features (required if input_type="features")
            num_hidden: Number of hidden units
        """

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Input tensor (waveform or features)

        Returns:
            Tensor: Class probabilities
        """

RNN-Transducer Models

Neural transducer models for streaming speech recognition.

class RNNT(torch.nn.Module):
    """RNN-Transducer model for streaming speech recognition."""
    
    def __init__(self, transcriber: torch.nn.Module, predictor: torch.nn.Module,
                 joiner: torch.nn.Module) -> None:
        """
        Args:
            transcriber: Encoder network (processes audio features)
            predictor: Decoder network (processes previous predictions)
            joiner: Joint network (combines encoder and decoder outputs)
        """

    def forward(self, sources: torch.Tensor, source_lengths: torch.Tensor,
                targets: torch.Tensor, target_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Args:
            sources: Input audio features (batch, time, feature_dim)
            source_lengths: Length of each audio sequence
            targets: Target token sequences (batch, target_time)
            target_lengths: Length of each target sequence

        Returns:
            Tuple of (transcriber_out, predictor_out, joiner_out)
        """

class Conformer(torch.nn.Module):
    """Conformer model combining CNN and self-attention."""
    
    def __init__(self, input_dim: int, num_heads: int, ffn_dim: int, num_layers: int,
                 depthwise_conv_kernel_size: int = 31, dropout: float = 0.1,
                 use_group_norm: bool = False, convolution_first: bool = False) -> None:
        """
        Args:
            input_dim: Input feature dimension
            num_heads: Number of attention heads
            ffn_dim: Feed-forward network dimension
            num_layers: Number of conformer layers
            depthwise_conv_kernel_size: Kernel size for depthwise convolution
            dropout: Dropout probability
            use_group_norm: Whether to use group normalization
            convolution_first: Whether to apply convolution before self-attention
        """

    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
            input: Input features (batch, time, feature_dim)
            lengths: Length of each sequence

        Returns:
            Tuple of (output, output_lengths)
        """

class Emformer(torch.nn.Module):
    """Emformer model for streaming applications."""
    
    def __init__(self, input_dim: int, num_heads: int, ffn_dim: int, num_layers: int,
                 segment_length: int, left_context_length: int = 0,
                 right_context_length: int = 0, max_memory_size: int = 0,
                 weight_init_scale_strategy: str = "depthwise", tanh_on_mem: bool = False,
                 negative_inf: float = -1e8) -> None:
        """
        Args:
            input_dim: Input feature dimension
            num_heads: Number of attention heads
            ffn_dim: Feed-forward dimension
            num_layers: Number of layers
            segment_length: Length of each segment
            left_context_length: Left context length
            right_context_length: Right context length
            max_memory_size: Maximum memory size
            weight_init_scale_strategy: Weight initialization strategy
            tanh_on_mem: Whether to apply tanh on memory
            negative_inf: Negative infinity value for masking
        """

    def forward(self, input: torch.Tensor, lengths: torch.Tensor,
                mems: Optional[List[List[torch.Tensor]]] = None) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
        """
        Args:
            input: Input features (batch, time, feature_dim)
            lengths: Length of each sequence
            mems: Previous memory states

        Returns:
            Tuple of (output, output_lengths, new_mems)
        """

def emformer_rnnt_base(num_symbols: int) -> RNNT:
    """Create base Emformer RNN-T model."""

def emformer_rnnt_model(arch: str, num_symbols: int) -> RNNT:
    """Create Emformer RNN-T model with specified architecture."""

Speech Synthesis Models

Neural networks for text-to-speech synthesis and vocoding.

class Tacotron2(torch.nn.Module):
    """Tacotron2 model for text-to-speech synthesis."""
    
    def __init__(self, mask_padding: bool = False, n_mels: int = 80,
                 n_frames_per_step: int = 1, n_characters: int = 188,
                 n_hidden: int = 1024, p_attention_dropout: float = 0.1,
                 p_decoder_dropout: float = 0.1, prenet_dim: int = 256,
                 postnet_embedding_dim: int = 512, postnet_kernel_size: int = 5,
                 postnet_n_convolutions: int = 5, postnet_dropout: float = 0.5,
                 attention_rnn_dim: int = 1024, attention_dim: int = 128,
                 attention_location_n_filters: int = 32, attention_location_kernel_size: int = 31,
                 encoder_embedding_dim: int = 512, encoder_n_convolutions: int = 3,
                 encoder_kernel_size: int = 5, encoder_dropout: float = 0.5,
                 decoder_rnn_dim: int = 1024, decoder_max_step: int = 2000,
                 gate_threshold: float = 0.5, p_teacher_forcing: float = 1.0,
                 decoder_dropout: float = 0.1, memory_dropout: float = 0.1) -> None:
        """
        Args:
            mask_padding: Whether to mask padding in loss computation
            n_mels: Number of mel frequency bins
            n_frames_per_step: Number of frames generated per step
            (additional parameters for model architecture configuration)
        """

    def forward(self, tokens: torch.Tensor, token_lengths: torch.Tensor,
                mel_specgram: Optional[torch.Tensor] = None,
                mel_specgram_lengths: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Args:
            tokens: Input token sequences (batch, max_token_length)
            token_lengths: Length of each token sequence
            mel_specgram: Target mel spectrograms (for training)
            mel_specgram_lengths: Length of each mel spectrogram

        Returns:
            Tuple of (mel_outputs, mel_outputs_postnet, gate_outputs)
        """

class WaveRNN(torch.nn.Module):
    """WaveRNN vocoder for high-quality audio generation."""
    
    def __init__(self, upsample_scales: List[int], n_classes: int, hop_length: int,
                 n_res_block: int = 10, n_rnn: int = 512, n_fc: int = 512,
                 kernel_size: int = 5, n_freq: int = 128, padding: int = 2) -> None:
        """
        Args:
            upsample_scales: Upsampling scales for each layer
            n_classes: Number of output classes (for mu-law quantization)
            hop_length: Hop length for upsampling
            n_res_block: Number of residual blocks
            n_rnn: RNN hidden dimension
            n_fc: Fully connected layer dimension
            kernel_size: Convolution kernel size
            n_freq: Number of frequency bins
            padding: Convolution padding
        """

    def forward(self, x: torch.Tensor, mels: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Input audio sequence (batch, time)
            mels: Mel spectrogram conditioning (batch, freq, time)

        Returns:
            Tensor: Output logits (batch, time, n_classes)
        """

Source Separation Models

Neural networks for separating mixed audio into individual sources.

class ConvTasNet(torch.nn.Module):
    """Convolutional Time-domain Audio Source Separation Network."""
    
    def __init__(self, num_sources: int = 2, enc_kernel_size: int = 16,
                 enc_num_feats: int = 512, msk_kernel_size: int = 3,
                 msk_num_feats: int = 128, msk_num_hidden_feats: int = 512,
                 msk_num_layers: int = 8, msk_num_stacks: int = 3,
                 msk_activate: str = "sigmoid") -> None:
        """
        Args:
            num_sources: Number of sources to separate
            enc_kernel_size: Encoder kernel size
            enc_num_feats: Number of encoder features
            msk_kernel_size: Mask generator kernel size
            msk_num_feats: Number of mask features
            msk_num_hidden_feats: Number of hidden features in mask generator
            msk_num_layers: Number of layers in each stack
            msk_num_stacks: Number of stacks
            msk_activate: Activation function for masks
        """

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        """
        Args:
            input: Mixed audio waveform (batch, time)

        Returns:
            Tensor: Separated sources (batch, num_sources, time)
        """

def conv_tasnet_base(num_sources: int) -> ConvTasNet:
    """Create base ConvTasNet model."""

class HDemucs(torch.nn.Module):
    """Hybrid Demucs model for music source separation."""
    
    def __init__(self, sources: List[str], audio_channels: int = 2, channels: int = 48,
                 growth: float = 2.0, nfft: int = 4096, wiener_iters: int = 0,
                 end_iters: int = 0, wiener_residual: bool = False, cac: bool = True,
                 depth: int = 6, rewrite: bool = True, hybrid: bool = True,
                 hybrid_old: bool = False, multi_freqs: List[int] = None,
                 multi_freqs_depth: int = 2, freq_emb: Optional[int] = None,
                 emb_scale: int = 10, emb_smooth: bool = False,
                 kernel_size: int = 8, time_stride: int = 2, stride: int = 4,
                 context: int = 1, context_enc: int = 0, norm_starts: int = 4,
                 norm_groups: int = 4, dconv_mode: int = 1, dconv_depth: int = 2,
                 dconv_comp: int = 4, dconv_attn: int = 4, dconv_lstm: int = 4,
                 dconv_init: float = 1e-4, bottom_channels: int = 0,
                 clone_kw: Dict[str, Any] = None, num_subbands: int = 1,
                 spec_complex: bool = True, segment_length: int = 4 * 10 * 44100) -> None:
        """
        Args:
            sources: List of source names to separate
            audio_channels: Number of audio channels
            channels: Base number of channels
            growth: Channel growth factor per layer
            nfft: FFT size for spectral branch
            wiener_iters: Number of Wiener filtering iterations
            (additional parameters for model configuration)
        """

    def forward(self, wav: torch.Tensor) -> torch.Tensor:
        """
        Args:
            wav: Input audio (batch, channels, time)

        Returns:
            Tensor: Separated sources (batch, sources, channels, time)
        """

def hdemucs_low() -> HDemucs:
    """Create low-complexity HDemucs model."""

def hdemucs_medium() -> HDemucs:
    """Create medium HDemucs model."""

def hdemucs_high() -> HDemucs:
    """Create high-quality HDemucs model."""

Speech Quality Assessment Models

Models for objective and subjective speech quality assessment.

class SquimObjective(torch.nn.Module):
    """SQUIM model for objective speech quality assessment."""
    
    def __init__(self, encoder: torch.nn.Module, classifier: torch.nn.Module) -> None:
        """
        Args:
            encoder: Feature encoder network
            classifier: Quality prediction classifier
        """

    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveforms: Input audio (batch, time)

        Returns:
            Tensor: Quality scores (STOI, PESQ, SI-SDR)
        """

class SquimSubjective(torch.nn.Module):
    """SQUIM model for subjective speech quality assessment."""
    
    def __init__(self, encoder: torch.nn.Module, classifier: torch.nn.Module) -> None:
        """
        Args:
            encoder: Feature encoder network
            classifier: Quality prediction classifier
        """

    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveforms: Input audio (batch, time)

        Returns:
            Tensor: Subjective quality scores (MOS)
        """

def squim_objective_base() -> SquimObjective:
    """Create base SQUIM objective model."""

def squim_objective_model() -> SquimObjective:
    """Create SQUIM objective model."""

def squim_subjective_base() -> SquimSubjective:
    """Create base SQUIM subjective model."""

def squim_subjective_model() -> SquimSubjective:
    """Create SQUIM subjective model."""

Decoder Utilities

Utilities for decoding model outputs, particularly for sequence-to-sequence models.

class RNNTBeamSearch(torch.nn.Module):
    """Beam search decoder for RNN-Transducer models."""
    
    def __init__(self, model: RNNT, blank: int, temperature: float = 1.0,
                 hyp_sort_score: Optional[Callable] = None,
                 token_sort_score: Optional[Callable] = None) -> None:
        """
        Args:
            model: RNN-T model to decode
            blank: Blank token index
            temperature: Temperature for softmax
            hyp_sort_score: Function to score hypotheses
            token_sort_score: Function to score tokens
        """

    def forward(self, input: torch.Tensor, length: torch.Tensor, beam_width: int,
                max_symbol_per_frame: Optional[int] = None) -> List[List[Hypothesis]]:
        """
        Args:
            input: Input features (batch, time, feature_dim)
            length: Length of each sequence
            beam_width: Beam search width
            max_symbol_per_frame: Maximum symbols per frame

        Returns:
            List of hypotheses for each batch item
        """

class Hypothesis:
    """Hypothesis object for beam search."""
    
    def __init__(self, score: float, y_sequence: List[int], dec_state: List[List[torch.Tensor]],
                 lm_state: Optional[Any] = None, lm_score: Optional[torch.Tensor] = None,
                 tokens: Optional[torch.Tensor] = None, timestep: Optional[torch.Tensor] = None,
                 last_token: Optional[int] = None) -> None:
        """
        Args:
            score: Hypothesis score
            y_sequence: Sequence of predicted tokens
            dec_state: Decoder state
            lm_state: Language model state
            lm_score: Language model score
            tokens: Token probabilities
            timestep: Current timestep
            last_token: Last predicted token
        """
    
    score: float
    y_sequence: List[int]
    dec_state: List[List[torch.Tensor]]
    lm_state: Optional[Any]
    lm_score: Optional[torch.Tensor]
    tokens: Optional[torch.Tensor]
    timestep: Optional[torch.Tensor]
    last_token: Optional[int]

Usage example:

import torch
import torchaudio
from torchaudio.models import wav2vec2_base, Tacotron2

# Load pre-trained Wav2Vec2 model
model = wav2vec2_base(num_out=32)  # 32 output classes for character recognition
model.eval()

# Process audio with Wav2Vec2
waveform, sample_rate = torchaudio.load("speech.wav")
with torch.no_grad():
    features, lengths = model(waveform)  # Extract features
    logits = model.aux(features)  # Get classification logits

# Create Tacotron2 for TTS
tts_model = Tacotron2()
tts_model.eval()

# Synthesize speech (tokens would come from text processing)
tokens = torch.randint(0, 188, (1, 50))  # Random tokens for example
token_lengths = torch.tensor([50])

with torch.no_grad():
    mel_outputs, mel_outputs_postnet, gate_outputs = tts_model(tokens, token_lengths)

These models provide state-of-the-art capabilities for various audio processing tasks and can be used as building blocks for more complex applications.

Install with Tessl CLI

npx tessl i tessl/pypi-torchaudio

docs

audio-io.md

datasets.md

effects.md

functional.md

index.md

models.md

pipelines.md

streaming.md

transforms.md

utils.md

tile.json