CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-torchaudio

An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.

Overview
Eval results
Files

transforms.mddocs/

Audio Transforms

PyTorch-compatible transform classes for building differentiable audio processing pipelines. These transforms are torch.nn.Module subclasses that can be composed with neural networks and trained end-to-end using automatic differentiation.

Capabilities

Spectral Transforms

Core spectral analysis transforms for converting between time and frequency domains.

class Spectrogram(torch.nn.Module):
    """Compute spectrogram of audio signal."""
    
    def __init__(self, n_fft: int = 400, win_length: Optional[int] = None,
                 hop_length: Optional[int] = None, pad: int = 0,
                 window_fn: Callable[..., torch.Tensor] = torch.hann_window,
                 power: Optional[float] = 2.0, normalized: bool = False,
                 wkwargs: Optional[Dict[str, Any]] = None, center: bool = True,
                 pad_mode: str = "reflect", onesided: bool = True) -> None:
        """
        Args:
            n_fft: Size of FFT
            win_length: Window size (defaults to n_fft)
            hop_length: Length of hop between STFT windows (defaults to win_length // 4)
            pad: Two-sided padding of signal
            window_fn: Window function (e.g., torch.hann_window, torch.hamming_window)
            power: Exponent for magnitude (1.0 for energy, 2.0 for power, None for complex)
            normalized: Whether to normalize by window and n_fft
            wkwargs: Additional arguments for window function
            center: Whether to pad waveform on both sides
            pad_mode: Padding mode for centering
            onesided: Controls whether to return half of results
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Spectrogram (..., freq, time)
        """

class InverseSpectrogram(torch.nn.Module):
    """Reconstruct waveform from spectrogram using inverse STFT."""
    
    def __init__(self, n_fft: int = 400, win_length: Optional[int] = None,
                 hop_length: Optional[int] = None, pad: int = 0,
                 window_fn: Callable[..., torch.Tensor] = torch.hann_window,
                 normalized: bool = False, wkwargs: Optional[Dict[str, Any]] = None,
                 center: bool = True, pad_mode: str = "reflect",
                 onesided: bool = True, length: Optional[int] = None) -> None:
        """
        Args:
            length: Expected length of reconstructed signal
            (other parameters same as Spectrogram)
        """

    def forward(self, spectrogram: torch.Tensor) -> torch.Tensor:
        """
        Args:
            spectrogram: Input spectrogram (..., freq, time)
        
        Returns:
            Tensor: Reconstructed waveform (..., time)
        """

class GriffinLim(torch.nn.Module):
    """Reconstruct waveform from magnitude spectrogram using Griffin-Lim algorithm."""
    
    def __init__(self, n_fft: int = 400, n_iter: int = 32, win_length: Optional[int] = None,
                 hop_length: Optional[int] = None, window_fn: Callable[..., torch.Tensor] = torch.hann_window,
                 power: float = 2.0, wkwargs: Optional[Dict[str, Any]] = None,
                 momentum: float = 0.99, length: Optional[int] = None,
                 rand_init: bool = True) -> None:
        """
        Args:
            n_iter: Number of Griffin-Lim iterations
            power: Exponent applied to spectrogram
            momentum: Momentum parameter for fast Griffin-Lim
            rand_init: Whether to initialize with random phase
            (other parameters same as Spectrogram)
        """

    def forward(self, specgram: torch.Tensor) -> torch.Tensor:
        """
        Args:
            specgram: Magnitude spectrogram (..., freq, time)
        
        Returns:
            Tensor: Reconstructed waveform (..., time)
        """

Mel-Scale Transforms

Transforms for mel-scale processing commonly used in speech and music analysis.

class MelSpectrogram(torch.nn.Module):
    """Compute mel-scale spectrogram."""
    
    def __init__(self, sample_rate: int = 16000, n_fft: int = 400,
                 win_length: Optional[int] = None, hop_length: Optional[int] = None,
                 f_min: float = 0.0, f_max: Optional[float] = None, n_mels: int = 128,
                 window_fn: Callable[..., torch.Tensor] = torch.hann_window,
                 power: float = 2.0, normalized: bool = False,
                 wkwargs: Optional[Dict[str, Any]] = None, center: bool = True,
                 pad_mode: str = "reflect", onesided: bool = True,
                 norm: Optional[str] = None, mel_scale: str = "htk") -> None:
        """
        Args:
            sample_rate: Sample rate of audio
            f_min: Minimum frequency
            f_max: Maximum frequency (defaults to sample_rate // 2)
            n_mels: Number of mel filter banks
            norm: Normalization method ("slaney" or None)
            mel_scale: Scale to use ("htk" or "slaney")
            (other parameters same as Spectrogram)
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Mel spectrogram (..., n_mels, time)
        """

class MelScale(torch.nn.Module):
    """Convert normal spectrogram to mel-scale spectrogram."""
    
    def __init__(self, n_mels: int = 128, sample_rate: int = 16000, f_min: float = 0.0,
                 f_max: Optional[float] = None, n_stft: Optional[int] = None,
                 norm: Optional[str] = None, mel_scale: str = "htk") -> None:
        """
        Args:
            n_mels: Number of mel filter banks
            sample_rate: Sample rate of audio
            f_min: Minimum frequency
            f_max: Maximum frequency
            n_stft: Number of STFT frequency bins (typically n_fft // 2 + 1)
            norm: Normalization method
            mel_scale: Scale to use
        """

    def forward(self, specgram: torch.Tensor) -> torch.Tensor:
        """
        Args:
            specgram: Input spectrogram (..., freq, time)
        
        Returns:
            Tensor: Mel-scale spectrogram (..., n_mels, time)
        """

class InverseMelScale(torch.nn.Module):
    """Solve for normal spectrogram from mel-scale spectrogram using iterative method."""
    
    def __init__(self, n_stft: int, n_mels: int = 128, sample_rate: int = 16000,
                 f_min: float = 0.0, f_max: Optional[float] = None,
                 max_iter: int = 100000, tolerance_loss: float = 1e-5,
                 tolerance_change: float = 1e-8, sgdargs: Optional[Dict[str, Any]] = None,
                 norm: Optional[str] = None, mel_scale: str = "htk") -> None:
        """
        Args:
            n_stft: Number of STFT frequency bins
            max_iter: Maximum number of optimization iterations
            tolerance_loss: Tolerance for loss convergence
            tolerance_change: Tolerance for parameter change
            sgdargs: Arguments for SGD optimizer
            (other parameters same as MelScale)
        """

    def forward(self, melspec: torch.Tensor) -> torch.Tensor:
        """
        Args:
            melspec: Mel-scale spectrogram (..., n_mels, time)
        
        Returns:
            Tensor: Linear spectrogram (..., n_stft, time)
        """

Feature Extraction Transforms

Transforms for extracting common audio features.

class MFCC(torch.nn.Module):
    """Compute Mel-frequency cepstral coefficients."""
    
    def __init__(self, sample_rate: int = 16000, n_mfcc: int = 40,
                 dct_type: int = 2, norm: str = "ortho", log_mels: bool = False,
                 melkwargs: Optional[Dict[str, Any]] = None) -> None:
        """
        Args:
            sample_rate: Sample rate of audio
            n_mfcc: Number of MFCC coefficients
            dct_type: DCT type (2 or 3)
            norm: DCT normalization ("ortho" or None)
            log_mels: Whether to use log mel spectrograms
            melkwargs: Additional arguments for MelSpectrogram
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: MFCC coefficients (..., n_mfcc, time)
        """

class LFCC(torch.nn.Module):
    """Compute Linear-frequency cepstral coefficients."""
    
    def __init__(self, sample_rate: int = 16000, n_lfcc: int = 40,
                 speckwargs: Optional[Dict[str, Any]] = None, n_filter: int = 128,
                 f_min: float = 0.0, f_max: Optional[float] = None,
                 dct_type: int = 2, norm: str = "ortho", log_lf: bool = False) -> None:
        """
        Args:
            sample_rate: Sample rate of audio
            n_lfcc: Number of LFCC coefficients
            speckwargs: Additional arguments for Spectrogram
            n_filter: Number of linear filter banks
            f_min: Minimum frequency
            f_max: Maximum frequency
            dct_type: DCT type
            norm: DCT normalization
            log_lf: Whether to use log linear spectrograms
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: LFCC coefficients (..., n_lfcc, time)
        """

class ComputeDeltas(torch.nn.Module):
    """Compute delta features (first derivatives) of input features."""
    
    def __init__(self, win_length: int = 5, mode: str = "replicate") -> None:
        """
        Args:
            win_length: Window length for delta computation
            mode: Padding mode for computing deltas
        """

    def forward(self, specgram: torch.Tensor) -> torch.Tensor:
        """
        Args:
            specgram: Input features (..., freq, time)
        
        Returns:
            Tensor: Delta features with same shape
        """

class SpectralCentroid(torch.nn.Module):
    """Compute spectral centroid."""
    
    def __init__(self, sample_rate: int, n_fft: int = 400, win_length: Optional[int] = None,
                 hop_length: Optional[int] = None, pad: int = 0,
                 window_fn: Callable[..., torch.Tensor] = torch.hann_window,
                 wkwargs: Optional[Dict[str, Any]] = None) -> None:
        """
        Args:
            sample_rate: Sample rate of audio
            (other parameters same as Spectrogram)
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Spectral centroid (..., time)
        """

class Loudness(torch.nn.Module):
    """Compute loudness using ITU-R BS.1770-4 standard."""
    
    def __init__(self, sample_rate: int) -> None:
        """
        Args:
            sample_rate: Sample rate of audio
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Loudness in LUFS
        """

Amplitude and Encoding Transforms

Transforms for amplitude scaling and audio encoding.

class AmplitudeToDB(torch.nn.Module):
    """Convert amplitude spectrogram to decibel scale."""
    
    def __init__(self, stype: str = "power", top_db: Optional[float] = None) -> None:
        """
        Args:
            stype: Spectrogram type ("power" or "magnitude")
            top_db: Minimum negative cut-off in decibels
        """

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Input spectrogram (..., freq, time)
        
        Returns:
            Tensor: Spectrogram in decibel scale
        """

class MuLawEncoding(torch.nn.Module):
    """Encode waveform using mu-law companding."""
    
    def __init__(self, quantization_channels: int = 256) -> None:
        """
        Args:
            quantization_channels: Number of quantization levels
        """

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Input waveform (..., time)
        
        Returns:
            Tensor: Mu-law encoded signal
        """

class MuLawDecoding(torch.nn.Module):
    """Decode mu-law encoded waveform."""
    
    def __init__(self, quantization_channels: int = 256) -> None:
        """
        Args:
            quantization_channels: Number of quantization levels
        """

    def forward(self, x_mu: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x_mu: Mu-law encoded signal (..., time)
        
        Returns:
            Tensor: Decoded waveform
        """

Resampling and Time Manipulation

Transforms for changing sample rates and temporal characteristics.

class Resample(torch.nn.Module):
    """Resample waveform to different sample rate."""
    
    def __init__(self, orig_freq: int = 16000, new_freq: int = 16000,
                 resampling_method: str = "sinc_interp_kaiser",
                 lowpass_filter_width: int = 6, rolloff: float = 0.99,
                 beta: Optional[float] = None, dtype: torch.dtype = torch.float32) -> None:
        """
        Args:
            orig_freq: Original sample rate
            new_freq: Target sample rate
            resampling_method: Resampling algorithm
            lowpass_filter_width: Width of lowpass filter
            rolloff: Roll-off frequency
            beta: Shape parameter for Kaiser window
            dtype: Output data type
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Resampled waveform
        """

class Speed(torch.nn.Module):
    """Adjust playback speed by resampling."""
    
    def __init__(self, orig_freq: int, factor: float) -> None:
        """
        Args:
            orig_freq: Original sample rate
            factor: Speed factor (>1.0 = faster, <1.0 = slower)
        """

    def forward(self, waveform: torch.Tensor, lengths: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
            lengths: Length of each sequence in batch
        
        Returns:
            Tensor: Speed-adjusted waveform
        """

class TimeStretch(torch.nn.Module):
    """Stretch time axis of spectrogram without changing pitch."""
    
    def __init__(self, hop_length: Optional[int] = None, n_freq: int = 201,
                 fixed_rate: Optional[float] = None) -> None:
        """
        Args:
            hop_length: Hop length for phase vocoder
            n_freq: Number of frequency bins
            fixed_rate: Fixed stretch rate (None for variable rate)
        """

    def forward(self, complex_specgrams: torch.Tensor, rate: float = 1.0) -> torch.Tensor:
        """
        Args:
            complex_specgrams: Complex spectrogram (..., freq, time)
            rate: Stretch rate (>1.0 = slower, <1.0 = faster)
        
        Returns:
            Tensor: Time-stretched spectrogram
        """

class PitchShift(torch.nn.Module):
    """Shift pitch without changing duration."""
    
    def __init__(self, sample_rate: int, n_steps: float, bins_per_octave: int = 12,
                 n_fft: int = 512, win_length: Optional[int] = None,
                 hop_length: Optional[int] = None,
                 window: Optional[torch.Tensor] = None) -> None:
        """
        Args:
            sample_rate: Sample rate
            n_steps: Number of semitones to shift
            bins_per_octave: Number of steps per octave
            n_fft: FFT size
            win_length: Window length
            hop_length: Hop length
            window: Window function
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Pitch-shifted waveform
        """

Data Augmentation Transforms

Transforms for data augmentation in machine learning training.

class FrequencyMasking(torch.nn.Module):
    """Apply frequency masking to spectrograms."""
    
    def __init__(self, freq_mask_param: int, iid_masks: bool = False) -> None:
        """
        Args:
            freq_mask_param: Maximum frequency mask length
            iid_masks: Whether to apply independent masks to each example in batch
        """

    def forward(self, specgram: torch.Tensor, mask_value: float = 0.0) -> torch.Tensor:
        """
        Args:
            specgram: Input spectrogram (..., freq, time)
            mask_value: Value to use for masked regions
        
        Returns:
            Tensor: Masked spectrogram
        """

class TimeMasking(torch.nn.Module):
    """Apply time masking to spectrograms."""
    
    def __init__(self, time_mask_param: int, iid_masks: bool = False, p: float = 1.0) -> None:
        """
        Args:
            time_mask_param: Maximum time mask length
            iid_masks: Whether to apply independent masks
            p: Probability of applying mask
        """

    def forward(self, specgram: torch.Tensor, mask_value: float = 0.0) -> torch.Tensor:
        """
        Args:
            specgram: Input spectrogram (..., freq, time)
            mask_value: Value to use for masked regions
        
        Returns:
            Tensor: Masked spectrogram
        """

class SpecAugment(torch.nn.Module):
    """Apply SpecAugment data augmentation."""
    
    def __init__(self, n_time_masks: int = 1, time_mask_param: int = 80,
                 n_freq_masks: int = 1, freq_mask_param: int = 80,
                 iid_masks: bool = False) -> None:
        """
        Args:
            n_time_masks: Number of time masks
            time_mask_param: Maximum time mask length
            n_freq_masks: Number of frequency masks
            freq_mask_param: Maximum frequency mask length
            iid_masks: Whether to apply independent masks
        """

    def forward(self, specgram: torch.Tensor, mask_value: float = 0.0) -> torch.Tensor:
        """
        Args:
            specgram: Input spectrogram (..., freq, time)
            mask_value: Value to use for masked regions
        
        Returns:
            Tensor: Augmented spectrogram
        """

class AddNoise(torch.nn.Module):
    """Add noise to waveform."""
    
    def __init__(self, noise: torch.Tensor, snr: torch.Tensor,
                 lengths: Optional[torch.Tensor] = None) -> None:
        """
        Args:
            noise: Noise tensor to add
            snr: Signal-to-noise ratio in dB
            lengths: Length of each sequence in batch
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Waveform with added noise
        """

class SpeedPerturbation(torch.nn.Module):
    """Apply speed perturbation augmentation by randomly sampling from given factors."""
    
    def __init__(self, orig_freq: int, factors: Sequence[float]) -> None:
        """
        Args:
            orig_freq: Original frequency of the signals
            factors: Factors by which to adjust speed. Values >1.0 compress time, <1.0 stretch time
        """

    def forward(self, waveform: torch.Tensor, lengths: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """
        Args:
            waveform: Input signals (..., time)
            lengths: Valid lengths of signals (...). Default: None
        
        Returns:
            Tuple[Tensor, Optional[Tensor]]: Speed-adjusted waveform and updated lengths
        """

Audio Processing Transforms

Basic audio processing transforms for volume, fading, and emphasis.

class Fade(torch.nn.Module):
    """Add a fade in and/or fade out to a waveform."""
    
    def __init__(self, fade_in_len: int = 0, fade_out_len: int = 0, fade_shape: str = "linear") -> None:
        """
        Args:
            fade_in_len: Length of fade-in (time frames). Default: 0
            fade_out_len: Length of fade-out (time frames). Default: 0
            fade_shape: Shape of fade. Must be one of: "quarter_sine", "half_sine", 
                       "linear", "logarithmic", "exponential". Default: "linear"
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Faded waveform with same shape
        """

class Vol(torch.nn.Module):
    """Adjust volume of waveform."""
    
    def __init__(self, gain: float, gain_type: str = "amplitude") -> None:
        """
        Args:
            gain: Interpreted according to gain_type:
                 - amplitude: positive amplitude ratio
                 - power: power (voltage squared)
                 - db: gain in decibels
            gain_type: Type of gain. One of: "amplitude", "power", "db". Default: "amplitude"
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Volume-adjusted waveform with same shape
        """

class Preemphasis(torch.nn.Module):
    """Pre-emphasizes a waveform along its last dimension."""
    
    def __init__(self, coeff: float = 0.97) -> None:
        """
        Args:
            coeff: Pre-emphasis coefficient. Typically between 0.0 and 1.0. Default: 0.97
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Pre-emphasized waveform with same shape
        """

class Deemphasis(torch.nn.Module):
    """De-emphasizes a waveform along its last dimension."""
    
    def __init__(self, coeff: float = 0.97) -> None:
        """
        Args:
            coeff: De-emphasis coefficient. Typically between 0.0 and 1.0. Default: 0.97
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: De-emphasized waveform with same shape
        """

Convolution Transforms

Convolution-based transforms for audio processing.

class Convolve(torch.nn.Module):
    """Convolves inputs along their last dimension using the direct method."""
    
    def __init__(self, mode: str = "full") -> None:
        """
        Args:
            mode: Must be one of ("full", "valid", "same").
                 - "full": Returns full convolution result (..., N + M - 1)
                 - "valid": Returns overlap segment (..., max(N, M) - min(N, M) + 1)  
                 - "same": Returns center segment (..., N)
                 Default: "full"
        """

    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: First convolution operand (..., N)
            y: Second convolution operand (..., M)
        
        Returns:
            Tensor: Convolution result with shape dictated by mode
        """

class FFTConvolve(torch.nn.Module):
    """Convolves inputs along their last dimension using FFT. Much faster than Convolve for large inputs."""
    
    def __init__(self, mode: str = "full") -> None:
        """
        Args:
            mode: Must be one of ("full", "valid", "same"). Same as Convolve. Default: "full"
        """

    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: First convolution operand (..., N)
            y: Second convolution operand (..., M)
        
        Returns:
            Tensor: FFT convolution result (always float tensors)
        """

Multi-Channel Beamforming Transforms

Advanced multi-channel transforms for beamforming and array processing.

class PSD(torch.nn.Module):
    """Compute cross-channel power spectral density (PSD) matrix."""
    
    def __init__(self, multi_mask: bool = False, normalize: bool = True, eps: float = 1e-15) -> None:
        """
        Args:
            multi_mask: If True, only accepts multi-channel Time-Frequency masks. Default: False
            normalize: If True, normalize the mask along the time dimension. Default: True
            eps: Value to add to denominator in mask normalization. Default: 1e-15
        """

    def forward(self, specgram: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Args:
            specgram: Multi-channel complex-valued spectrum (..., channel, freq, time)
            mask: Time-Frequency mask for normalization (..., freq, time) or (..., channel, freq, time)
        
        Returns:
            Tensor: Complex-valued PSD matrix (..., freq, channel, channel)
        """

class MVDR(torch.nn.Module):
    """Minimum Variance Distortionless Response (MVDR) beamforming with Time-Frequency masks."""
    
    def __init__(self, ref_channel: int = 0, solution: str = "ref_channel", 
                 multi_mask: bool = False, diag_loading: bool = True, 
                 diag_eps: float = 1e-7, online: bool = False) -> None:
        """
        Args:
            ref_channel: Reference channel for beamforming. Default: 0
            solution: Solution method. One of ["ref_channel", "stv_evd", "stv_power"]. Default: "ref_channel"
            multi_mask: If True, accepts multi-channel masks. Default: False
            diag_loading: If True, applies diagonal loading to noise covariance. Default: True
            diag_eps: Diagonal loading coefficient. Default: 1e-7
            online: If True, updates weights based on previous covariance matrices. Default: False
        """

    def forward(self, specgram: torch.Tensor, mask_s: torch.Tensor, mask_n: torch.Tensor) -> torch.Tensor:
        """
        Args:
            specgram: Multi-channel noisy spectrum (..., channel, freq, time)
            mask_s: Time-Frequency mask for target speech
            mask_n: Time-Frequency mask for noise
        
        Returns:
            Tensor: Enhanced single-channel spectrum (..., freq, time)
        """

class SoudenMVDR(torch.nn.Module):
    """MVDR beamforming using Souden's method."""
    
    def __init__(self, ref_channel: int = 0, multi_mask: bool = False, 
                 diag_loading: bool = True, diag_eps: float = 1e-7) -> None:
        """
        Args:
            ref_channel: Reference channel for beamforming. Default: 0
            multi_mask: If True, accepts multi-channel masks. Default: False
            diag_loading: If True, applies diagonal loading. Default: True
            diag_eps: Diagonal loading coefficient. Default: 1e-7
        """

    def forward(self, specgram: torch.Tensor, mask_s: torch.Tensor, mask_n: torch.Tensor) -> torch.Tensor:
        """
        Args:
            specgram: Multi-channel noisy spectrum (..., channel, freq, time)
            mask_s: Time-Frequency mask for target speech
            mask_n: Time-Frequency mask for noise
        
        Returns:
            Tensor: Enhanced single-channel spectrum using Souden method
        """

class RTFMVDR(torch.nn.Module):
    """MVDR beamforming using Relative Transfer Function (RTF)."""
    
    def __init__(self, ref_channel: int = 0, multi_mask: bool = False,
                 diag_loading: bool = True, diag_eps: float = 1e-7) -> None:
        """
        Args:
            ref_channel: Reference channel for beamforming. Default: 0
            multi_mask: If True, accepts multi-channel masks. Default: False  
            diag_loading: If True, applies diagonal loading. Default: True
            diag_eps: Diagonal loading coefficient. Default: 1e-7
        """

    def forward(self, specgram: torch.Tensor, mask_s: torch.Tensor, mask_n: torch.Tensor) -> torch.Tensor:
        """
        Args:
            specgram: Multi-channel noisy spectrum (..., channel, freq, time)
            mask_s: Time-Frequency mask for target speech
            mask_n: Time-Frequency mask for noise
        
        Returns:
            Tensor: Enhanced single-channel spectrum using RTF method
        """

Advanced Processing Transforms

Specialized transforms for feature processing and analysis.

class SlidingWindowCmn(torch.nn.Module):
    """Apply sliding-window cepstral mean (and optionally variance) normalization per utterance."""
    
    def __init__(self, cmn_window: int = 600, min_cmn_window: int = 100, 
                 center: bool = False, norm_vars: bool = False) -> None:
        """
        Args:
            cmn_window: Window in frames for running average CMN computation. Default: 600
            min_cmn_window: Minimum CMN window used at start of decoding. Default: 100
            center: If True, use centered window; if False, window is to the left. Default: False
            norm_vars: If True, normalize variance to one. Default: False
        """

    def forward(self, specgram: torch.Tensor) -> torch.Tensor:
        """
        Args:
            specgram: Spectrogram (..., time, freq)
        
        Returns:
            Tensor: CMN normalized spectrogram with same shape
        """

class Vad(torch.nn.Module):
    """Voice Activity Detector. Similar to SoX implementation."""
    
    def __init__(self, sample_rate: int, trigger_level: float = 7.0, trigger_time: float = 0.25,
                 search_time: float = 1.0, allowed_gap: float = 0.25, pre_trigger_time: float = 0.0,
                 boot_time: float = 0.35, noise_up_time: float = 0.1, noise_down_time: float = 0.01,
                 noise_reduction_amount: float = 1.35, measure_freq: float = 20.0,
                 measure_duration: Optional[float] = None, measure_smooth_time: float = 0.4,
                 hp_filter_freq: float = 50.0, lp_filter_freq: float = 6000.0,
                 hp_lifter_freq: float = 150.0, lp_lifter_freq: float = 2000.0) -> None:
        """
        Args:
            sample_rate: Sample rate of audio signal
            trigger_level: Measurement level used to trigger activity detection. Default: 7.0
            trigger_time: Time constant to help ignore short bursts. Default: 0.25
            search_time: Amount of audio to search for quieter bursts. Default: 1.0
            allowed_gap: Allowed gap between quieter bursts. Default: 0.25
            pre_trigger_time: Amount of audio to preserve before trigger. Default: 0.0
            boot_time: Time for initial noise estimate. Default: 0.35
            noise_up_time: Time constant for increasing noise level. Default: 0.1
            noise_down_time: Time constant for decreasing noise level. Default: 0.01
            noise_reduction_amount: Amount of noise reduction. Default: 1.35
            measure_freq: Frequency of algorithm processing. Default: 20.0
            measure_duration: Measurement duration. Default: None (twice measurement period)
            measure_smooth_time: Time constant for spectral smoothing. Default: 0.4
            hp_filter_freq: High-pass filter frequency. Default: 50.0
            lp_filter_freq: Low-pass filter frequency. Default: 6000.0
            hp_lifter_freq: High-pass lifter frequency. Default: 150.0
            lp_lifter_freq: Low-pass lifter frequency. Default: 2000.0
        """

    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
        """
        Args:
            waveform: Input tensor (..., time)
        
        Returns:
            Tensor: Voice activity detection result
        """

Loss Functions

Loss functions for training neural networks with audio data.

class RNNTLoss(torch.nn.Module):
    """Compute the RNN Transducer loss from Sequence Transduction with Recurrent Neural Networks."""
    
    def __init__(self, blank: int = -1, clamp: float = -1.0, reduction: str = "mean", 
                 fused_log_softmax: bool = True) -> None:
        """
        Args:
            blank: Blank label. Default: -1
            clamp: Clamp for gradients. Default: -1
            reduction: Specifies reduction to apply: "none", "mean", or "sum". Default: "mean"
            fused_log_softmax: Set to False if calling log_softmax outside of loss. Default: True
        """

    def forward(self, logits: torch.Tensor, targets: torch.Tensor, logit_lengths: torch.Tensor, 
                target_lengths: torch.Tensor) -> torch.Tensor:
        """
        Args:
            logits: Tensor with shape (N, T, U, V) where N=batch, T=time, U=target, V=vocab
            targets: Tensor with shape (N, S) where S=target sequence length
            logit_lengths: Tensor with shape (N,) representing lengths of logits
            target_lengths: Tensor with shape (N,) representing lengths of targets
        
        Returns:
            Tensor: RNN Transducer loss
        """

Usage example combining multiple transforms:

import torch
import torchaudio
from torchaudio import transforms as T

# Create a processing pipeline
transform_pipeline = torch.nn.Sequential(
    T.Resample(orig_freq=44100, new_freq=16000),  # Resample to 16kHz
    T.MelSpectrogram(
        sample_rate=16000,
        n_fft=1024,
        hop_length=256,
        n_mels=80
    ),  # Convert to mel spectrogram
    T.AmplitudeToDB(stype="power"),  # Convert to dB scale
    T.FrequencyMasking(freq_mask_param=15),  # Apply frequency masking
    T.TimeMasking(time_mask_param=35)  # Apply time masking
)

# Load and process audio
waveform, orig_sr = torchaudio.load("audio.wav")
processed = transform_pipeline(waveform)

These transforms provide the building blocks for creating sophisticated audio processing pipelines that integrate seamlessly with PyTorch's neural network ecosystem.

Install with Tessl CLI

npx tessl i tessl/pypi-torchaudio

docs

audio-io.md

datasets.md

effects.md

functional.md

index.md

models.md

pipelines.md

streaming.md

transforms.md

utils.md

tile.json