An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.
PyTorch-compatible transform classes for building differentiable audio processing pipelines. These transforms are torch.nn.Module subclasses that can be composed with neural networks and trained end-to-end using automatic differentiation.
Core spectral analysis transforms for converting between time and frequency domains.
class Spectrogram(torch.nn.Module):
"""Compute spectrogram of audio signal."""
def __init__(self, n_fft: int = 400, win_length: Optional[int] = None,
hop_length: Optional[int] = None, pad: int = 0,
window_fn: Callable[..., torch.Tensor] = torch.hann_window,
power: Optional[float] = 2.0, normalized: bool = False,
wkwargs: Optional[Dict[str, Any]] = None, center: bool = True,
pad_mode: str = "reflect", onesided: bool = True) -> None:
"""
Args:
n_fft: Size of FFT
win_length: Window size (defaults to n_fft)
hop_length: Length of hop between STFT windows (defaults to win_length // 4)
pad: Two-sided padding of signal
window_fn: Window function (e.g., torch.hann_window, torch.hamming_window)
power: Exponent for magnitude (1.0 for energy, 2.0 for power, None for complex)
normalized: Whether to normalize by window and n_fft
wkwargs: Additional arguments for window function
center: Whether to pad waveform on both sides
pad_mode: Padding mode for centering
onesided: Controls whether to return half of results
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Spectrogram (..., freq, time)
"""
class InverseSpectrogram(torch.nn.Module):
"""Reconstruct waveform from spectrogram using inverse STFT."""
def __init__(self, n_fft: int = 400, win_length: Optional[int] = None,
hop_length: Optional[int] = None, pad: int = 0,
window_fn: Callable[..., torch.Tensor] = torch.hann_window,
normalized: bool = False, wkwargs: Optional[Dict[str, Any]] = None,
center: bool = True, pad_mode: str = "reflect",
onesided: bool = True, length: Optional[int] = None) -> None:
"""
Args:
length: Expected length of reconstructed signal
(other parameters same as Spectrogram)
"""
def forward(self, spectrogram: torch.Tensor) -> torch.Tensor:
"""
Args:
spectrogram: Input spectrogram (..., freq, time)
Returns:
Tensor: Reconstructed waveform (..., time)
"""
class GriffinLim(torch.nn.Module):
"""Reconstruct waveform from magnitude spectrogram using Griffin-Lim algorithm."""
def __init__(self, n_fft: int = 400, n_iter: int = 32, win_length: Optional[int] = None,
hop_length: Optional[int] = None, window_fn: Callable[..., torch.Tensor] = torch.hann_window,
power: float = 2.0, wkwargs: Optional[Dict[str, Any]] = None,
momentum: float = 0.99, length: Optional[int] = None,
rand_init: bool = True) -> None:
"""
Args:
n_iter: Number of Griffin-Lim iterations
power: Exponent applied to spectrogram
momentum: Momentum parameter for fast Griffin-Lim
rand_init: Whether to initialize with random phase
(other parameters same as Spectrogram)
"""
def forward(self, specgram: torch.Tensor) -> torch.Tensor:
"""
Args:
specgram: Magnitude spectrogram (..., freq, time)
Returns:
Tensor: Reconstructed waveform (..., time)
"""Transforms for mel-scale processing commonly used in speech and music analysis.
class MelSpectrogram(torch.nn.Module):
"""Compute mel-scale spectrogram."""
def __init__(self, sample_rate: int = 16000, n_fft: int = 400,
win_length: Optional[int] = None, hop_length: Optional[int] = None,
f_min: float = 0.0, f_max: Optional[float] = None, n_mels: int = 128,
window_fn: Callable[..., torch.Tensor] = torch.hann_window,
power: float = 2.0, normalized: bool = False,
wkwargs: Optional[Dict[str, Any]] = None, center: bool = True,
pad_mode: str = "reflect", onesided: bool = True,
norm: Optional[str] = None, mel_scale: str = "htk") -> None:
"""
Args:
sample_rate: Sample rate of audio
f_min: Minimum frequency
f_max: Maximum frequency (defaults to sample_rate // 2)
n_mels: Number of mel filter banks
norm: Normalization method ("slaney" or None)
mel_scale: Scale to use ("htk" or "slaney")
(other parameters same as Spectrogram)
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Mel spectrogram (..., n_mels, time)
"""
class MelScale(torch.nn.Module):
"""Convert normal spectrogram to mel-scale spectrogram."""
def __init__(self, n_mels: int = 128, sample_rate: int = 16000, f_min: float = 0.0,
f_max: Optional[float] = None, n_stft: Optional[int] = None,
norm: Optional[str] = None, mel_scale: str = "htk") -> None:
"""
Args:
n_mels: Number of mel filter banks
sample_rate: Sample rate of audio
f_min: Minimum frequency
f_max: Maximum frequency
n_stft: Number of STFT frequency bins (typically n_fft // 2 + 1)
norm: Normalization method
mel_scale: Scale to use
"""
def forward(self, specgram: torch.Tensor) -> torch.Tensor:
"""
Args:
specgram: Input spectrogram (..., freq, time)
Returns:
Tensor: Mel-scale spectrogram (..., n_mels, time)
"""
class InverseMelScale(torch.nn.Module):
"""Solve for normal spectrogram from mel-scale spectrogram using iterative method."""
def __init__(self, n_stft: int, n_mels: int = 128, sample_rate: int = 16000,
f_min: float = 0.0, f_max: Optional[float] = None,
max_iter: int = 100000, tolerance_loss: float = 1e-5,
tolerance_change: float = 1e-8, sgdargs: Optional[Dict[str, Any]] = None,
norm: Optional[str] = None, mel_scale: str = "htk") -> None:
"""
Args:
n_stft: Number of STFT frequency bins
max_iter: Maximum number of optimization iterations
tolerance_loss: Tolerance for loss convergence
tolerance_change: Tolerance for parameter change
sgdargs: Arguments for SGD optimizer
(other parameters same as MelScale)
"""
def forward(self, melspec: torch.Tensor) -> torch.Tensor:
"""
Args:
melspec: Mel-scale spectrogram (..., n_mels, time)
Returns:
Tensor: Linear spectrogram (..., n_stft, time)
"""Transforms for extracting common audio features.
class MFCC(torch.nn.Module):
"""Compute Mel-frequency cepstral coefficients."""
def __init__(self, sample_rate: int = 16000, n_mfcc: int = 40,
dct_type: int = 2, norm: str = "ortho", log_mels: bool = False,
melkwargs: Optional[Dict[str, Any]] = None) -> None:
"""
Args:
sample_rate: Sample rate of audio
n_mfcc: Number of MFCC coefficients
dct_type: DCT type (2 or 3)
norm: DCT normalization ("ortho" or None)
log_mels: Whether to use log mel spectrograms
melkwargs: Additional arguments for MelSpectrogram
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: MFCC coefficients (..., n_mfcc, time)
"""
class LFCC(torch.nn.Module):
"""Compute Linear-frequency cepstral coefficients."""
def __init__(self, sample_rate: int = 16000, n_lfcc: int = 40,
speckwargs: Optional[Dict[str, Any]] = None, n_filter: int = 128,
f_min: float = 0.0, f_max: Optional[float] = None,
dct_type: int = 2, norm: str = "ortho", log_lf: bool = False) -> None:
"""
Args:
sample_rate: Sample rate of audio
n_lfcc: Number of LFCC coefficients
speckwargs: Additional arguments for Spectrogram
n_filter: Number of linear filter banks
f_min: Minimum frequency
f_max: Maximum frequency
dct_type: DCT type
norm: DCT normalization
log_lf: Whether to use log linear spectrograms
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: LFCC coefficients (..., n_lfcc, time)
"""
class ComputeDeltas(torch.nn.Module):
"""Compute delta features (first derivatives) of input features."""
def __init__(self, win_length: int = 5, mode: str = "replicate") -> None:
"""
Args:
win_length: Window length for delta computation
mode: Padding mode for computing deltas
"""
def forward(self, specgram: torch.Tensor) -> torch.Tensor:
"""
Args:
specgram: Input features (..., freq, time)
Returns:
Tensor: Delta features with same shape
"""
class SpectralCentroid(torch.nn.Module):
"""Compute spectral centroid."""
def __init__(self, sample_rate: int, n_fft: int = 400, win_length: Optional[int] = None,
hop_length: Optional[int] = None, pad: int = 0,
window_fn: Callable[..., torch.Tensor] = torch.hann_window,
wkwargs: Optional[Dict[str, Any]] = None) -> None:
"""
Args:
sample_rate: Sample rate of audio
(other parameters same as Spectrogram)
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Spectral centroid (..., time)
"""
class Loudness(torch.nn.Module):
"""Compute loudness using ITU-R BS.1770-4 standard."""
def __init__(self, sample_rate: int) -> None:
"""
Args:
sample_rate: Sample rate of audio
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Loudness in LUFS
"""Transforms for amplitude scaling and audio encoding.
class AmplitudeToDB(torch.nn.Module):
"""Convert amplitude spectrogram to decibel scale."""
def __init__(self, stype: str = "power", top_db: Optional[float] = None) -> None:
"""
Args:
stype: Spectrogram type ("power" or "magnitude")
top_db: Minimum negative cut-off in decibels
"""
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x: Input spectrogram (..., freq, time)
Returns:
Tensor: Spectrogram in decibel scale
"""
class MuLawEncoding(torch.nn.Module):
"""Encode waveform using mu-law companding."""
def __init__(self, quantization_channels: int = 256) -> None:
"""
Args:
quantization_channels: Number of quantization levels
"""
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Args:
x: Input waveform (..., time)
Returns:
Tensor: Mu-law encoded signal
"""
class MuLawDecoding(torch.nn.Module):
"""Decode mu-law encoded waveform."""
def __init__(self, quantization_channels: int = 256) -> None:
"""
Args:
quantization_channels: Number of quantization levels
"""
def forward(self, x_mu: torch.Tensor) -> torch.Tensor:
"""
Args:
x_mu: Mu-law encoded signal (..., time)
Returns:
Tensor: Decoded waveform
"""Transforms for changing sample rates and temporal characteristics.
class Resample(torch.nn.Module):
"""Resample waveform to different sample rate."""
def __init__(self, orig_freq: int = 16000, new_freq: int = 16000,
resampling_method: str = "sinc_interp_kaiser",
lowpass_filter_width: int = 6, rolloff: float = 0.99,
beta: Optional[float] = None, dtype: torch.dtype = torch.float32) -> None:
"""
Args:
orig_freq: Original sample rate
new_freq: Target sample rate
resampling_method: Resampling algorithm
lowpass_filter_width: Width of lowpass filter
rolloff: Roll-off frequency
beta: Shape parameter for Kaiser window
dtype: Output data type
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Resampled waveform
"""
class Speed(torch.nn.Module):
"""Adjust playback speed by resampling."""
def __init__(self, orig_freq: int, factor: float) -> None:
"""
Args:
orig_freq: Original sample rate
factor: Speed factor (>1.0 = faster, <1.0 = slower)
"""
def forward(self, waveform: torch.Tensor, lengths: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
lengths: Length of each sequence in batch
Returns:
Tensor: Speed-adjusted waveform
"""
class TimeStretch(torch.nn.Module):
"""Stretch time axis of spectrogram without changing pitch."""
def __init__(self, hop_length: Optional[int] = None, n_freq: int = 201,
fixed_rate: Optional[float] = None) -> None:
"""
Args:
hop_length: Hop length for phase vocoder
n_freq: Number of frequency bins
fixed_rate: Fixed stretch rate (None for variable rate)
"""
def forward(self, complex_specgrams: torch.Tensor, rate: float = 1.0) -> torch.Tensor:
"""
Args:
complex_specgrams: Complex spectrogram (..., freq, time)
rate: Stretch rate (>1.0 = slower, <1.0 = faster)
Returns:
Tensor: Time-stretched spectrogram
"""
class PitchShift(torch.nn.Module):
"""Shift pitch without changing duration."""
def __init__(self, sample_rate: int, n_steps: float, bins_per_octave: int = 12,
n_fft: int = 512, win_length: Optional[int] = None,
hop_length: Optional[int] = None,
window: Optional[torch.Tensor] = None) -> None:
"""
Args:
sample_rate: Sample rate
n_steps: Number of semitones to shift
bins_per_octave: Number of steps per octave
n_fft: FFT size
win_length: Window length
hop_length: Hop length
window: Window function
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Pitch-shifted waveform
"""Transforms for data augmentation in machine learning training.
class FrequencyMasking(torch.nn.Module):
"""Apply frequency masking to spectrograms."""
def __init__(self, freq_mask_param: int, iid_masks: bool = False) -> None:
"""
Args:
freq_mask_param: Maximum frequency mask length
iid_masks: Whether to apply independent masks to each example in batch
"""
def forward(self, specgram: torch.Tensor, mask_value: float = 0.0) -> torch.Tensor:
"""
Args:
specgram: Input spectrogram (..., freq, time)
mask_value: Value to use for masked regions
Returns:
Tensor: Masked spectrogram
"""
class TimeMasking(torch.nn.Module):
"""Apply time masking to spectrograms."""
def __init__(self, time_mask_param: int, iid_masks: bool = False, p: float = 1.0) -> None:
"""
Args:
time_mask_param: Maximum time mask length
iid_masks: Whether to apply independent masks
p: Probability of applying mask
"""
def forward(self, specgram: torch.Tensor, mask_value: float = 0.0) -> torch.Tensor:
"""
Args:
specgram: Input spectrogram (..., freq, time)
mask_value: Value to use for masked regions
Returns:
Tensor: Masked spectrogram
"""
class SpecAugment(torch.nn.Module):
"""Apply SpecAugment data augmentation."""
def __init__(self, n_time_masks: int = 1, time_mask_param: int = 80,
n_freq_masks: int = 1, freq_mask_param: int = 80,
iid_masks: bool = False) -> None:
"""
Args:
n_time_masks: Number of time masks
time_mask_param: Maximum time mask length
n_freq_masks: Number of frequency masks
freq_mask_param: Maximum frequency mask length
iid_masks: Whether to apply independent masks
"""
def forward(self, specgram: torch.Tensor, mask_value: float = 0.0) -> torch.Tensor:
"""
Args:
specgram: Input spectrogram (..., freq, time)
mask_value: Value to use for masked regions
Returns:
Tensor: Augmented spectrogram
"""
class AddNoise(torch.nn.Module):
"""Add noise to waveform."""
def __init__(self, noise: torch.Tensor, snr: torch.Tensor,
lengths: Optional[torch.Tensor] = None) -> None:
"""
Args:
noise: Noise tensor to add
snr: Signal-to-noise ratio in dB
lengths: Length of each sequence in batch
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Waveform with added noise
"""
class SpeedPerturbation(torch.nn.Module):
"""Apply speed perturbation augmentation by randomly sampling from given factors."""
def __init__(self, orig_freq: int, factors: Sequence[float]) -> None:
"""
Args:
orig_freq: Original frequency of the signals
factors: Factors by which to adjust speed. Values >1.0 compress time, <1.0 stretch time
"""
def forward(self, waveform: torch.Tensor, lengths: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
"""
Args:
waveform: Input signals (..., time)
lengths: Valid lengths of signals (...). Default: None
Returns:
Tuple[Tensor, Optional[Tensor]]: Speed-adjusted waveform and updated lengths
"""Basic audio processing transforms for volume, fading, and emphasis.
class Fade(torch.nn.Module):
"""Add a fade in and/or fade out to a waveform."""
def __init__(self, fade_in_len: int = 0, fade_out_len: int = 0, fade_shape: str = "linear") -> None:
"""
Args:
fade_in_len: Length of fade-in (time frames). Default: 0
fade_out_len: Length of fade-out (time frames). Default: 0
fade_shape: Shape of fade. Must be one of: "quarter_sine", "half_sine",
"linear", "logarithmic", "exponential". Default: "linear"
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Faded waveform with same shape
"""
class Vol(torch.nn.Module):
"""Adjust volume of waveform."""
def __init__(self, gain: float, gain_type: str = "amplitude") -> None:
"""
Args:
gain: Interpreted according to gain_type:
- amplitude: positive amplitude ratio
- power: power (voltage squared)
- db: gain in decibels
gain_type: Type of gain. One of: "amplitude", "power", "db". Default: "amplitude"
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Volume-adjusted waveform with same shape
"""
class Preemphasis(torch.nn.Module):
"""Pre-emphasizes a waveform along its last dimension."""
def __init__(self, coeff: float = 0.97) -> None:
"""
Args:
coeff: Pre-emphasis coefficient. Typically between 0.0 and 1.0. Default: 0.97
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Pre-emphasized waveform with same shape
"""
class Deemphasis(torch.nn.Module):
"""De-emphasizes a waveform along its last dimension."""
def __init__(self, coeff: float = 0.97) -> None:
"""
Args:
coeff: De-emphasis coefficient. Typically between 0.0 and 1.0. Default: 0.97
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: De-emphasized waveform with same shape
"""Convolution-based transforms for audio processing.
class Convolve(torch.nn.Module):
"""Convolves inputs along their last dimension using the direct method."""
def __init__(self, mode: str = "full") -> None:
"""
Args:
mode: Must be one of ("full", "valid", "same").
- "full": Returns full convolution result (..., N + M - 1)
- "valid": Returns overlap segment (..., max(N, M) - min(N, M) + 1)
- "same": Returns center segment (..., N)
Default: "full"
"""
def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
"""
Args:
x: First convolution operand (..., N)
y: Second convolution operand (..., M)
Returns:
Tensor: Convolution result with shape dictated by mode
"""
class FFTConvolve(torch.nn.Module):
"""Convolves inputs along their last dimension using FFT. Much faster than Convolve for large inputs."""
def __init__(self, mode: str = "full") -> None:
"""
Args:
mode: Must be one of ("full", "valid", "same"). Same as Convolve. Default: "full"
"""
def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
"""
Args:
x: First convolution operand (..., N)
y: Second convolution operand (..., M)
Returns:
Tensor: FFT convolution result (always float tensors)
"""Advanced multi-channel transforms for beamforming and array processing.
class PSD(torch.nn.Module):
"""Compute cross-channel power spectral density (PSD) matrix."""
def __init__(self, multi_mask: bool = False, normalize: bool = True, eps: float = 1e-15) -> None:
"""
Args:
multi_mask: If True, only accepts multi-channel Time-Frequency masks. Default: False
normalize: If True, normalize the mask along the time dimension. Default: True
eps: Value to add to denominator in mask normalization. Default: 1e-15
"""
def forward(self, specgram: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Args:
specgram: Multi-channel complex-valued spectrum (..., channel, freq, time)
mask: Time-Frequency mask for normalization (..., freq, time) or (..., channel, freq, time)
Returns:
Tensor: Complex-valued PSD matrix (..., freq, channel, channel)
"""
class MVDR(torch.nn.Module):
"""Minimum Variance Distortionless Response (MVDR) beamforming with Time-Frequency masks."""
def __init__(self, ref_channel: int = 0, solution: str = "ref_channel",
multi_mask: bool = False, diag_loading: bool = True,
diag_eps: float = 1e-7, online: bool = False) -> None:
"""
Args:
ref_channel: Reference channel for beamforming. Default: 0
solution: Solution method. One of ["ref_channel", "stv_evd", "stv_power"]. Default: "ref_channel"
multi_mask: If True, accepts multi-channel masks. Default: False
diag_loading: If True, applies diagonal loading to noise covariance. Default: True
diag_eps: Diagonal loading coefficient. Default: 1e-7
online: If True, updates weights based on previous covariance matrices. Default: False
"""
def forward(self, specgram: torch.Tensor, mask_s: torch.Tensor, mask_n: torch.Tensor) -> torch.Tensor:
"""
Args:
specgram: Multi-channel noisy spectrum (..., channel, freq, time)
mask_s: Time-Frequency mask for target speech
mask_n: Time-Frequency mask for noise
Returns:
Tensor: Enhanced single-channel spectrum (..., freq, time)
"""
class SoudenMVDR(torch.nn.Module):
"""MVDR beamforming using Souden's method."""
def __init__(self, ref_channel: int = 0, multi_mask: bool = False,
diag_loading: bool = True, diag_eps: float = 1e-7) -> None:
"""
Args:
ref_channel: Reference channel for beamforming. Default: 0
multi_mask: If True, accepts multi-channel masks. Default: False
diag_loading: If True, applies diagonal loading. Default: True
diag_eps: Diagonal loading coefficient. Default: 1e-7
"""
def forward(self, specgram: torch.Tensor, mask_s: torch.Tensor, mask_n: torch.Tensor) -> torch.Tensor:
"""
Args:
specgram: Multi-channel noisy spectrum (..., channel, freq, time)
mask_s: Time-Frequency mask for target speech
mask_n: Time-Frequency mask for noise
Returns:
Tensor: Enhanced single-channel spectrum using Souden method
"""
class RTFMVDR(torch.nn.Module):
"""MVDR beamforming using Relative Transfer Function (RTF)."""
def __init__(self, ref_channel: int = 0, multi_mask: bool = False,
diag_loading: bool = True, diag_eps: float = 1e-7) -> None:
"""
Args:
ref_channel: Reference channel for beamforming. Default: 0
multi_mask: If True, accepts multi-channel masks. Default: False
diag_loading: If True, applies diagonal loading. Default: True
diag_eps: Diagonal loading coefficient. Default: 1e-7
"""
def forward(self, specgram: torch.Tensor, mask_s: torch.Tensor, mask_n: torch.Tensor) -> torch.Tensor:
"""
Args:
specgram: Multi-channel noisy spectrum (..., channel, freq, time)
mask_s: Time-Frequency mask for target speech
mask_n: Time-Frequency mask for noise
Returns:
Tensor: Enhanced single-channel spectrum using RTF method
"""Specialized transforms for feature processing and analysis.
class SlidingWindowCmn(torch.nn.Module):
"""Apply sliding-window cepstral mean (and optionally variance) normalization per utterance."""
def __init__(self, cmn_window: int = 600, min_cmn_window: int = 100,
center: bool = False, norm_vars: bool = False) -> None:
"""
Args:
cmn_window: Window in frames for running average CMN computation. Default: 600
min_cmn_window: Minimum CMN window used at start of decoding. Default: 100
center: If True, use centered window; if False, window is to the left. Default: False
norm_vars: If True, normalize variance to one. Default: False
"""
def forward(self, specgram: torch.Tensor) -> torch.Tensor:
"""
Args:
specgram: Spectrogram (..., time, freq)
Returns:
Tensor: CMN normalized spectrogram with same shape
"""
class Vad(torch.nn.Module):
"""Voice Activity Detector. Similar to SoX implementation."""
def __init__(self, sample_rate: int, trigger_level: float = 7.0, trigger_time: float = 0.25,
search_time: float = 1.0, allowed_gap: float = 0.25, pre_trigger_time: float = 0.0,
boot_time: float = 0.35, noise_up_time: float = 0.1, noise_down_time: float = 0.01,
noise_reduction_amount: float = 1.35, measure_freq: float = 20.0,
measure_duration: Optional[float] = None, measure_smooth_time: float = 0.4,
hp_filter_freq: float = 50.0, lp_filter_freq: float = 6000.0,
hp_lifter_freq: float = 150.0, lp_lifter_freq: float = 2000.0) -> None:
"""
Args:
sample_rate: Sample rate of audio signal
trigger_level: Measurement level used to trigger activity detection. Default: 7.0
trigger_time: Time constant to help ignore short bursts. Default: 0.25
search_time: Amount of audio to search for quieter bursts. Default: 1.0
allowed_gap: Allowed gap between quieter bursts. Default: 0.25
pre_trigger_time: Amount of audio to preserve before trigger. Default: 0.0
boot_time: Time for initial noise estimate. Default: 0.35
noise_up_time: Time constant for increasing noise level. Default: 0.1
noise_down_time: Time constant for decreasing noise level. Default: 0.01
noise_reduction_amount: Amount of noise reduction. Default: 1.35
measure_freq: Frequency of algorithm processing. Default: 20.0
measure_duration: Measurement duration. Default: None (twice measurement period)
measure_smooth_time: Time constant for spectral smoothing. Default: 0.4
hp_filter_freq: High-pass filter frequency. Default: 50.0
lp_filter_freq: Low-pass filter frequency. Default: 6000.0
hp_lifter_freq: High-pass lifter frequency. Default: 150.0
lp_lifter_freq: Low-pass lifter frequency. Default: 2000.0
"""
def forward(self, waveform: torch.Tensor) -> torch.Tensor:
"""
Args:
waveform: Input tensor (..., time)
Returns:
Tensor: Voice activity detection result
"""Loss functions for training neural networks with audio data.
class RNNTLoss(torch.nn.Module):
"""Compute the RNN Transducer loss from Sequence Transduction with Recurrent Neural Networks."""
def __init__(self, blank: int = -1, clamp: float = -1.0, reduction: str = "mean",
fused_log_softmax: bool = True) -> None:
"""
Args:
blank: Blank label. Default: -1
clamp: Clamp for gradients. Default: -1
reduction: Specifies reduction to apply: "none", "mean", or "sum". Default: "mean"
fused_log_softmax: Set to False if calling log_softmax outside of loss. Default: True
"""
def forward(self, logits: torch.Tensor, targets: torch.Tensor, logit_lengths: torch.Tensor,
target_lengths: torch.Tensor) -> torch.Tensor:
"""
Args:
logits: Tensor with shape (N, T, U, V) where N=batch, T=time, U=target, V=vocab
targets: Tensor with shape (N, S) where S=target sequence length
logit_lengths: Tensor with shape (N,) representing lengths of logits
target_lengths: Tensor with shape (N,) representing lengths of targets
Returns:
Tensor: RNN Transducer loss
"""Usage example combining multiple transforms:
import torch
import torchaudio
from torchaudio import transforms as T
# Create a processing pipeline
transform_pipeline = torch.nn.Sequential(
T.Resample(orig_freq=44100, new_freq=16000), # Resample to 16kHz
T.MelSpectrogram(
sample_rate=16000,
n_fft=1024,
hop_length=256,
n_mels=80
), # Convert to mel spectrogram
T.AmplitudeToDB(stype="power"), # Convert to dB scale
T.FrequencyMasking(freq_mask_param=15), # Apply frequency masking
T.TimeMasking(time_mask_param=35) # Apply time masking
)
# Load and process audio
waveform, orig_sr = torchaudio.load("audio.wav")
processed = transform_pipeline(waveform)These transforms provide the building blocks for creating sophisticated audio processing pipelines that integrate seamlessly with PyTorch's neural network ecosystem.
Install with Tessl CLI
npx tessl i tessl/pypi-torchaudio