An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.
Extensive collection of stateless audio processing functions for spectral analysis, filtering, resampling, pitch manipulation, and advanced signal processing algorithms. These functions operate directly on tensors and are compatible with PyTorch's autograd system for gradient-based optimization.
Core spectral analysis functions for converting between time and frequency domains.
def spectrogram(waveform: torch.Tensor, pad: int, window: torch.Tensor,
n_fft: int, hop_length: int, win_length: int,
power: Optional[float], normalized: Union[bool, str],
center: bool = True, pad_mode: str = "reflect",
onesided: bool = True, return_complex: Optional[bool] = None) -> torch.Tensor:
"""
Create spectrogram from waveform.
Args:
waveform: Tensor of audio of dimension (..., time)
pad: Two sided padding of signal
window: Window tensor that is applied/multiplied to each frame/window
n_fft: Size of FFT
hop_length: Length of hop between STFT windows
win_length: Window size
power: Exponent for the magnitude spectrogram (must be > 0) e.g., 1 for magnitude, 2 for power, etc. If None, then the complex spectrum is returned instead.
normalized: Whether to normalize by magnitude after stft. If input is str, choices are "window" and "frame_length", if specific normalization type is desirable. True maps to "window".
center: whether to pad waveform on both sides so that the t-th frame is centered at time t × hop_length
pad_mode: controls the padding method used when center is True
onesided: controls whether to return half of results to avoid redundancy
return_complex: Deprecated, use power=None instead
Returns:
Tensor: Spectrogram with shape (..., freq, time)
"""
def inverse_spectrogram(spectrogram: torch.Tensor, length: Optional[int] = None,
pad: int = 0, window: Optional[torch.Tensor] = None,
n_fft: int = 400, hop_length: Optional[int] = None,
win_length: Optional[int] = None, normalized: bool = False,
center: bool = True, pad_mode: str = "reflect",
onesided: bool = True) -> torch.Tensor:
"""
Reconstruct waveform from spectrogram using inverse STFT.
Args:
spectrogram: Input spectrogram (..., freq, time)
length: Expected length of output
(other parameters same as spectrogram)
Returns:
Tensor: Reconstructed waveform (..., time)
"""
def griffinlim(spectrogram: torch.Tensor, window: Optional[torch.Tensor] = None,
n_fft: int = 400, hop_length: Optional[int] = None,
win_length: Optional[int] = None, power: float = 2.0,
n_iter: int = 32, momentum: float = 0.99,
length: Optional[int] = None, rand_init: bool = True) -> torch.Tensor:
"""
Reconstruct waveform from magnitude spectrogram using Griffin-Lim algorithm.
Args:
spectrogram: Magnitude spectrogram (..., freq, time)
window: Window function
n_fft: Size of FFT
hop_length: Length of hop between STFT windows
win_length: Window size
power: Exponent applied to spectrogram
n_iter: Number of Griffin-Lim iterations
momentum: Momentum parameter for fast Griffin-Lim
length: Expected output length
rand_init: Whether to initialize with random phase
Returns:
Tensor: Reconstructed waveform (..., time)
"""Functions for mel-scale analysis commonly used in speech and music processing.
def melscale_fbanks(n_freqs: int, f_min: float, f_max: float, n_mels: int,
sample_rate: int, norm: Optional[str] = None,
mel_scale: str = "htk") -> torch.Tensor:
"""
Create mel-scale filter banks.
Args:
n_freqs: Number of frequency bins (typically n_fft // 2 + 1)
f_min: Minimum frequency
f_max: Maximum frequency
n_mels: Number of mel filter banks
sample_rate: Sample rate of audio
norm: Normalization method ("slaney" or None)
mel_scale: Scale to use ("htk" or "slaney")
Returns:
Tensor: Mel filter bank matrix (n_mels, n_freqs)
"""
def linear_fbanks(n_freqs: int, f_min: float, f_max: float, n_filter: int,
sample_rate: int) -> torch.Tensor:
"""
Create linear-spaced filter banks.
Args:
n_freqs: Number of frequency bins
f_min: Minimum frequency
f_max: Maximum frequency
n_filter: Number of linear filter banks
sample_rate: Sample rate of audio
Returns:
Tensor: Linear filter bank matrix (n_filter, n_freqs)
"""Functions for converting between linear amplitude and logarithmic decibel scales.
def amplitude_to_DB(x: torch.Tensor, multiplier: float = 10.0, amin: float = 1e-10,
db_multiplier: float = 0.0, top_db: Optional[float] = None) -> torch.Tensor:
"""
Convert amplitude spectrogram to decibel scale.
Args:
x: Input tensor (amplitude or power spectrogram)
multiplier: Multiplier for log10 (10.0 for power, 20.0 for amplitude)
amin: Minimum value to clamp x
db_multiplier: Additional multiplier for result
top_db: Minimum negative cut-off in decibels
Returns:
Tensor: Spectrogram in decibel scale
"""
def DB_to_amplitude(x: torch.Tensor, ref: float = 1.0, power: float = 1.0) -> torch.Tensor:
"""
Convert decibel scale back to amplitude.
Args:
x: Input tensor in decibel scale
ref: Reference value
power: Power exponent (1.0 for amplitude, 2.0 for power)
Returns:
Tensor: Amplitude spectrogram
"""Audio resampling for sample rate conversion.
def resample(waveform: torch.Tensor, orig_freq: int, new_freq: int,
resampling_method: str = "sinc_interp_kaiser",
lowpass_filter_width: int = 6, rolloff: float = 0.99,
beta: Optional[float] = None) -> torch.Tensor:
"""
Resample waveform to different sample rate.
Args:
waveform: Input waveform tensor (..., time)
orig_freq: Original sample rate
new_freq: Target sample rate
resampling_method: Resampling algorithm ("sinc_interp_kaiser" or "sinc_interp_hann")
lowpass_filter_width: Width of lowpass filter
rolloff: Roll-off frequency of lowpass filter
beta: Shape parameter for Kaiser window
Returns:
Tensor: Resampled waveform
"""Comprehensive collection of audio filters and effects.
def biquad(waveform: torch.Tensor, b0: float, b1: float, b2: float,
a0: float, a1: float, a2: float) -> torch.Tensor:
"""
Apply biquad IIR filter.
Args:
waveform: Input audio (..., time)
b0, b1, b2: Numerator coefficients
a0, a1, a2: Denominator coefficients
Returns:
Tensor: Filtered audio
"""
def allpass_biquad(waveform: torch.Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> torch.Tensor:
"""
Design two-pole all-pass filter. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform, e.g. 44100 (Hz)
central_freq: Central frequency (in Hz)
Q: Q factor (Default: 0.707)
Returns:
Tensor: Waveform of dimension (..., time)
"""
def band_biquad(waveform: torch.Tensor, sample_rate: int, central_freq: float,
Q: float = 0.707, noise: bool = False) -> torch.Tensor:
"""
Design two-pole band filter. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform, e.g. 44100 (Hz)
central_freq: Central frequency (in Hz)
Q: Q factor (Default: 0.707)
noise: Add noise to the filter
Returns:
Tensor: Waveform of dimension (..., time)
"""
def bandpass_biquad(waveform: torch.Tensor, sample_rate: int, central_freq: float,
Q: float = 0.707, const_skirt_gain: bool = False) -> torch.Tensor:
"""
Design two-pole band-pass filter. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform, e.g. 44100 (Hz)
central_freq: Central frequency (in Hz)
Q: Q factor (Default: 0.707)
const_skirt_gain: Constant skirt gain
Returns:
Tensor: Waveform of dimension (..., time)
"""
def bandreject_biquad(waveform: torch.Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> torch.Tensor:
"""
Design two-pole band-reject filter. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform, e.g. 44100 (Hz)
central_freq: Central frequency (in Hz)
Q: Q factor (Default: 0.707)
Returns:
Tensor: Waveform of dimension (..., time)
"""
def bass_biquad(waveform: torch.Tensor, sample_rate: int, gain: float,
central_freq: float = 100, Q: float = 0.707) -> torch.Tensor:
"""
Design a bass tone-control effect. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform, e.g. 44100 (Hz)
gain: Gain in dB
central_freq: Central frequency (in Hz, default: 100)
Q: Q factor (Default: 0.707)
Returns:
Tensor: Waveform of dimension (..., time)
"""
def contrast(waveform: torch.Tensor, enhancement_amount: float = 75.0) -> torch.Tensor:
"""
Apply contrast effect. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
enhancement_amount: Enhancement amount (default: 75.0)
Returns:
Tensor: Waveform of dimension (..., time)
"""
def dcshift(waveform: torch.Tensor, shift: float, limiter_gain: Optional[float] = None) -> torch.Tensor:
"""
Apply a DC shift to the audio. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
shift: DC shift amount
limiter_gain: Optional limiter gain
Returns:
Tensor: Waveform of dimension (..., time)
"""
def deemph_biquad(waveform: torch.Tensor, sample_rate: int) -> torch.Tensor:
"""
Apply ISO 908 CD de-emphasis (shelving) IIR filter. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform
Returns:
Tensor: Waveform of dimension (..., time)
"""
def dither(waveform: torch.Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> torch.Tensor:
"""
Apply dither. Dither increases the perceived dynamic range of audio stored at a particular bit-depth.
Args:
waveform: Audio waveform of dimension (..., time)
density_function: Density function ("TPDF", "RPDF", "GPDF")
noise_shaping: Apply noise shaping
Returns:
Tensor: Dithered waveform
"""
def equalizer_biquad(waveform: torch.Tensor, sample_rate: int, center_freq: float,
gain: float, Q: float = 0.707) -> torch.Tensor:
"""
Design biquad peaking equalizer filter and perform filtering. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform
center_freq: Center frequency (in Hz)
gain: Gain in dB
Q: Q factor (Default: 0.707)
Returns:
Tensor: Waveform of dimension (..., time)
"""
def filtfilt(waveform: torch.Tensor, a_coeffs: torch.Tensor, b_coeffs: torch.Tensor,
clamp: bool = True) -> torch.Tensor:
"""
Apply an IIR filter forward and backward to a waveform. Inspired by scipy.signal.filtfilt.
Args:
waveform: Input waveform (..., time)
a_coeffs: Denominator coefficients of the filter
b_coeffs: Numerator coefficients of the filter
clamp: Clamp intermediate values
Returns:
Tensor: Zero-phase filtered waveform
"""
def flanger(waveform: torch.Tensor, sample_rate: int, delay: float = 0.0,
depth: float = 2.0, regen: float = 0.0, width: float = 71.0,
speed: float = 0.5, phase: float = 25.0, modulation: str = "sinusoidal",
interpolation: str = "linear") -> torch.Tensor:
"""
Apply a flanger effect to the audio. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., channel, time)
sample_rate: Sampling rate of the waveform
delay: Base delay in milliseconds
depth: Delay depth in milliseconds
regen: Regeneration (feedback) in percent
width: Delay line width in percent
speed: Modulation speed in Hz
phase: Phase in percent
modulation: Modulation type ("sinusoidal" or "triangular")
interpolation: Interpolation type ("linear" or "quadratic")
Returns:
Tensor: Waveform of dimension (..., channel, time)
"""
def gain(waveform: torch.Tensor, gain_db: float = 1.0) -> torch.Tensor:
"""
Apply amplification or attenuation to the whole waveform.
Args:
waveform: Audio waveform of dimension (..., time)
gain_db: Gain in decibels
Returns:
Tensor: Amplified waveform
"""
def highpass_biquad(waveform: torch.Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> torch.Tensor:
"""
Design biquad highpass filter and perform filtering. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform
cutoff_freq: Cutoff frequency
Q: Q factor (Default: 0.707)
Returns:
Tensor: Waveform dimension (..., time)
"""
def lfilter(waveform: torch.Tensor, a_coeffs: torch.Tensor, b_coeffs: torch.Tensor,
clamp: bool = True, batching: bool = True) -> torch.Tensor:
"""
Perform an IIR filter by evaluating difference equation.
Args:
waveform: Input waveform (..., time)
a_coeffs: Denominator coefficients of the filter
b_coeffs: Numerator coefficients of the filter
clamp: Clamp intermediate values
batching: Enable batching optimization
Returns:
Tensor: Filtered waveform
"""
def lowpass_biquad(waveform: torch.Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> torch.Tensor:
"""
Design biquad lowpass filter and perform filtering. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform
cutoff_freq: Cutoff frequency
Q: Q factor (Default: 0.707)
Returns:
Tensor: Waveform of dimension (..., time)
"""
def overdrive(waveform: torch.Tensor, gain: float = 20, colour: float = 20) -> torch.Tensor:
"""
Apply a overdrive effect to the audio. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
gain: Gain amount
colour: Colour amount
Returns:
Tensor: Waveform of dimension (..., time)
"""
def phaser(waveform: torch.Tensor, sample_rate: int, gain_in: float = 0.4,
gain_out: float = 0.74, delay_ms: float = 3.0, decay: float = 0.4,
mod_speed: float = 0.5, sinusoidal: bool = True) -> torch.Tensor:
"""
Apply a phasing effect to the audio. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform
gain_in: Input gain
gain_out: Output gain
delay_ms: Delay in milliseconds
decay: Decay amount
mod_speed: Modulation speed
sinusoidal: Use sinusoidal modulation
Returns:
Tensor: Waveform of dimension (..., time)
"""
def riaa_biquad(waveform: torch.Tensor, sample_rate: int) -> torch.Tensor:
"""
Apply RIAA vinyl playback equalization. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform
Returns:
Tensor: Waveform of dimension (..., time)
"""
def treble_biquad(waveform: torch.Tensor, sample_rate: int, gain: float,
central_freq: float = 3000, Q: float = 0.707) -> torch.Tensor:
"""
Design a treble tone-control effect. Similar to SoX implementation.
Args:
waveform: Audio waveform of dimension (..., time)
sample_rate: Sampling rate of the waveform
gain: Gain in dB
central_freq: Central frequency (in Hz, default: 3000)
Q: Q factor (Default: 0.707)
Returns:
Tensor: Waveform of dimension (..., time)
"""
def vad(waveform: torch.Tensor, sample_rate: int, trigger_level: float = 7.0,
trigger_time: float = 0.25, search_time: float = 1.0,
allowed_gap: float = 0.25, pre_trigger_time: float = 0.0,
boot_time: float = 0.35, noise_up_time: float = 0.1,
noise_down_time: float = 0.01, noise_reduction_amount: float = 1.35,
measure_freq: float = 20.0, measure_duration: Optional[float] = None,
measure_smooth_time: float = 0.4, hp_filter_freq: float = 50.0,
lp_filter_freq: float = 6000.0, hp_lifter_freq: float = 150.0,
lp_lifter_freq: float = 2000.0) -> torch.Tensor:
"""
Voice Activity Detector. Similar to SoX implementation.
Args:
waveform: Tensor of audio of dimension (..., time)
sample_rate: Sample rate of audio
trigger_level: Trigger level (default: 7.0)
trigger_time: Trigger time (default: 0.25)
search_time: Search time (default: 1.0)
allowed_gap: Allowed gap (default: 0.25)
pre_trigger_time: Pre-trigger time (default: 0.0)
boot_time: Boot time (default: 0.35)
noise_up_time: Noise up time (default: 0.1)
noise_down_time: Noise down time (default: 0.01)
noise_reduction_amount: Noise reduction amount (default: 1.35)
measure_freq: Measure frequency (default: 20.0)
measure_duration: Measure duration (optional)
measure_smooth_time: Measure smooth time (default: 0.4)
hp_filter_freq: High-pass filter frequency (default: 50.0)
lp_filter_freq: Low-pass filter frequency (default: 6000.0)
hp_lifter_freq: High-pass lifter frequency (default: 150.0)
lp_lifter_freq: Low-pass lifter frequency (default: 2000.0)
Returns:
Tensor: Audio with silence trimmed
"""Advanced beamforming algorithms for multi-channel audio processing and spatial filtering.
def apply_beamforming(multi_channel_audio: torch.Tensor, beamforming_weights: torch.Tensor) -> torch.Tensor:
"""
Apply beamforming weights to multi-channel audio.
Args:
multi_channel_audio: Multi-channel audio tensor (..., channel, freq, time)
beamforming_weights: Beamforming weights (..., channel, freq)
Returns:
Tensor: Beamformed audio (..., freq, time)
"""
def mvdr_weights_souden(psd_s: torch.Tensor, psd_n: torch.Tensor, reference_vector: torch.Tensor,
diagonal_loading: bool = True, diag_eps: float = 1e-7) -> torch.Tensor:
"""
Compute MVDR (Minimum Variance Distortionless Response) beamforming weights using Souden's method.
Args:
psd_s: Power spectral density matrix of target speech (..., freq, channel, channel)
psd_n: Power spectral density matrix of noise (..., freq, channel, channel)
reference_vector: Reference microphone vector (..., channel)
diagonal_loading: Whether to apply diagonal loading
diag_eps: Diagonal loading factor
Returns:
Tensor: MVDR beamforming weights (..., freq, channel)
"""
def mvdr_weights_rtf(rtf_mat: torch.Tensor, psd_n: torch.Tensor, reference_vector: torch.Tensor,
diagonal_loading: bool = True, diag_eps: float = 1e-7) -> torch.Tensor:
"""
Compute MVDR beamforming weights using Relative Transfer Function (RTF).
Args:
rtf_mat: Relative transfer function matrix (..., freq, channel)
psd_n: Power spectral density matrix of noise (..., freq, channel, channel)
reference_vector: Reference microphone vector (..., channel)
diagonal_loading: Whether to apply diagonal loading
diag_eps: Diagonal loading factor
Returns:
Tensor: MVDR beamforming weights (..., freq, channel)
"""
def rtf_evd(psd_s: torch.Tensor, psd_n: torch.Tensor) -> torch.Tensor:
"""
Estimate relative transfer function (RTF) using eigenvalue decomposition.
Args:
psd_s: Power spectral density matrix of target speech (..., freq, channel, channel)
psd_n: Power spectral density matrix of noise (..., freq, channel, channel)
Returns:
Tensor: RTF matrix (..., freq, channel)
"""
def rtf_power(psd_s: torch.Tensor, psd_n: torch.Tensor, reference_channel: int = 0) -> torch.Tensor:
"""
Estimate relative transfer function (RTF) using power method.
Args:
psd_s: Power spectral density matrix of target speech (..., freq, channel, channel)
psd_n: Power spectral density matrix of noise (..., freq, channel, channel)
reference_channel: Reference channel index
Returns:
Tensor: RTF matrix (..., freq, channel)
"""
def psd(specgrams: torch.Tensor, mask: Optional[torch.Tensor] = None,
normalize: bool = True, eps: float = 1e-15) -> torch.Tensor:
"""
Compute power spectral density (PSD) matrix.
Args:
specgrams: Multi-channel spectrograms (..., channel, freq, time)
mask: Optional mask for PSD estimation (..., freq, time)
normalize: Whether to normalize by time frames
eps: Small value for numerical stability
Returns:
Tensor: PSD matrix (..., freq, channel, channel)
"""Functions for pitch shifting and time-scale modification.
def pitch_shift(waveform: torch.Tensor, sample_rate: int, n_steps: float,
bins_per_octave: int = 12, n_fft: int = 512,
win_length: Optional[int] = None, hop_length: Optional[int] = None,
window: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Shift the pitch of waveform by n_steps steps.
Args:
waveform: Input waveform (..., time)
sample_rate: Sample rate of waveform
n_steps: Number of pitch steps to shift
bins_per_octave: Number of steps per octave
n_fft: Size of FFT
win_length: Window size
hop_length: Length of hop between STFT windows
window: Window function
Returns:
Tensor: Pitch-shifted waveform
"""
def speed(waveform: torch.Tensor, orig_freq: int, factor: float,
lengths: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
"""
Adjust waveform speed by a given factor.
Args:
waveform: Input waveform (..., time)
orig_freq: Original sample rate
factor: Speed factor (>1.0 makes faster, <1.0 makes slower)
lengths: Original lengths of waveforms
Returns:
Tuple: (speed-adjusted waveform, adjusted lengths)
"""
def detect_pitch_frequency(waveform: torch.Tensor, sample_rate: int,
frame_time: float = 10 ** (-2), win_length: int = 30,
freq_low: int = 85, freq_high: int = 3400) -> torch.Tensor:
"""
Detect pitch frequency using autocorrelation method.
Args:
waveform: Input waveform (..., time)
sample_rate: Sample rate of the waveform
frame_time: Duration of a frame in seconds
win_length: Length of the window in frames
freq_low: Lowest detectable frequency
freq_high: Highest detectable frequency
Returns:
Tensor: Detected pitch frequencies (..., frame)
"""Functions for codec simulation and audio format processing.
def apply_codec(waveform: torch.Tensor, sample_rate: int, format: str,
encoder: Optional[str] = None, encoder_config: Optional[dict] = None,
decoder: Optional[str] = None, decoder_config: Optional[dict] = None) -> torch.Tensor:
"""
Apply codec compression and decompression to waveform.
Args:
waveform: Input waveform (..., time)
sample_rate: Sample rate
format: Audio format ("wav", "mp3", "ogg", etc.)
encoder: Encoder name
encoder_config: Encoder configuration
decoder: Decoder name
decoder_config: Decoder configuration
Returns:
Tensor: Codec-processed waveform
"""
def mu_law_encoding(x: torch.Tensor, quantization_channels: int = 256) -> torch.Tensor:
"""
Encode signal based on mu-law companding.
Args:
x: Input tensor (..., time)
quantization_channels: Number of quantization channels
Returns:
Tensor: Mu-law encoded tensor
"""
def mu_law_decoding(x_mu: torch.Tensor, quantization_channels: int = 256) -> torch.Tensor:
"""
Decode mu-law encoded signal.
Args:
x_mu: Mu-law encoded input (..., time)
quantization_channels: Number of quantization channels
Returns:
Tensor: Decoded tensor
"""Additional signal processing utilities and analysis functions.
def preemphasis(waveform: torch.Tensor, coeff: float = 0.97) -> torch.Tensor:
"""
Apply pre-emphasis filter to waveform.
Args:
waveform: Input waveform (..., time)
coeff: Pre-emphasis coefficient
Returns:
Tensor: Pre-emphasized waveform
"""
def deemphasis(waveform: torch.Tensor, coeff: float = 0.97) -> torch.Tensor:
"""
Apply de-emphasis filter to waveform.
Args:
waveform: Input waveform (..., time)
coeff: De-emphasis coefficient
Returns:
Tensor: De-emphasized waveform
"""
def phase_vocoder(complex_specgrams: torch.Tensor, rate: float, phase_advance: torch.Tensor) -> torch.Tensor:
"""
Given a STFT tensor, speed up in time without modifying pitch by applying phase vocoder.
Args:
complex_specgrams: Complex-valued spectrogram (..., freq, time)
rate: Speed-up factor
phase_advance: Expected phase advance in each bin
Returns:
Tensor: Time-stretched complex spectrogram
"""
def mask_along_axis(specgrams: torch.Tensor, mask_param: int, mask_value: float,
axis: int) -> torch.Tensor:
"""
Apply masking along the given axis.
Args:
specgrams: Tensor spectrogram (..., freq, time)
mask_param: Number of columns to be masked
mask_value: Value to assign to masked columns
axis: Axis to apply masking on (1 for freq, 2 for time)
Returns:
Tensor: Masked spectrogram
"""
def mask_along_axis_iid(specgrams: torch.Tensor, mask_param: int, mask_value: float,
axis: int) -> torch.Tensor:
"""
Apply masking along the given axis with independent masks for each example.
Args:
specgrams: Tensor spectrogram (..., freq, time)
mask_param: Number of columns to be masked
mask_value: Value to assign to masked columns
axis: Axis to apply masking on (1 for freq, 2 for time)
Returns:
Tensor: Masked spectrogram
"""
def compute_deltas(specgram: torch.Tensor, win_length: int = 5, mode: str = "replicate") -> torch.Tensor:
"""
Compute delta coefficients of a tensor.
Args:
specgram: Input tensor (..., freq, time)
win_length: The window length used for computing delta
mode: Mode for padding ("replicate", "constant", etc.)
Returns:
Tensor: Delta coefficients
"""
def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str] = None) -> torch.Tensor:
"""
Create DCT transformation matrix.
Args:
n_mfcc: Number of MFCC coefficients
n_mels: Number of mel filter banks
norm: Normalization mode ("ortho" or None)
Returns:
Tensor: DCT transformation matrix (n_mfcc, n_mels)
"""
def sliding_window_cmn(specgram: torch.Tensor, cmn_window: int = 600,
min_cmn_window: int = 100, center: bool = False,
norm_vars: bool = False) -> torch.Tensor:
"""
Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
Args:
specgram: Input tensor (..., freq, time)
cmn_window: Window length for normalization
min_cmn_window: Minimum window length
center: Whether to center the window
norm_vars: Whether to normalize variance
Returns:
Tensor: Normalized tensor
"""
def spectral_centroid(waveform: torch.Tensor, sample_rate: int, pad: int = 0,
window: Optional[torch.Tensor] = None, n_fft: int = 400,
hop_length: Optional[int] = None, win_length: Optional[int] = None) -> torch.Tensor:
"""
Compute the spectral centroid for each frame.
Args:
waveform: Input tensor (..., time)
sample_rate: Sample rate of waveform
pad: Two sided padding of signal
window: Window tensor
n_fft: Size of FFT
hop_length: Length of hop between STFT windows
win_length: Window size
Returns:
Tensor: Spectral centroid (..., time)
"""
def add_noise(waveform: torch.Tensor, noise: torch.Tensor, snr: torch.Tensor,
lengths: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Add noise to waveform with given Signal-to-Noise Ratio (SNR).
Args:
waveform: Input waveform (..., time)
noise: Noise tensor (..., time)
snr: Signal-to-noise ratio in dB
lengths: Lengths of waveforms
Returns:
Tensor: Noisy waveform
"""
def convolve(waveform: torch.Tensor, kernel: torch.Tensor, mode: str = "full") -> torch.Tensor:
"""
Convolve waveform with kernel using PyTorch operations.
Args:
waveform: Input waveform (..., time)
kernel: Convolution kernel (..., time)
mode: Convolution mode ("full", "valid", "same")
Returns:
Tensor: Convolved waveform
"""
def fftconvolve(waveform: torch.Tensor, kernel: torch.Tensor, mode: str = "full") -> torch.Tensor:
"""
Convolve waveform with kernel using FFT.
Args:
waveform: Input waveform (..., time)
kernel: Convolution kernel (..., time)
mode: Convolution mode ("full", "valid", "same")
Returns:
Tensor: Convolved waveform
"""
def loudness(specgram: torch.Tensor, sample_rate: int) -> torch.Tensor:
"""
Compute loudness according to ITU-R BS.1770-4.
Args:
specgram: Input spectrogram (..., freq, time)
sample_rate: Sample rate
Returns:
Tensor: Loudness values
"""
def edit_distance(seq1: List[int], seq2: List[int]) -> int:
"""
Calculate edit distance between two sequences.
Args:
seq1: First sequence
seq2: Second sequence
Returns:
int: Edit distance
"""
def rnnt_loss(logits: torch.Tensor, targets: torch.Tensor, logit_lengths: torch.Tensor,
target_lengths: torch.Tensor, blank: int = -1, clamp: float = -1) -> torch.Tensor:
"""
Compute RNN-Transducer loss.
Args:
logits: Predicted logits (..., time, target_length, n_class)
targets: Target sequences (..., target_length)
logit_lengths: Length of logits for each sample
target_lengths: Length of targets for each sample
blank: Blank label index
clamp: Clamp gradients
Returns:
Tensor: RNN-T loss
"""
def frechet_distance(mu_x: torch.Tensor, sigma_x: torch.Tensor,
mu_y: torch.Tensor, sigma_y: torch.Tensor) -> torch.Tensor:
"""
Compute Fréchet distance between two multivariate Gaussians.
Args:
mu_x: Mean of first distribution
sigma_x: Covariance of first distribution
mu_y: Mean of second distribution
sigma_y: Covariance of second distribution
Returns:
Tensor: Fréchet distance
"""def lfilter(waveform: torch.Tensor, a_coeffs: torch.Tensor, b_coeffs: torch.Tensor,
zi: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Apply IIR filter using difference equation.
Args:
waveform: Input signal (..., time)
a_coeffs: Denominator coefficients (autoregressive)
b_coeffs: Numerator coefficients (moving average)
zi: Initial conditions for filter delays
Returns:
Tensor: Filtered signal
"""
def filtfilt(waveform: torch.Tensor, a_coeffs: torch.Tensor, b_coeffs: torch.Tensor,
clamp: bool = True) -> torch.Tensor:
"""
Apply zero-phase filtering using forward-backward filter.
Args:
waveform: Input signal (..., time)
a_coeffs: Denominator coefficients
b_coeffs: Numerator coefficients
clamp: Whether to clamp output to prevent numerical issues
Returns:
Tensor: Zero-phase filtered signal
"""Functions for manipulating pitch and temporal characteristics of audio.
def pitch_shift(waveform: torch.Tensor, sample_rate: int, n_steps: float,
bins_per_octave: int = 12, n_fft: int = 512,
win_length: Optional[int] = None, hop_length: Optional[int] = None,
window: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Shift pitch of waveform by n_steps semitones.
Args:
waveform: Input audio (..., time)
sample_rate: Sample rate
n_steps: Number of semitones to shift (positive = higher, negative = lower)
bins_per_octave: Number of steps per octave
n_fft: FFT size for STFT
win_length: Window length
hop_length: Hop length
window: Window function
Returns:
Tensor: Pitch-shifted audio
"""
def speed(waveform: torch.Tensor, orig_freq: int, factor: float,
lengths: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Adjust playback speed by resampling.
Args:
waveform: Input audio (..., time)
orig_freq: Original sample rate
factor: Speed factor (>1.0 = faster, <1.0 = slower)
lengths: Length of each sequence in batch
Returns:
Tensor: Speed-adjusted audio
"""
def phase_vocoder(complex_specgrams: torch.Tensor, rate: float,
phase_advance: torch.Tensor) -> torch.Tensor:
"""
Apply phase vocoder for time stretching/compression.
Args:
complex_specgrams: Complex STFT (..., freq, time)
rate: Rate factor (>1.0 = faster, <1.0 = slower)
phase_advance: Expected phase advance per hop
Returns:
Tensor: Time-stretched complex spectrogram
"""Functions for analyzing audio characteristics and extracting features.
def spectral_centroid(waveform: torch.Tensor, sample_rate: int, pad: int = 0,
window: Optional[torch.Tensor] = None, n_fft: int = 400,
hop_length: Optional[int] = None, win_length: Optional[int] = None) -> torch.Tensor:
"""
Compute spectral centroid (center of mass of spectrum).
Args:
waveform: Input audio (..., time)
sample_rate: Sample rate
(other parameters same as spectrogram)
Returns:
Tensor: Spectral centroid over time (..., time)
"""
def detect_pitch_frequency(waveform: torch.Tensor, sample_rate: int, frame_time: float = 10**(-2),
win_length: int = 30, freq_low: int = 85, freq_high: int = 3400) -> torch.Tensor:
"""
Detect pitch frequency using autocorrelation method.
Args:
waveform: Input audio (..., time)
sample_rate: Sample rate
frame_time: Length of frame in seconds
win_length: Length of window for median filtering
freq_low: Lowest frequency that can be detected
freq_high: Highest frequency that can be detected
Returns:
Tensor: Detected pitch frequency over time
"""
def loudness(waveform: torch.Tensor, sample_rate: int) -> torch.Tensor:
"""
Compute loudness using ITU-R BS.1770-4 standard.
Args:
waveform: Input audio (..., time)
sample_rate: Sample rate
Returns:
Tensor: Loudness in LUFS (Loudness Units Full Scale)
"""Convolution-based processing for impulse response application and acoustic modeling.
def convolve(x: torch.Tensor, y: torch.Tensor, mode: str = "full") -> torch.Tensor:
"""
Convolve two 1D tensors.
Args:
x: First input tensor (..., time)
y: Second input tensor (..., time)
mode: Convolution mode ("full", "valid", "same")
Returns:
Tensor: Convolved signal
"""
def fftconvolve(x: torch.Tensor, y: torch.Tensor, mode: str = "full") -> torch.Tensor:
"""
Convolve using FFT for efficiency with long signals.
Args:
x: First input tensor (..., time)
y: Second input tensor (..., time)
mode: Convolution mode ("full", "valid", "same")
Returns:
Tensor: Convolved signal
"""Logarithmic quantization commonly used in telecommunications.
def mu_law_encoding(x: torch.Tensor, quantization_channels: int = 256) -> torch.Tensor:
"""
Encode waveform using mu-law companding.
Args:
x: Input waveform (..., time)
quantization_channels: Number of quantization levels
Returns:
Tensor: Mu-law encoded signal (integer values)
"""
def mu_law_decoding(x_mu: torch.Tensor, quantization_channels: int = 256) -> torch.Tensor:
"""
Decode mu-law encoded waveform.
Args:
x_mu: Mu-law encoded signal (..., time)
quantization_channels: Number of quantization levels
Returns:
Tensor: Decoded waveform
"""Functions for processing extracted audio features.
def compute_deltas(specgram: torch.Tensor, win_length: int = 5) -> torch.Tensor:
"""
Compute delta features (first derivatives) of spectrogram.
Args:
specgram: Input spectrogram (..., freq, time)
win_length: Window length for delta computation
Returns:
Tensor: Delta features with same shape as input
"""
def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str] = None) -> torch.Tensor:
"""
Create Discrete Cosine Transform matrix for MFCC computation.
Args:
n_mfcc: Number of MFCC coefficients
n_mels: Number of mel filter banks
norm: Normalization method ("ortho" or None)
Returns:
Tensor: DCT matrix (n_mfcc, n_mels)
"""
def sliding_window_cmn(specgram: torch.Tensor, cmn_window: int = 600, min_cmn_window: int = 100,
center: bool = False, norm_vars: bool = False) -> torch.Tensor:
"""
Apply sliding window cepstral mean normalization.
Args:
specgram: Input spectrogram (..., freq, time)
cmn_window: Window size for normalization
min_cmn_window: Minimum window size
center: Whether to center the window
norm_vars: Whether to normalize variance
Returns:
Tensor: Normalized spectrogram
"""This covers the extensive functional API of TorchAudio, providing stateless functions for all major audio processing operations from basic spectral analysis to advanced effects and feature extraction.
Install with Tessl CLI
npx tessl i tessl/pypi-torchaudio