PyTorch native metrics library providing 400+ rigorously tested metrics across classification, regression, audio, image, text, and other ML domains
Specialized metrics for audio processing and speech evaluation including signal-to-noise ratios, perceptual quality measures, and separation metrics for speech and audio applications.
Measures the quality of audio signal reconstruction and separation.
class ScaleInvariantSignalDistortionRatio(Metric):
def __init__(
self,
zero_mean: bool = True,
**kwargs
): ...
class SignalDistortionRatio(Metric):
def __init__(
self,
use_cg_iter: Optional[int] = None,
filter_length: int = 512,
zero_mean: bool = True,
load_diag: Optional[float] = None,
**kwargs
): ...
class SourceAggregatedSignalDistortionRatio(Metric):
def __init__(
self,
scale_invariant: bool = True,
zero_mean: bool = True,
**kwargs
): ...Evaluates the ratio of signal power to noise power in audio signals.
class ScaleInvariantSignalNoiseRatio(Metric):
def __init__(
self,
zero_mean: bool = True,
**kwargs
): ...
class SignalNoiseRatio(Metric):
def __init__(
self,
zero_mean: bool = True,
**kwargs
): ...
class ComplexScaleInvariantSignalNoiseRatio(Metric):
def __init__(
self,
zero_mean: bool = True,
**kwargs
): ...Specialized metrics for evaluating audio source separation tasks.
class PermutationInvariantTraining(Metric):
def __init__(
self,
metric: Union[Callable, Metric],
mode: str = "speaker-wise",
eval_func: str = "max",
**kwargs
): ...Metrics that evaluate audio quality from a human perception perspective (require optional dependencies).
class PerceptualEvaluationSpeechQuality(Metric):
def __init__(
self,
fs: int,
mode: str = "wb",
keep_same_device: bool = False,
**kwargs
): ...
class ShortTimeObjectiveIntelligibility(Metric):
def __init__(
self,
fs: int,
extended: bool = False,
keep_same_device: bool = False,
**kwargs
): ...Sophisticated metrics for speech and audio quality assessment (require optional dependencies).
class SpeechReverberationModulationEnergyRatio(Metric):
def __init__(
self,
fs: int,
max_cf: float = 128.0,
norm: bool = False,
fast: bool = True,
**kwargs
): ...
class DeepNoiseSuppressionMeanOpinionScore(Metric):
def __init__(
self,
fs: int = 16000,
personalized: bool = False,
**kwargs
): ...
class NonIntrusiveSpeechQualityAssessment(Metric):
def __init__(
self,
fs: int = 16000,
**kwargs
): ...import torch
from torchmetrics.audio import (
ScaleInvariantSignalDistortionRatio,
ScaleInvariantSignalNoiseRatio
)
# Initialize metrics
si_sdr = ScaleInvariantSignalDistortionRatio()
si_snr = ScaleInvariantSignalNoiseRatio()
# Sample audio data (batch_size, time)
preds = torch.randn(4, 8000) # 4 samples, 8000 time steps
target = torch.randn(4, 8000)
# Compute signal quality metrics
sdr_score = si_sdr(preds, target)
snr_score = si_snr(preds, target)
print(f"SI-SDR: {sdr_score:.4f} dB")
print(f"SI-SNR: {snr_score:.4f} dB")from torchmetrics.audio import PermutationInvariantTraining, ScaleInvariantSignalDistortionRatio
# Initialize PIT wrapper with SI-SDR
base_metric = ScaleInvariantSignalDistortionRatio()
pit_metric = PermutationInvariantTraining(
metric=base_metric,
mode="speaker-wise",
eval_func="max"
)
# Source separation scenario: 2 sources, 2 estimates
# Shape: (batch, num_speakers, time)
preds = torch.randn(4, 2, 8000) # 4 batches, 2 estimated sources
target = torch.randn(4, 2, 8000) # 4 batches, 2 true sources
# Compute PIT score (handles permutation)
pit_score = pit_metric(preds, target)
print(f"PIT SI-SDR: {pit_score:.4f} dB")from torchmetrics.audio import PerceptualEvaluationSpeechQuality
# Initialize PESQ metric (requires pesq package)
try:
pesq_metric = PerceptualEvaluationSpeechQuality(fs=16000, mode="wb")
# Sample speech signals at 16kHz
preds = torch.randn(4, 16000) # 1 second of audio
target = torch.randn(4, 16000)
# Compute PESQ score
pesq_score = pesq_metric(preds, target)
print(f"PESQ: {pesq_score:.4f}")
except ImportError:
print("PESQ requires the 'pesq' package: pip install pesq")from torchmetrics.audio import ShortTimeObjectiveIntelligibility
# Initialize STOI metric (requires pystoi package)
try:
stoi_metric = ShortTimeObjectiveIntelligibility(fs=16000, extended=False)
# Sample speech signals
preds = torch.randn(2, 16000)
target = torch.randn(2, 16000)
# Compute STOI score
stoi_score = stoi_metric(preds, target)
print(f"STOI: {stoi_score:.4f}")
except ImportError:
print("STOI requires the 'pystoi' package: pip install pystoi")from torchmetrics.audio import SignalDistortionRatio
# Traditional SDR for multi-channel audio
sdr_metric = SignalDistortionRatio()
# Multi-channel audio (batch, channels, time)
preds = torch.randn(2, 2, 8000) # Stereo audio
target = torch.randn(2, 2, 8000)
# Compute SDR
sdr_score = sdr_metric(preds, target)
print(f"SDR: {sdr_score:.4f} dB")from torchmetrics.audio import DeepNoiseSuppressionMeanOpinionScore
# DNS-MOS for speech enhancement (requires librosa and onnxruntime)
try:
dns_mos = DeepNoiseSuppressionMeanOpinionScore(fs=16000)
# Enhanced speech samples
preds = torch.randn(3, 16000)
target = torch.randn(3, 16000)
# Compute DNS-MOS
mos_score = dns_mos(preds, target)
print(f"DNS-MOS: {mos_score:.4f}")
except ImportError:
print("DNS-MOS requires 'librosa' and 'onnxruntime' packages")from typing import Union, Optional, Callable
import torch
from torch import Tensor
AudioTensor = Tensor # Shape: (..., time) or (..., channels, time)
SeparationTensor = Tensor # Shape: (..., num_sources, time)
EvalFunc = Union["max", "min", "mean"]
SeparationMode = Union["speaker-wise", "permutation-wise"]
PESQMode = Union["wb", "nb"] # wideband or narrowbandInstall with Tessl CLI
npx tessl i tessl/pypi-torchmetrics