0
# Audio Metrics
1
2
Specialized metrics for audio processing and speech evaluation including signal-to-noise ratios, perceptual quality measures, and separation metrics for speech and audio applications.
3
4
## Capabilities
5
6
### Signal Distortion Ratio Metrics
7
8
Measures the quality of audio signal reconstruction and separation.
9
10
```python { .api }
11
class ScaleInvariantSignalDistortionRatio(Metric):
12
def __init__(
13
self,
14
zero_mean: bool = True,
15
**kwargs
16
): ...
17
18
class SignalDistortionRatio(Metric):
19
def __init__(
20
self,
21
use_cg_iter: Optional[int] = None,
22
filter_length: int = 512,
23
zero_mean: bool = True,
24
load_diag: Optional[float] = None,
25
**kwargs
26
): ...
27
28
class SourceAggregatedSignalDistortionRatio(Metric):
29
def __init__(
30
self,
31
scale_invariant: bool = True,
32
zero_mean: bool = True,
33
**kwargs
34
): ...
35
```
36
37
### Signal-to-Noise Ratio Metrics
38
39
Evaluates the ratio of signal power to noise power in audio signals.
40
41
```python { .api }
42
class ScaleInvariantSignalNoiseRatio(Metric):
43
def __init__(
44
self,
45
zero_mean: bool = True,
46
**kwargs
47
): ...
48
49
class SignalNoiseRatio(Metric):
50
def __init__(
51
self,
52
zero_mean: bool = True,
53
**kwargs
54
): ...
55
56
class ComplexScaleInvariantSignalNoiseRatio(Metric):
57
def __init__(
58
self,
59
zero_mean: bool = True,
60
**kwargs
61
): ...
62
```
63
64
### Source Separation Metrics
65
66
Specialized metrics for evaluating audio source separation tasks.
67
68
```python { .api }
69
class PermutationInvariantTraining(Metric):
70
def __init__(
71
self,
72
metric: Union[Callable, Metric],
73
mode: str = "speaker-wise",
74
eval_func: str = "max",
75
**kwargs
76
): ...
77
```
78
79
### Perceptual Quality Metrics
80
81
Metrics that evaluate audio quality from a human perception perspective (require optional dependencies).
82
83
```python { .api }
84
class PerceptualEvaluationSpeechQuality(Metric):
85
def __init__(
86
self,
87
fs: int,
88
mode: str = "wb",
89
keep_same_device: bool = False,
90
**kwargs
91
): ...
92
93
class ShortTimeObjectiveIntelligibility(Metric):
94
def __init__(
95
self,
96
fs: int,
97
extended: bool = False,
98
keep_same_device: bool = False,
99
**kwargs
100
): ...
101
```
102
103
### Advanced Audio Quality Metrics
104
105
Sophisticated metrics for speech and audio quality assessment (require optional dependencies).
106
107
```python { .api }
108
class SpeechReverberationModulationEnergyRatio(Metric):
109
def __init__(
110
self,
111
fs: int,
112
max_cf: float = 128.0,
113
norm: bool = False,
114
fast: bool = True,
115
**kwargs
116
): ...
117
118
class DeepNoiseSuppressionMeanOpinionScore(Metric):
119
def __init__(
120
self,
121
fs: int = 16000,
122
personalized: bool = False,
123
**kwargs
124
): ...
125
126
class NonIntrusiveSpeechQualityAssessment(Metric):
127
def __init__(
128
self,
129
fs: int = 16000,
130
**kwargs
131
): ...
132
```
133
134
## Usage Examples
135
136
### Basic Signal Quality Metrics
137
138
```python
139
import torch
140
from torchmetrics.audio import (
141
ScaleInvariantSignalDistortionRatio,
142
ScaleInvariantSignalNoiseRatio
143
)
144
145
# Initialize metrics
146
si_sdr = ScaleInvariantSignalDistortionRatio()
147
si_snr = ScaleInvariantSignalNoiseRatio()
148
149
# Sample audio data (batch_size, time)
150
preds = torch.randn(4, 8000) # 4 samples, 8000 time steps
151
target = torch.randn(4, 8000)
152
153
# Compute signal quality metrics
154
sdr_score = si_sdr(preds, target)
155
snr_score = si_snr(preds, target)
156
157
print(f"SI-SDR: {sdr_score:.4f} dB")
158
print(f"SI-SNR: {snr_score:.4f} dB")
159
```
160
161
### Source Separation Evaluation
162
163
```python
164
from torchmetrics.audio import PermutationInvariantTraining, ScaleInvariantSignalDistortionRatio
165
166
# Initialize PIT wrapper with SI-SDR
167
base_metric = ScaleInvariantSignalDistortionRatio()
168
pit_metric = PermutationInvariantTraining(
169
metric=base_metric,
170
mode="speaker-wise",
171
eval_func="max"
172
)
173
174
# Source separation scenario: 2 sources, 2 estimates
175
# Shape: (batch, num_speakers, time)
176
preds = torch.randn(4, 2, 8000) # 4 batches, 2 estimated sources
177
target = torch.randn(4, 2, 8000) # 4 batches, 2 true sources
178
179
# Compute PIT score (handles permutation)
180
pit_score = pit_metric(preds, target)
181
print(f"PIT SI-SDR: {pit_score:.4f} dB")
182
```
183
184
### Perceptual Quality Assessment
185
186
```python
187
from torchmetrics.audio import PerceptualEvaluationSpeechQuality
188
189
# Initialize PESQ metric (requires pesq package)
190
try:
191
pesq_metric = PerceptualEvaluationSpeechQuality(fs=16000, mode="wb")
192
193
# Sample speech signals at 16kHz
194
preds = torch.randn(4, 16000) # 1 second of audio
195
target = torch.randn(4, 16000)
196
197
# Compute PESQ score
198
pesq_score = pesq_metric(preds, target)
199
print(f"PESQ: {pesq_score:.4f}")
200
201
except ImportError:
202
print("PESQ requires the 'pesq' package: pip install pesq")
203
```
204
205
### Speech Intelligibility
206
207
```python
208
from torchmetrics.audio import ShortTimeObjectiveIntelligibility
209
210
# Initialize STOI metric (requires pystoi package)
211
try:
212
stoi_metric = ShortTimeObjectiveIntelligibility(fs=16000, extended=False)
213
214
# Sample speech signals
215
preds = torch.randn(2, 16000)
216
target = torch.randn(2, 16000)
217
218
# Compute STOI score
219
stoi_score = stoi_metric(preds, target)
220
print(f"STOI: {stoi_score:.4f}")
221
222
except ImportError:
223
print("STOI requires the 'pystoi' package: pip install pystoi")
224
```
225
226
### Multi-channel Audio Processing
227
228
```python
229
from torchmetrics.audio import SignalDistortionRatio
230
231
# Traditional SDR for multi-channel audio
232
sdr_metric = SignalDistortionRatio()
233
234
# Multi-channel audio (batch, channels, time)
235
preds = torch.randn(2, 2, 8000) # Stereo audio
236
target = torch.randn(2, 2, 8000)
237
238
# Compute SDR
239
sdr_score = sdr_metric(preds, target)
240
print(f"SDR: {sdr_score:.4f} dB")
241
```
242
243
### Advanced Speech Quality
244
245
```python
246
from torchmetrics.audio import DeepNoiseSuppressionMeanOpinionScore
247
248
# DNS-MOS for speech enhancement (requires librosa and onnxruntime)
249
try:
250
dns_mos = DeepNoiseSuppressionMeanOpinionScore(fs=16000)
251
252
# Enhanced speech samples
253
preds = torch.randn(3, 16000)
254
target = torch.randn(3, 16000)
255
256
# Compute DNS-MOS
257
mos_score = dns_mos(preds, target)
258
print(f"DNS-MOS: {mos_score:.4f}")
259
260
except ImportError:
261
print("DNS-MOS requires 'librosa' and 'onnxruntime' packages")
262
```
263
264
## Types
265
266
```python { .api }
267
from typing import Union, Optional, Callable
268
import torch
269
from torch import Tensor
270
271
AudioTensor = Tensor # Shape: (..., time) or (..., channels, time)
272
SeparationTensor = Tensor # Shape: (..., num_sources, time)
273
274
EvalFunc = Union["max", "min", "mean"]
275
SeparationMode = Union["speaker-wise", "permutation-wise"]
276
PESQMode = Union["wb", "nb"] # wideband or narrowband
277
```