or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio.mdclassification.mdclustering.mddetection.mdfunctional.mdimage.mdindex.mdmultimodal.mdnominal.mdregression.mdretrieval.mdsegmentation.mdshape.mdtext.mdutilities.mdvideo.md

audio.mddocs/

0

# Audio Metrics

1

2

Specialized metrics for audio processing and speech evaluation including signal-to-noise ratios, perceptual quality measures, and separation metrics for speech and audio applications.

3

4

## Capabilities

5

6

### Signal Distortion Ratio Metrics

7

8

Measures the quality of audio signal reconstruction and separation.

9

10

```python { .api }

11

class ScaleInvariantSignalDistortionRatio(Metric):

12

def __init__(

13

self,

14

zero_mean: bool = True,

15

**kwargs

16

): ...

17

18

class SignalDistortionRatio(Metric):

19

def __init__(

20

self,

21

use_cg_iter: Optional[int] = None,

22

filter_length: int = 512,

23

zero_mean: bool = True,

24

load_diag: Optional[float] = None,

25

**kwargs

26

): ...

27

28

class SourceAggregatedSignalDistortionRatio(Metric):

29

def __init__(

30

self,

31

scale_invariant: bool = True,

32

zero_mean: bool = True,

33

**kwargs

34

): ...

35

```

36

37

### Signal-to-Noise Ratio Metrics

38

39

Evaluates the ratio of signal power to noise power in audio signals.

40

41

```python { .api }

42

class ScaleInvariantSignalNoiseRatio(Metric):

43

def __init__(

44

self,

45

zero_mean: bool = True,

46

**kwargs

47

): ...

48

49

class SignalNoiseRatio(Metric):

50

def __init__(

51

self,

52

zero_mean: bool = True,

53

**kwargs

54

): ...

55

56

class ComplexScaleInvariantSignalNoiseRatio(Metric):

57

def __init__(

58

self,

59

zero_mean: bool = True,

60

**kwargs

61

): ...

62

```

63

64

### Source Separation Metrics

65

66

Specialized metrics for evaluating audio source separation tasks.

67

68

```python { .api }

69

class PermutationInvariantTraining(Metric):

70

def __init__(

71

self,

72

metric: Union[Callable, Metric],

73

mode: str = "speaker-wise",

74

eval_func: str = "max",

75

**kwargs

76

): ...

77

```

78

79

### Perceptual Quality Metrics

80

81

Metrics that evaluate audio quality from a human perception perspective (require optional dependencies).

82

83

```python { .api }

84

class PerceptualEvaluationSpeechQuality(Metric):

85

def __init__(

86

self,

87

fs: int,

88

mode: str = "wb",

89

keep_same_device: bool = False,

90

**kwargs

91

): ...

92

93

class ShortTimeObjectiveIntelligibility(Metric):

94

def __init__(

95

self,

96

fs: int,

97

extended: bool = False,

98

keep_same_device: bool = False,

99

**kwargs

100

): ...

101

```

102

103

### Advanced Audio Quality Metrics

104

105

Sophisticated metrics for speech and audio quality assessment (require optional dependencies).

106

107

```python { .api }

108

class SpeechReverberationModulationEnergyRatio(Metric):

109

def __init__(

110

self,

111

fs: int,

112

max_cf: float = 128.0,

113

norm: bool = False,

114

fast: bool = True,

115

**kwargs

116

): ...

117

118

class DeepNoiseSuppressionMeanOpinionScore(Metric):

119

def __init__(

120

self,

121

fs: int = 16000,

122

personalized: bool = False,

123

**kwargs

124

): ...

125

126

class NonIntrusiveSpeechQualityAssessment(Metric):

127

def __init__(

128

self,

129

fs: int = 16000,

130

**kwargs

131

): ...

132

```

133

134

## Usage Examples

135

136

### Basic Signal Quality Metrics

137

138

```python

139

import torch

140

from torchmetrics.audio import (

141

ScaleInvariantSignalDistortionRatio,

142

ScaleInvariantSignalNoiseRatio

143

)

144

145

# Initialize metrics

146

si_sdr = ScaleInvariantSignalDistortionRatio()

147

si_snr = ScaleInvariantSignalNoiseRatio()

148

149

# Sample audio data (batch_size, time)

150

preds = torch.randn(4, 8000) # 4 samples, 8000 time steps

151

target = torch.randn(4, 8000)

152

153

# Compute signal quality metrics

154

sdr_score = si_sdr(preds, target)

155

snr_score = si_snr(preds, target)

156

157

print(f"SI-SDR: {sdr_score:.4f} dB")

158

print(f"SI-SNR: {snr_score:.4f} dB")

159

```

160

161

### Source Separation Evaluation

162

163

```python

164

from torchmetrics.audio import PermutationInvariantTraining, ScaleInvariantSignalDistortionRatio

165

166

# Initialize PIT wrapper with SI-SDR

167

base_metric = ScaleInvariantSignalDistortionRatio()

168

pit_metric = PermutationInvariantTraining(

169

metric=base_metric,

170

mode="speaker-wise",

171

eval_func="max"

172

)

173

174

# Source separation scenario: 2 sources, 2 estimates

175

# Shape: (batch, num_speakers, time)

176

preds = torch.randn(4, 2, 8000) # 4 batches, 2 estimated sources

177

target = torch.randn(4, 2, 8000) # 4 batches, 2 true sources

178

179

# Compute PIT score (handles permutation)

180

pit_score = pit_metric(preds, target)

181

print(f"PIT SI-SDR: {pit_score:.4f} dB")

182

```

183

184

### Perceptual Quality Assessment

185

186

```python

187

from torchmetrics.audio import PerceptualEvaluationSpeechQuality

188

189

# Initialize PESQ metric (requires pesq package)

190

try:

191

pesq_metric = PerceptualEvaluationSpeechQuality(fs=16000, mode="wb")

192

193

# Sample speech signals at 16kHz

194

preds = torch.randn(4, 16000) # 1 second of audio

195

target = torch.randn(4, 16000)

196

197

# Compute PESQ score

198

pesq_score = pesq_metric(preds, target)

199

print(f"PESQ: {pesq_score:.4f}")

200

201

except ImportError:

202

print("PESQ requires the 'pesq' package: pip install pesq")

203

```

204

205

### Speech Intelligibility

206

207

```python

208

from torchmetrics.audio import ShortTimeObjectiveIntelligibility

209

210

# Initialize STOI metric (requires pystoi package)

211

try:

212

stoi_metric = ShortTimeObjectiveIntelligibility(fs=16000, extended=False)

213

214

# Sample speech signals

215

preds = torch.randn(2, 16000)

216

target = torch.randn(2, 16000)

217

218

# Compute STOI score

219

stoi_score = stoi_metric(preds, target)

220

print(f"STOI: {stoi_score:.4f}")

221

222

except ImportError:

223

print("STOI requires the 'pystoi' package: pip install pystoi")

224

```

225

226

### Multi-channel Audio Processing

227

228

```python

229

from torchmetrics.audio import SignalDistortionRatio

230

231

# Traditional SDR for multi-channel audio

232

sdr_metric = SignalDistortionRatio()

233

234

# Multi-channel audio (batch, channels, time)

235

preds = torch.randn(2, 2, 8000) # Stereo audio

236

target = torch.randn(2, 2, 8000)

237

238

# Compute SDR

239

sdr_score = sdr_metric(preds, target)

240

print(f"SDR: {sdr_score:.4f} dB")

241

```

242

243

### Advanced Speech Quality

244

245

```python

246

from torchmetrics.audio import DeepNoiseSuppressionMeanOpinionScore

247

248

# DNS-MOS for speech enhancement (requires librosa and onnxruntime)

249

try:

250

dns_mos = DeepNoiseSuppressionMeanOpinionScore(fs=16000)

251

252

# Enhanced speech samples

253

preds = torch.randn(3, 16000)

254

target = torch.randn(3, 16000)

255

256

# Compute DNS-MOS

257

mos_score = dns_mos(preds, target)

258

print(f"DNS-MOS: {mos_score:.4f}")

259

260

except ImportError:

261

print("DNS-MOS requires 'librosa' and 'onnxruntime' packages")

262

```

263

264

## Types

265

266

```python { .api }

267

from typing import Union, Optional, Callable

268

import torch

269

from torch import Tensor

270

271

AudioTensor = Tensor # Shape: (..., time) or (..., channels, time)

272

SeparationTensor = Tensor # Shape: (..., num_sources, time)

273

274

EvalFunc = Union["max", "min", "mean"]

275

SeparationMode = Union["speaker-wise", "permutation-wise"]

276

PESQMode = Union["wb", "nb"] # wideband or narrowband

277

```