or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-io.mddatasets.mdeffects.mdfunctional.mdindex.mdmodels.mdpipelines.mdstreaming.mdtransforms.mdutils.md

datasets.mddocs/

0

# Audio Datasets

1

2

Standard dataset loaders for common audio datasets with consistent interfaces and preprocessing. TorchAudio provides PyTorch-compatible dataset classes for speech recognition, synthesis, music analysis, and source separation research.

3

4

## Capabilities

5

6

### Speech Recognition Datasets

7

8

Datasets for training and evaluating automatic speech recognition systems.

9

10

```python { .api }

11

class LIBRISPEECH(torch.utils.data.Dataset):

12

"""LibriSpeech ASR corpus - large-scale English speech recognition dataset."""

13

14

def __init__(self, root: str, url: str = "train-clean-100",

15

folder_in_archive: str = "LibriSpeech", download: bool = False) -> None:

16

"""

17

Args:

18

root: Root directory for dataset storage

19

url: Dataset subset ("train-clean-100", "train-clean-360", "train-other-500",

20

"dev-clean", "dev-other", "test-clean", "test-other")

21

folder_in_archive: Folder name in archive

22

download: Whether to download if not found

23

"""

24

25

def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, int, int, int]:

26

"""

27

Returns:

28

Tuple of (waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)

29

"""

30

31

class LibriSpeechBiasing(torch.utils.data.Dataset):

32

"""LibriSpeech dataset with word-level biasing lists for contextualized ASR."""

33

34

def __init__(self, root: str, subset: str, audio_dir: str, download: bool = False) -> None:

35

"""

36

Args:

37

root: Root directory

38

subset: Dataset subset

39

audio_dir: Directory containing audio files

40

download: Whether to download if not found

41

"""

42

43

class SPEECHCOMMANDS(torch.utils.data.Dataset):

44

"""Google Speech Commands dataset - keyword spotting."""

45

46

def __init__(self, root: str, url: str = "speech_commands_v0.02",

47

folder_in_archive: str = "SpeechCommands", download: bool = False,

48

subset: Optional[str] = None) -> None:

49

"""

50

Args:

51

root: Root directory

52

url: Dataset version

53

folder_in_archive: Folder name in archive

54

download: Whether to download

55

subset: "training", "validation", "testing", or None for all

56

"""

57

58

def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, int]:

59

"""

60

Returns:

61

Tuple of (waveform, sample_rate, label, speaker_id, utterance_number)

62

"""

63

64

class COMMONVOICE(torch.utils.data.Dataset):

65

"""Mozilla Common Voice multilingual speech corpus."""

66

67

def __init__(self, root: str, tsv: str = "train.tsv", url: str = "cv-corpus-4-2019-12-10",

68

folder_in_archive: str = "cv-corpus-4-2019-12-10", download: bool = False,

69

version: str = "cv-corpus-4-2019-12-10") -> None:

70

"""

71

Args:

72

root: Root directory

73

tsv: TSV file to load ("train.tsv", "dev.tsv", "test.tsv")

74

url: Download URL identifier

75

folder_in_archive: Archive folder name

76

download: Whether to download

77

version: Dataset version

78

"""

79

80

class TEDLIUM(torch.utils.data.Dataset):

81

"""TED-LIUM ASR corpus - TED talks with transcripts."""

82

83

def __init__(self, root: str, release: str = "release3", subset: str = "train",

84

download: bool = False, audio_ext: str = ".sph") -> None:

85

"""

86

Args:

87

root: Root directory

88

release: Dataset release ("release1", "release2", "release3")

89

subset: Data subset ("train", "dev", "test")

90

download: Whether to download

91

audio_ext: Audio file extension

92

"""

93

94

class VoxCeleb1Identification(torch.utils.data.Dataset):

95

"""VoxCeleb1 speaker identification dataset."""

96

97

def __init__(self, root: str, subset: str = "train", meta_url: str = "vox1_meta.csv",

98

base_url: str = "https://mm.kaist.ac.kr/datasets/voxceleb/",

99

download: bool = False) -> None:

100

"""

101

Args:

102

root: Root directory

103

subset: "train", "dev", or "test"

104

meta_url: Metadata file URL

105

base_url: Base download URL

106

download: Whether to download

107

"""

108

```

109

110

### Speech Synthesis Datasets

111

112

Datasets for text-to-speech synthesis and voice conversion.

113

114

```python { .api }

115

class LJSPEECH(torch.utils.data.Dataset):

116

"""LJ Speech dataset - single speaker English TTS corpus."""

117

118

def __init__(self, root: str, url: str = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",

119

folder_in_archive: str = "LJSpeech-1.1", download: bool = False) -> None:

120

"""

121

Args:

122

root: Root directory

123

url: Download URL

124

folder_in_archive: Archive folder name

125

download: Whether to download

126

"""

127

128

def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str]:

129

"""

130

Returns:

131

Tuple of (waveform, sample_rate, transcript, normalized_transcript)

132

"""

133

134

class LIBRITTS(torch.utils.data.Dataset):

135

"""LibriTTS multi-speaker English TTS corpus."""

136

137

def __init__(self, root: str, url: str = "train-clean-100",

138

folder_in_archive: str = "LibriTTS", download: bool = False,

139

subset: str = "train-clean-100") -> None:

140

"""

141

Args:

142

root: Root directory

143

url: Dataset subset URL

144

folder_in_archive: Archive folder name

145

download: Whether to download

146

subset: Data subset

147

"""

148

149

class VCTK_092(torch.utils.data.Dataset):

150

"""VCTK Corpus 0.92 - multi-speaker English TTS dataset."""

151

152

def __init__(self, root: str, mic_id: str = "mic1", download: bool = False,

153

url: str = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip",

154

folder_in_archive: str = "VCTK-Corpus-0.92") -> None:

155

"""

156

Args:

157

root: Root directory

158

mic_id: Microphone ID ("mic1" or "mic2")

159

download: Whether to download

160

url: Download URL

161

folder_in_archive: Archive folder name

162

"""

163

164

class CMUARCTIC(torch.utils.data.Dataset):

165

"""CMU ARCTIC speech synthesis database."""

166

167

def __init__(self, root: str, subset: str = "aew", download: bool = False,

168

url: str = "cmu_arctic", folder_in_archive: str = "ARCTIC") -> None:

169

"""

170

Args:

171

root: Root directory

172

subset: Speaker subset (e.g., "aew", "ahw", "aup", "awb")

173

download: Whether to download

174

url: Download URL

175

folder_in_archive: Archive folder name

176

"""

177

```

178

179

### Music and Audio Datasets

180

181

Datasets for music information retrieval and general audio analysis.

182

183

```python { .api }

184

class GTZAN(torch.utils.data.Dataset):

185

"""GTZAN Genre Collection - music genre classification dataset."""

186

187

def __init__(self, root: str, url: str = "http://opihi.cs.uvic.ca/sound/genres.tar.gz",

188

folder_in_archive: str = "genres", download: bool = False,

189

subset: Optional[str] = None) -> None:

190

"""

191

Args:

192

root: Root directory

193

url: Download URL

194

folder_in_archive: Archive folder name

195

download: Whether to download

196

subset: Specific genre subset or None for all

197

"""

198

199

def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:

200

"""

201

Returns:

202

Tuple of (waveform, sample_rate, genre_label)

203

"""

204

205

class MUSDB_HQ(torch.utils.data.Dataset):

206

"""MUSDB18-HQ source separation dataset."""

207

208

def __init__(self, root: str, subset: str = "train", sources: List[str] = None,

209

targets: List[str] = None, duration: Optional[float] = None,

210

sample_rate: int = 44100, overlap: float = 0.25,

211

num_workers: int = 0, split: str = "train", seed: int = 42,

212

download: bool = False) -> None:

213

"""

214

Args:

215

root: Root directory

216

subset: "train" or "test"

217

sources: List of source stems to load

218

targets: List of target stems for separation

219

duration: Duration of segments in seconds

220

sample_rate: Target sample rate

221

overlap: Overlap between segments

222

num_workers: Number of worker processes

223

split: Data split

224

seed: Random seed

225

download: Whether to download

226

"""

227

```

228

229

### Specialized Datasets

230

231

Datasets for specific audio processing tasks.

232

233

```python { .api }

234

class FluentSpeechCommands(torch.utils.data.Dataset):

235

"""Fluent Speech Commands - intent classification dataset."""

236

237

def __init__(self, root: str, subset: str = "train", download: bool = False) -> None:

238

"""

239

Args:

240

root: Root directory

241

subset: "train", "valid", or "test"

242

download: Whether to download

243

"""

244

245

class YESNO(torch.utils.data.Dataset):

246

"""Hebrew Yes/No dataset - simple binary classification."""

247

248

def __init__(self, root: str, url: str = "http://www.openslr.org/resources/1/waves_yesno.tar.gz",

249

folder_in_archive: str = "waves_yesno", download: bool = False) -> None:

250

"""

251

Args:

252

root: Root directory

253

url: Download URL

254

folder_in_archive: Archive folder name

255

download: Whether to download

256

"""

257

258

def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, List[int]]:

259

"""

260

Returns:

261

Tuple of (waveform, sample_rate, labels) where labels is list of 0s and 1s

262

"""

263

264

class CMUDict(torch.utils.data.Dataset):

265

"""CMU Pronouncing Dictionary - phonetic dictionary."""

266

267

def __init__(self, root: str, url: str = "cmudict-0.7b",

268

folder_in_archive: str = "cmudict", download: bool = False) -> None:

269

"""

270

Args:

271

root: Root directory

272

url: Dataset version

273

folder_in_archive: Archive folder name

274

download: Whether to download

275

"""

276

277

class LibriMix(torch.utils.data.Dataset):

278

"""LibriMix speech separation dataset."""

279

280

def __init__(self, root: str, subset: str = "train-360", num_speakers: int = 2,

281

sample_rate: int = 8000, task: str = "sep_clean", download: bool = False) -> None:

282

"""

283

Args:

284

root: Root directory

285

subset: Data subset

286

num_speakers: Number of speakers in mixture (2 or 3)

287

sample_rate: Sample rate (8000 or 16000)

288

task: Task type ("sep_clean", "sep_noisy", etc.)

289

download: Whether to download

290

"""

291

292

class QUESST14(torch.utils.data.Dataset):

293

"""QUESST 2014 Query by Example Spoken Term Detection."""

294

295

def __init__(self, root: str, subset: str = "docs", download: bool = False,

296

url: str = "quesst14_database", folder_in_archive: str = "quesst14Database") -> None:

297

"""

298

Args:

299

root: Root directory

300

subset: "docs", "dev", or "eval"

301

download: Whether to download

302

url: Download URL

303

folder_in_archive: Archive folder name

304

"""

305

306

class IEMOCAP(torch.utils.data.Dataset):

307

"""IEMOCAP emotion recognition dataset."""

308

309

def __init__(self, root: str, sessions: List[int] = [1, 2, 3, 4, 5],

310

utterance_type: str = "scripted", download: bool = False) -> None:

311

"""

312

Args:

313

root: Root directory

314

sessions: List of session numbers to include

315

utterance_type: "scripted" or "improvised"

316

download: Whether to download

317

"""

318

```

319

320

## Usage Examples

321

322

### LibriSpeech for ASR

323

324

```python

325

import torchaudio

326

from torchaudio.datasets import LIBRISPEECH

327

from torch.utils.data import DataLoader

328

329

# Create dataset

330

dataset = LIBRISPEECH(

331

root="./data",

332

url="train-clean-100", # 100 hours of clean training data

333

download=True

334

)

335

336

# Create data loader

337

dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: x)

338

339

# Iterate through data

340

for batch in dataloader:

341

for waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id in batch:

342

print(f"Waveform shape: {waveform.shape}")

343

print(f"Sample rate: {sample_rate}")

344

print(f"Transcript: {transcript}")

345

print(f"Speaker ID: {speaker_id}")

346

break

347

break

348

```

349

350

### LJ Speech for TTS

351

352

```python

353

import torchaudio

354

from torchaudio.datasets import LJSPEECH

355

356

# Create dataset

357

dataset = LJSPEECH(root="./data", download=True)

358

359

# Get a sample

360

waveform, sample_rate, transcript, normalized_transcript = dataset[0]

361

362

print(f"Audio shape: {waveform.shape}")

363

print(f"Original transcript: {transcript}")

364

print(f"Normalized transcript: {normalized_transcript}")

365

366

# Can be used with DataLoader for training TTS models

367

from torch.utils.data import DataLoader

368

dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

369

```

370

371

### GTZAN for Music Classification

372

373

```python

374

import torchaudio

375

from torchaudio.datasets import GTZAN

376

377

# Create dataset

378

dataset = GTZAN(root="./data", download=True)

379

380

# Get a sample

381

waveform, sample_rate, genre = dataset[0]

382

383

print(f"Audio shape: {waveform.shape}")

384

print(f"Sample rate: {sample_rate}")

385

print(f"Genre: {genre}")

386

387

# Genres: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, rock

388

```

389

390

### Speech Commands for Keyword Spotting

391

392

```python

393

import torchaudio

394

from torchaudio.datasets import SPEECHCOMMANDS

395

396

# Create training dataset

397

train_set = SPEECHCOMMANDS(root="./data", subset="training", download=True)

398

399

# Get a sample

400

waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]

401

402

print(f"Audio shape: {waveform.shape}")

403

print(f"Command: {label}")

404

print(f"Speaker: {speaker_id}")

405

406

# Commands include: "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"

407

```

408

409

These datasets provide standardized interfaces for common audio processing tasks and can be easily integrated into PyTorch training pipelines with consistent preprocessing and data loading patterns.