0
# Audio Datasets
1
2
Standard dataset loaders for common audio datasets with consistent interfaces and preprocessing. TorchAudio provides PyTorch-compatible dataset classes for speech recognition, synthesis, music analysis, and source separation research.
3
4
## Capabilities
5
6
### Speech Recognition Datasets
7
8
Datasets for training and evaluating automatic speech recognition systems.
9
10
```python { .api }
11
class LIBRISPEECH(torch.utils.data.Dataset):
12
"""LibriSpeech ASR corpus - large-scale English speech recognition dataset."""
13
14
def __init__(self, root: str, url: str = "train-clean-100",
15
folder_in_archive: str = "LibriSpeech", download: bool = False) -> None:
16
"""
17
Args:
18
root: Root directory for dataset storage
19
url: Dataset subset ("train-clean-100", "train-clean-360", "train-other-500",
20
"dev-clean", "dev-other", "test-clean", "test-other")
21
folder_in_archive: Folder name in archive
22
download: Whether to download if not found
23
"""
24
25
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, int, int, int]:
26
"""
27
Returns:
28
Tuple of (waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)
29
"""
30
31
class LibriSpeechBiasing(torch.utils.data.Dataset):
32
"""LibriSpeech dataset with word-level biasing lists for contextualized ASR."""
33
34
def __init__(self, root: str, subset: str, audio_dir: str, download: bool = False) -> None:
35
"""
36
Args:
37
root: Root directory
38
subset: Dataset subset
39
audio_dir: Directory containing audio files
40
download: Whether to download if not found
41
"""
42
43
class SPEECHCOMMANDS(torch.utils.data.Dataset):
44
"""Google Speech Commands dataset - keyword spotting."""
45
46
def __init__(self, root: str, url: str = "speech_commands_v0.02",
47
folder_in_archive: str = "SpeechCommands", download: bool = False,
48
subset: Optional[str] = None) -> None:
49
"""
50
Args:
51
root: Root directory
52
url: Dataset version
53
folder_in_archive: Folder name in archive
54
download: Whether to download
55
subset: "training", "validation", "testing", or None for all
56
"""
57
58
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, int]:
59
"""
60
Returns:
61
Tuple of (waveform, sample_rate, label, speaker_id, utterance_number)
62
"""
63
64
class COMMONVOICE(torch.utils.data.Dataset):
65
"""Mozilla Common Voice multilingual speech corpus."""
66
67
def __init__(self, root: str, tsv: str = "train.tsv", url: str = "cv-corpus-4-2019-12-10",
68
folder_in_archive: str = "cv-corpus-4-2019-12-10", download: bool = False,
69
version: str = "cv-corpus-4-2019-12-10") -> None:
70
"""
71
Args:
72
root: Root directory
73
tsv: TSV file to load ("train.tsv", "dev.tsv", "test.tsv")
74
url: Download URL identifier
75
folder_in_archive: Archive folder name
76
download: Whether to download
77
version: Dataset version
78
"""
79
80
class TEDLIUM(torch.utils.data.Dataset):
81
"""TED-LIUM ASR corpus - TED talks with transcripts."""
82
83
def __init__(self, root: str, release: str = "release3", subset: str = "train",
84
download: bool = False, audio_ext: str = ".sph") -> None:
85
"""
86
Args:
87
root: Root directory
88
release: Dataset release ("release1", "release2", "release3")
89
subset: Data subset ("train", "dev", "test")
90
download: Whether to download
91
audio_ext: Audio file extension
92
"""
93
94
class VoxCeleb1Identification(torch.utils.data.Dataset):
95
"""VoxCeleb1 speaker identification dataset."""
96
97
def __init__(self, root: str, subset: str = "train", meta_url: str = "vox1_meta.csv",
98
base_url: str = "https://mm.kaist.ac.kr/datasets/voxceleb/",
99
download: bool = False) -> None:
100
"""
101
Args:
102
root: Root directory
103
subset: "train", "dev", or "test"
104
meta_url: Metadata file URL
105
base_url: Base download URL
106
download: Whether to download
107
"""
108
```
109
110
### Speech Synthesis Datasets
111
112
Datasets for text-to-speech synthesis and voice conversion.
113
114
```python { .api }
115
class LJSPEECH(torch.utils.data.Dataset):
116
"""LJ Speech dataset - single speaker English TTS corpus."""
117
118
def __init__(self, root: str, url: str = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",
119
folder_in_archive: str = "LJSpeech-1.1", download: bool = False) -> None:
120
"""
121
Args:
122
root: Root directory
123
url: Download URL
124
folder_in_archive: Archive folder name
125
download: Whether to download
126
"""
127
128
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str]:
129
"""
130
Returns:
131
Tuple of (waveform, sample_rate, transcript, normalized_transcript)
132
"""
133
134
class LIBRITTS(torch.utils.data.Dataset):
135
"""LibriTTS multi-speaker English TTS corpus."""
136
137
def __init__(self, root: str, url: str = "train-clean-100",
138
folder_in_archive: str = "LibriTTS", download: bool = False,
139
subset: str = "train-clean-100") -> None:
140
"""
141
Args:
142
root: Root directory
143
url: Dataset subset URL
144
folder_in_archive: Archive folder name
145
download: Whether to download
146
subset: Data subset
147
"""
148
149
class VCTK_092(torch.utils.data.Dataset):
150
"""VCTK Corpus 0.92 - multi-speaker English TTS dataset."""
151
152
def __init__(self, root: str, mic_id: str = "mic1", download: bool = False,
153
url: str = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip",
154
folder_in_archive: str = "VCTK-Corpus-0.92") -> None:
155
"""
156
Args:
157
root: Root directory
158
mic_id: Microphone ID ("mic1" or "mic2")
159
download: Whether to download
160
url: Download URL
161
folder_in_archive: Archive folder name
162
"""
163
164
class CMUARCTIC(torch.utils.data.Dataset):
165
"""CMU ARCTIC speech synthesis database."""
166
167
def __init__(self, root: str, subset: str = "aew", download: bool = False,
168
url: str = "cmu_arctic", folder_in_archive: str = "ARCTIC") -> None:
169
"""
170
Args:
171
root: Root directory
172
subset: Speaker subset (e.g., "aew", "ahw", "aup", "awb")
173
download: Whether to download
174
url: Download URL
175
folder_in_archive: Archive folder name
176
"""
177
```
178
179
### Music and Audio Datasets
180
181
Datasets for music information retrieval and general audio analysis.
182
183
```python { .api }
184
class GTZAN(torch.utils.data.Dataset):
185
"""GTZAN Genre Collection - music genre classification dataset."""
186
187
def __init__(self, root: str, url: str = "http://opihi.cs.uvic.ca/sound/genres.tar.gz",
188
folder_in_archive: str = "genres", download: bool = False,
189
subset: Optional[str] = None) -> None:
190
"""
191
Args:
192
root: Root directory
193
url: Download URL
194
folder_in_archive: Archive folder name
195
download: Whether to download
196
subset: Specific genre subset or None for all
197
"""
198
199
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
200
"""
201
Returns:
202
Tuple of (waveform, sample_rate, genre_label)
203
"""
204
205
class MUSDB_HQ(torch.utils.data.Dataset):
206
"""MUSDB18-HQ source separation dataset."""
207
208
def __init__(self, root: str, subset: str = "train", sources: List[str] = None,
209
targets: List[str] = None, duration: Optional[float] = None,
210
sample_rate: int = 44100, overlap: float = 0.25,
211
num_workers: int = 0, split: str = "train", seed: int = 42,
212
download: bool = False) -> None:
213
"""
214
Args:
215
root: Root directory
216
subset: "train" or "test"
217
sources: List of source stems to load
218
targets: List of target stems for separation
219
duration: Duration of segments in seconds
220
sample_rate: Target sample rate
221
overlap: Overlap between segments
222
num_workers: Number of worker processes
223
split: Data split
224
seed: Random seed
225
download: Whether to download
226
"""
227
```
228
229
### Specialized Datasets
230
231
Datasets for specific audio processing tasks.
232
233
```python { .api }
234
class FluentSpeechCommands(torch.utils.data.Dataset):
235
"""Fluent Speech Commands - intent classification dataset."""
236
237
def __init__(self, root: str, subset: str = "train", download: bool = False) -> None:
238
"""
239
Args:
240
root: Root directory
241
subset: "train", "valid", or "test"
242
download: Whether to download
243
"""
244
245
class YESNO(torch.utils.data.Dataset):
246
"""Hebrew Yes/No dataset - simple binary classification."""
247
248
def __init__(self, root: str, url: str = "http://www.openslr.org/resources/1/waves_yesno.tar.gz",
249
folder_in_archive: str = "waves_yesno", download: bool = False) -> None:
250
"""
251
Args:
252
root: Root directory
253
url: Download URL
254
folder_in_archive: Archive folder name
255
download: Whether to download
256
"""
257
258
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, List[int]]:
259
"""
260
Returns:
261
Tuple of (waveform, sample_rate, labels) where labels is list of 0s and 1s
262
"""
263
264
class CMUDict(torch.utils.data.Dataset):
265
"""CMU Pronouncing Dictionary - phonetic dictionary."""
266
267
def __init__(self, root: str, url: str = "cmudict-0.7b",
268
folder_in_archive: str = "cmudict", download: bool = False) -> None:
269
"""
270
Args:
271
root: Root directory
272
url: Dataset version
273
folder_in_archive: Archive folder name
274
download: Whether to download
275
"""
276
277
class LibriMix(torch.utils.data.Dataset):
278
"""LibriMix speech separation dataset."""
279
280
def __init__(self, root: str, subset: str = "train-360", num_speakers: int = 2,
281
sample_rate: int = 8000, task: str = "sep_clean", download: bool = False) -> None:
282
"""
283
Args:
284
root: Root directory
285
subset: Data subset
286
num_speakers: Number of speakers in mixture (2 or 3)
287
sample_rate: Sample rate (8000 or 16000)
288
task: Task type ("sep_clean", "sep_noisy", etc.)
289
download: Whether to download
290
"""
291
292
class QUESST14(torch.utils.data.Dataset):
293
"""QUESST 2014 Query by Example Spoken Term Detection."""
294
295
def __init__(self, root: str, subset: str = "docs", download: bool = False,
296
url: str = "quesst14_database", folder_in_archive: str = "quesst14Database") -> None:
297
"""
298
Args:
299
root: Root directory
300
subset: "docs", "dev", or "eval"
301
download: Whether to download
302
url: Download URL
303
folder_in_archive: Archive folder name
304
"""
305
306
class IEMOCAP(torch.utils.data.Dataset):
307
"""IEMOCAP emotion recognition dataset."""
308
309
def __init__(self, root: str, sessions: List[int] = [1, 2, 3, 4, 5],
310
utterance_type: str = "scripted", download: bool = False) -> None:
311
"""
312
Args:
313
root: Root directory
314
sessions: List of session numbers to include
315
utterance_type: "scripted" or "improvised"
316
download: Whether to download
317
"""
318
```
319
320
## Usage Examples
321
322
### LibriSpeech for ASR
323
324
```python
325
import torchaudio
326
from torchaudio.datasets import LIBRISPEECH
327
from torch.utils.data import DataLoader
328
329
# Create dataset
330
dataset = LIBRISPEECH(
331
root="./data",
332
url="train-clean-100", # 100 hours of clean training data
333
download=True
334
)
335
336
# Create data loader
337
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: x)
338
339
# Iterate through data
340
for batch in dataloader:
341
for waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id in batch:
342
print(f"Waveform shape: {waveform.shape}")
343
print(f"Sample rate: {sample_rate}")
344
print(f"Transcript: {transcript}")
345
print(f"Speaker ID: {speaker_id}")
346
break
347
break
348
```
349
350
### LJ Speech for TTS
351
352
```python
353
import torchaudio
354
from torchaudio.datasets import LJSPEECH
355
356
# Create dataset
357
dataset = LJSPEECH(root="./data", download=True)
358
359
# Get a sample
360
waveform, sample_rate, transcript, normalized_transcript = dataset[0]
361
362
print(f"Audio shape: {waveform.shape}")
363
print(f"Original transcript: {transcript}")
364
print(f"Normalized transcript: {normalized_transcript}")
365
366
# Can be used with DataLoader for training TTS models
367
from torch.utils.data import DataLoader
368
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
369
```
370
371
### GTZAN for Music Classification
372
373
```python
374
import torchaudio
375
from torchaudio.datasets import GTZAN
376
377
# Create dataset
378
dataset = GTZAN(root="./data", download=True)
379
380
# Get a sample
381
waveform, sample_rate, genre = dataset[0]
382
383
print(f"Audio shape: {waveform.shape}")
384
print(f"Sample rate: {sample_rate}")
385
print(f"Genre: {genre}")
386
387
# Genres: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, rock
388
```
389
390
### Speech Commands for Keyword Spotting
391
392
```python
393
import torchaudio
394
from torchaudio.datasets import SPEECHCOMMANDS
395
396
# Create training dataset
397
train_set = SPEECHCOMMANDS(root="./data", subset="training", download=True)
398
399
# Get a sample
400
waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]
401
402
print(f"Audio shape: {waveform.shape}")
403
print(f"Command: {label}")
404
print(f"Speaker: {speaker_id}")
405
406
# Commands include: "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"
407
```
408
409
These datasets provide standardized interfaces for common audio processing tasks and can be easily integrated into PyTorch training pipelines with consistent preprocessing and data loading patterns.