Tessl Tile for pypi/torchaudio@2.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio-io.md datasets.md effects.md functional.md index.md models.md pipelines.md streaming.md transforms.md utils.md

datasets.mddocs/

0
# Audio Datasets
1

2
Standard dataset loaders for common audio datasets with consistent interfaces and preprocessing. TorchAudio provides PyTorch-compatible dataset classes for speech recognition, synthesis, music analysis, and source separation research.
3

4
## Capabilities
5

6
### Speech Recognition Datasets
7

8
Datasets for training and evaluating automatic speech recognition systems.
9

10
```python { .api }
11
class LIBRISPEECH(torch.utils.data.Dataset):
12
    """LibriSpeech ASR corpus - large-scale English speech recognition dataset."""
13
    
14
    def __init__(self, root: str, url: str = "train-clean-100", 
15
                 folder_in_archive: str = "LibriSpeech", download: bool = False) -> None:
16
        """
17
        Args:
18
            root: Root directory for dataset storage
19
            url: Dataset subset ("train-clean-100", "train-clean-360", "train-other-500", 
20
                                "dev-clean", "dev-other", "test-clean", "test-other")
21
            folder_in_archive: Folder name in archive
22
            download: Whether to download if not found
23
        """
24

25
    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, int, int, int]:
26
        """
27
        Returns:
28
            Tuple of (waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)
29
        """
30

31
class LibriSpeechBiasing(torch.utils.data.Dataset):
32
    """LibriSpeech dataset with word-level biasing lists for contextualized ASR."""
33
    
34
    def __init__(self, root: str, subset: str, audio_dir: str, download: bool = False) -> None:
35
        """
36
        Args:
37
            root: Root directory
38
            subset: Dataset subset 
39
            audio_dir: Directory containing audio files
40
            download: Whether to download if not found
41
        """
42

43
class SPEECHCOMMANDS(torch.utils.data.Dataset):
44
    """Google Speech Commands dataset - keyword spotting."""
45
    
46
    def __init__(self, root: str, url: str = "speech_commands_v0.02", 
47
                 folder_in_archive: str = "SpeechCommands", download: bool = False,
48
                 subset: Optional[str] = None) -> None:
49
        """
50
        Args:
51
            root: Root directory
52
            url: Dataset version
53
            folder_in_archive: Folder name in archive
54
            download: Whether to download
55
            subset: "training", "validation", "testing", or None for all
56
        """
57

58
    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, int]:
59
        """
60
        Returns:
61
            Tuple of (waveform, sample_rate, label, speaker_id, utterance_number)
62
        """
63

64
class COMMONVOICE(torch.utils.data.Dataset):
65
    """Mozilla Common Voice multilingual speech corpus."""
66
    
67
    def __init__(self, root: str, tsv: str = "train.tsv", url: str = "cv-corpus-4-2019-12-10",
68
                 folder_in_archive: str = "cv-corpus-4-2019-12-10", download: bool = False,
69
                 version: str = "cv-corpus-4-2019-12-10") -> None:
70
        """
71
        Args:
72
            root: Root directory
73
            tsv: TSV file to load ("train.tsv", "dev.tsv", "test.tsv")
74
            url: Download URL identifier
75
            folder_in_archive: Archive folder name
76
            download: Whether to download
77
            version: Dataset version
78
        """
79

80
class TEDLIUM(torch.utils.data.Dataset):
81
    """TED-LIUM ASR corpus - TED talks with transcripts."""
82
    
83
    def __init__(self, root: str, release: str = "release3", subset: str = "train",
84
                 download: bool = False, audio_ext: str = ".sph") -> None:
85
        """
86
        Args:
87
            root: Root directory
88
            release: Dataset release ("release1", "release2", "release3")
89
            subset: Data subset ("train", "dev", "test")
90
            download: Whether to download
91
            audio_ext: Audio file extension
92
        """
93

94
class VoxCeleb1Identification(torch.utils.data.Dataset):
95
    """VoxCeleb1 speaker identification dataset."""
96
    
97
    def __init__(self, root: str, subset: str = "train", meta_url: str = "vox1_meta.csv",
98
                 base_url: str = "https://mm.kaist.ac.kr/datasets/voxceleb/",
99
                 download: bool = False) -> None:
100
        """
101
        Args:
102
            root: Root directory
103
            subset: "train", "dev", or "test"
104
            meta_url: Metadata file URL
105
            base_url: Base download URL
106
            download: Whether to download
107
        """
108
```
109

110
### Speech Synthesis Datasets
111

112
Datasets for text-to-speech synthesis and voice conversion.
113

114
```python { .api }
115
class LJSPEECH(torch.utils.data.Dataset):
116
    """LJ Speech dataset - single speaker English TTS corpus."""
117
    
118
    def __init__(self, root: str, url: str = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",
119
                 folder_in_archive: str = "LJSpeech-1.1", download: bool = False) -> None:
120
        """
121
        Args:
122
            root: Root directory
123
            url: Download URL
124
            folder_in_archive: Archive folder name
125
            download: Whether to download
126
        """
127

128
    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str]:
129
        """
130
        Returns:
131
            Tuple of (waveform, sample_rate, transcript, normalized_transcript)
132
        """
133

134
class LIBRITTS(torch.utils.data.Dataset):
135
    """LibriTTS multi-speaker English TTS corpus."""
136
    
137
    def __init__(self, root: str, url: str = "train-clean-100", 
138
                 folder_in_archive: str = "LibriTTS", download: bool = False,
139
                 subset: str = "train-clean-100") -> None:
140
        """
141
        Args:
142
            root: Root directory
143
            url: Dataset subset URL
144
            folder_in_archive: Archive folder name
145
            download: Whether to download
146
            subset: Data subset
147
        """
148

149
class VCTK_092(torch.utils.data.Dataset):
150
    """VCTK Corpus 0.92 - multi-speaker English TTS dataset."""
151
    
152
    def __init__(self, root: str, mic_id: str = "mic1", download: bool = False,
153
                 url: str = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip",
154
                 folder_in_archive: str = "VCTK-Corpus-0.92") -> None:
155
        """
156
        Args:
157
            root: Root directory
158
            mic_id: Microphone ID ("mic1" or "mic2")
159
            download: Whether to download
160
            url: Download URL
161
            folder_in_archive: Archive folder name
162
        """
163

164
class CMUARCTIC(torch.utils.data.Dataset):
165
    """CMU ARCTIC speech synthesis database."""
166
    
167
    def __init__(self, root: str, subset: str = "aew", download: bool = False,
168
                 url: str = "cmu_arctic", folder_in_archive: str = "ARCTIC") -> None:
169
        """
170
        Args:
171
            root: Root directory
172
            subset: Speaker subset (e.g., "aew", "ahw", "aup", "awb")
173
            download: Whether to download
174
            url: Download URL
175
            folder_in_archive: Archive folder name
176
        """
177
```
178

179
### Music and Audio Datasets
180

181
Datasets for music information retrieval and general audio analysis.
182

183
```python { .api }
184
class GTZAN(torch.utils.data.Dataset):
185
    """GTZAN Genre Collection - music genre classification dataset."""
186
    
187
    def __init__(self, root: str, url: str = "http://opihi.cs.uvic.ca/sound/genres.tar.gz",
188
                 folder_in_archive: str = "genres", download: bool = False,
189
                 subset: Optional[str] = None) -> None:
190
        """
191
        Args:
192
            root: Root directory
193
            url: Download URL
194
            folder_in_archive: Archive folder name
195
            download: Whether to download
196
            subset: Specific genre subset or None for all
197
        """
198

199
    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
200
        """
201
        Returns:
202
            Tuple of (waveform, sample_rate, genre_label)
203
        """
204

205
class MUSDB_HQ(torch.utils.data.Dataset):
206
    """MUSDB18-HQ source separation dataset."""
207
    
208
    def __init__(self, root: str, subset: str = "train", sources: List[str] = None,
209
                 targets: List[str] = None, duration: Optional[float] = None,
210
                 sample_rate: int = 44100, overlap: float = 0.25,
211
                 num_workers: int = 0, split: str = "train", seed: int = 42,
212
                 download: bool = False) -> None:
213
        """
214
        Args:
215
            root: Root directory
216
            subset: "train" or "test"
217
            sources: List of source stems to load
218
            targets: List of target stems for separation
219
            duration: Duration of segments in seconds
220
            sample_rate: Target sample rate
221
            overlap: Overlap between segments
222
            num_workers: Number of worker processes
223
            split: Data split
224
            seed: Random seed
225
            download: Whether to download
226
        """
227
```
228

229
### Specialized Datasets
230

231
Datasets for specific audio processing tasks.
232

233
```python { .api }
234
class FluentSpeechCommands(torch.utils.data.Dataset):
235
    """Fluent Speech Commands - intent classification dataset."""
236
    
237
    def __init__(self, root: str, subset: str = "train", download: bool = False) -> None:
238
        """
239
        Args:
240
            root: Root directory
241
            subset: "train", "valid", or "test"
242
            download: Whether to download
243
        """
244

245
class YESNO(torch.utils.data.Dataset):
246
    """Hebrew Yes/No dataset - simple binary classification."""
247
    
248
    def __init__(self, root: str, url: str = "http://www.openslr.org/resources/1/waves_yesno.tar.gz",
249
                 folder_in_archive: str = "waves_yesno", download: bool = False) -> None:
250
        """
251
        Args:
252
            root: Root directory
253
            url: Download URL
254
            folder_in_archive: Archive folder name
255
            download: Whether to download
256
        """
257

258
    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, List[int]]:
259
        """
260
        Returns:
261
            Tuple of (waveform, sample_rate, labels) where labels is list of 0s and 1s
262
        """
263

264
class CMUDict(torch.utils.data.Dataset):
265
    """CMU Pronouncing Dictionary - phonetic dictionary."""
266
    
267
    def __init__(self, root: str, url: str = "cmudict-0.7b", 
268
                 folder_in_archive: str = "cmudict", download: bool = False) -> None:
269
        """
270
        Args:
271
            root: Root directory
272
            url: Dataset version
273
            folder_in_archive: Archive folder name
274
            download: Whether to download
275
        """
276

277
class LibriMix(torch.utils.data.Dataset):
278
    """LibriMix speech separation dataset."""
279
    
280
    def __init__(self, root: str, subset: str = "train-360", num_speakers: int = 2,
281
                 sample_rate: int = 8000, task: str = "sep_clean", download: bool = False) -> None:
282
        """
283
        Args:
284
            root: Root directory
285
            subset: Data subset
286
            num_speakers: Number of speakers in mixture (2 or 3)
287
            sample_rate: Sample rate (8000 or 16000)
288
            task: Task type ("sep_clean", "sep_noisy", etc.)
289
            download: Whether to download
290
        """
291

292
class QUESST14(torch.utils.data.Dataset):
293
    """QUESST 2014 Query by Example Spoken Term Detection."""
294
    
295
    def __init__(self, root: str, subset: str = "docs", download: bool = False,
296
                 url: str = "quesst14_database", folder_in_archive: str = "quesst14Database") -> None:
297
        """
298
        Args:
299
            root: Root directory
300
            subset: "docs", "dev", or "eval"
301
            download: Whether to download
302
            url: Download URL
303
            folder_in_archive: Archive folder name
304
        """
305

306
class IEMOCAP(torch.utils.data.Dataset):
307
    """IEMOCAP emotion recognition dataset."""
308
    
309
    def __init__(self, root: str, sessions: List[int] = [1, 2, 3, 4, 5],
310
                 utterance_type: str = "scripted", download: bool = False) -> None:
311
        """
312
        Args:
313
            root: Root directory
314
            sessions: List of session numbers to include
315
            utterance_type: "scripted" or "improvised"
316
            download: Whether to download
317
        """
318
```
319

320
## Usage Examples
321

322
### LibriSpeech for ASR
323

324
```python
325
import torchaudio
326
from torchaudio.datasets import LIBRISPEECH
327
from torch.utils.data import DataLoader
328

329
# Create dataset
330
dataset = LIBRISPEECH(
331
    root="./data",
332
    url="train-clean-100",  # 100 hours of clean training data
333
    download=True
334
)
335

336
# Create data loader
337
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: x)
338

339
# Iterate through data
340
for batch in dataloader:
341
    for waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id in batch:
342
        print(f"Waveform shape: {waveform.shape}")
343
        print(f"Sample rate: {sample_rate}")
344
        print(f"Transcript: {transcript}")
345
        print(f"Speaker ID: {speaker_id}")
346
        break
347
    break
348
```
349

350
### LJ Speech for TTS
351

352
```python
353
import torchaudio
354
from torchaudio.datasets import LJSPEECH
355

356
# Create dataset
357
dataset = LJSPEECH(root="./data", download=True)
358

359
# Get a sample
360
waveform, sample_rate, transcript, normalized_transcript = dataset[0]
361

362
print(f"Audio shape: {waveform.shape}")
363
print(f"Original transcript: {transcript}")
364
print(f"Normalized transcript: {normalized_transcript}")
365

366
# Can be used with DataLoader for training TTS models
367
from torch.utils.data import DataLoader
368
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
369
```
370

371
### GTZAN for Music Classification
372

373
```python
374
import torchaudio
375
from torchaudio.datasets import GTZAN
376

377
# Create dataset  
378
dataset = GTZAN(root="./data", download=True)
379

380
# Get a sample
381
waveform, sample_rate, genre = dataset[0]
382

383
print(f"Audio shape: {waveform.shape}")
384
print(f"Sample rate: {sample_rate}")
385
print(f"Genre: {genre}")
386

387
# Genres: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, rock
388
```
389

390
### Speech Commands for Keyword Spotting
391

392
```python
393
import torchaudio
394
from torchaudio.datasets import SPEECHCOMMANDS
395

396
# Create training dataset
397
train_set = SPEECHCOMMANDS(root="./data", subset="training", download=True)
398

399
# Get a sample
400
waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]
401

402
print(f"Audio shape: {waveform.shape}")
403
print(f"Command: {label}")
404
print(f"Speaker: {speaker_id}")
405

406
# Commands include: "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"
407
```
408

409
These datasets provide standardized interfaces for common audio processing tasks and can be easily integrated into PyTorch training pipelines with consistent preprocessing and data loading patterns.

Version

Tile

Files

datasets.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

datasets.mddocs/