An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.
Standard dataset loaders for common audio datasets with consistent interfaces and preprocessing. TorchAudio provides PyTorch-compatible dataset classes for speech recognition, synthesis, music analysis, and source separation research.
Datasets for training and evaluating automatic speech recognition systems.
class LIBRISPEECH(torch.utils.data.Dataset):
"""LibriSpeech ASR corpus - large-scale English speech recognition dataset."""
def __init__(self, root: str, url: str = "train-clean-100",
folder_in_archive: str = "LibriSpeech", download: bool = False) -> None:
"""
Args:
root: Root directory for dataset storage
url: Dataset subset ("train-clean-100", "train-clean-360", "train-other-500",
"dev-clean", "dev-other", "test-clean", "test-other")
folder_in_archive: Folder name in archive
download: Whether to download if not found
"""
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, int, int, int]:
"""
Returns:
Tuple of (waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)
"""
class LibriSpeechBiasing(torch.utils.data.Dataset):
"""LibriSpeech dataset with word-level biasing lists for contextualized ASR."""
def __init__(self, root: str, subset: str, audio_dir: str, download: bool = False) -> None:
"""
Args:
root: Root directory
subset: Dataset subset
audio_dir: Directory containing audio files
download: Whether to download if not found
"""
class SPEECHCOMMANDS(torch.utils.data.Dataset):
"""Google Speech Commands dataset - keyword spotting."""
def __init__(self, root: str, url: str = "speech_commands_v0.02",
folder_in_archive: str = "SpeechCommands", download: bool = False,
subset: Optional[str] = None) -> None:
"""
Args:
root: Root directory
url: Dataset version
folder_in_archive: Folder name in archive
download: Whether to download
subset: "training", "validation", "testing", or None for all
"""
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, int]:
"""
Returns:
Tuple of (waveform, sample_rate, label, speaker_id, utterance_number)
"""
class COMMONVOICE(torch.utils.data.Dataset):
"""Mozilla Common Voice multilingual speech corpus."""
def __init__(self, root: str, tsv: str = "train.tsv", url: str = "cv-corpus-4-2019-12-10",
folder_in_archive: str = "cv-corpus-4-2019-12-10", download: bool = False,
version: str = "cv-corpus-4-2019-12-10") -> None:
"""
Args:
root: Root directory
tsv: TSV file to load ("train.tsv", "dev.tsv", "test.tsv")
url: Download URL identifier
folder_in_archive: Archive folder name
download: Whether to download
version: Dataset version
"""
class TEDLIUM(torch.utils.data.Dataset):
"""TED-LIUM ASR corpus - TED talks with transcripts."""
def __init__(self, root: str, release: str = "release3", subset: str = "train",
download: bool = False, audio_ext: str = ".sph") -> None:
"""
Args:
root: Root directory
release: Dataset release ("release1", "release2", "release3")
subset: Data subset ("train", "dev", "test")
download: Whether to download
audio_ext: Audio file extension
"""
class VoxCeleb1Identification(torch.utils.data.Dataset):
"""VoxCeleb1 speaker identification dataset."""
def __init__(self, root: str, subset: str = "train", meta_url: str = "vox1_meta.csv",
base_url: str = "https://mm.kaist.ac.kr/datasets/voxceleb/",
download: bool = False) -> None:
"""
Args:
root: Root directory
subset: "train", "dev", or "test"
meta_url: Metadata file URL
base_url: Base download URL
download: Whether to download
"""Datasets for text-to-speech synthesis and voice conversion.
class LJSPEECH(torch.utils.data.Dataset):
"""LJ Speech dataset - single speaker English TTS corpus."""
def __init__(self, root: str, url: str = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",
folder_in_archive: str = "LJSpeech-1.1", download: bool = False) -> None:
"""
Args:
root: Root directory
url: Download URL
folder_in_archive: Archive folder name
download: Whether to download
"""
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str]:
"""
Returns:
Tuple of (waveform, sample_rate, transcript, normalized_transcript)
"""
class LIBRITTS(torch.utils.data.Dataset):
"""LibriTTS multi-speaker English TTS corpus."""
def __init__(self, root: str, url: str = "train-clean-100",
folder_in_archive: str = "LibriTTS", download: bool = False,
subset: str = "train-clean-100") -> None:
"""
Args:
root: Root directory
url: Dataset subset URL
folder_in_archive: Archive folder name
download: Whether to download
subset: Data subset
"""
class VCTK_092(torch.utils.data.Dataset):
"""VCTK Corpus 0.92 - multi-speaker English TTS dataset."""
def __init__(self, root: str, mic_id: str = "mic1", download: bool = False,
url: str = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip",
folder_in_archive: str = "VCTK-Corpus-0.92") -> None:
"""
Args:
root: Root directory
mic_id: Microphone ID ("mic1" or "mic2")
download: Whether to download
url: Download URL
folder_in_archive: Archive folder name
"""
class CMUARCTIC(torch.utils.data.Dataset):
"""CMU ARCTIC speech synthesis database."""
def __init__(self, root: str, subset: str = "aew", download: bool = False,
url: str = "cmu_arctic", folder_in_archive: str = "ARCTIC") -> None:
"""
Args:
root: Root directory
subset: Speaker subset (e.g., "aew", "ahw", "aup", "awb")
download: Whether to download
url: Download URL
folder_in_archive: Archive folder name
"""Datasets for music information retrieval and general audio analysis.
class GTZAN(torch.utils.data.Dataset):
"""GTZAN Genre Collection - music genre classification dataset."""
def __init__(self, root: str, url: str = "http://opihi.cs.uvic.ca/sound/genres.tar.gz",
folder_in_archive: str = "genres", download: bool = False,
subset: Optional[str] = None) -> None:
"""
Args:
root: Root directory
url: Download URL
folder_in_archive: Archive folder name
download: Whether to download
subset: Specific genre subset or None for all
"""
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
"""
Returns:
Tuple of (waveform, sample_rate, genre_label)
"""
class MUSDB_HQ(torch.utils.data.Dataset):
"""MUSDB18-HQ source separation dataset."""
def __init__(self, root: str, subset: str = "train", sources: List[str] = None,
targets: List[str] = None, duration: Optional[float] = None,
sample_rate: int = 44100, overlap: float = 0.25,
num_workers: int = 0, split: str = "train", seed: int = 42,
download: bool = False) -> None:
"""
Args:
root: Root directory
subset: "train" or "test"
sources: List of source stems to load
targets: List of target stems for separation
duration: Duration of segments in seconds
sample_rate: Target sample rate
overlap: Overlap between segments
num_workers: Number of worker processes
split: Data split
seed: Random seed
download: Whether to download
"""Datasets for specific audio processing tasks.
class FluentSpeechCommands(torch.utils.data.Dataset):
"""Fluent Speech Commands - intent classification dataset."""
def __init__(self, root: str, subset: str = "train", download: bool = False) -> None:
"""
Args:
root: Root directory
subset: "train", "valid", or "test"
download: Whether to download
"""
class YESNO(torch.utils.data.Dataset):
"""Hebrew Yes/No dataset - simple binary classification."""
def __init__(self, root: str, url: str = "http://www.openslr.org/resources/1/waves_yesno.tar.gz",
folder_in_archive: str = "waves_yesno", download: bool = False) -> None:
"""
Args:
root: Root directory
url: Download URL
folder_in_archive: Archive folder name
download: Whether to download
"""
def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, List[int]]:
"""
Returns:
Tuple of (waveform, sample_rate, labels) where labels is list of 0s and 1s
"""
class CMUDict(torch.utils.data.Dataset):
"""CMU Pronouncing Dictionary - phonetic dictionary."""
def __init__(self, root: str, url: str = "cmudict-0.7b",
folder_in_archive: str = "cmudict", download: bool = False) -> None:
"""
Args:
root: Root directory
url: Dataset version
folder_in_archive: Archive folder name
download: Whether to download
"""
class LibriMix(torch.utils.data.Dataset):
"""LibriMix speech separation dataset."""
def __init__(self, root: str, subset: str = "train-360", num_speakers: int = 2,
sample_rate: int = 8000, task: str = "sep_clean", download: bool = False) -> None:
"""
Args:
root: Root directory
subset: Data subset
num_speakers: Number of speakers in mixture (2 or 3)
sample_rate: Sample rate (8000 or 16000)
task: Task type ("sep_clean", "sep_noisy", etc.)
download: Whether to download
"""
class QUESST14(torch.utils.data.Dataset):
"""QUESST 2014 Query by Example Spoken Term Detection."""
def __init__(self, root: str, subset: str = "docs", download: bool = False,
url: str = "quesst14_database", folder_in_archive: str = "quesst14Database") -> None:
"""
Args:
root: Root directory
subset: "docs", "dev", or "eval"
download: Whether to download
url: Download URL
folder_in_archive: Archive folder name
"""
class IEMOCAP(torch.utils.data.Dataset):
"""IEMOCAP emotion recognition dataset."""
def __init__(self, root: str, sessions: List[int] = [1, 2, 3, 4, 5],
utterance_type: str = "scripted", download: bool = False) -> None:
"""
Args:
root: Root directory
sessions: List of session numbers to include
utterance_type: "scripted" or "improvised"
download: Whether to download
"""import torchaudio
from torchaudio.datasets import LIBRISPEECH
from torch.utils.data import DataLoader
# Create dataset
dataset = LIBRISPEECH(
root="./data",
url="train-clean-100", # 100 hours of clean training data
download=True
)
# Create data loader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: x)
# Iterate through data
for batch in dataloader:
for waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id in batch:
print(f"Waveform shape: {waveform.shape}")
print(f"Sample rate: {sample_rate}")
print(f"Transcript: {transcript}")
print(f"Speaker ID: {speaker_id}")
break
breakimport torchaudio
from torchaudio.datasets import LJSPEECH
# Create dataset
dataset = LJSPEECH(root="./data", download=True)
# Get a sample
waveform, sample_rate, transcript, normalized_transcript = dataset[0]
print(f"Audio shape: {waveform.shape}")
print(f"Original transcript: {transcript}")
print(f"Normalized transcript: {normalized_transcript}")
# Can be used with DataLoader for training TTS models
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)import torchaudio
from torchaudio.datasets import GTZAN
# Create dataset
dataset = GTZAN(root="./data", download=True)
# Get a sample
waveform, sample_rate, genre = dataset[0]
print(f"Audio shape: {waveform.shape}")
print(f"Sample rate: {sample_rate}")
print(f"Genre: {genre}")
# Genres: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, rockimport torchaudio
from torchaudio.datasets import SPEECHCOMMANDS
# Create training dataset
train_set = SPEECHCOMMANDS(root="./data", subset="training", download=True)
# Get a sample
waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]
print(f"Audio shape: {waveform.shape}")
print(f"Command: {label}")
print(f"Speaker: {speaker_id}")
# Commands include: "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"These datasets provide standardized interfaces for common audio processing tasks and can be easily integrated into PyTorch training pipelines with consistent preprocessing and data loading patterns.
Install with Tessl CLI
npx tessl i tessl/pypi-torchaudio