CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-torchaudio

An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.

Overview
Eval results
Files

datasets.mddocs/

Audio Datasets

Standard dataset loaders for common audio datasets with consistent interfaces and preprocessing. TorchAudio provides PyTorch-compatible dataset classes for speech recognition, synthesis, music analysis, and source separation research.

Capabilities

Speech Recognition Datasets

Datasets for training and evaluating automatic speech recognition systems.

class LIBRISPEECH(torch.utils.data.Dataset):
    """LibriSpeech ASR corpus - large-scale English speech recognition dataset."""
    
    def __init__(self, root: str, url: str = "train-clean-100", 
                 folder_in_archive: str = "LibriSpeech", download: bool = False) -> None:
        """
        Args:
            root: Root directory for dataset storage
            url: Dataset subset ("train-clean-100", "train-clean-360", "train-other-500", 
                                "dev-clean", "dev-other", "test-clean", "test-other")
            folder_in_archive: Folder name in archive
            download: Whether to download if not found
        """

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, int, int, int]:
        """
        Returns:
            Tuple of (waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)
        """

class LibriSpeechBiasing(torch.utils.data.Dataset):
    """LibriSpeech dataset with word-level biasing lists for contextualized ASR."""
    
    def __init__(self, root: str, subset: str, audio_dir: str, download: bool = False) -> None:
        """
        Args:
            root: Root directory
            subset: Dataset subset 
            audio_dir: Directory containing audio files
            download: Whether to download if not found
        """

class SPEECHCOMMANDS(torch.utils.data.Dataset):
    """Google Speech Commands dataset - keyword spotting."""
    
    def __init__(self, root: str, url: str = "speech_commands_v0.02", 
                 folder_in_archive: str = "SpeechCommands", download: bool = False,
                 subset: Optional[str] = None) -> None:
        """
        Args:
            root: Root directory
            url: Dataset version
            folder_in_archive: Folder name in archive
            download: Whether to download
            subset: "training", "validation", "testing", or None for all
        """

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, int]:
        """
        Returns:
            Tuple of (waveform, sample_rate, label, speaker_id, utterance_number)
        """

class COMMONVOICE(torch.utils.data.Dataset):
    """Mozilla Common Voice multilingual speech corpus."""
    
    def __init__(self, root: str, tsv: str = "train.tsv", url: str = "cv-corpus-4-2019-12-10",
                 folder_in_archive: str = "cv-corpus-4-2019-12-10", download: bool = False,
                 version: str = "cv-corpus-4-2019-12-10") -> None:
        """
        Args:
            root: Root directory
            tsv: TSV file to load ("train.tsv", "dev.tsv", "test.tsv")
            url: Download URL identifier
            folder_in_archive: Archive folder name
            download: Whether to download
            version: Dataset version
        """

class TEDLIUM(torch.utils.data.Dataset):
    """TED-LIUM ASR corpus - TED talks with transcripts."""
    
    def __init__(self, root: str, release: str = "release3", subset: str = "train",
                 download: bool = False, audio_ext: str = ".sph") -> None:
        """
        Args:
            root: Root directory
            release: Dataset release ("release1", "release2", "release3")
            subset: Data subset ("train", "dev", "test")
            download: Whether to download
            audio_ext: Audio file extension
        """

class VoxCeleb1Identification(torch.utils.data.Dataset):
    """VoxCeleb1 speaker identification dataset."""
    
    def __init__(self, root: str, subset: str = "train", meta_url: str = "vox1_meta.csv",
                 base_url: str = "https://mm.kaist.ac.kr/datasets/voxceleb/",
                 download: bool = False) -> None:
        """
        Args:
            root: Root directory
            subset: "train", "dev", or "test"
            meta_url: Metadata file URL
            base_url: Base download URL
            download: Whether to download
        """

Speech Synthesis Datasets

Datasets for text-to-speech synthesis and voice conversion.

class LJSPEECH(torch.utils.data.Dataset):
    """LJ Speech dataset - single speaker English TTS corpus."""
    
    def __init__(self, root: str, url: str = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",
                 folder_in_archive: str = "LJSpeech-1.1", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            url: Download URL
            folder_in_archive: Archive folder name
            download: Whether to download
        """

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str]:
        """
        Returns:
            Tuple of (waveform, sample_rate, transcript, normalized_transcript)
        """

class LIBRITTS(torch.utils.data.Dataset):
    """LibriTTS multi-speaker English TTS corpus."""
    
    def __init__(self, root: str, url: str = "train-clean-100", 
                 folder_in_archive: str = "LibriTTS", download: bool = False,
                 subset: str = "train-clean-100") -> None:
        """
        Args:
            root: Root directory
            url: Dataset subset URL
            folder_in_archive: Archive folder name
            download: Whether to download
            subset: Data subset
        """

class VCTK_092(torch.utils.data.Dataset):
    """VCTK Corpus 0.92 - multi-speaker English TTS dataset."""
    
    def __init__(self, root: str, mic_id: str = "mic1", download: bool = False,
                 url: str = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip",
                 folder_in_archive: str = "VCTK-Corpus-0.92") -> None:
        """
        Args:
            root: Root directory
            mic_id: Microphone ID ("mic1" or "mic2")
            download: Whether to download
            url: Download URL
            folder_in_archive: Archive folder name
        """

class CMUARCTIC(torch.utils.data.Dataset):
    """CMU ARCTIC speech synthesis database."""
    
    def __init__(self, root: str, subset: str = "aew", download: bool = False,
                 url: str = "cmu_arctic", folder_in_archive: str = "ARCTIC") -> None:
        """
        Args:
            root: Root directory
            subset: Speaker subset (e.g., "aew", "ahw", "aup", "awb")
            download: Whether to download
            url: Download URL
            folder_in_archive: Archive folder name
        """

Music and Audio Datasets

Datasets for music information retrieval and general audio analysis.

class GTZAN(torch.utils.data.Dataset):
    """GTZAN Genre Collection - music genre classification dataset."""
    
    def __init__(self, root: str, url: str = "http://opihi.cs.uvic.ca/sound/genres.tar.gz",
                 folder_in_archive: str = "genres", download: bool = False,
                 subset: Optional[str] = None) -> None:
        """
        Args:
            root: Root directory
            url: Download URL
            folder_in_archive: Archive folder name
            download: Whether to download
            subset: Specific genre subset or None for all
        """

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
        """
        Returns:
            Tuple of (waveform, sample_rate, genre_label)
        """

class MUSDB_HQ(torch.utils.data.Dataset):
    """MUSDB18-HQ source separation dataset."""
    
    def __init__(self, root: str, subset: str = "train", sources: List[str] = None,
                 targets: List[str] = None, duration: Optional[float] = None,
                 sample_rate: int = 44100, overlap: float = 0.25,
                 num_workers: int = 0, split: str = "train", seed: int = 42,
                 download: bool = False) -> None:
        """
        Args:
            root: Root directory
            subset: "train" or "test"
            sources: List of source stems to load
            targets: List of target stems for separation
            duration: Duration of segments in seconds
            sample_rate: Target sample rate
            overlap: Overlap between segments
            num_workers: Number of worker processes
            split: Data split
            seed: Random seed
            download: Whether to download
        """

Specialized Datasets

Datasets for specific audio processing tasks.

class FluentSpeechCommands(torch.utils.data.Dataset):
    """Fluent Speech Commands - intent classification dataset."""
    
    def __init__(self, root: str, subset: str = "train", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            subset: "train", "valid", or "test"
            download: Whether to download
        """

class YESNO(torch.utils.data.Dataset):
    """Hebrew Yes/No dataset - simple binary classification."""
    
    def __init__(self, root: str, url: str = "http://www.openslr.org/resources/1/waves_yesno.tar.gz",
                 folder_in_archive: str = "waves_yesno", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            url: Download URL
            folder_in_archive: Archive folder name
            download: Whether to download
        """

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, List[int]]:
        """
        Returns:
            Tuple of (waveform, sample_rate, labels) where labels is list of 0s and 1s
        """

class CMUDict(torch.utils.data.Dataset):
    """CMU Pronouncing Dictionary - phonetic dictionary."""
    
    def __init__(self, root: str, url: str = "cmudict-0.7b", 
                 folder_in_archive: str = "cmudict", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            url: Dataset version
            folder_in_archive: Archive folder name
            download: Whether to download
        """

class LibriMix(torch.utils.data.Dataset):
    """LibriMix speech separation dataset."""
    
    def __init__(self, root: str, subset: str = "train-360", num_speakers: int = 2,
                 sample_rate: int = 8000, task: str = "sep_clean", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            subset: Data subset
            num_speakers: Number of speakers in mixture (2 or 3)
            sample_rate: Sample rate (8000 or 16000)
            task: Task type ("sep_clean", "sep_noisy", etc.)
            download: Whether to download
        """

class QUESST14(torch.utils.data.Dataset):
    """QUESST 2014 Query by Example Spoken Term Detection."""
    
    def __init__(self, root: str, subset: str = "docs", download: bool = False,
                 url: str = "quesst14_database", folder_in_archive: str = "quesst14Database") -> None:
        """
        Args:
            root: Root directory
            subset: "docs", "dev", or "eval"
            download: Whether to download
            url: Download URL
            folder_in_archive: Archive folder name
        """

class IEMOCAP(torch.utils.data.Dataset):
    """IEMOCAP emotion recognition dataset."""
    
    def __init__(self, root: str, sessions: List[int] = [1, 2, 3, 4, 5],
                 utterance_type: str = "scripted", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            sessions: List of session numbers to include
            utterance_type: "scripted" or "improvised"
            download: Whether to download
        """

Usage Examples

LibriSpeech for ASR

import torchaudio
from torchaudio.datasets import LIBRISPEECH
from torch.utils.data import DataLoader

# Create dataset
dataset = LIBRISPEECH(
    root="./data",
    url="train-clean-100",  # 100 hours of clean training data
    download=True
)

# Create data loader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: x)

# Iterate through data
for batch in dataloader:
    for waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id in batch:
        print(f"Waveform shape: {waveform.shape}")
        print(f"Sample rate: {sample_rate}")
        print(f"Transcript: {transcript}")
        print(f"Speaker ID: {speaker_id}")
        break
    break

LJ Speech for TTS

import torchaudio
from torchaudio.datasets import LJSPEECH

# Create dataset
dataset = LJSPEECH(root="./data", download=True)

# Get a sample
waveform, sample_rate, transcript, normalized_transcript = dataset[0]

print(f"Audio shape: {waveform.shape}")
print(f"Original transcript: {transcript}")
print(f"Normalized transcript: {normalized_transcript}")

# Can be used with DataLoader for training TTS models
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

GTZAN for Music Classification

import torchaudio
from torchaudio.datasets import GTZAN

# Create dataset  
dataset = GTZAN(root="./data", download=True)

# Get a sample
waveform, sample_rate, genre = dataset[0]

print(f"Audio shape: {waveform.shape}")
print(f"Sample rate: {sample_rate}")
print(f"Genre: {genre}")

# Genres: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, rock

Speech Commands for Keyword Spotting

import torchaudio
from torchaudio.datasets import SPEECHCOMMANDS

# Create training dataset
train_set = SPEECHCOMMANDS(root="./data", subset="training", download=True)

# Get a sample
waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]

print(f"Audio shape: {waveform.shape}")
print(f"Command: {label}")
print(f"Speaker: {speaker_id}")

# Commands include: "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"

These datasets provide standardized interfaces for common audio processing tasks and can be easily integrated into PyTorch training pipelines with consistent preprocessing and data loading patterns.

Install with Tessl CLI

npx tessl i tessl/pypi-torchaudio

docs

audio-io.md

datasets.md

effects.md

functional.md

index.md

models.md

pipelines.md

streaming.md

transforms.md

utils.md

tile.json