tessl/pypi-torchaudio

An audio package for PyTorch providing GPU-accelerated audio I/O operations, signal processing transforms, and machine learning utilities for audio data.

Overview

Eval results

Files

Audio Datasets

Name: tessl/pypi-torchaudio
Author: tessl

Standard dataset loaders for common audio datasets with consistent interfaces and preprocessing. TorchAudio provides PyTorch-compatible dataset classes for speech recognition, synthesis, music analysis, and source separation research.

Capabilities

Speech Recognition Datasets

Datasets for training and evaluating automatic speech recognition systems.

class LIBRISPEECH(torch.utils.data.Dataset):
    """LibriSpeech ASR corpus - large-scale English speech recognition dataset."""
    
    def __init__(self, root: str, url: str = "train-clean-100", 
                 folder_in_archive: str = "LibriSpeech", download: bool = False) -> None:
        """
        Args:
            root: Root directory for dataset storage
            url: Dataset subset ("train-clean-100", "train-clean-360", "train-other-500", 
                                "dev-clean", "dev-other", "test-clean", "test-other")
            folder_in_archive: Folder name in archive
            download: Whether to download if not found
        """

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, int, int, int]:
        """
        Returns:
            Tuple of (waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)
        """

class LibriSpeechBiasing(torch.utils.data.Dataset):
    """LibriSpeech dataset with word-level biasing lists for contextualized ASR."""
    
    def __init__(self, root: str, subset: str, audio_dir: str, download: bool = False) -> None:
        """
        Args:
            root: Root directory
            subset: Dataset subset 
            audio_dir: Directory containing audio files
            download: Whether to download if not found
        """

class SPEECHCOMMANDS(torch.utils.data.Dataset):
    """Google Speech Commands dataset - keyword spotting."""
    
    def __init__(self, root: str, url: str = "speech_commands_v0.02", 
                 folder_in_archive: str = "SpeechCommands", download: bool = False,
                 subset: Optional[str] = None) -> None:
        """
        Args:
            root: Root directory
            url: Dataset version
            folder_in_archive: Folder name in archive
            download: Whether to download
            subset: "training", "validation", "testing", or None for all
        """

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, int]:
        """
        Returns:
            Tuple of (waveform, sample_rate, label, speaker_id, utterance_number)
        """

class COMMONVOICE(torch.utils.data.Dataset):
    """Mozilla Common Voice multilingual speech corpus."""
    
    def __init__(self, root: str, tsv: str = "train.tsv", url: str = "cv-corpus-4-2019-12-10",
                 folder_in_archive: str = "cv-corpus-4-2019-12-10", download: bool = False,
                 version: str = "cv-corpus-4-2019-12-10") -> None:
        """
        Args:
            root: Root directory
            tsv: TSV file to load ("train.tsv", "dev.tsv", "test.tsv")
            url: Download URL identifier
            folder_in_archive: Archive folder name
            download: Whether to download
            version: Dataset version
        """

class TEDLIUM(torch.utils.data.Dataset):
    """TED-LIUM ASR corpus - TED talks with transcripts."""
    
    def __init__(self, root: str, release: str = "release3", subset: str = "train",
                 download: bool = False, audio_ext: str = ".sph") -> None:
        """
        Args:
            root: Root directory
            release: Dataset release ("release1", "release2", "release3")
            subset: Data subset ("train", "dev", "test")
            download: Whether to download
            audio_ext: Audio file extension
        """

class VoxCeleb1Identification(torch.utils.data.Dataset):
    """VoxCeleb1 speaker identification dataset."""
    
    def __init__(self, root: str, subset: str = "train", meta_url: str = "vox1_meta.csv",
                 base_url: str = "https://mm.kaist.ac.kr/datasets/voxceleb/",
                 download: bool = False) -> None:
        """
        Args:
            root: Root directory
            subset: "train", "dev", or "test"
            meta_url: Metadata file URL
            base_url: Base download URL
            download: Whether to download
        """

Speech Synthesis Datasets

Datasets for text-to-speech synthesis and voice conversion.

class LJSPEECH(torch.utils.data.Dataset):
    """LJ Speech dataset - single speaker English TTS corpus."""
    
    def __init__(self, root: str, url: str = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",
                 folder_in_archive: str = "LJSpeech-1.1", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            url: Download URL
            folder_in_archive: Archive folder name
            download: Whether to download
        """

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str]:
        """
        Returns:
            Tuple of (waveform, sample_rate, transcript, normalized_transcript)
        """

class LIBRITTS(torch.utils.data.Dataset):
    """LibriTTS multi-speaker English TTS corpus."""
    
    def __init__(self, root: str, url: str = "train-clean-100", 
                 folder_in_archive: str = "LibriTTS", download: bool = False,
                 subset: str = "train-clean-100") -> None:
        """
        Args:
            root: Root directory
            url: Dataset subset URL
            folder_in_archive: Archive folder name
            download: Whether to download
            subset: Data subset
        """

class VCTK_092(torch.utils.data.Dataset):
    """VCTK Corpus 0.92 - multi-speaker English TTS dataset."""
    
    def __init__(self, root: str, mic_id: str = "mic1", download: bool = False,
                 url: str = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip",
                 folder_in_archive: str = "VCTK-Corpus-0.92") -> None:
        """
        Args:
            root: Root directory
            mic_id: Microphone ID ("mic1" or "mic2")
            download: Whether to download
            url: Download URL
            folder_in_archive: Archive folder name
        """

class CMUARCTIC(torch.utils.data.Dataset):
    """CMU ARCTIC speech synthesis database."""
    
    def __init__(self, root: str, subset: str = "aew", download: bool = False,
                 url: str = "cmu_arctic", folder_in_archive: str = "ARCTIC") -> None:
        """
        Args:
            root: Root directory
            subset: Speaker subset (e.g., "aew", "ahw", "aup", "awb")
            download: Whether to download
            url: Download URL
            folder_in_archive: Archive folder name
        """

Music and Audio Datasets

Datasets for music information retrieval and general audio analysis.

class GTZAN(torch.utils.data.Dataset):
    """GTZAN Genre Collection - music genre classification dataset."""
    
    def __init__(self, root: str, url: str = "http://opihi.cs.uvic.ca/sound/genres.tar.gz",
                 folder_in_archive: str = "genres", download: bool = False,
                 subset: Optional[str] = None) -> None:
        """
        Args:
            root: Root directory
            url: Download URL
            folder_in_archive: Archive folder name
            download: Whether to download
            subset: Specific genre subset or None for all
        """

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str]:
        """
        Returns:
            Tuple of (waveform, sample_rate, genre_label)
        """

class MUSDB_HQ(torch.utils.data.Dataset):
    """MUSDB18-HQ source separation dataset."""
    
    def __init__(self, root: str, subset: str = "train", sources: List[str] = None,
                 targets: List[str] = None, duration: Optional[float] = None,
                 sample_rate: int = 44100, overlap: float = 0.25,
                 num_workers: int = 0, split: str = "train", seed: int = 42,
                 download: bool = False) -> None:
        """
        Args:
            root: Root directory
            subset: "train" or "test"
            sources: List of source stems to load
            targets: List of target stems for separation
            duration: Duration of segments in seconds
            sample_rate: Target sample rate
            overlap: Overlap between segments
            num_workers: Number of worker processes
            split: Data split
            seed: Random seed
            download: Whether to download
        """

Specialized Datasets

Datasets for specific audio processing tasks.

class FluentSpeechCommands(torch.utils.data.Dataset):
    """Fluent Speech Commands - intent classification dataset."""
    
    def __init__(self, root: str, subset: str = "train", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            subset: "train", "valid", or "test"
            download: Whether to download
        """

class YESNO(torch.utils.data.Dataset):
    """Hebrew Yes/No dataset - simple binary classification."""
    
    def __init__(self, root: str, url: str = "http://www.openslr.org/resources/1/waves_yesno.tar.gz",
                 folder_in_archive: str = "waves_yesno", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            url: Download URL
            folder_in_archive: Archive folder name
            download: Whether to download
        """

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, List[int]]:
        """
        Returns:
            Tuple of (waveform, sample_rate, labels) where labels is list of 0s and 1s
        """

class CMUDict(torch.utils.data.Dataset):
    """CMU Pronouncing Dictionary - phonetic dictionary."""
    
    def __init__(self, root: str, url: str = "cmudict-0.7b", 
                 folder_in_archive: str = "cmudict", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            url: Dataset version
            folder_in_archive: Archive folder name
            download: Whether to download
        """

class LibriMix(torch.utils.data.Dataset):
    """LibriMix speech separation dataset."""
    
    def __init__(self, root: str, subset: str = "train-360", num_speakers: int = 2,
                 sample_rate: int = 8000, task: str = "sep_clean", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            subset: Data subset
            num_speakers: Number of speakers in mixture (2 or 3)
            sample_rate: Sample rate (8000 or 16000)
            task: Task type ("sep_clean", "sep_noisy", etc.)
            download: Whether to download
        """

class QUESST14(torch.utils.data.Dataset):
    """QUESST 2014 Query by Example Spoken Term Detection."""
    
    def __init__(self, root: str, subset: str = "docs", download: bool = False,
                 url: str = "quesst14_database", folder_in_archive: str = "quesst14Database") -> None:
        """
        Args:
            root: Root directory
            subset: "docs", "dev", or "eval"
            download: Whether to download
            url: Download URL
            folder_in_archive: Archive folder name
        """

class IEMOCAP(torch.utils.data.Dataset):
    """IEMOCAP emotion recognition dataset."""
    
    def __init__(self, root: str, sessions: List[int] = [1, 2, 3, 4, 5],
                 utterance_type: str = "scripted", download: bool = False) -> None:
        """
        Args:
            root: Root directory
            sessions: List of session numbers to include
            utterance_type: "scripted" or "improvised"
            download: Whether to download
        """

Usage Examples

LibriSpeech for ASR

import torchaudio
from torchaudio.datasets import LIBRISPEECH
from torch.utils.data import DataLoader

# Create dataset
dataset = LIBRISPEECH(
    root="./data",
    url="train-clean-100",  # 100 hours of clean training data
    download=True
)

# Create data loader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: x)

# Iterate through data
for batch in dataloader:
    for waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id in batch:
        print(f"Waveform shape: {waveform.shape}")
        print(f"Sample rate: {sample_rate}")
        print(f"Transcript: {transcript}")
        print(f"Speaker ID: {speaker_id}")
        break
    break

LJ Speech for TTS

import torchaudio
from torchaudio.datasets import LJSPEECH

# Create dataset
dataset = LJSPEECH(root="./data", download=True)

# Get a sample
waveform, sample_rate, transcript, normalized_transcript = dataset[0]

print(f"Audio shape: {waveform.shape}")
print(f"Original transcript: {transcript}")
print(f"Normalized transcript: {normalized_transcript}")

# Can be used with DataLoader for training TTS models
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

GTZAN for Music Classification

import torchaudio
from torchaudio.datasets import GTZAN

# Create dataset  
dataset = GTZAN(root="./data", download=True)

# Get a sample
waveform, sample_rate, genre = dataset[0]

print(f"Audio shape: {waveform.shape}")
print(f"Sample rate: {sample_rate}")
print(f"Genre: {genre}")

# Genres: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, rock

Speech Commands for Keyword Spotting

import torchaudio
from torchaudio.datasets import SPEECHCOMMANDS

# Create training dataset
train_set = SPEECHCOMMANDS(root="./data", subset="training", download=True)

# Get a sample
waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]

print(f"Audio shape: {waveform.shape}")
print(f"Command: {label}")
print(f"Speaker: {speaker_id}")

# Commands include: "yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"

These datasets provide standardized interfaces for common audio processing tasks and can be easily integrated into PyTorch training pipelines with consistent preprocessing and data loading patterns.

Install with Tessl CLI