Tessl Tile for pypi/torchaudio@2.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio-io.md datasets.md effects.md functional.md index.md models.md pipelines.md streaming.md transforms.md utils.md

pipelines.mddocs/

0
# Model Pipelines
1

2
Pre-configured model bundles with preprocessing, inference, and post-processing for production-ready audio applications. Pipelines provide complete workflows for ASR, TTS, source separation, and speech quality assessment with pre-trained weights and consistent interfaces.
3

4
## Capabilities
5

6
### Pipeline Bundle Base Classes
7

8
Base classes that provide common functionality for all pipeline bundles.
9

10
```python { .api }
11
class Wav2Vec2Bundle:
12
    """Base bundle for Wav2Vec2 models."""
13
    
14
    def get_model(self) -> Wav2Vec2Model:
15
        """
16
        Get the Wav2Vec2 model.
17
        
18
        Returns:
19
            Wav2Vec2Model: Pre-trained model instance
20
        """
21
    
22
    def get_labels(self) -> List[str]:
23
        """
24
        Get the class labels.
25
        
26
        Returns:
27
            List[str]: List of class labels (characters, phonemes, etc.)
28
        """
29
    
30
    sample_rate: int  # Expected sample rate for input audio
31

32
class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
33
    """Bundle for Wav2Vec2 automatic speech recognition models."""
34
    
35
    def get_model(self) -> Wav2Vec2Model:
36
        """Get the fine-tuned ASR model."""
37
    
38
    def get_decoder(self) -> torch.nn.Module:
39
        """
40
        Get the decoder for converting logits to text.
41
        
42
        Returns:
43
            torch.nn.Module: Decoder module (e.g., CTC decoder)
44
        """
45

46
class Wav2Vec2FABundle(Wav2Vec2Bundle):
47
    """Bundle for Wav2Vec2 forced alignment models."""
48
    
49
    def get_model(self) -> Wav2Vec2Model:
50
        """Get the forced alignment model."""
51
    
52
    def get_dict(self) -> Dict[str, int]:
53
        """
54
        Get the token dictionary for alignment.
55
        
56
        Returns:
57
            Dict[str, int]: Mapping from tokens to indices
58
        """
59

60
class Tacotron2TTSBundle:
61
    """Bundle for Tacotron2 text-to-speech synthesis."""
62
    
63
    def get_tacotron2(self) -> Tacotron2:
64
        """
65
        Get the Tacotron2 model.
66
        
67
        Returns:
68
            Tacotron2: Pre-trained synthesis model
69
        """
70
    
71
    def get_vocoder(self) -> torch.nn.Module:
72
        """
73
        Get the vocoder for converting mel spectrograms to audio.
74
        
75
        Returns:
76
            torch.nn.Module: Vocoder model (WaveRNN or Griffin-Lim)
77
        """
78
    
79
    def get_text_processor(self) -> torch.nn.Module:
80
        """
81
        Get the text processor for converting text to tokens.
82
        
83
        Returns:
84
            torch.nn.Module: Text processing pipeline
85
        """
86
    
87
    sample_rate: int  # Output sample rate
88

89
class RNNTBundle:
90
    """Bundle for RNN-Transducer streaming ASR models."""
91
    
92
    def get_model(self) -> RNNT:
93
        """
94
        Get the RNN-T model.
95
        
96
        Returns:
97
            RNNT: Pre-trained RNN-Transducer model
98
        """
99
    
100
    def get_decoder(self) -> RNNTBeamSearch:
101
        """
102
        Get the beam search decoder.
103
        
104
        Returns:
105
            RNNTBeamSearch: Configured beam search decoder
106
        """
107
    
108
    def get_tokens(self) -> List[str]:
109
        """
110
        Get the token vocabulary.
111
        
112
        Returns:
113
            List[str]: List of tokens (characters, subwords, etc.)
114
        """
115
    
116
    sample_rate: int
117

118
class SourceSeparationBundle:
119
    """Bundle for source separation models."""
120
    
121
    def get_model(self) -> torch.nn.Module:
122
        """
123
        Get the source separation model.
124
        
125
        Returns:
126
            torch.nn.Module: Pre-trained separation model
127
        """
128
    
129
    def get_source_labels(self) -> List[str]:
130
        """
131
        Get the source labels.
132
        
133
        Returns:
134
            List[str]: Names of separated sources (e.g., ["vocals", "drums", "bass", "other"])
135
        """
136
    
137
    sample_rate: int
138

139
class SquimObjectiveBundle:
140
    """Bundle for objective speech quality assessment."""
141
    
142
    def get_model(self) -> SquimObjective:
143
        """
144
        Get the SQUIM objective model.
145
        
146
        Returns:
147
            SquimObjective: Pre-trained quality assessment model
148
        """
149
    
150
    sample_rate: int
151

152
class SquimSubjectiveBundle:
153
    """Bundle for subjective speech quality assessment."""
154
    
155
    def get_model(self) -> SquimSubjective:
156
        """
157
        Get the SQUIM subjective model.
158
        
159
        Returns:
160
            SquimSubjective: Pre-trained quality assessment model
161
        """
162
    
163
    sample_rate: int
164
```
165

166
### Wav2Vec2 Pre-trained Bundles
167

168
Self-supervised speech representation models trained on large-scale unlabeled audio.
169

170
```python { .api }
171
# Base models (self-supervised representations)
172
WAV2VEC2_BASE: Wav2Vec2Bundle          # Base model (12 layers, 768 dim) trained on LibriSpeech
173
WAV2VEC2_LARGE: Wav2Vec2Bundle         # Large model (24 layers, 1024 dim) trained on LibriSpeech
174
WAV2VEC2_LARGE_LV60K: Wav2Vec2Bundle   # Large model trained on 60k hours of Libri-Light
175

176
# Cross-lingual models
177
WAV2VEC2_XLSR53: Wav2Vec2Bundle        # Cross-lingual model trained on 53 languages
178
WAV2VEC2_XLSR_300M: Wav2Vec2Bundle     # 300M parameter multilingual model
179
WAV2VEC2_XLSR_1B: Wav2Vec2Bundle       # 1B parameter multilingual model
180
WAV2VEC2_XLSR_2B: Wav2Vec2Bundle       # 2B parameter multilingual model
181

182
# Fine-tuned ASR models (English)
183
WAV2VEC2_ASR_BASE_10M: Wav2Vec2ASRBundle    # Base model fine-tuned on 10min LibriSpeech
184
WAV2VEC2_ASR_BASE_100H: Wav2Vec2ASRBundle   # Base model fine-tuned on 100h LibriSpeech
185
WAV2VEC2_ASR_BASE_960H: Wav2Vec2ASRBundle   # Base model fine-tuned on 960h LibriSpeech
186
WAV2VEC2_ASR_LARGE_10M: Wav2Vec2ASRBundle   # Large model fine-tuned on 10min LibriSpeech
187
WAV2VEC2_ASR_LARGE_100H: Wav2Vec2ASRBundle  # Large model fine-tuned on 100h LibriSpeech
188
WAV2VEC2_ASR_LARGE_960H: Wav2Vec2ASRBundle  # Large model fine-tuned on 960h LibriSpeech
189
WAV2VEC2_ASR_LARGE_LV60K_10M: Wav2Vec2ASRBundle   # LV60K model fine-tuned on 10min
190
WAV2VEC2_ASR_LARGE_LV60K_100H: Wav2Vec2ASRBundle  # LV60K model fine-tuned on 100h
191
WAV2VEC2_ASR_LARGE_LV60K_960H: Wav2Vec2ASRBundle  # LV60K model fine-tuned on 960h
192

193
# Multilingual ASR models (VoxPopuli)
194
VOXPOPULI_ASR_BASE_10K_EN: Wav2Vec2ASRBundle  # English ASR on VoxPopuli
195
VOXPOPULI_ASR_BASE_10K_ES: Wav2Vec2ASRBundle  # Spanish ASR on VoxPopuli
196
VOXPOPULI_ASR_BASE_10K_DE: Wav2Vec2ASRBundle  # German ASR on VoxPopuli
197
VOXPOPULI_ASR_BASE_10K_FR: Wav2Vec2ASRBundle  # French ASR on VoxPopuli
198
VOXPOPULI_ASR_BASE_10K_IT: Wav2Vec2ASRBundle  # Italian ASR on VoxPopuli
199
```
200

201
### HuBERT Pre-trained Bundles
202

203
Self-supervised speech models using hidden unit BERT approach.
204

205
```python { .api }
206
# Base HuBERT models
207
HUBERT_BASE: Wav2Vec2Bundle    # Base HuBERT model (12 layers, 768 dim)
208
HUBERT_LARGE: Wav2Vec2Bundle   # Large HuBERT model (24 layers, 1024 dim)  
209
HUBERT_XLARGE: Wav2Vec2Bundle  # Extra-large HuBERT model (24 layers, 1280 dim)
210

211
# Fine-tuned ASR models
212
HUBERT_ASR_LARGE: Wav2Vec2ASRBundle   # Large HuBERT fine-tuned for ASR
213
HUBERT_ASR_XLARGE: Wav2Vec2ASRBundle  # XLarge HuBERT fine-tuned for ASR
214

215
# Forced alignment model
216
MMS_FA: Wav2Vec2FABundle  # Multilingual forced alignment model (Massively Multilingual Speech)
217
```
218

219
### WavLM Pre-trained Bundles
220

221
Models trained for various speech processing tasks including speaker verification.
222

223
```python { .api }
224
WAVLM_BASE: Wav2Vec2Bundle       # Base WavLM model
225
WAVLM_BASE_PLUS: Wav2Vec2Bundle  # Base WavLM model with additional training
226
WAVLM_LARGE: Wav2Vec2Bundle      # Large WavLM model
227
```
228

229
### Text-to-Speech Bundles
230

231
Complete text-to-speech synthesis pipelines.
232

233
```python { .api }
234
# Tacotron2 + Griffin-Lim vocoder
235
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH: Tacotron2TTSBundle  # Character-based, Griffin-Lim vocoder
236
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH: Tacotron2TTSBundle # Phoneme-based, Griffin-Lim vocoder
237

238
# Tacotron2 + WaveRNN vocoder  
239
TACOTRON2_WAVERNN_CHAR_LJSPEECH: Tacotron2TTSBundle   # Character-based, WaveRNN vocoder
240
TACOTRON2_WAVERNN_PHONE_LJSPEECH: Tacotron2TTSBundle  # Phoneme-based, WaveRNN vocoder
241
```
242

243
### RNN-Transducer Bundles
244

245
Streaming speech recognition models.
246

247
```python { .api }
248
EMFORMER_RNNT_BASE_LIBRISPEECH: RNNTBundle  # Emformer-based RNN-T trained on LibriSpeech
249
```
250

251
### Source Separation Bundles
252

253
Models for separating mixed audio into individual sources.
254

255
```python { .api }
256
# Speech separation
257
CONVTASNET_BASE_LIBRI2MIX: SourceSeparationBundle  # ConvTasNet trained on Libri2Mix dataset
258

259
# Music separation  
260
HDEMUCS_HIGH_MUSDB: SourceSeparationBundle      # High-quality HDemucs trained on MUSDB18
261
HDEMUCS_HIGH_MUSDB_PLUS: SourceSeparationBundle # HDemucs trained on MUSDB18-HQ with extra data
262
```
263

264
### Speech Quality Assessment Bundles
265

266
Models for evaluating speech quality and intelligibility.
267

268
```python { .api }
269
SQUIM_OBJECTIVE: SquimObjectiveBundle    # Objective quality metrics (STOI, PESQ, SI-SDR)
270
SQUIM_SUBJECTIVE: SquimSubjectiveBundle  # Subjective quality metrics (MOS prediction)
271
```
272

273
## Usage Examples
274

275
### Speech Recognition with Wav2Vec2
276

277
```python
278
import torch
279
import torchaudio
280
from torchaudio.pipelines import WAV2VEC2_ASR_BASE_960H
281

282
# Load bundle and models
283
bundle = WAV2VEC2_ASR_BASE_960H
284
model = bundle.get_model()
285
decoder = bundle.get_decoder()
286
labels = bundle.get_labels()
287

288
# Load and preprocess audio
289
waveform, sample_rate = torchaudio.load("speech.wav")
290
if sample_rate != bundle.sample_rate:
291
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
292

293
# Run inference
294
model.eval()
295
with torch.no_grad():
296
    emission, lengths = model(waveform)
297
    
298
# Decode to text
299
transcripts = decoder(emission, lengths)
300
transcript = "".join([labels[i] for i in transcripts[0][0].tokens])
301
print(f"Transcript: {transcript}")
302
```
303

304
### Text-to-Speech with Tacotron2
305

306
```python
307
import torch
308
import torchaudio
309
from torchaudio.pipelines import TACOTRON2_WAVERNN_CHAR_LJSPEECH
310

311
# Load bundle and models
312
bundle = TACOTRON2_WAVERNN_CHAR_LJSPEECH
313
tacotron2 = bundle.get_tacotron2()
314
vocoder = bundle.get_vocoder()
315
text_processor = bundle.get_text_processor()
316

317
# Process text to tokens
318
text = "Hello, this is a test of text-to-speech synthesis."
319
tokens, token_lengths = text_processor(text)
320

321
# Generate mel spectrogram
322
tacotron2.eval()
323
with torch.no_grad():
324
    mel_outputs, mel_outputs_postnet, gate_outputs = tacotron2(tokens, token_lengths)
325

326
# Generate audio with vocoder
327
vocoder.eval()
328
with torch.no_grad():
329
    waveform = vocoder(mel_outputs_postnet)
330

331
# Save generated audio
332
torchaudio.save("synthesized.wav", waveform, bundle.sample_rate)
333
```
334

335
### Source Separation
336

337
```python
338
import torch
339
import torchaudio
340
from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB
341

342
# Load bundle and model
343
bundle = HDEMUCS_HIGH_MUSDB
344
model = bundle.get_model()
345
source_labels = bundle.get_source_labels()  # ["drums", "bass", "other", "vocals"]
346

347
# Load audio
348
waveform, sample_rate = torchaudio.load("mixed_music.wav")
349
if sample_rate != bundle.sample_rate:
350
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
351

352
# Ensure stereo and correct shape
353
if waveform.shape[0] == 1:
354
    waveform = waveform.repeat(2, 1)  # Convert mono to stereo
355
waveform = waveform.unsqueeze(0)  # Add batch dimension: (1, channels, time)
356

357
# Separate sources
358
model.eval()
359
with torch.no_grad():
360
    sources = model(waveform)  # (1, sources, channels, time)
361

362
# Save separated sources
363
for i, source_name in enumerate(source_labels):
364
    source_audio = sources[0, i]  # (channels, time)
365
    torchaudio.save(f"separated_{source_name}.wav", source_audio, bundle.sample_rate)
366
```
367

368
### Speech Quality Assessment
369

370
```python
371
import torch
372
import torchaudio
373
from torchaudio.pipelines import SQUIM_OBJECTIVE
374

375
# Load bundle and model
376
bundle = SQUIM_OBJECTIVE
377
model = bundle.get_model()
378

379
# Load audio
380
waveform, sample_rate = torchaudio.load("speech_sample.wav")
381
if sample_rate != bundle.sample_rate:
382
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
383

384
# Assess quality
385
model.eval()
386
with torch.no_grad():
387
    scores = model(waveform)  # Returns [STOI, PESQ, SI-SDR] scores
388

389
print(f"STOI: {scores[0]:.3f}")    # Speech Transmission Index Objective
390
print(f"PESQ: {scores[1]:.3f}")    # Perceptual Evaluation of Speech Quality  
391
print(f"SI-SDR: {scores[2]:.3f}")  # Scale-Invariant Signal-to-Distortion Ratio
392
```
393

394
### Multilingual Speech Recognition
395

396
```python
397
import torch
398
import torchaudio
399
from torchaudio.pipelines import WAV2VEC2_XLSR53
400

401
# Load multilingual model
402
bundle = WAV2VEC2_XLSR53
403
model = bundle.get_model()
404

405
# Load audio in any supported language
406
waveform, sample_rate = torchaudio.load("multilingual_speech.wav")
407
if sample_rate != bundle.sample_rate:
408
    waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
409

410
# Extract features (can be used for downstream tasks)
411
model.eval()
412
with torch.no_grad():
413
    features, lengths = model(waveform)
414
    
415
# Features can be used for language identification, ASR, etc.
416
print(f"Feature shape: {features.shape}")  # (batch, time, feature_dim)
417
```
418

419
These pipelines provide production-ready solutions for common audio processing tasks, with pre-trained weights and optimized preprocessing/postprocessing workflows.

Version

Tile

Files

pipelines.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

pipelines.mddocs/