0
# Model Pipelines
1
2
Pre-configured model bundles with preprocessing, inference, and post-processing for production-ready audio applications. Pipelines provide complete workflows for ASR, TTS, source separation, and speech quality assessment with pre-trained weights and consistent interfaces.
3
4
## Capabilities
5
6
### Pipeline Bundle Base Classes
7
8
Base classes that provide common functionality for all pipeline bundles.
9
10
```python { .api }
11
class Wav2Vec2Bundle:
12
"""Base bundle for Wav2Vec2 models."""
13
14
def get_model(self) -> Wav2Vec2Model:
15
"""
16
Get the Wav2Vec2 model.
17
18
Returns:
19
Wav2Vec2Model: Pre-trained model instance
20
"""
21
22
def get_labels(self) -> List[str]:
23
"""
24
Get the class labels.
25
26
Returns:
27
List[str]: List of class labels (characters, phonemes, etc.)
28
"""
29
30
sample_rate: int # Expected sample rate for input audio
31
32
class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
33
"""Bundle for Wav2Vec2 automatic speech recognition models."""
34
35
def get_model(self) -> Wav2Vec2Model:
36
"""Get the fine-tuned ASR model."""
37
38
def get_decoder(self) -> torch.nn.Module:
39
"""
40
Get the decoder for converting logits to text.
41
42
Returns:
43
torch.nn.Module: Decoder module (e.g., CTC decoder)
44
"""
45
46
class Wav2Vec2FABundle(Wav2Vec2Bundle):
47
"""Bundle for Wav2Vec2 forced alignment models."""
48
49
def get_model(self) -> Wav2Vec2Model:
50
"""Get the forced alignment model."""
51
52
def get_dict(self) -> Dict[str, int]:
53
"""
54
Get the token dictionary for alignment.
55
56
Returns:
57
Dict[str, int]: Mapping from tokens to indices
58
"""
59
60
class Tacotron2TTSBundle:
61
"""Bundle for Tacotron2 text-to-speech synthesis."""
62
63
def get_tacotron2(self) -> Tacotron2:
64
"""
65
Get the Tacotron2 model.
66
67
Returns:
68
Tacotron2: Pre-trained synthesis model
69
"""
70
71
def get_vocoder(self) -> torch.nn.Module:
72
"""
73
Get the vocoder for converting mel spectrograms to audio.
74
75
Returns:
76
torch.nn.Module: Vocoder model (WaveRNN or Griffin-Lim)
77
"""
78
79
def get_text_processor(self) -> torch.nn.Module:
80
"""
81
Get the text processor for converting text to tokens.
82
83
Returns:
84
torch.nn.Module: Text processing pipeline
85
"""
86
87
sample_rate: int # Output sample rate
88
89
class RNNTBundle:
90
"""Bundle for RNN-Transducer streaming ASR models."""
91
92
def get_model(self) -> RNNT:
93
"""
94
Get the RNN-T model.
95
96
Returns:
97
RNNT: Pre-trained RNN-Transducer model
98
"""
99
100
def get_decoder(self) -> RNNTBeamSearch:
101
"""
102
Get the beam search decoder.
103
104
Returns:
105
RNNTBeamSearch: Configured beam search decoder
106
"""
107
108
def get_tokens(self) -> List[str]:
109
"""
110
Get the token vocabulary.
111
112
Returns:
113
List[str]: List of tokens (characters, subwords, etc.)
114
"""
115
116
sample_rate: int
117
118
class SourceSeparationBundle:
119
"""Bundle for source separation models."""
120
121
def get_model(self) -> torch.nn.Module:
122
"""
123
Get the source separation model.
124
125
Returns:
126
torch.nn.Module: Pre-trained separation model
127
"""
128
129
def get_source_labels(self) -> List[str]:
130
"""
131
Get the source labels.
132
133
Returns:
134
List[str]: Names of separated sources (e.g., ["vocals", "drums", "bass", "other"])
135
"""
136
137
sample_rate: int
138
139
class SquimObjectiveBundle:
140
"""Bundle for objective speech quality assessment."""
141
142
def get_model(self) -> SquimObjective:
143
"""
144
Get the SQUIM objective model.
145
146
Returns:
147
SquimObjective: Pre-trained quality assessment model
148
"""
149
150
sample_rate: int
151
152
class SquimSubjectiveBundle:
153
"""Bundle for subjective speech quality assessment."""
154
155
def get_model(self) -> SquimSubjective:
156
"""
157
Get the SQUIM subjective model.
158
159
Returns:
160
SquimSubjective: Pre-trained quality assessment model
161
"""
162
163
sample_rate: int
164
```
165
166
### Wav2Vec2 Pre-trained Bundles
167
168
Self-supervised speech representation models trained on large-scale unlabeled audio.
169
170
```python { .api }
171
# Base models (self-supervised representations)
172
WAV2VEC2_BASE: Wav2Vec2Bundle # Base model (12 layers, 768 dim) trained on LibriSpeech
173
WAV2VEC2_LARGE: Wav2Vec2Bundle # Large model (24 layers, 1024 dim) trained on LibriSpeech
174
WAV2VEC2_LARGE_LV60K: Wav2Vec2Bundle # Large model trained on 60k hours of Libri-Light
175
176
# Cross-lingual models
177
WAV2VEC2_XLSR53: Wav2Vec2Bundle # Cross-lingual model trained on 53 languages
178
WAV2VEC2_XLSR_300M: Wav2Vec2Bundle # 300M parameter multilingual model
179
WAV2VEC2_XLSR_1B: Wav2Vec2Bundle # 1B parameter multilingual model
180
WAV2VEC2_XLSR_2B: Wav2Vec2Bundle # 2B parameter multilingual model
181
182
# Fine-tuned ASR models (English)
183
WAV2VEC2_ASR_BASE_10M: Wav2Vec2ASRBundle # Base model fine-tuned on 10min LibriSpeech
184
WAV2VEC2_ASR_BASE_100H: Wav2Vec2ASRBundle # Base model fine-tuned on 100h LibriSpeech
185
WAV2VEC2_ASR_BASE_960H: Wav2Vec2ASRBundle # Base model fine-tuned on 960h LibriSpeech
186
WAV2VEC2_ASR_LARGE_10M: Wav2Vec2ASRBundle # Large model fine-tuned on 10min LibriSpeech
187
WAV2VEC2_ASR_LARGE_100H: Wav2Vec2ASRBundle # Large model fine-tuned on 100h LibriSpeech
188
WAV2VEC2_ASR_LARGE_960H: Wav2Vec2ASRBundle # Large model fine-tuned on 960h LibriSpeech
189
WAV2VEC2_ASR_LARGE_LV60K_10M: Wav2Vec2ASRBundle # LV60K model fine-tuned on 10min
190
WAV2VEC2_ASR_LARGE_LV60K_100H: Wav2Vec2ASRBundle # LV60K model fine-tuned on 100h
191
WAV2VEC2_ASR_LARGE_LV60K_960H: Wav2Vec2ASRBundle # LV60K model fine-tuned on 960h
192
193
# Multilingual ASR models (VoxPopuli)
194
VOXPOPULI_ASR_BASE_10K_EN: Wav2Vec2ASRBundle # English ASR on VoxPopuli
195
VOXPOPULI_ASR_BASE_10K_ES: Wav2Vec2ASRBundle # Spanish ASR on VoxPopuli
196
VOXPOPULI_ASR_BASE_10K_DE: Wav2Vec2ASRBundle # German ASR on VoxPopuli
197
VOXPOPULI_ASR_BASE_10K_FR: Wav2Vec2ASRBundle # French ASR on VoxPopuli
198
VOXPOPULI_ASR_BASE_10K_IT: Wav2Vec2ASRBundle # Italian ASR on VoxPopuli
199
```
200
201
### HuBERT Pre-trained Bundles
202
203
Self-supervised speech models using hidden unit BERT approach.
204
205
```python { .api }
206
# Base HuBERT models
207
HUBERT_BASE: Wav2Vec2Bundle # Base HuBERT model (12 layers, 768 dim)
208
HUBERT_LARGE: Wav2Vec2Bundle # Large HuBERT model (24 layers, 1024 dim)
209
HUBERT_XLARGE: Wav2Vec2Bundle # Extra-large HuBERT model (24 layers, 1280 dim)
210
211
# Fine-tuned ASR models
212
HUBERT_ASR_LARGE: Wav2Vec2ASRBundle # Large HuBERT fine-tuned for ASR
213
HUBERT_ASR_XLARGE: Wav2Vec2ASRBundle # XLarge HuBERT fine-tuned for ASR
214
215
# Forced alignment model
216
MMS_FA: Wav2Vec2FABundle # Multilingual forced alignment model (Massively Multilingual Speech)
217
```
218
219
### WavLM Pre-trained Bundles
220
221
Models trained for various speech processing tasks including speaker verification.
222
223
```python { .api }
224
WAVLM_BASE: Wav2Vec2Bundle # Base WavLM model
225
WAVLM_BASE_PLUS: Wav2Vec2Bundle # Base WavLM model with additional training
226
WAVLM_LARGE: Wav2Vec2Bundle # Large WavLM model
227
```
228
229
### Text-to-Speech Bundles
230
231
Complete text-to-speech synthesis pipelines.
232
233
```python { .api }
234
# Tacotron2 + Griffin-Lim vocoder
235
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH: Tacotron2TTSBundle # Character-based, Griffin-Lim vocoder
236
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH: Tacotron2TTSBundle # Phoneme-based, Griffin-Lim vocoder
237
238
# Tacotron2 + WaveRNN vocoder
239
TACOTRON2_WAVERNN_CHAR_LJSPEECH: Tacotron2TTSBundle # Character-based, WaveRNN vocoder
240
TACOTRON2_WAVERNN_PHONE_LJSPEECH: Tacotron2TTSBundle # Phoneme-based, WaveRNN vocoder
241
```
242
243
### RNN-Transducer Bundles
244
245
Streaming speech recognition models.
246
247
```python { .api }
248
EMFORMER_RNNT_BASE_LIBRISPEECH: RNNTBundle # Emformer-based RNN-T trained on LibriSpeech
249
```
250
251
### Source Separation Bundles
252
253
Models for separating mixed audio into individual sources.
254
255
```python { .api }
256
# Speech separation
257
CONVTASNET_BASE_LIBRI2MIX: SourceSeparationBundle # ConvTasNet trained on Libri2Mix dataset
258
259
# Music separation
260
HDEMUCS_HIGH_MUSDB: SourceSeparationBundle # High-quality HDemucs trained on MUSDB18
261
HDEMUCS_HIGH_MUSDB_PLUS: SourceSeparationBundle # HDemucs trained on MUSDB18-HQ with extra data
262
```
263
264
### Speech Quality Assessment Bundles
265
266
Models for evaluating speech quality and intelligibility.
267
268
```python { .api }
269
SQUIM_OBJECTIVE: SquimObjectiveBundle # Objective quality metrics (STOI, PESQ, SI-SDR)
270
SQUIM_SUBJECTIVE: SquimSubjectiveBundle # Subjective quality metrics (MOS prediction)
271
```
272
273
## Usage Examples
274
275
### Speech Recognition with Wav2Vec2
276
277
```python
278
import torch
279
import torchaudio
280
from torchaudio.pipelines import WAV2VEC2_ASR_BASE_960H
281
282
# Load bundle and models
283
bundle = WAV2VEC2_ASR_BASE_960H
284
model = bundle.get_model()
285
decoder = bundle.get_decoder()
286
labels = bundle.get_labels()
287
288
# Load and preprocess audio
289
waveform, sample_rate = torchaudio.load("speech.wav")
290
if sample_rate != bundle.sample_rate:
291
waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
292
293
# Run inference
294
model.eval()
295
with torch.no_grad():
296
emission, lengths = model(waveform)
297
298
# Decode to text
299
transcripts = decoder(emission, lengths)
300
transcript = "".join([labels[i] for i in transcripts[0][0].tokens])
301
print(f"Transcript: {transcript}")
302
```
303
304
### Text-to-Speech with Tacotron2
305
306
```python
307
import torch
308
import torchaudio
309
from torchaudio.pipelines import TACOTRON2_WAVERNN_CHAR_LJSPEECH
310
311
# Load bundle and models
312
bundle = TACOTRON2_WAVERNN_CHAR_LJSPEECH
313
tacotron2 = bundle.get_tacotron2()
314
vocoder = bundle.get_vocoder()
315
text_processor = bundle.get_text_processor()
316
317
# Process text to tokens
318
text = "Hello, this is a test of text-to-speech synthesis."
319
tokens, token_lengths = text_processor(text)
320
321
# Generate mel spectrogram
322
tacotron2.eval()
323
with torch.no_grad():
324
mel_outputs, mel_outputs_postnet, gate_outputs = tacotron2(tokens, token_lengths)
325
326
# Generate audio with vocoder
327
vocoder.eval()
328
with torch.no_grad():
329
waveform = vocoder(mel_outputs_postnet)
330
331
# Save generated audio
332
torchaudio.save("synthesized.wav", waveform, bundle.sample_rate)
333
```
334
335
### Source Separation
336
337
```python
338
import torch
339
import torchaudio
340
from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB
341
342
# Load bundle and model
343
bundle = HDEMUCS_HIGH_MUSDB
344
model = bundle.get_model()
345
source_labels = bundle.get_source_labels() # ["drums", "bass", "other", "vocals"]
346
347
# Load audio
348
waveform, sample_rate = torchaudio.load("mixed_music.wav")
349
if sample_rate != bundle.sample_rate:
350
waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
351
352
# Ensure stereo and correct shape
353
if waveform.shape[0] == 1:
354
waveform = waveform.repeat(2, 1) # Convert mono to stereo
355
waveform = waveform.unsqueeze(0) # Add batch dimension: (1, channels, time)
356
357
# Separate sources
358
model.eval()
359
with torch.no_grad():
360
sources = model(waveform) # (1, sources, channels, time)
361
362
# Save separated sources
363
for i, source_name in enumerate(source_labels):
364
source_audio = sources[0, i] # (channels, time)
365
torchaudio.save(f"separated_{source_name}.wav", source_audio, bundle.sample_rate)
366
```
367
368
### Speech Quality Assessment
369
370
```python
371
import torch
372
import torchaudio
373
from torchaudio.pipelines import SQUIM_OBJECTIVE
374
375
# Load bundle and model
376
bundle = SQUIM_OBJECTIVE
377
model = bundle.get_model()
378
379
# Load audio
380
waveform, sample_rate = torchaudio.load("speech_sample.wav")
381
if sample_rate != bundle.sample_rate:
382
waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
383
384
# Assess quality
385
model.eval()
386
with torch.no_grad():
387
scores = model(waveform) # Returns [STOI, PESQ, SI-SDR] scores
388
389
print(f"STOI: {scores[0]:.3f}") # Speech Transmission Index Objective
390
print(f"PESQ: {scores[1]:.3f}") # Perceptual Evaluation of Speech Quality
391
print(f"SI-SDR: {scores[2]:.3f}") # Scale-Invariant Signal-to-Distortion Ratio
392
```
393
394
### Multilingual Speech Recognition
395
396
```python
397
import torch
398
import torchaudio
399
from torchaudio.pipelines import WAV2VEC2_XLSR53
400
401
# Load multilingual model
402
bundle = WAV2VEC2_XLSR53
403
model = bundle.get_model()
404
405
# Load audio in any supported language
406
waveform, sample_rate = torchaudio.load("multilingual_speech.wav")
407
if sample_rate != bundle.sample_rate:
408
waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
409
410
# Extract features (can be used for downstream tasks)
411
model.eval()
412
with torch.no_grad():
413
features, lengths = model(waveform)
414
415
# Features can be used for language identification, ASR, etc.
416
print(f"Feature shape: {features.shape}") # (batch, time, feature_dim)
417
```
418
419
These pipelines provide production-ready solutions for common audio processing tasks, with pre-trained weights and optimized preprocessing/postprocessing workflows.