or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-io.mddatasets.mdeffects.mdfunctional.mdindex.mdmodels.mdpipelines.mdstreaming.mdtransforms.mdutils.md

pipelines.mddocs/

0

# Model Pipelines

1

2

Pre-configured model bundles with preprocessing, inference, and post-processing for production-ready audio applications. Pipelines provide complete workflows for ASR, TTS, source separation, and speech quality assessment with pre-trained weights and consistent interfaces.

3

4

## Capabilities

5

6

### Pipeline Bundle Base Classes

7

8

Base classes that provide common functionality for all pipeline bundles.

9

10

```python { .api }

11

class Wav2Vec2Bundle:

12

"""Base bundle for Wav2Vec2 models."""

13

14

def get_model(self) -> Wav2Vec2Model:

15

"""

16

Get the Wav2Vec2 model.

17

18

Returns:

19

Wav2Vec2Model: Pre-trained model instance

20

"""

21

22

def get_labels(self) -> List[str]:

23

"""

24

Get the class labels.

25

26

Returns:

27

List[str]: List of class labels (characters, phonemes, etc.)

28

"""

29

30

sample_rate: int # Expected sample rate for input audio

31

32

class Wav2Vec2ASRBundle(Wav2Vec2Bundle):

33

"""Bundle for Wav2Vec2 automatic speech recognition models."""

34

35

def get_model(self) -> Wav2Vec2Model:

36

"""Get the fine-tuned ASR model."""

37

38

def get_decoder(self) -> torch.nn.Module:

39

"""

40

Get the decoder for converting logits to text.

41

42

Returns:

43

torch.nn.Module: Decoder module (e.g., CTC decoder)

44

"""

45

46

class Wav2Vec2FABundle(Wav2Vec2Bundle):

47

"""Bundle for Wav2Vec2 forced alignment models."""

48

49

def get_model(self) -> Wav2Vec2Model:

50

"""Get the forced alignment model."""

51

52

def get_dict(self) -> Dict[str, int]:

53

"""

54

Get the token dictionary for alignment.

55

56

Returns:

57

Dict[str, int]: Mapping from tokens to indices

58

"""

59

60

class Tacotron2TTSBundle:

61

"""Bundle for Tacotron2 text-to-speech synthesis."""

62

63

def get_tacotron2(self) -> Tacotron2:

64

"""

65

Get the Tacotron2 model.

66

67

Returns:

68

Tacotron2: Pre-trained synthesis model

69

"""

70

71

def get_vocoder(self) -> torch.nn.Module:

72

"""

73

Get the vocoder for converting mel spectrograms to audio.

74

75

Returns:

76

torch.nn.Module: Vocoder model (WaveRNN or Griffin-Lim)

77

"""

78

79

def get_text_processor(self) -> torch.nn.Module:

80

"""

81

Get the text processor for converting text to tokens.

82

83

Returns:

84

torch.nn.Module: Text processing pipeline

85

"""

86

87

sample_rate: int # Output sample rate

88

89

class RNNTBundle:

90

"""Bundle for RNN-Transducer streaming ASR models."""

91

92

def get_model(self) -> RNNT:

93

"""

94

Get the RNN-T model.

95

96

Returns:

97

RNNT: Pre-trained RNN-Transducer model

98

"""

99

100

def get_decoder(self) -> RNNTBeamSearch:

101

"""

102

Get the beam search decoder.

103

104

Returns:

105

RNNTBeamSearch: Configured beam search decoder

106

"""

107

108

def get_tokens(self) -> List[str]:

109

"""

110

Get the token vocabulary.

111

112

Returns:

113

List[str]: List of tokens (characters, subwords, etc.)

114

"""

115

116

sample_rate: int

117

118

class SourceSeparationBundle:

119

"""Bundle for source separation models."""

120

121

def get_model(self) -> torch.nn.Module:

122

"""

123

Get the source separation model.

124

125

Returns:

126

torch.nn.Module: Pre-trained separation model

127

"""

128

129

def get_source_labels(self) -> List[str]:

130

"""

131

Get the source labels.

132

133

Returns:

134

List[str]: Names of separated sources (e.g., ["vocals", "drums", "bass", "other"])

135

"""

136

137

sample_rate: int

138

139

class SquimObjectiveBundle:

140

"""Bundle for objective speech quality assessment."""

141

142

def get_model(self) -> SquimObjective:

143

"""

144

Get the SQUIM objective model.

145

146

Returns:

147

SquimObjective: Pre-trained quality assessment model

148

"""

149

150

sample_rate: int

151

152

class SquimSubjectiveBundle:

153

"""Bundle for subjective speech quality assessment."""

154

155

def get_model(self) -> SquimSubjective:

156

"""

157

Get the SQUIM subjective model.

158

159

Returns:

160

SquimSubjective: Pre-trained quality assessment model

161

"""

162

163

sample_rate: int

164

```

165

166

### Wav2Vec2 Pre-trained Bundles

167

168

Self-supervised speech representation models trained on large-scale unlabeled audio.

169

170

```python { .api }

171

# Base models (self-supervised representations)

172

WAV2VEC2_BASE: Wav2Vec2Bundle # Base model (12 layers, 768 dim) trained on LibriSpeech

173

WAV2VEC2_LARGE: Wav2Vec2Bundle # Large model (24 layers, 1024 dim) trained on LibriSpeech

174

WAV2VEC2_LARGE_LV60K: Wav2Vec2Bundle # Large model trained on 60k hours of Libri-Light

175

176

# Cross-lingual models

177

WAV2VEC2_XLSR53: Wav2Vec2Bundle # Cross-lingual model trained on 53 languages

178

WAV2VEC2_XLSR_300M: Wav2Vec2Bundle # 300M parameter multilingual model

179

WAV2VEC2_XLSR_1B: Wav2Vec2Bundle # 1B parameter multilingual model

180

WAV2VEC2_XLSR_2B: Wav2Vec2Bundle # 2B parameter multilingual model

181

182

# Fine-tuned ASR models (English)

183

WAV2VEC2_ASR_BASE_10M: Wav2Vec2ASRBundle # Base model fine-tuned on 10min LibriSpeech

184

WAV2VEC2_ASR_BASE_100H: Wav2Vec2ASRBundle # Base model fine-tuned on 100h LibriSpeech

185

WAV2VEC2_ASR_BASE_960H: Wav2Vec2ASRBundle # Base model fine-tuned on 960h LibriSpeech

186

WAV2VEC2_ASR_LARGE_10M: Wav2Vec2ASRBundle # Large model fine-tuned on 10min LibriSpeech

187

WAV2VEC2_ASR_LARGE_100H: Wav2Vec2ASRBundle # Large model fine-tuned on 100h LibriSpeech

188

WAV2VEC2_ASR_LARGE_960H: Wav2Vec2ASRBundle # Large model fine-tuned on 960h LibriSpeech

189

WAV2VEC2_ASR_LARGE_LV60K_10M: Wav2Vec2ASRBundle # LV60K model fine-tuned on 10min

190

WAV2VEC2_ASR_LARGE_LV60K_100H: Wav2Vec2ASRBundle # LV60K model fine-tuned on 100h

191

WAV2VEC2_ASR_LARGE_LV60K_960H: Wav2Vec2ASRBundle # LV60K model fine-tuned on 960h

192

193

# Multilingual ASR models (VoxPopuli)

194

VOXPOPULI_ASR_BASE_10K_EN: Wav2Vec2ASRBundle # English ASR on VoxPopuli

195

VOXPOPULI_ASR_BASE_10K_ES: Wav2Vec2ASRBundle # Spanish ASR on VoxPopuli

196

VOXPOPULI_ASR_BASE_10K_DE: Wav2Vec2ASRBundle # German ASR on VoxPopuli

197

VOXPOPULI_ASR_BASE_10K_FR: Wav2Vec2ASRBundle # French ASR on VoxPopuli

198

VOXPOPULI_ASR_BASE_10K_IT: Wav2Vec2ASRBundle # Italian ASR on VoxPopuli

199

```

200

201

### HuBERT Pre-trained Bundles

202

203

Self-supervised speech models using hidden unit BERT approach.

204

205

```python { .api }

206

# Base HuBERT models

207

HUBERT_BASE: Wav2Vec2Bundle # Base HuBERT model (12 layers, 768 dim)

208

HUBERT_LARGE: Wav2Vec2Bundle # Large HuBERT model (24 layers, 1024 dim)

209

HUBERT_XLARGE: Wav2Vec2Bundle # Extra-large HuBERT model (24 layers, 1280 dim)

210

211

# Fine-tuned ASR models

212

HUBERT_ASR_LARGE: Wav2Vec2ASRBundle # Large HuBERT fine-tuned for ASR

213

HUBERT_ASR_XLARGE: Wav2Vec2ASRBundle # XLarge HuBERT fine-tuned for ASR

214

215

# Forced alignment model

216

MMS_FA: Wav2Vec2FABundle # Multilingual forced alignment model (Massively Multilingual Speech)

217

```

218

219

### WavLM Pre-trained Bundles

220

221

Models trained for various speech processing tasks including speaker verification.

222

223

```python { .api }

224

WAVLM_BASE: Wav2Vec2Bundle # Base WavLM model

225

WAVLM_BASE_PLUS: Wav2Vec2Bundle # Base WavLM model with additional training

226

WAVLM_LARGE: Wav2Vec2Bundle # Large WavLM model

227

```

228

229

### Text-to-Speech Bundles

230

231

Complete text-to-speech synthesis pipelines.

232

233

```python { .api }

234

# Tacotron2 + Griffin-Lim vocoder

235

TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH: Tacotron2TTSBundle # Character-based, Griffin-Lim vocoder

236

TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH: Tacotron2TTSBundle # Phoneme-based, Griffin-Lim vocoder

237

238

# Tacotron2 + WaveRNN vocoder

239

TACOTRON2_WAVERNN_CHAR_LJSPEECH: Tacotron2TTSBundle # Character-based, WaveRNN vocoder

240

TACOTRON2_WAVERNN_PHONE_LJSPEECH: Tacotron2TTSBundle # Phoneme-based, WaveRNN vocoder

241

```

242

243

### RNN-Transducer Bundles

244

245

Streaming speech recognition models.

246

247

```python { .api }

248

EMFORMER_RNNT_BASE_LIBRISPEECH: RNNTBundle # Emformer-based RNN-T trained on LibriSpeech

249

```

250

251

### Source Separation Bundles

252

253

Models for separating mixed audio into individual sources.

254

255

```python { .api }

256

# Speech separation

257

CONVTASNET_BASE_LIBRI2MIX: SourceSeparationBundle # ConvTasNet trained on Libri2Mix dataset

258

259

# Music separation

260

HDEMUCS_HIGH_MUSDB: SourceSeparationBundle # High-quality HDemucs trained on MUSDB18

261

HDEMUCS_HIGH_MUSDB_PLUS: SourceSeparationBundle # HDemucs trained on MUSDB18-HQ with extra data

262

```

263

264

### Speech Quality Assessment Bundles

265

266

Models for evaluating speech quality and intelligibility.

267

268

```python { .api }

269

SQUIM_OBJECTIVE: SquimObjectiveBundle # Objective quality metrics (STOI, PESQ, SI-SDR)

270

SQUIM_SUBJECTIVE: SquimSubjectiveBundle # Subjective quality metrics (MOS prediction)

271

```

272

273

## Usage Examples

274

275

### Speech Recognition with Wav2Vec2

276

277

```python

278

import torch

279

import torchaudio

280

from torchaudio.pipelines import WAV2VEC2_ASR_BASE_960H

281

282

# Load bundle and models

283

bundle = WAV2VEC2_ASR_BASE_960H

284

model = bundle.get_model()

285

decoder = bundle.get_decoder()

286

labels = bundle.get_labels()

287

288

# Load and preprocess audio

289

waveform, sample_rate = torchaudio.load("speech.wav")

290

if sample_rate != bundle.sample_rate:

291

waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

292

293

# Run inference

294

model.eval()

295

with torch.no_grad():

296

emission, lengths = model(waveform)

297

298

# Decode to text

299

transcripts = decoder(emission, lengths)

300

transcript = "".join([labels[i] for i in transcripts[0][0].tokens])

301

print(f"Transcript: {transcript}")

302

```

303

304

### Text-to-Speech with Tacotron2

305

306

```python

307

import torch

308

import torchaudio

309

from torchaudio.pipelines import TACOTRON2_WAVERNN_CHAR_LJSPEECH

310

311

# Load bundle and models

312

bundle = TACOTRON2_WAVERNN_CHAR_LJSPEECH

313

tacotron2 = bundle.get_tacotron2()

314

vocoder = bundle.get_vocoder()

315

text_processor = bundle.get_text_processor()

316

317

# Process text to tokens

318

text = "Hello, this is a test of text-to-speech synthesis."

319

tokens, token_lengths = text_processor(text)

320

321

# Generate mel spectrogram

322

tacotron2.eval()

323

with torch.no_grad():

324

mel_outputs, mel_outputs_postnet, gate_outputs = tacotron2(tokens, token_lengths)

325

326

# Generate audio with vocoder

327

vocoder.eval()

328

with torch.no_grad():

329

waveform = vocoder(mel_outputs_postnet)

330

331

# Save generated audio

332

torchaudio.save("synthesized.wav", waveform, bundle.sample_rate)

333

```

334

335

### Source Separation

336

337

```python

338

import torch

339

import torchaudio

340

from torchaudio.pipelines import HDEMUCS_HIGH_MUSDB

341

342

# Load bundle and model

343

bundle = HDEMUCS_HIGH_MUSDB

344

model = bundle.get_model()

345

source_labels = bundle.get_source_labels() # ["drums", "bass", "other", "vocals"]

346

347

# Load audio

348

waveform, sample_rate = torchaudio.load("mixed_music.wav")

349

if sample_rate != bundle.sample_rate:

350

waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

351

352

# Ensure stereo and correct shape

353

if waveform.shape[0] == 1:

354

waveform = waveform.repeat(2, 1) # Convert mono to stereo

355

waveform = waveform.unsqueeze(0) # Add batch dimension: (1, channels, time)

356

357

# Separate sources

358

model.eval()

359

with torch.no_grad():

360

sources = model(waveform) # (1, sources, channels, time)

361

362

# Save separated sources

363

for i, source_name in enumerate(source_labels):

364

source_audio = sources[0, i] # (channels, time)

365

torchaudio.save(f"separated_{source_name}.wav", source_audio, bundle.sample_rate)

366

```

367

368

### Speech Quality Assessment

369

370

```python

371

import torch

372

import torchaudio

373

from torchaudio.pipelines import SQUIM_OBJECTIVE

374

375

# Load bundle and model

376

bundle = SQUIM_OBJECTIVE

377

model = bundle.get_model()

378

379

# Load audio

380

waveform, sample_rate = torchaudio.load("speech_sample.wav")

381

if sample_rate != bundle.sample_rate:

382

waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

383

384

# Assess quality

385

model.eval()

386

with torch.no_grad():

387

scores = model(waveform) # Returns [STOI, PESQ, SI-SDR] scores

388

389

print(f"STOI: {scores[0]:.3f}") # Speech Transmission Index Objective

390

print(f"PESQ: {scores[1]:.3f}") # Perceptual Evaluation of Speech Quality

391

print(f"SI-SDR: {scores[2]:.3f}") # Scale-Invariant Signal-to-Distortion Ratio

392

```

393

394

### Multilingual Speech Recognition

395

396

```python

397

import torch

398

import torchaudio

399

from torchaudio.pipelines import WAV2VEC2_XLSR53

400

401

# Load multilingual model

402

bundle = WAV2VEC2_XLSR53

403

model = bundle.get_model()

404

405

# Load audio in any supported language

406

waveform, sample_rate = torchaudio.load("multilingual_speech.wav")

407

if sample_rate != bundle.sample_rate:

408

waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)

409

410

# Extract features (can be used for downstream tasks)

411

model.eval()

412

with torch.no_grad():

413

features, lengths = model(waveform)

414

415

# Features can be used for language identification, ASR, etc.

416

print(f"Feature shape: {features.shape}") # (batch, time, feature_dim)

417

```

418

419

These pipelines provide production-ready solutions for common audio processing tasks, with pre-trained weights and optimized preprocessing/postprocessing workflows.