or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-processing.mdbatched-processing.mdcore-speech-recognition.mdindex.mdutilities.mdvoice-activity-detection.md

core-speech-recognition.mddocs/

0

# Core Speech Recognition

1

2

Primary speech recognition functionality including transcription, language detection, and model management. These are the main operations for converting audio to text and managing Whisper models.

3

4

## Capabilities

5

6

### WhisperModel Initialization

7

8

Create and configure a Whisper model for speech recognition with support for different model sizes, devices, and compute types.

9

10

```python { .api }

11

class WhisperModel:

12

def __init__(

13

self,

14

model_size_or_path: str,

15

device: str = "auto",

16

device_index: int | list[int] = 0,

17

compute_type: str = "default",

18

cpu_threads: int = 0,

19

num_workers: int = 1,

20

download_root: str | None = None,

21

local_files_only: bool = False,

22

files: dict | None = None,

23

revision: str | None = None,

24

use_auth_token: str | bool | None = None,

25

**model_kwargs

26

):

27

"""

28

Initialize a Whisper model.

29

30

Args:

31

model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,

32

small, small.en, distil-small.en, medium, medium.en, distil-medium.en,

33

large-v1, large-v2, large-v3, large, distil-large-v2, distil-large-v3,

34

distil-large-v3.5, large-v3-turbo, turbo) or path to model directory

35

device: Device to use for computation ("auto", "cpu", "cuda")

36

device_index: Device index(es) to use for CUDA

37

compute_type: Type to use for computation ("default", "auto", "int8", "int8_float32",

38

"int8_float16", "int8_bfloat16", "int16", "float16", "bfloat16", "float32")

39

cpu_threads: Number of threads to use when running on CPU

40

num_workers: Number of workers to use for transcription

41

download_root: Directory where models should be downloaded

42

local_files_only: If True, avoid downloading files and use only local cached files

43

files: Optional dictionary of model files to use instead of downloading

44

revision: Git revision to use when downloading from Hugging Face Hub

45

use_auth_token: Hugging Face authentication token

46

"""

47

```

48

49

### Audio Transcription

50

51

Transcribe audio files or numpy arrays to text with extensive configuration options for different use cases.

52

53

```python { .api }

54

def transcribe(

55

self,

56

audio: str | BinaryIO | np.ndarray,

57

language: str | None = None,

58

task: str = "transcribe",

59

log_progress: bool = False,

60

beam_size: int = 5,

61

best_of: int = 5,

62

patience: float = 1,

63

length_penalty: float = 1,

64

repetition_penalty: float = 1,

65

no_repeat_ngram_size: int = 0,

66

temperature: float | list[float] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],

67

compression_ratio_threshold: float | None = 2.4,

68

log_prob_threshold: float | None = -1.0,

69

no_speech_threshold: float | None = 0.6,

70

condition_on_previous_text: bool = True,

71

prompt_reset_on_temperature: float = 0.5,

72

initial_prompt: str | list[int] | None = None,

73

prefix: str | None = None,

74

suppress_blank: bool = True,

75

suppress_tokens: list[int] | None = [-1],

76

without_timestamps: bool = False,

77

max_initial_timestamp: float = 1.0,

78

word_timestamps: bool = False,

79

prepend_punctuations: str = "\"'"¿([{-",

80

append_punctuations: str = "\"'.。,,!!??::")]}、",

81

vad_filter: bool = False,

82

vad_parameters: dict | VadOptions | None = None,

83

max_new_tokens: int | None = None,

84

chunk_length: int | None = None,

85

clip_timestamps: str | list[float] = "0",

86

hallucination_silence_threshold: float | None = None,

87

hotwords: str | None = None,

88

multilingual: bool = False,

89

language_detection_threshold: float | None = 0.5,

90

language_detection_segments: int = 1

91

) -> tuple[Iterator[Segment], TranscriptionInfo]:

92

"""

93

Transcribe an audio file.

94

95

Args:

96

audio: Path to audio file, file-like object, or numpy array of audio data

97

language: Language of the audio (ISO 639-1 code). If None, language is detected

98

task: Task to perform ("transcribe" or "translate")

99

log_progress: Whether to display progress information

100

beam_size: Beam size for beam search decoding

101

best_of: Number of candidates to generate using beam search

102

patience: Beam search patience factor

103

length_penalty: Length penalty for beam search

104

repetition_penalty: Repetition penalty for beam search

105

no_repeat_ngram_size: Prevent repetitions of n-grams

106

temperature: Temperature(s) for sampling. Can be float or list of floats

107

compression_ratio_threshold: If compression ratio is above this value, treat as failed transcription

108

log_prob_threshold: If average log probability is below this value, treat as failed transcription

109

no_speech_threshold: If no-speech probability is above this value, treat as silence

110

condition_on_previous_text: Whether to condition on previous transcribed text

111

prompt_reset_on_temperature: Reset prompt when temperature is above this value

112

initial_prompt: Optional initial prompt to condition transcription

113

prefix: Optional prefix to prepend to transcription

114

suppress_blank: Whether to suppress blank outputs

115

suppress_tokens: List of token IDs to suppress during generation

116

without_timestamps: Whether to include timestamps in output

117

max_initial_timestamp: Maximum initial timestamp

118

word_timestamps: Whether to extract word-level timestamps

119

prepend_punctuations: Punctuations to prepend to word timestamps

120

append_punctuations: Punctuations to append to word timestamps

121

vad_filter: Whether to use voice activity detection to filter audio

122

vad_parameters: Parameters for voice activity detection

123

max_new_tokens: Maximum number of tokens to generate per segment

124

chunk_length: Length of audio chunks to process in seconds (default: 30s if not specified)

125

clip_timestamps: How to handle timestamps that go beyond audio duration

126

hallucination_silence_threshold: Threshold for detecting hallucinations

127

hotwords: String of hotwords to boost during transcription

128

multilingual: Whether the model supports multiple languages

129

language_detection_threshold: Threshold for language detection confidence

130

language_detection_segments: Number of segments to use for language detection

131

132

Returns:

133

Tuple of (segments_iterator, transcription_info)

134

- segments_iterator: Iterator of Segment objects containing transcribed text and metadata

135

- transcription_info: TranscriptionInfo object with language, duration, and other metadata

136

"""

137

```

138

139

### Language Detection

140

141

Detect the language of audio content with confidence scores for all supported languages.

142

143

```python { .api }

144

def detect_language(

145

self,

146

audio: np.ndarray | None = None,

147

features: np.ndarray | None = None,

148

vad_filter: bool = False,

149

vad_parameters: dict | VadOptions | None = None

150

) -> tuple[str, float]:

151

"""

152

Detect the language of audio.

153

154

Args:

155

audio: Audio data as numpy array

156

features: Pre-computed audio features (alternative to audio)

157

vad_filter: Whether to use voice activity detection

158

vad_parameters: Parameters for voice activity detection

159

160

Returns:

161

Tuple of (language_code, confidence_score)

162

- language_code: ISO 639-1 language code

163

- confidence_score: Confidence probability (0-1)

164

"""

165

```

166

167

### Model Management

168

169

Functions for discovering and downloading available pre-trained models.

170

171

```python { .api }

172

def available_models() -> list[str]:

173

"""

174

Get list of available model names.

175

176

Returns:

177

List of model size strings that can be used with WhisperModel

178

"""

179

180

def download_model(

181

size_or_id: str,

182

output_dir: str | None = None,

183

local_files_only: bool = False,

184

cache_dir: str | None = None,

185

revision: str | None = None,

186

use_auth_token: str | bool | None = None

187

) -> str:

188

"""

189

Download a CTranslate2 Whisper model from Hugging Face Hub.

190

191

Args:

192

size_or_id: Model size (tiny, base, small, medium, large, etc.) or

193

full model ID from Hugging Face Hub

194

output_dir: Directory where model should be saved

195

local_files_only: If True, avoid downloading and use only cached files

196

cache_dir: Path to cache directory for storing downloaded models

197

revision: Git revision to download (branch, tag, or commit hash)

198

use_auth_token: Hugging Face authentication token

199

200

Returns:

201

Path to downloaded model directory

202

203

Raises:

204

ValueError: If model size is invalid

205

"""

206

```

207

208

## Usage Examples

209

210

### Basic Transcription

211

212

```python

213

from faster_whisper import WhisperModel

214

215

# Initialize model

216

model = WhisperModel("base", device="cpu", compute_type="int8")

217

218

# Transcribe with default settings

219

segments, info = model.transcribe("audio.mp3")

220

221

for segment in segments:

222

print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

223

```

224

225

### Advanced Transcription with Options

226

227

```python

228

from faster_whisper import WhisperModel

229

230

model = WhisperModel("medium", device="cuda", compute_type="float16")

231

232

# Transcribe with custom options

233

segments, info = model.transcribe(

234

"audio.mp3",

235

language="en",

236

word_timestamps=True,

237

beam_size=10,

238

vad_filter=True,

239

temperature=[0.0, 0.2, 0.4],

240

initial_prompt="This is a technical presentation about machine learning."

241

)

242

243

print(f"Language: {info.language} (confidence: {info.language_probability:.2f})")

244

print(f"Duration: {info.duration:.2f}s")

245

246

for segment in segments:

247

print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

248

if segment.words:

249

for word in segment.words:

250

print(f" {word.word} ({word.start:.2f}s-{word.end:.2f}s, p={word.probability:.2f})")

251

```

252

253

### Language Detection

254

255

```python

256

from faster_whisper import WhisperModel, decode_audio

257

258

model = WhisperModel("base")

259

260

# Decode audio first

261

audio = decode_audio("multilingual_audio.mp3")

262

263

# Detect language

264

language, confidence = model.detect_language(audio)

265

print(f"Detected language: {language} (confidence: {confidence:.2f})")

266

267

# Use detected language for transcription

268

segments, info = model.transcribe("multilingual_audio.mp3", language=language)

269

```