0
# Core Speech Recognition
1
2
Primary speech recognition functionality including transcription, language detection, and model management. These are the main operations for converting audio to text and managing Whisper models.
3
4
## Capabilities
5
6
### WhisperModel Initialization
7
8
Create and configure a Whisper model for speech recognition with support for different model sizes, devices, and compute types.
9
10
```python { .api }
11
class WhisperModel:
12
def __init__(
13
self,
14
model_size_or_path: str,
15
device: str = "auto",
16
device_index: int | list[int] = 0,
17
compute_type: str = "default",
18
cpu_threads: int = 0,
19
num_workers: int = 1,
20
download_root: str | None = None,
21
local_files_only: bool = False,
22
files: dict | None = None,
23
revision: str | None = None,
24
use_auth_token: str | bool | None = None,
25
**model_kwargs
26
):
27
"""
28
Initialize a Whisper model.
29
30
Args:
31
model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
32
small, small.en, distil-small.en, medium, medium.en, distil-medium.en,
33
large-v1, large-v2, large-v3, large, distil-large-v2, distil-large-v3,
34
distil-large-v3.5, large-v3-turbo, turbo) or path to model directory
35
device: Device to use for computation ("auto", "cpu", "cuda")
36
device_index: Device index(es) to use for CUDA
37
compute_type: Type to use for computation ("default", "auto", "int8", "int8_float32",
38
"int8_float16", "int8_bfloat16", "int16", "float16", "bfloat16", "float32")
39
cpu_threads: Number of threads to use when running on CPU
40
num_workers: Number of workers to use for transcription
41
download_root: Directory where models should be downloaded
42
local_files_only: If True, avoid downloading files and use only local cached files
43
files: Optional dictionary of model files to use instead of downloading
44
revision: Git revision to use when downloading from Hugging Face Hub
45
use_auth_token: Hugging Face authentication token
46
"""
47
```
48
49
### Audio Transcription
50
51
Transcribe audio files or numpy arrays to text with extensive configuration options for different use cases.
52
53
```python { .api }
54
def transcribe(
55
self,
56
audio: str | BinaryIO | np.ndarray,
57
language: str | None = None,
58
task: str = "transcribe",
59
log_progress: bool = False,
60
beam_size: int = 5,
61
best_of: int = 5,
62
patience: float = 1,
63
length_penalty: float = 1,
64
repetition_penalty: float = 1,
65
no_repeat_ngram_size: int = 0,
66
temperature: float | list[float] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
67
compression_ratio_threshold: float | None = 2.4,
68
log_prob_threshold: float | None = -1.0,
69
no_speech_threshold: float | None = 0.6,
70
condition_on_previous_text: bool = True,
71
prompt_reset_on_temperature: float = 0.5,
72
initial_prompt: str | list[int] | None = None,
73
prefix: str | None = None,
74
suppress_blank: bool = True,
75
suppress_tokens: list[int] | None = [-1],
76
without_timestamps: bool = False,
77
max_initial_timestamp: float = 1.0,
78
word_timestamps: bool = False,
79
prepend_punctuations: str = "\"'"¿([{-",
80
append_punctuations: str = "\"'.。,,!!??::")]}、",
81
vad_filter: bool = False,
82
vad_parameters: dict | VadOptions | None = None,
83
max_new_tokens: int | None = None,
84
chunk_length: int | None = None,
85
clip_timestamps: str | list[float] = "0",
86
hallucination_silence_threshold: float | None = None,
87
hotwords: str | None = None,
88
multilingual: bool = False,
89
language_detection_threshold: float | None = 0.5,
90
language_detection_segments: int = 1
91
) -> tuple[Iterator[Segment], TranscriptionInfo]:
92
"""
93
Transcribe an audio file.
94
95
Args:
96
audio: Path to audio file, file-like object, or numpy array of audio data
97
language: Language of the audio (ISO 639-1 code). If None, language is detected
98
task: Task to perform ("transcribe" or "translate")
99
log_progress: Whether to display progress information
100
beam_size: Beam size for beam search decoding
101
best_of: Number of candidates to generate using beam search
102
patience: Beam search patience factor
103
length_penalty: Length penalty for beam search
104
repetition_penalty: Repetition penalty for beam search
105
no_repeat_ngram_size: Prevent repetitions of n-grams
106
temperature: Temperature(s) for sampling. Can be float or list of floats
107
compression_ratio_threshold: If compression ratio is above this value, treat as failed transcription
108
log_prob_threshold: If average log probability is below this value, treat as failed transcription
109
no_speech_threshold: If no-speech probability is above this value, treat as silence
110
condition_on_previous_text: Whether to condition on previous transcribed text
111
prompt_reset_on_temperature: Reset prompt when temperature is above this value
112
initial_prompt: Optional initial prompt to condition transcription
113
prefix: Optional prefix to prepend to transcription
114
suppress_blank: Whether to suppress blank outputs
115
suppress_tokens: List of token IDs to suppress during generation
116
without_timestamps: Whether to include timestamps in output
117
max_initial_timestamp: Maximum initial timestamp
118
word_timestamps: Whether to extract word-level timestamps
119
prepend_punctuations: Punctuations to prepend to word timestamps
120
append_punctuations: Punctuations to append to word timestamps
121
vad_filter: Whether to use voice activity detection to filter audio
122
vad_parameters: Parameters for voice activity detection
123
max_new_tokens: Maximum number of tokens to generate per segment
124
chunk_length: Length of audio chunks to process in seconds (default: 30s if not specified)
125
clip_timestamps: How to handle timestamps that go beyond audio duration
126
hallucination_silence_threshold: Threshold for detecting hallucinations
127
hotwords: String of hotwords to boost during transcription
128
multilingual: Whether the model supports multiple languages
129
language_detection_threshold: Threshold for language detection confidence
130
language_detection_segments: Number of segments to use for language detection
131
132
Returns:
133
Tuple of (segments_iterator, transcription_info)
134
- segments_iterator: Iterator of Segment objects containing transcribed text and metadata
135
- transcription_info: TranscriptionInfo object with language, duration, and other metadata
136
"""
137
```
138
139
### Language Detection
140
141
Detect the language of audio content with confidence scores for all supported languages.
142
143
```python { .api }
144
def detect_language(
145
self,
146
audio: np.ndarray | None = None,
147
features: np.ndarray | None = None,
148
vad_filter: bool = False,
149
vad_parameters: dict | VadOptions | None = None
150
) -> tuple[str, float]:
151
"""
152
Detect the language of audio.
153
154
Args:
155
audio: Audio data as numpy array
156
features: Pre-computed audio features (alternative to audio)
157
vad_filter: Whether to use voice activity detection
158
vad_parameters: Parameters for voice activity detection
159
160
Returns:
161
Tuple of (language_code, confidence_score)
162
- language_code: ISO 639-1 language code
163
- confidence_score: Confidence probability (0-1)
164
"""
165
```
166
167
### Model Management
168
169
Functions for discovering and downloading available pre-trained models.
170
171
```python { .api }
172
def available_models() -> list[str]:
173
"""
174
Get list of available model names.
175
176
Returns:
177
List of model size strings that can be used with WhisperModel
178
"""
179
180
def download_model(
181
size_or_id: str,
182
output_dir: str | None = None,
183
local_files_only: bool = False,
184
cache_dir: str | None = None,
185
revision: str | None = None,
186
use_auth_token: str | bool | None = None
187
) -> str:
188
"""
189
Download a CTranslate2 Whisper model from Hugging Face Hub.
190
191
Args:
192
size_or_id: Model size (tiny, base, small, medium, large, etc.) or
193
full model ID from Hugging Face Hub
194
output_dir: Directory where model should be saved
195
local_files_only: If True, avoid downloading and use only cached files
196
cache_dir: Path to cache directory for storing downloaded models
197
revision: Git revision to download (branch, tag, or commit hash)
198
use_auth_token: Hugging Face authentication token
199
200
Returns:
201
Path to downloaded model directory
202
203
Raises:
204
ValueError: If model size is invalid
205
"""
206
```
207
208
## Usage Examples
209
210
### Basic Transcription
211
212
```python
213
from faster_whisper import WhisperModel
214
215
# Initialize model
216
model = WhisperModel("base", device="cpu", compute_type="int8")
217
218
# Transcribe with default settings
219
segments, info = model.transcribe("audio.mp3")
220
221
for segment in segments:
222
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
223
```
224
225
### Advanced Transcription with Options
226
227
```python
228
from faster_whisper import WhisperModel
229
230
model = WhisperModel("medium", device="cuda", compute_type="float16")
231
232
# Transcribe with custom options
233
segments, info = model.transcribe(
234
"audio.mp3",
235
language="en",
236
word_timestamps=True,
237
beam_size=10,
238
vad_filter=True,
239
temperature=[0.0, 0.2, 0.4],
240
initial_prompt="This is a technical presentation about machine learning."
241
)
242
243
print(f"Language: {info.language} (confidence: {info.language_probability:.2f})")
244
print(f"Duration: {info.duration:.2f}s")
245
246
for segment in segments:
247
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
248
if segment.words:
249
for word in segment.words:
250
print(f" {word.word} ({word.start:.2f}s-{word.end:.2f}s, p={word.probability:.2f})")
251
```
252
253
### Language Detection
254
255
```python
256
from faster_whisper import WhisperModel, decode_audio
257
258
model = WhisperModel("base")
259
260
# Decode audio first
261
audio = decode_audio("multilingual_audio.mp3")
262
263
# Detect language
264
language, confidence = model.detect_language(audio)
265
print(f"Detected language: {language} (confidence: {confidence:.2f})")
266
267
# Use detected language for transcription
268
segments, info = model.transcribe("multilingual_audio.mp3", language=language)
269
```