Tessl Tile for pypi/faster-whisper@1.2.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio-processing.md batched-processing.md core-speech-recognition.md index.md utilities.md voice-activity-detection.md

core-speech-recognition.mddocs/

0
# Core Speech Recognition
1

2
Primary speech recognition functionality including transcription, language detection, and model management. These are the main operations for converting audio to text and managing Whisper models.
3

4
## Capabilities
5

6
### WhisperModel Initialization
7

8
Create and configure a Whisper model for speech recognition with support for different model sizes, devices, and compute types.
9

10
```python { .api }
11
class WhisperModel:
12
    def __init__(
13
        self,
14
        model_size_or_path: str,
15
        device: str = "auto",
16
        device_index: int | list[int] = 0,
17
        compute_type: str = "default",
18
        cpu_threads: int = 0,
19
        num_workers: int = 1,
20
        download_root: str | None = None,
21
        local_files_only: bool = False,
22
        files: dict | None = None,
23
        revision: str | None = None,
24
        use_auth_token: str | bool | None = None,
25
        **model_kwargs
26
    ):
27
        """
28
        Initialize a Whisper model.
29
        
30
        Args:
31
            model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
32
                small, small.en, distil-small.en, medium, medium.en, distil-medium.en,
33
                large-v1, large-v2, large-v3, large, distil-large-v2, distil-large-v3,
34
                distil-large-v3.5, large-v3-turbo, turbo) or path to model directory
35
            device: Device to use for computation ("auto", "cpu", "cuda")
36
            device_index: Device index(es) to use for CUDA
37
            compute_type: Type to use for computation ("default", "auto", "int8", "int8_float32",
38
                "int8_float16", "int8_bfloat16", "int16", "float16", "bfloat16", "float32")
39
            cpu_threads: Number of threads to use when running on CPU
40
            num_workers: Number of workers to use for transcription
41
            download_root: Directory where models should be downloaded
42
            local_files_only: If True, avoid downloading files and use only local cached files
43
            files: Optional dictionary of model files to use instead of downloading
44
            revision: Git revision to use when downloading from Hugging Face Hub
45
            use_auth_token: Hugging Face authentication token
46
        """
47
```
48

49
### Audio Transcription
50

51
Transcribe audio files or numpy arrays to text with extensive configuration options for different use cases.
52

53
```python { .api }
54
def transcribe(
55
    self,
56
    audio: str | BinaryIO | np.ndarray,
57
    language: str | None = None,
58
    task: str = "transcribe",
59
    log_progress: bool = False,
60
    beam_size: int = 5,
61
    best_of: int = 5,
62
    patience: float = 1,
63
    length_penalty: float = 1,
64
    repetition_penalty: float = 1,
65
    no_repeat_ngram_size: int = 0,
66
    temperature: float | list[float] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
67
    compression_ratio_threshold: float | None = 2.4,
68
    log_prob_threshold: float | None = -1.0,
69
    no_speech_threshold: float | None = 0.6,
70
    condition_on_previous_text: bool = True,
71
    prompt_reset_on_temperature: float = 0.5,
72
    initial_prompt: str | list[int] | None = None,
73
    prefix: str | None = None,
74
    suppress_blank: bool = True,
75
    suppress_tokens: list[int] | None = [-1],
76
    without_timestamps: bool = False,
77
    max_initial_timestamp: float = 1.0,
78
    word_timestamps: bool = False,
79
    prepend_punctuations: str = "\"'"¿([{-",
80
    append_punctuations: str = "\"'.。,，!！?？:：")]}、",
81
    vad_filter: bool = False,
82
    vad_parameters: dict | VadOptions | None = None,
83
    max_new_tokens: int | None = None,
84
    chunk_length: int | None = None,
85
    clip_timestamps: str | list[float] = "0",
86
    hallucination_silence_threshold: float | None = None,
87
    hotwords: str | None = None,
88
    multilingual: bool = False,
89
    language_detection_threshold: float | None = 0.5,
90
    language_detection_segments: int = 1
91
) -> tuple[Iterator[Segment], TranscriptionInfo]:
92
    """
93
    Transcribe an audio file.
94
    
95
    Args:
96
        audio: Path to audio file, file-like object, or numpy array of audio data
97
        language: Language of the audio (ISO 639-1 code). If None, language is detected
98
        task: Task to perform ("transcribe" or "translate")
99
        log_progress: Whether to display progress information
100
        beam_size: Beam size for beam search decoding
101
        best_of: Number of candidates to generate using beam search
102
        patience: Beam search patience factor
103
        length_penalty: Length penalty for beam search
104
        repetition_penalty: Repetition penalty for beam search
105
        no_repeat_ngram_size: Prevent repetitions of n-grams
106
        temperature: Temperature(s) for sampling. Can be float or list of floats
107
        compression_ratio_threshold: If compression ratio is above this value, treat as failed transcription
108
        log_prob_threshold: If average log probability is below this value, treat as failed transcription
109
        no_speech_threshold: If no-speech probability is above this value, treat as silence
110
        condition_on_previous_text: Whether to condition on previous transcribed text
111
        prompt_reset_on_temperature: Reset prompt when temperature is above this value
112
        initial_prompt: Optional initial prompt to condition transcription
113
        prefix: Optional prefix to prepend to transcription
114
        suppress_blank: Whether to suppress blank outputs
115
        suppress_tokens: List of token IDs to suppress during generation
116
        without_timestamps: Whether to include timestamps in output
117
        max_initial_timestamp: Maximum initial timestamp
118
        word_timestamps: Whether to extract word-level timestamps
119
        prepend_punctuations: Punctuations to prepend to word timestamps
120
        append_punctuations: Punctuations to append to word timestamps
121
        vad_filter: Whether to use voice activity detection to filter audio
122
        vad_parameters: Parameters for voice activity detection
123
        max_new_tokens: Maximum number of tokens to generate per segment
124
        chunk_length: Length of audio chunks to process in seconds (default: 30s if not specified)
125
        clip_timestamps: How to handle timestamps that go beyond audio duration
126
        hallucination_silence_threshold: Threshold for detecting hallucinations
127
        hotwords: String of hotwords to boost during transcription
128
        multilingual: Whether the model supports multiple languages
129
        language_detection_threshold: Threshold for language detection confidence
130
        language_detection_segments: Number of segments to use for language detection
131
        
132
    Returns:
133
        Tuple of (segments_iterator, transcription_info)
134
        - segments_iterator: Iterator of Segment objects containing transcribed text and metadata
135
        - transcription_info: TranscriptionInfo object with language, duration, and other metadata
136
    """
137
```
138

139
### Language Detection
140

141
Detect the language of audio content with confidence scores for all supported languages.
142

143
```python { .api }
144
def detect_language(
145
    self,
146
    audio: np.ndarray | None = None,
147
    features: np.ndarray | None = None,
148
    vad_filter: bool = False,
149
    vad_parameters: dict | VadOptions | None = None
150
) -> tuple[str, float]:
151
    """
152
    Detect the language of audio.
153
    
154
    Args:
155
        audio: Audio data as numpy array
156
        features: Pre-computed audio features (alternative to audio)
157
        vad_filter: Whether to use voice activity detection
158
        vad_parameters: Parameters for voice activity detection
159
        
160
    Returns:
161
        Tuple of (language_code, confidence_score)
162
        - language_code: ISO 639-1 language code
163
        - confidence_score: Confidence probability (0-1)
164
    """
165
```
166

167
### Model Management
168

169
Functions for discovering and downloading available pre-trained models.
170

171
```python { .api }
172
def available_models() -> list[str]:
173
    """
174
    Get list of available model names.
175
    
176
    Returns:
177
        List of model size strings that can be used with WhisperModel
178
    """
179

180
def download_model(
181
    size_or_id: str,
182
    output_dir: str | None = None,
183
    local_files_only: bool = False,
184
    cache_dir: str | None = None,
185
    revision: str | None = None,
186
    use_auth_token: str | bool | None = None
187
) -> str:
188
    """
189
    Download a CTranslate2 Whisper model from Hugging Face Hub.
190
    
191
    Args:
192
        size_or_id: Model size (tiny, base, small, medium, large, etc.) or 
193
                   full model ID from Hugging Face Hub
194
        output_dir: Directory where model should be saved
195
        local_files_only: If True, avoid downloading and use only cached files
196
        cache_dir: Path to cache directory for storing downloaded models
197
        revision: Git revision to download (branch, tag, or commit hash)
198
        use_auth_token: Hugging Face authentication token
199
        
200
    Returns:
201
        Path to downloaded model directory
202
        
203
    Raises:
204
        ValueError: If model size is invalid
205
    """
206
```
207

208
## Usage Examples
209

210
### Basic Transcription
211

212
```python
213
from faster_whisper import WhisperModel
214

215
# Initialize model
216
model = WhisperModel("base", device="cpu", compute_type="int8")
217

218
# Transcribe with default settings
219
segments, info = model.transcribe("audio.mp3")
220

221
for segment in segments:
222
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
223
```
224

225
### Advanced Transcription with Options
226

227
```python
228
from faster_whisper import WhisperModel
229

230
model = WhisperModel("medium", device="cuda", compute_type="float16")
231

232
# Transcribe with custom options
233
segments, info = model.transcribe(
234
    "audio.mp3",
235
    language="en",
236
    word_timestamps=True,
237
    beam_size=10,
238
    vad_filter=True,
239
    temperature=[0.0, 0.2, 0.4],
240
    initial_prompt="This is a technical presentation about machine learning."
241
)
242

243
print(f"Language: {info.language} (confidence: {info.language_probability:.2f})")
244
print(f"Duration: {info.duration:.2f}s")
245

246
for segment in segments:
247
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
248
    if segment.words:
249
        for word in segment.words:
250
            print(f"  {word.word} ({word.start:.2f}s-{word.end:.2f}s, p={word.probability:.2f})")
251
```
252

253
### Language Detection
254

255
```python
256
from faster_whisper import WhisperModel, decode_audio
257

258
model = WhisperModel("base")
259

260
# Decode audio first
261
audio = decode_audio("multilingual_audio.mp3")
262

263
# Detect language
264
language, confidence = model.detect_language(audio)
265
print(f"Detected language: {language} (confidence: {confidence:.2f})")
266

267
# Use detected language for transcription
268
segments, info = model.transcribe("multilingual_audio.mp3", language=language)
269
```

Version

Tile

Files

core-speech-recognition.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

core-speech-recognition.mddocs/