Faster Whisper transcription with CTranslate2 for high-performance speech recognition
—
Primary speech recognition functionality including transcription, language detection, and model management. These are the main operations for converting audio to text and managing Whisper models.
Create and configure a Whisper model for speech recognition with support for different model sizes, devices, and compute types.
class WhisperModel:
def __init__(
self,
model_size_or_path: str,
device: str = "auto",
device_index: int | list[int] = 0,
compute_type: str = "default",
cpu_threads: int = 0,
num_workers: int = 1,
download_root: str | None = None,
local_files_only: bool = False,
files: dict | None = None,
revision: str | None = None,
use_auth_token: str | bool | None = None,
**model_kwargs
):
"""
Initialize a Whisper model.
Args:
model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
small, small.en, distil-small.en, medium, medium.en, distil-medium.en,
large-v1, large-v2, large-v3, large, distil-large-v2, distil-large-v3,
distil-large-v3.5, large-v3-turbo, turbo) or path to model directory
device: Device to use for computation ("auto", "cpu", "cuda")
device_index: Device index(es) to use for CUDA
compute_type: Type to use for computation ("default", "auto", "int8", "int8_float32",
"int8_float16", "int8_bfloat16", "int16", "float16", "bfloat16", "float32")
cpu_threads: Number of threads to use when running on CPU
num_workers: Number of workers to use for transcription
download_root: Directory where models should be downloaded
local_files_only: If True, avoid downloading files and use only local cached files
files: Optional dictionary of model files to use instead of downloading
revision: Git revision to use when downloading from Hugging Face Hub
use_auth_token: Hugging Face authentication token
"""Transcribe audio files or numpy arrays to text with extensive configuration options for different use cases.
def transcribe(
self,
audio: str | BinaryIO | np.ndarray,
language: str | None = None,
task: str = "transcribe",
log_progress: bool = False,
beam_size: int = 5,
best_of: int = 5,
patience: float = 1,
length_penalty: float = 1,
repetition_penalty: float = 1,
no_repeat_ngram_size: int = 0,
temperature: float | list[float] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
compression_ratio_threshold: float | None = 2.4,
log_prob_threshold: float | None = -1.0,
no_speech_threshold: float | None = 0.6,
condition_on_previous_text: bool = True,
prompt_reset_on_temperature: float = 0.5,
initial_prompt: str | list[int] | None = None,
prefix: str | None = None,
suppress_blank: bool = True,
suppress_tokens: list[int] | None = [-1],
without_timestamps: bool = False,
max_initial_timestamp: float = 1.0,
word_timestamps: bool = False,
prepend_punctuations: str = "\"'"¿([{-",
append_punctuations: str = "\"'.。,,!!??::")]}、",
vad_filter: bool = False,
vad_parameters: dict | VadOptions | None = None,
max_new_tokens: int | None = None,
chunk_length: int | None = None,
clip_timestamps: str | list[float] = "0",
hallucination_silence_threshold: float | None = None,
hotwords: str | None = None,
multilingual: bool = False,
language_detection_threshold: float | None = 0.5,
language_detection_segments: int = 1
) -> tuple[Iterator[Segment], TranscriptionInfo]:
"""
Transcribe an audio file.
Args:
audio: Path to audio file, file-like object, or numpy array of audio data
language: Language of the audio (ISO 639-1 code). If None, language is detected
task: Task to perform ("transcribe" or "translate")
log_progress: Whether to display progress information
beam_size: Beam size for beam search decoding
best_of: Number of candidates to generate using beam search
patience: Beam search patience factor
length_penalty: Length penalty for beam search
repetition_penalty: Repetition penalty for beam search
no_repeat_ngram_size: Prevent repetitions of n-grams
temperature: Temperature(s) for sampling. Can be float or list of floats
compression_ratio_threshold: If compression ratio is above this value, treat as failed transcription
log_prob_threshold: If average log probability is below this value, treat as failed transcription
no_speech_threshold: If no-speech probability is above this value, treat as silence
condition_on_previous_text: Whether to condition on previous transcribed text
prompt_reset_on_temperature: Reset prompt when temperature is above this value
initial_prompt: Optional initial prompt to condition transcription
prefix: Optional prefix to prepend to transcription
suppress_blank: Whether to suppress blank outputs
suppress_tokens: List of token IDs to suppress during generation
without_timestamps: Whether to include timestamps in output
max_initial_timestamp: Maximum initial timestamp
word_timestamps: Whether to extract word-level timestamps
prepend_punctuations: Punctuations to prepend to word timestamps
append_punctuations: Punctuations to append to word timestamps
vad_filter: Whether to use voice activity detection to filter audio
vad_parameters: Parameters for voice activity detection
max_new_tokens: Maximum number of tokens to generate per segment
chunk_length: Length of audio chunks to process in seconds (default: 30s if not specified)
clip_timestamps: How to handle timestamps that go beyond audio duration
hallucination_silence_threshold: Threshold for detecting hallucinations
hotwords: String of hotwords to boost during transcription
multilingual: Whether the model supports multiple languages
language_detection_threshold: Threshold for language detection confidence
language_detection_segments: Number of segments to use for language detection
Returns:
Tuple of (segments_iterator, transcription_info)
- segments_iterator: Iterator of Segment objects containing transcribed text and metadata
- transcription_info: TranscriptionInfo object with language, duration, and other metadata
"""Detect the language of audio content with confidence scores for all supported languages.
def detect_language(
self,
audio: np.ndarray | None = None,
features: np.ndarray | None = None,
vad_filter: bool = False,
vad_parameters: dict | VadOptions | None = None
) -> tuple[str, float]:
"""
Detect the language of audio.
Args:
audio: Audio data as numpy array
features: Pre-computed audio features (alternative to audio)
vad_filter: Whether to use voice activity detection
vad_parameters: Parameters for voice activity detection
Returns:
Tuple of (language_code, confidence_score)
- language_code: ISO 639-1 language code
- confidence_score: Confidence probability (0-1)
"""Functions for discovering and downloading available pre-trained models.
def available_models() -> list[str]:
"""
Get list of available model names.
Returns:
List of model size strings that can be used with WhisperModel
"""
def download_model(
size_or_id: str,
output_dir: str | None = None,
local_files_only: bool = False,
cache_dir: str | None = None,
revision: str | None = None,
use_auth_token: str | bool | None = None
) -> str:
"""
Download a CTranslate2 Whisper model from Hugging Face Hub.
Args:
size_or_id: Model size (tiny, base, small, medium, large, etc.) or
full model ID from Hugging Face Hub
output_dir: Directory where model should be saved
local_files_only: If True, avoid downloading and use only cached files
cache_dir: Path to cache directory for storing downloaded models
revision: Git revision to download (branch, tag, or commit hash)
use_auth_token: Hugging Face authentication token
Returns:
Path to downloaded model directory
Raises:
ValueError: If model size is invalid
"""from faster_whisper import WhisperModel
# Initialize model
model = WhisperModel("base", device="cpu", compute_type="int8")
# Transcribe with default settings
segments, info = model.transcribe("audio.mp3")
for segment in segments:
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")from faster_whisper import WhisperModel
model = WhisperModel("medium", device="cuda", compute_type="float16")
# Transcribe with custom options
segments, info = model.transcribe(
"audio.mp3",
language="en",
word_timestamps=True,
beam_size=10,
vad_filter=True,
temperature=[0.0, 0.2, 0.4],
initial_prompt="This is a technical presentation about machine learning."
)
print(f"Language: {info.language} (confidence: {info.language_probability:.2f})")
print(f"Duration: {info.duration:.2f}s")
for segment in segments:
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
if segment.words:
for word in segment.words:
print(f" {word.word} ({word.start:.2f}s-{word.end:.2f}s, p={word.probability:.2f})")from faster_whisper import WhisperModel, decode_audio
model = WhisperModel("base")
# Decode audio first
audio = decode_audio("multilingual_audio.mp3")
# Detect language
language, confidence = model.detect_language(audio)
print(f"Detected language: {language} (confidence: {confidence:.2f})")
# Use detected language for transcription
segments, info = model.transcribe("multilingual_audio.mp3", language=language)Install with Tessl CLI
npx tessl i tessl/pypi-faster-whisper