Faster Whisper transcription with CTranslate2 for high-performance speech recognition
npx @tessl/cli install tessl/pypi-faster-whisper@1.2.0A high-performance reimplementation of OpenAI's Whisper automatic speech recognition model using CTranslate2 for fast inference. Faster Whisper delivers up to 4x faster transcription than the original openai/whisper implementation while maintaining the same accuracy and using less memory, with support for various precision levels (FP16, INT8) for both CPU and GPU execution.
pip install faster-whisperfrom faster_whisper import WhisperModelCommon additional imports:
from faster_whisper import (
WhisperModel,
BatchedInferencePipeline,
decode_audio,
available_models,
download_model,
format_timestamp
)from faster_whisper import WhisperModel
# Initialize model
model = WhisperModel("base", device="cpu", compute_type="int8")
# Transcribe audio file
segments, info = model.transcribe("audio.mp3", beam_size=5)
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
# Process transcription segments
for segment in segments:
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))The library is built around several key components:
This design enables efficient speech-to-text processing with extensive customization options for different deployment scenarios.
Primary speech recognition functionality including transcription, language detection, and model management. These are the main operations for converting audio to text.
class WhisperModel:
def __init__(self, model_size_or_path, device="auto", compute_type="default", **kwargs): ...
def transcribe(self, audio, language=None, task="transcribe", **kwargs): ...
def detect_language(self, audio=None, features=None, **kwargs): ...
def available_models(): ...
def download_model(size_or_id, output_dir=None, **kwargs): ...High-throughput batch processing capabilities for processing multiple audio files or chunks efficiently.
class BatchedInferencePipeline:
def __init__(self, model): ...
def forward(self, features, tokenizer, chunks_metadata, options): ...Audio decoding, format conversion, and preprocessing utilities for preparing audio data for transcription.
def decode_audio(input_file, sampling_rate=16000, split_stereo=False): ...
def pad_or_trim(array, length=3000, *, axis=-1): ...Voice activity detection functionality using Silero VAD for automatic silence detection and audio segmentation.
@dataclass
class VadOptions:
threshold: float = 0.5
min_speech_duration_ms: int = 0
max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000
speech_pad_ms: int = 400
def get_speech_timestamps(audio, vad_options=None, sampling_rate=16000, **kwargs): ...Helper functions for timestamp formatting, model information, and other utility operations.
def format_timestamp(seconds, always_include_hours=False, decimal_marker="."): ...
def get_logger(): ...
def get_assets_path(): ...@dataclass
class Word:
start: float
end: float
word: str
probability: float
@dataclass
class Segment:
id: int
seek: int
start: float
end: float
text: str
tokens: list[int]
avg_logprob: float
compression_ratio: float
no_speech_prob: float
words: list[Word] | None
temperature: float | None
@dataclass
class TranscriptionInfo:
language: str
language_probability: float
duration: float
duration_after_vad: float
all_language_probs: list[tuple[str, float]] | None
transcription_options: TranscriptionOptions
vad_options: VadOptions
@dataclass
class TranscriptionOptions:
beam_size: int
best_of: int
patience: float
length_penalty: float
repetition_penalty: float
no_repeat_ngram_size: int
log_prob_threshold: float | None
no_speech_threshold: float | None
compression_ratio_threshold: float | None
condition_on_previous_text: bool
prompt_reset_on_temperature: float
temperatures: list[float]
initial_prompt: str | list[int] | None
prefix: str | None
suppress_blank: bool
suppress_tokens: list[int] | None
without_timestamps: bool
max_initial_timestamp: float
word_timestamps: bool
prepend_punctuations: str
append_punctuations: str
multilingual: bool
max_new_tokens: int | None
clip_timestamps: str | list[float]
hallucination_silence_threshold: float | None
hotwords: str | None