CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-faster-whisper

Faster Whisper transcription with CTranslate2 for high-performance speech recognition

Pending
Overview
Eval results
Files

core-speech-recognition.mddocs/

Core Speech Recognition

Primary speech recognition functionality including transcription, language detection, and model management. These are the main operations for converting audio to text and managing Whisper models.

Capabilities

WhisperModel Initialization

Create and configure a Whisper model for speech recognition with support for different model sizes, devices, and compute types.

class WhisperModel:
    def __init__(
        self,
        model_size_or_path: str,
        device: str = "auto",
        device_index: int | list[int] = 0,
        compute_type: str = "default",
        cpu_threads: int = 0,
        num_workers: int = 1,
        download_root: str | None = None,
        local_files_only: bool = False,
        files: dict | None = None,
        revision: str | None = None,
        use_auth_token: str | bool | None = None,
        **model_kwargs
    ):
        """
        Initialize a Whisper model.
        
        Args:
            model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
                small, small.en, distil-small.en, medium, medium.en, distil-medium.en,
                large-v1, large-v2, large-v3, large, distil-large-v2, distil-large-v3,
                distil-large-v3.5, large-v3-turbo, turbo) or path to model directory
            device: Device to use for computation ("auto", "cpu", "cuda")
            device_index: Device index(es) to use for CUDA
            compute_type: Type to use for computation ("default", "auto", "int8", "int8_float32",
                "int8_float16", "int8_bfloat16", "int16", "float16", "bfloat16", "float32")
            cpu_threads: Number of threads to use when running on CPU
            num_workers: Number of workers to use for transcription
            download_root: Directory where models should be downloaded
            local_files_only: If True, avoid downloading files and use only local cached files
            files: Optional dictionary of model files to use instead of downloading
            revision: Git revision to use when downloading from Hugging Face Hub
            use_auth_token: Hugging Face authentication token
        """

Audio Transcription

Transcribe audio files or numpy arrays to text with extensive configuration options for different use cases.

def transcribe(
    self,
    audio: str | BinaryIO | np.ndarray,
    language: str | None = None,
    task: str = "transcribe",
    log_progress: bool = False,
    beam_size: int = 5,
    best_of: int = 5,
    patience: float = 1,
    length_penalty: float = 1,
    repetition_penalty: float = 1,
    no_repeat_ngram_size: int = 0,
    temperature: float | list[float] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
    compression_ratio_threshold: float | None = 2.4,
    log_prob_threshold: float | None = -1.0,
    no_speech_threshold: float | None = 0.6,
    condition_on_previous_text: bool = True,
    prompt_reset_on_temperature: float = 0.5,
    initial_prompt: str | list[int] | None = None,
    prefix: str | None = None,
    suppress_blank: bool = True,
    suppress_tokens: list[int] | None = [-1],
    without_timestamps: bool = False,
    max_initial_timestamp: float = 1.0,
    word_timestamps: bool = False,
    prepend_punctuations: str = "\"'"¿([{-",
    append_punctuations: str = "\"'.。,,!!??::")]}、",
    vad_filter: bool = False,
    vad_parameters: dict | VadOptions | None = None,
    max_new_tokens: int | None = None,
    chunk_length: int | None = None,
    clip_timestamps: str | list[float] = "0",
    hallucination_silence_threshold: float | None = None,
    hotwords: str | None = None,
    multilingual: bool = False,
    language_detection_threshold: float | None = 0.5,
    language_detection_segments: int = 1
) -> tuple[Iterator[Segment], TranscriptionInfo]:
    """
    Transcribe an audio file.
    
    Args:
        audio: Path to audio file, file-like object, or numpy array of audio data
        language: Language of the audio (ISO 639-1 code). If None, language is detected
        task: Task to perform ("transcribe" or "translate")
        log_progress: Whether to display progress information
        beam_size: Beam size for beam search decoding
        best_of: Number of candidates to generate using beam search
        patience: Beam search patience factor
        length_penalty: Length penalty for beam search
        repetition_penalty: Repetition penalty for beam search
        no_repeat_ngram_size: Prevent repetitions of n-grams
        temperature: Temperature(s) for sampling. Can be float or list of floats
        compression_ratio_threshold: If compression ratio is above this value, treat as failed transcription
        log_prob_threshold: If average log probability is below this value, treat as failed transcription
        no_speech_threshold: If no-speech probability is above this value, treat as silence
        condition_on_previous_text: Whether to condition on previous transcribed text
        prompt_reset_on_temperature: Reset prompt when temperature is above this value
        initial_prompt: Optional initial prompt to condition transcription
        prefix: Optional prefix to prepend to transcription
        suppress_blank: Whether to suppress blank outputs
        suppress_tokens: List of token IDs to suppress during generation
        without_timestamps: Whether to include timestamps in output
        max_initial_timestamp: Maximum initial timestamp
        word_timestamps: Whether to extract word-level timestamps
        prepend_punctuations: Punctuations to prepend to word timestamps
        append_punctuations: Punctuations to append to word timestamps
        vad_filter: Whether to use voice activity detection to filter audio
        vad_parameters: Parameters for voice activity detection
        max_new_tokens: Maximum number of tokens to generate per segment
        chunk_length: Length of audio chunks to process in seconds (default: 30s if not specified)
        clip_timestamps: How to handle timestamps that go beyond audio duration
        hallucination_silence_threshold: Threshold for detecting hallucinations
        hotwords: String of hotwords to boost during transcription
        multilingual: Whether the model supports multiple languages
        language_detection_threshold: Threshold for language detection confidence
        language_detection_segments: Number of segments to use for language detection
        
    Returns:
        Tuple of (segments_iterator, transcription_info)
        - segments_iterator: Iterator of Segment objects containing transcribed text and metadata
        - transcription_info: TranscriptionInfo object with language, duration, and other metadata
    """

Language Detection

Detect the language of audio content with confidence scores for all supported languages.

def detect_language(
    self,
    audio: np.ndarray | None = None,
    features: np.ndarray | None = None,
    vad_filter: bool = False,
    vad_parameters: dict | VadOptions | None = None
) -> tuple[str, float]:
    """
    Detect the language of audio.
    
    Args:
        audio: Audio data as numpy array
        features: Pre-computed audio features (alternative to audio)
        vad_filter: Whether to use voice activity detection
        vad_parameters: Parameters for voice activity detection
        
    Returns:
        Tuple of (language_code, confidence_score)
        - language_code: ISO 639-1 language code
        - confidence_score: Confidence probability (0-1)
    """

Model Management

Functions for discovering and downloading available pre-trained models.

def available_models() -> list[str]:
    """
    Get list of available model names.
    
    Returns:
        List of model size strings that can be used with WhisperModel
    """

def download_model(
    size_or_id: str,
    output_dir: str | None = None,
    local_files_only: bool = False,
    cache_dir: str | None = None,
    revision: str | None = None,
    use_auth_token: str | bool | None = None
) -> str:
    """
    Download a CTranslate2 Whisper model from Hugging Face Hub.
    
    Args:
        size_or_id: Model size (tiny, base, small, medium, large, etc.) or 
                   full model ID from Hugging Face Hub
        output_dir: Directory where model should be saved
        local_files_only: If True, avoid downloading and use only cached files
        cache_dir: Path to cache directory for storing downloaded models
        revision: Git revision to download (branch, tag, or commit hash)
        use_auth_token: Hugging Face authentication token
        
    Returns:
        Path to downloaded model directory
        
    Raises:
        ValueError: If model size is invalid
    """

Usage Examples

Basic Transcription

from faster_whisper import WhisperModel

# Initialize model
model = WhisperModel("base", device="cpu", compute_type="int8")

# Transcribe with default settings
segments, info = model.transcribe("audio.mp3")

for segment in segments:
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

Advanced Transcription with Options

from faster_whisper import WhisperModel

model = WhisperModel("medium", device="cuda", compute_type="float16")

# Transcribe with custom options
segments, info = model.transcribe(
    "audio.mp3",
    language="en",
    word_timestamps=True,
    beam_size=10,
    vad_filter=True,
    temperature=[0.0, 0.2, 0.4],
    initial_prompt="This is a technical presentation about machine learning."
)

print(f"Language: {info.language} (confidence: {info.language_probability:.2f})")
print(f"Duration: {info.duration:.2f}s")

for segment in segments:
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
    if segment.words:
        for word in segment.words:
            print(f"  {word.word} ({word.start:.2f}s-{word.end:.2f}s, p={word.probability:.2f})")

Language Detection

from faster_whisper import WhisperModel, decode_audio

model = WhisperModel("base")

# Decode audio first
audio = decode_audio("multilingual_audio.mp3")

# Detect language
language, confidence = model.detect_language(audio)
print(f"Detected language: {language} (confidence: {confidence:.2f})")

# Use detected language for transcription
segments, info = model.transcribe("multilingual_audio.mp3", language=language)

Install with Tessl CLI

npx tessl i tessl/pypi-faster-whisper

docs

audio-processing.md

batched-processing.md

core-speech-recognition.md

index.md

utilities.md

voice-activity-detection.md

tile.json