tessl/pypi-faster-whisper

Faster Whisper transcription with CTranslate2 for high-performance speech recognition

—

Pending

Overview

Eval results

Files

Voice Activity Detection

Name: tessl/pypi-faster-whisper
Author: tessl

Voice activity detection functionality using Silero VAD for automatic silence detection and audio segmentation. VAD helps improve transcription accuracy by filtering out silence and focusing processing on speech segments.

Capabilities

VAD Configuration

Configure voice activity detection parameters for different audio scenarios and quality requirements.

@dataclass
class VadOptions:
    """
    Voice Activity Detection options for Silero VAD.
    
    Attributes:
        threshold: Speech threshold (0-1). Probabilities above this are considered speech.
                  Higher values are more conservative. Default: 0.5
        neg_threshold: Silence threshold for speech end detection. If None, uses threshold.
                      Values below this are always silence. Values above are speech only if
                      previous sample was speech. Default: None
        min_speech_duration_ms: Minimum speech segment duration in milliseconds.
                               Shorter segments are discarded. Default: 0
        max_speech_duration_s: Maximum speech segment duration in seconds.
                              Longer segments are split at silence gaps > 100ms or
                              aggressively if no suitable split point. Default: inf
        min_silence_duration_ms: Minimum silence duration before ending speech segment.
                                Must be silent this long to end segment. Default: 2000
        speech_pad_ms: Padding added to both ends of speech segments in milliseconds.
                      Helps avoid cutting off speech edges. Default: 400
    """
    threshold: float = 0.5
    neg_threshold: float | None = None
    min_speech_duration_ms: int = 0
    max_speech_duration_s: float = float("inf")
    min_silence_duration_ms: int = 2000
    speech_pad_ms: int = 400

Speech Timestamp Detection

Extract speech timestamps from audio using Silero VAD model for automatic speech segmentation.

def get_speech_timestamps(
    audio: np.ndarray,
    vad_options: VadOptions | None = None,
    sampling_rate: int = 16000,
    **kwargs
) -> list[dict]:
    """
    Get speech timestamps using Silero VAD.
    
    Args:
        audio: Audio data as numpy array (mono, float32)
        vad_options: VAD configuration options. If None, uses defaults
        sampling_rate: Audio sample rate in Hz
        **kwargs: Additional arguments passed to Silero VAD
        
    Returns:
        List of dictionaries with speech segments:
        [
            {"start": start_sample, "end": end_sample},
            {"start": start_sample, "end": end_sample},
            ...
        ]
        
    Notes:
        - Timestamps are in sample indices, not seconds
        - Convert to seconds by dividing by sampling_rate
        - Empty list returned if no speech detected
    """

Speech Chunk Collection

Collect and process audio chunks based on detected speech timestamps.

def collect_chunks(
    audio: np.ndarray,
    chunks: list[dict],
    sampling_rate: int = 16000,
    max_duration: float = float("inf")
) -> tuple[list[np.ndarray], list[dict[str, float]]]:
    """
    Collect and merge audio chunks based on speech timestamps.
    
    Args:
        audio: Original audio array
        chunks: List of timestamp dictionaries from get_speech_timestamps
        sampling_rate: Audio sampling rate in Hz (default: 16000)
        max_duration: Maximum duration in seconds for merged chunks (default: inf)
        
    Returns:
        Tuple of (audio_chunks, chunks_metadata)
        - audio_chunks: List of audio chunk arrays corresponding to speech segments
        - chunks_metadata: List of metadata dictionaries with offset, duration, and segments info
        
    Notes:
        - Merges speech chunks that would exceed max_duration
        - Returns empty chunk if no speech timestamps provided
        - Metadata includes timing information for each merged chunk
    """

Usage Examples

Basic VAD Usage

from faster_whisper import decode_audio
from faster_whisper.vad import get_speech_timestamps, VadOptions

# Decode audio
audio = decode_audio("interview.mp3", sampling_rate=16000)

# Get speech timestamps with default settings
speech_timestamps = get_speech_timestamps(audio)

# Convert to seconds and display
for i, segment in enumerate(speech_timestamps):
    start_sec = segment["start"] / 16000
    end_sec = segment["end"] / 16000
    duration = end_sec - start_sec
    print(f"Speech segment {i+1}: {start_sec:.2f}s - {end_sec:.2f}s ({duration:.2f}s)")

Custom VAD Configuration

from faster_whisper import decode_audio
from faster_whisper.vad import get_speech_timestamps, VadOptions

audio = decode_audio("noisy_audio.wav")

# Configure VAD for noisy environment
vad_options = VadOptions(
    threshold=0.6,  # Higher threshold for noisy audio
    min_speech_duration_ms=500,  # Ignore very short speech
    min_silence_duration_ms=1000,  # Shorter silence gaps
    speech_pad_ms=200  # Less padding for tight segments
)

speech_timestamps = get_speech_timestamps(audio, vad_options=vad_options)

print(f"Found {len(speech_timestamps)} speech segments")
for segment in speech_timestamps:
    start_sec = segment["start"] / 16000
    end_sec = segment["end"] / 16000
    print(f"  {start_sec:.2f}s - {end_sec:.2f}s")

VAD with Transcription

from faster_whisper import WhisperModel, decode_audio
from faster_whisper.vad import VadOptions

model = WhisperModel("base")

# Use VAD filtering during transcription
vad_options = VadOptions(
    threshold=0.5,
    min_speech_duration_ms=1000,
    max_speech_duration_s=30
)

segments, info = model.transcribe(
    "lecture.mp3",
    vad_filter=True,
    vad_parameters=vad_options,
    word_timestamps=True
)

print(f"Duration before VAD: {info.duration:.2f}s")
print(f"Duration after VAD: {info.duration_after_vad:.2f}s")
print(f"VAD filtered out {info.duration - info.duration_after_vad:.2f}s of silence")

for segment in segments:
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

Processing Long Audio with VAD

from faster_whisper import WhisperModel, decode_audio
from faster_whisper.vad import get_speech_timestamps, collect_chunks, VadOptions
import numpy as np

# Process very long audio file efficiently
audio = decode_audio("long_podcast.mp3")
print(f"Total audio duration: {len(audio) / 16000 / 60:.1f} minutes")

# Configure VAD for podcast content
vad_options = VadOptions(
    threshold=0.4,  # Lower threshold for clear speech
    min_speech_duration_ms=2000,  # Ignore short utterances
    max_speech_duration_s=60,  # Split very long segments
    min_silence_duration_ms=3000,  # Allow longer pauses
    speech_pad_ms=500  # More padding for natural speech
)

# Get speech segments
speech_timestamps = get_speech_timestamps(audio, vad_options=vad_options)
speech_chunks, chunks_metadata = collect_chunks(audio, speech_timestamps)

print(f"Found {len(speech_chunks)} speech segments")

# Transcribe only speech chunks
model = WhisperModel("medium")
all_segments = []

for i, (chunk, chunk_metadata) in enumerate(zip(speech_chunks, chunks_metadata)):
    print(f"Processing speech chunk {i+1}/{len(speech_chunks)}")
    
    # Transcribe chunk
    segments, info = model.transcribe(chunk)
    
    # Adjust timestamps to global timeline
    chunk_start_sec = chunk_metadata["offset"]
    
    for segment in segments:
        segment.start += chunk_start_sec
        segment.end += chunk_start_sec
        all_segments.append(segment)

# Display results
for segment in all_segments:
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

VAD Quality Analysis

from faster_whisper import decode_audio
from faster_whisper.vad import get_speech_timestamps, VadOptions
import numpy as np

def analyze_vad_quality(audio_path, vad_options=None):
    """Analyze VAD performance on audio file."""
    audio = decode_audio(audio_path)
    total_duration = len(audio) / 16000
    
    speech_timestamps = get_speech_timestamps(audio, vad_options=vad_options)
    
    if not speech_timestamps:
        print("No speech detected!")
        return
    
    # Calculate statistics
    speech_samples = sum(seg["end"] - seg["start"] for seg in speech_timestamps)
    speech_duration = speech_samples / 16000
    silence_duration = total_duration - speech_duration
    
    segment_durations = [(seg["end"] - seg["start"]) / 16000 for seg in speech_timestamps]
    avg_segment_duration = np.mean(segment_durations)
    
    print(f"Audio Analysis for {audio_path}:")
    print(f"  Total duration: {total_duration:.2f}s")
    print(f"  Speech duration: {speech_duration:.2f}s ({speech_duration/total_duration*100:.1f}%)")
    print(f"  Silence duration: {silence_duration:.2f}s ({silence_duration/total_duration*100:.1f}%)")
    print(f"  Number of segments: {len(speech_timestamps)}")
    print(f"  Average segment duration: {avg_segment_duration:.2f}s")
    print(f"  Shortest segment: {min(segment_durations):.2f}s")
    print(f"  Longest segment: {max(segment_durations):.2f}s")

# Test different VAD configurations
analyze_vad_quality("meeting.wav")

# More aggressive VAD
strict_options = VadOptions(threshold=0.7, min_speech_duration_ms=1500)
analyze_vad_quality("meeting.wav", strict_options)

VAD Parameter Tuning Guidelines

Threshold Selection

0.3-0.4: Sensitive, good for quiet/distant speech
0.5: Balanced, good for most scenarios (default)
0.6-0.7: Conservative, good for noisy environments
0.8+: Very conservative, may miss quiet speech

Duration Parameters

min_speech_duration_ms: Filter out mouth sounds, very short utterances
max_speech_duration_s: Prevent excessively long segments that hurt transcription
min_silence_duration_ms: Control sensitivity to brief pauses in speech
speech_pad_ms: Ensure speech edges aren't cut off