Faster Whisper transcription with CTranslate2 for high-performance speech recognition
—
Voice activity detection functionality using Silero VAD for automatic silence detection and audio segmentation. VAD helps improve transcription accuracy by filtering out silence and focusing processing on speech segments.
Configure voice activity detection parameters for different audio scenarios and quality requirements.
@dataclass
class VadOptions:
"""
Voice Activity Detection options for Silero VAD.
Attributes:
threshold: Speech threshold (0-1). Probabilities above this are considered speech.
Higher values are more conservative. Default: 0.5
neg_threshold: Silence threshold for speech end detection. If None, uses threshold.
Values below this are always silence. Values above are speech only if
previous sample was speech. Default: None
min_speech_duration_ms: Minimum speech segment duration in milliseconds.
Shorter segments are discarded. Default: 0
max_speech_duration_s: Maximum speech segment duration in seconds.
Longer segments are split at silence gaps > 100ms or
aggressively if no suitable split point. Default: inf
min_silence_duration_ms: Minimum silence duration before ending speech segment.
Must be silent this long to end segment. Default: 2000
speech_pad_ms: Padding added to both ends of speech segments in milliseconds.
Helps avoid cutting off speech edges. Default: 400
"""
threshold: float = 0.5
neg_threshold: float | None = None
min_speech_duration_ms: int = 0
max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000
speech_pad_ms: int = 400Extract speech timestamps from audio using Silero VAD model for automatic speech segmentation.
def get_speech_timestamps(
audio: np.ndarray,
vad_options: VadOptions | None = None,
sampling_rate: int = 16000,
**kwargs
) -> list[dict]:
"""
Get speech timestamps using Silero VAD.
Args:
audio: Audio data as numpy array (mono, float32)
vad_options: VAD configuration options. If None, uses defaults
sampling_rate: Audio sample rate in Hz
**kwargs: Additional arguments passed to Silero VAD
Returns:
List of dictionaries with speech segments:
[
{"start": start_sample, "end": end_sample},
{"start": start_sample, "end": end_sample},
...
]
Notes:
- Timestamps are in sample indices, not seconds
- Convert to seconds by dividing by sampling_rate
- Empty list returned if no speech detected
"""Collect and process audio chunks based on detected speech timestamps.
def collect_chunks(
audio: np.ndarray,
chunks: list[dict],
sampling_rate: int = 16000,
max_duration: float = float("inf")
) -> tuple[list[np.ndarray], list[dict[str, float]]]:
"""
Collect and merge audio chunks based on speech timestamps.
Args:
audio: Original audio array
chunks: List of timestamp dictionaries from get_speech_timestamps
sampling_rate: Audio sampling rate in Hz (default: 16000)
max_duration: Maximum duration in seconds for merged chunks (default: inf)
Returns:
Tuple of (audio_chunks, chunks_metadata)
- audio_chunks: List of audio chunk arrays corresponding to speech segments
- chunks_metadata: List of metadata dictionaries with offset, duration, and segments info
Notes:
- Merges speech chunks that would exceed max_duration
- Returns empty chunk if no speech timestamps provided
- Metadata includes timing information for each merged chunk
"""from faster_whisper import decode_audio
from faster_whisper.vad import get_speech_timestamps, VadOptions
# Decode audio
audio = decode_audio("interview.mp3", sampling_rate=16000)
# Get speech timestamps with default settings
speech_timestamps = get_speech_timestamps(audio)
# Convert to seconds and display
for i, segment in enumerate(speech_timestamps):
start_sec = segment["start"] / 16000
end_sec = segment["end"] / 16000
duration = end_sec - start_sec
print(f"Speech segment {i+1}: {start_sec:.2f}s - {end_sec:.2f}s ({duration:.2f}s)")from faster_whisper import decode_audio
from faster_whisper.vad import get_speech_timestamps, VadOptions
audio = decode_audio("noisy_audio.wav")
# Configure VAD for noisy environment
vad_options = VadOptions(
threshold=0.6, # Higher threshold for noisy audio
min_speech_duration_ms=500, # Ignore very short speech
min_silence_duration_ms=1000, # Shorter silence gaps
speech_pad_ms=200 # Less padding for tight segments
)
speech_timestamps = get_speech_timestamps(audio, vad_options=vad_options)
print(f"Found {len(speech_timestamps)} speech segments")
for segment in speech_timestamps:
start_sec = segment["start"] / 16000
end_sec = segment["end"] / 16000
print(f" {start_sec:.2f}s - {end_sec:.2f}s")from faster_whisper import WhisperModel, decode_audio
from faster_whisper.vad import VadOptions
model = WhisperModel("base")
# Use VAD filtering during transcription
vad_options = VadOptions(
threshold=0.5,
min_speech_duration_ms=1000,
max_speech_duration_s=30
)
segments, info = model.transcribe(
"lecture.mp3",
vad_filter=True,
vad_parameters=vad_options,
word_timestamps=True
)
print(f"Duration before VAD: {info.duration:.2f}s")
print(f"Duration after VAD: {info.duration_after_vad:.2f}s")
print(f"VAD filtered out {info.duration - info.duration_after_vad:.2f}s of silence")
for segment in segments:
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")from faster_whisper import WhisperModel, decode_audio
from faster_whisper.vad import get_speech_timestamps, collect_chunks, VadOptions
import numpy as np
# Process very long audio file efficiently
audio = decode_audio("long_podcast.mp3")
print(f"Total audio duration: {len(audio) / 16000 / 60:.1f} minutes")
# Configure VAD for podcast content
vad_options = VadOptions(
threshold=0.4, # Lower threshold for clear speech
min_speech_duration_ms=2000, # Ignore short utterances
max_speech_duration_s=60, # Split very long segments
min_silence_duration_ms=3000, # Allow longer pauses
speech_pad_ms=500 # More padding for natural speech
)
# Get speech segments
speech_timestamps = get_speech_timestamps(audio, vad_options=vad_options)
speech_chunks, chunks_metadata = collect_chunks(audio, speech_timestamps)
print(f"Found {len(speech_chunks)} speech segments")
# Transcribe only speech chunks
model = WhisperModel("medium")
all_segments = []
for i, (chunk, chunk_metadata) in enumerate(zip(speech_chunks, chunks_metadata)):
print(f"Processing speech chunk {i+1}/{len(speech_chunks)}")
# Transcribe chunk
segments, info = model.transcribe(chunk)
# Adjust timestamps to global timeline
chunk_start_sec = chunk_metadata["offset"]
for segment in segments:
segment.start += chunk_start_sec
segment.end += chunk_start_sec
all_segments.append(segment)
# Display results
for segment in all_segments:
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")from faster_whisper import decode_audio
from faster_whisper.vad import get_speech_timestamps, VadOptions
import numpy as np
def analyze_vad_quality(audio_path, vad_options=None):
"""Analyze VAD performance on audio file."""
audio = decode_audio(audio_path)
total_duration = len(audio) / 16000
speech_timestamps = get_speech_timestamps(audio, vad_options=vad_options)
if not speech_timestamps:
print("No speech detected!")
return
# Calculate statistics
speech_samples = sum(seg["end"] - seg["start"] for seg in speech_timestamps)
speech_duration = speech_samples / 16000
silence_duration = total_duration - speech_duration
segment_durations = [(seg["end"] - seg["start"]) / 16000 for seg in speech_timestamps]
avg_segment_duration = np.mean(segment_durations)
print(f"Audio Analysis for {audio_path}:")
print(f" Total duration: {total_duration:.2f}s")
print(f" Speech duration: {speech_duration:.2f}s ({speech_duration/total_duration*100:.1f}%)")
print(f" Silence duration: {silence_duration:.2f}s ({silence_duration/total_duration*100:.1f}%)")
print(f" Number of segments: {len(speech_timestamps)}")
print(f" Average segment duration: {avg_segment_duration:.2f}s")
print(f" Shortest segment: {min(segment_durations):.2f}s")
print(f" Longest segment: {max(segment_durations):.2f}s")
# Test different VAD configurations
analyze_vad_quality("meeting.wav")
# More aggressive VAD
strict_options = VadOptions(threshold=0.7, min_speech_duration_ms=1500)
analyze_vad_quality("meeting.wav", strict_options)Install with Tessl CLI
npx tessl i tessl/pypi-faster-whisper