Faster Whisper transcription with CTranslate2 for high-performance speech recognition
—
Audio decoding, format conversion, and preprocessing utilities for preparing audio data for transcription. These functions handle the conversion of various audio formats into the numpy arrays required by the Whisper models.
Decode audio files from various formats into numpy arrays suitable for speech recognition processing.
def decode_audio(
input_file: str | BinaryIO,
sampling_rate: int = 16000,
split_stereo: bool = False,
) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
"""
Decode audio from file or file-like object.
Uses PyAV library to decode audio with FFmpeg backend, supporting most audio formats
without requiring system FFmpeg installation.
Args:
input_file: Path to audio file or file-like object containing audio data
sampling_rate: Target sample rate for resampling (default: 16000 Hz)
split_stereo: If True, return separate left and right channels for stereo audio
Returns:
- If split_stereo=False: Single numpy array of shape (samples,) containing mono audio
- If split_stereo=True: Tuple of (left_channel, right_channel) numpy arrays
Notes:
- Output is always float32 normalized to [-1.0, 1.0] range
- Stereo audio is automatically converted to mono unless split_stereo=True
- Automatically handles resampling to target sampling rate
- Supports all formats supported by FFmpeg/PyAV
"""Utility function for padding or trimming arrays to specific lengths, commonly used for feature processing.
def pad_or_trim(
array: np.ndarray,
length: int = 3000,
*,
axis: int = -1
) -> np.ndarray:
"""
Pad or trim array to specified length along given axis.
Used internally for preparing mel-spectrogram features to expected input size
for the encoder (typically 3000 frames for 30-second audio chunks).
Args:
array: Input numpy array to pad or trim
length: Target length for the specified axis
axis: Axis along which to pad or trim (default: last axis)
Returns:
Array padded or trimmed to specified length
Notes:
- If array is longer than length, it's trimmed from the end
- If array is shorter than length, it's zero-padded at the end
- Padding uses numpy's pad function with zeros
"""from faster_whisper import decode_audio
import numpy as np
# Decode audio file to mono
audio = decode_audio("speech.mp3")
print(f"Audio shape: {audio.shape}")
print(f"Audio dtype: {audio.dtype}")
print(f"Duration: {len(audio) / 16000:.2f} seconds")
# Decode with custom sample rate
audio_8k = decode_audio("speech.mp3", sampling_rate=8000)
print(f"8kHz audio shape: {audio_8k.shape}")from faster_whisper import decode_audio
# Decode stereo audio as separate channels
left_channel, right_channel = decode_audio("stereo_audio.wav", split_stereo=True)
print(f"Left channel shape: {left_channel.shape}")
print(f"Right channel shape: {right_channel.shape}")
# Process each channel separately or combine them
combined = (left_channel + right_channel) / 2 # Simple averagingfrom faster_whisper import decode_audio
import io
import requests
# Download and decode audio from URL
response = requests.get("https://example.com/audio.wav")
audio_bytes = io.BytesIO(response.content)
# Decode from memory
audio = decode_audio(audio_bytes, sampling_rate=16000)
print(f"Downloaded audio duration: {len(audio) / 16000:.2f}s")from faster_whisper import WhisperModel, decode_audio
import numpy as np
model = WhisperModel("base")
# Decode audio manually
audio = decode_audio("long_audio.mp3", sampling_rate=16000)
# Split long audio into chunks for processing
chunk_duration = 30 # seconds
chunk_samples = chunk_duration * 16000
chunks = []
for start in range(0, len(audio), chunk_samples):
chunk = audio[start:start + chunk_samples]
if len(chunk) < chunk_samples:
# Pad last chunk if necessary
chunk = np.pad(chunk, (0, chunk_samples - len(chunk)))
chunks.append(chunk)
# Process each chunk
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}")
segments, info = model.transcribe(chunk)
for segment in segments:
# Adjust timestamps for chunk offset
start_time = segment.start + (i * chunk_duration)
end_time = segment.end + (i * chunk_duration)
print(f"[{start_time:.2f}s -> {end_time:.2f}s] {segment.text}")from faster_whisper import decode_audio
import numpy as np
def validate_audio_quality(audio_path):
"""Validate audio quality for speech recognition."""
audio = decode_audio(audio_path)
# Basic quality checks
duration = len(audio) / 16000
rms_level = np.sqrt(np.mean(audio**2))
max_amplitude = np.max(np.abs(audio))
print(f"Duration: {duration:.2f}s")
print(f"RMS level: {rms_level:.4f}")
print(f"Max amplitude: {max_amplitude:.4f}")
# Quality warnings
if duration < 1.0:
print("WARNING: Audio is very short (< 1s)")
if rms_level < 0.01:
print("WARNING: Audio level is very low")
if max_amplitude > 0.95:
print("WARNING: Audio may be clipped")
return audio
# Validate before transcription
audio = validate_audio_quality("input.wav")The decode_audio function supports all formats handled by FFmpeg/PyAV, including:
Install with Tessl CLI
npx tessl i tessl/pypi-faster-whisper