Faster Whisper transcription with CTranslate2 for high-performance speech recognition
—
High-throughput batch processing capabilities for processing multiple audio files or chunks efficiently. The BatchedInferencePipeline enables improved performance when processing large amounts of audio data.
Create a batched inference pipeline that wraps a WhisperModel for improved throughput processing.
class BatchedInferencePipeline:
def __init__(self, model):
"""
Initialize batched inference pipeline.
Args:
model: WhisperModel instance to use for batched processing
"""Process multiple audio features in a single batch operation for improved throughput.
def forward(
self,
features: np.ndarray,
tokenizer,
chunks_metadata: list[dict],
options: TranscriptionOptions
) -> list[list[dict]]:
"""
Process batched features through the model.
Args:
features: Batched audio features array
tokenizer: Tokenizer instance for text processing
chunks_metadata: List of metadata dictionaries for each chunk
options: TranscriptionOptions for processing configuration
Returns:
List of segmented outputs for each input chunk
"""Generate transcription segments from batched audio features with improved efficiency.
def generate_segment_batched(
self,
features: np.ndarray,
tokenizer,
options: TranscriptionOptions
) -> tuple[np.ndarray, list[dict]]:
"""
Generate segments from batched features.
Args:
features: Batched audio features array
tokenizer: Tokenizer instance for processing
options: TranscriptionOptions configuration
Returns:
Tuple of (encoder_output, segment_outputs)
"""from faster_whisper import WhisperModel, BatchedInferencePipeline, decode_audio
# Initialize model and batched pipeline
model = WhisperModel("base", device="cuda")
batched_model = BatchedInferencePipeline(model=model)
# Process single audio file with batched pipeline
segments, info = batched_model.transcribe("audio.mp3", vad_filter=False)
print(f"Language: {info.language}")
for segment in segments:
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")from faster_whisper import WhisperModel, BatchedInferencePipeline
import numpy as np
model = WhisperModel("medium", device="cuda", compute_type="float16")
batched_model = BatchedInferencePipeline(model=model)
audio_files = ["audio1.mp3", "audio2.wav", "audio3.mp4"]
# Process each file with the batched pipeline
for audio_file in audio_files:
print(f"Processing {audio_file}...")
segments, info = batched_model.transcribe(
audio_file,
word_timestamps=True,
vad_filter=True
)
print(f" Language: {info.language} (confidence: {info.language_probability:.2f})")
print(f" Duration: {info.duration:.2f}s")
for segment in segments:
print(f" [{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")from faster_whisper import WhisperModel, BatchedInferencePipeline, decode_audio
from faster_whisper.transcribe import TranscriptionOptions
import numpy as np
model = WhisperModel("base")
batched_model = BatchedInferencePipeline(model=model)
# Prepare audio data
audio_files = ["file1.wav", "file2.wav"]
audio_arrays = []
chunks_metadata = []
for i, file_path in enumerate(audio_files):
audio = decode_audio(file_path)
audio_arrays.append(audio)
chunks_metadata.append({
"file_id": i,
"offset": 0.0,
"duration": len(audio) / 16000.0 # assuming 16kHz sample rate
})
# Convert to batched features (simplified example)
# In practice, you would use the model's feature extractor
features = np.stack([model.feature_extractor(audio) for audio in audio_arrays])
# Configure transcription options
options = TranscriptionOptions(
beam_size=5,
word_timestamps=True,
without_timestamps=False,
temperatures=[0.0]
)
# Process batch
tokenizer = model.tokenizer
results = batched_model.forward(features, tokenizer, chunks_metadata, options)
# Process results
for i, (file_path, result) in enumerate(zip(audio_files, results)):
print(f"Results for {file_path}:")
for segment_data in result:
print(f" [{segment_data['start']:.2f}s -> {segment_data['end']:.2f}s] {segment_data['text']}")The BatchedInferencePipeline provides a transcribe method that maintains compatibility with the WhisperModel API while providing improved throughput for batch processing scenarios. The method signature and return format are identical to WhisperModel.transcribe().
Install with Tessl CLI
npx tessl i tessl/pypi-faster-whisper