docs
tessl install tessl/pypi-pipecat-ai@0.0.0An open source framework for building real-time voice and multimodal conversational AI agents with support for speech-to-text, text-to-speech, LLMs, and multiple transport protocols
Voice Activity Detection (VAD) identifies when speech is present in audio streams. VAD is essential for turn detection, interruption handling, and efficient audio processing in conversational AI.
{ .api }
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams
class VADAnalyzer:
"""Base class for VAD analyzers.
Detects voice activity in audio streams and emits events
when speech starts/stops.
Methods:
analyze(audio): Analyze audio for voice activity
reset(): Reset VAD state
Example:
vad = SomeVADAnalyzer(params=VADParams())
is_speech = await vad.analyze(audio_bytes)
"""
def __init__(self, params: VADParams):
"""Initialize VAD analyzer.
Args:
params: VAD configuration
"""
self.params = params
self._state = None
async def analyze(self, audio: bytes) -> bool:
"""Analyze audio for voice activity.
Args:
audio: Audio bytes to analyze
Returns:
True if speech detected, False otherwise
"""
# Implementation is provider-specific
raise NotImplementedError("Subclasses must implement analyze()")
def reset(self):
"""Reset VAD state."""
self._state = None
class VADParams:
"""VAD configuration parameters.
Attributes:
threshold (float): Speech detection threshold (0.0-1.0). Default: 0.5
min_speech_duration_ms (int): Minimum speech duration to trigger. Default: 100
min_silence_duration_ms (int): Minimum silence to end speech. Default: 500
prefix_padding_ms (int): Audio to include before speech. Default: 100
sample_rate (int): Expected sample rate in Hz. Default: 16000
Example:
params = VADParams(
threshold=0.5,
min_speech_duration_ms=250,
min_silence_duration_ms=500,
sample_rate=16000
)
"""
def __init__(
self,
threshold: float = 0.5,
min_speech_duration_ms: int = 100,
min_silence_duration_ms: int = 500,
prefix_padding_ms: int = 100,
sample_rate: int = 16000
):
"""Initialize VAD parameters.
Args:
threshold: Speech detection threshold (0.0-1.0)
min_speech_duration_ms: Minimum speech duration in milliseconds
min_silence_duration_ms: Minimum silence duration in milliseconds
prefix_padding_ms: Prefix padding in milliseconds
sample_rate: Audio sample rate in Hz
"""
self.threshold = threshold
self.min_speech_duration_ms = min_speech_duration_ms
self.min_silence_duration_ms = min_silence_duration_ms
self.prefix_padding_ms = prefix_padding_ms
self.sample_rate = sample_rate{ .api }
from pipecat.audio.vad.vad_analyzer import SileroVADAnalyzer
class SileroVADAnalyzer(VADAnalyzer):
"""Silero VAD analyzer.
High-quality VAD using Silero VAD model. Accurate and efficient
for real-time speech detection.
Args:
params: VAD parameters
Example:
vad = SileroVADAnalyzer(
params=VADParams(
threshold=0.5,
min_speech_duration_ms=250,
min_silence_duration_ms=500
)
)
# Use with transport
transport = DailyTransport(
params=DailyParams(
vad_enabled=True,
vad_analyzer=vad
)
)
"""
def __init__(self, params: Optional[VADParams] = None):
"""Initialize Silero VAD analyzer.
Args:
params: Optional VAD parameters. If None, uses defaults.
"""
super().__init__(params or VADParams())
self._model = None # Silero model loaded on first use{ .api }
from pipecat.audio.vad.aic_vad_analyzer import AICVADAnalyzer
class AICVADAnalyzer(VADAnalyzer):
"""AIC VAD analyzer.
VAD using AIC's voice activity detection.
Args:
params: VAD parameters
Example:
vad = AICVADAnalyzer(
params=VADParams(threshold=0.6)
)
"""
def __init__(self, params: Optional[VADParams] = None):
"""Initialize AIC VAD analyzer.
Args:
params: Optional VAD parameters. If None, uses defaults.
"""
super().__init__(params or VADParams())
self._detector = None # AIC detector initialized on first use{ .api }
from pipecat.audio.vad.vad_analyzer import SileroVADAnalyzer, VADParams
from pipecat.transports.daily import DailyTransport, DailyParams
# Configure VAD
vad = SileroVADAnalyzer(
params=VADParams(
threshold=0.5, # Sensitivity (0.0-1.0)
min_speech_duration_ms=250, # Min speech to trigger
min_silence_duration_ms=500 # Min silence to end
)
)
# Use with transport
transport = DailyTransport(
room_url="...",
params=DailyParams(
audio_in_enabled=True,
vad_enabled=True,
vad_analyzer=vad
)
)
# VAD automatically emits:
# - VADUserStartedSpeakingFrame when speech detected
# - VADUserStoppedSpeakingFrame when silence detected{ .api }
# Sensitive VAD (catches more speech)
sensitive_vad = SileroVADAnalyzer(
params=VADParams(
threshold=0.3, # Lower threshold
min_speech_duration_ms=100, # Shorter minimum
min_silence_duration_ms=300 # Shorter silence
)
)
# Conservative VAD (fewer false positives)
conservative_vad = SileroVADAnalyzer(
params=VADParams(
threshold=0.7, # Higher threshold
min_speech_duration_ms=500, # Longer minimum
min_silence_duration_ms=1000 # Longer silence
)
){ .api }
from pipecat.audio.vad.vad_analyzer import SileroVADAnalyzer
vad = SileroVADAnalyzer()
# Analyze audio chunks
audio_chunk = b'...' # 16kHz PCM audio
is_speech = await vad.analyze(audio_chunk)
if is_speech:
print("Speech detected!")
else:
print("Silence")
# Reset VAD state
vad.reset(){ .api }
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.frames.frames import VADUserStartedSpeakingFrame, VADUserStoppedSpeakingFrame
class VADMonitor(FrameProcessor):
"""Monitor VAD events."""
async def process_frame(self, frame, direction):
if isinstance(frame, VADUserStartedSpeakingFrame):
print("User started speaking (VAD)")
elif isinstance(frame, VADUserStoppedSpeakingFrame):
print("User stopped speaking (VAD)")
await self.push_frame(frame, direction){ .api }
# Good: Quiet environment - sensitive VAD
vad = SileroVADAnalyzer(
params=VADParams(
threshold=0.4,
min_speech_duration_ms=200
)
)
# Good: Noisy environment - conservative VAD
vad = SileroVADAnalyzer(
params=VADParams(
threshold=0.7,
min_speech_duration_ms=400
)
)
# Bad: One-size-fits-all
vad = SileroVADAnalyzer() # May not work well in all environments{ .api }
# Good: Include audio before speech
vad = SileroVADAnalyzer(
params=VADParams(
prefix_padding_ms=300 # Include 300ms before speech
)
)
# Captures beginning of speech, better transcription
# Bad: No padding
vad = SileroVADAnalyzer(
params=VADParams(prefix_padding_ms=0)
)
# May cut off first word{ .api }
# Good: VAD sample rate matches audio
transport = DailyTransport(
params=DailyParams(audio_in_sample_rate=16000)
)
vad = SileroVADAnalyzer(
params=VADParams(sample_rate=16000) # Matches transport
)
# Bad: Mismatched sample rates
vad = SileroVADAnalyzer(
params=VADParams(sample_rate=8000) # Mismatch!
)
# May not work correctly{ .api }
# Enable VAD and interruptions together
transport = DailyTransport(
params=DailyParams(
vad_enabled=True,
vad_analyzer=vad
)
)
task = PipelineTask(
pipeline,
params=PipelineParams(
allow_interruptions=True # Allow VAD to trigger interruptions
)
)
# VAD detects speech -> Interrupts bot -> Natural conversation