or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/pipecat-ai@0.0.x

docs

audio

dtmf.mdfilters-mixers.mdturn-detection.mdvad.md
core-concepts.mdindex.mdpipeline.mdrunner.mdtransports.mdturns.md
tile.json

tessl/pypi-pipecat-ai

tessl install tessl/pypi-pipecat-ai@0.0.0

An open source framework for building real-time voice and multimodal conversational AI agents with support for speech-to-text, text-to-speech, LLMs, and multiple transport protocols

vad.mddocs/audio/

Voice Activity Detection

Voice Activity Detection (VAD) identifies when speech is present in audio streams. VAD is essential for turn detection, interruption handling, and efficient audio processing in conversational AI.

VAD Analyzers

VADAnalyzer

{ .api }
from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADParams

class VADAnalyzer:
    """Base class for VAD analyzers.

    Detects voice activity in audio streams and emits events
    when speech starts/stops.

    Methods:
        analyze(audio): Analyze audio for voice activity
        reset(): Reset VAD state

    Example:
        vad = SomeVADAnalyzer(params=VADParams())
        is_speech = await vad.analyze(audio_bytes)
    """

    def __init__(self, params: VADParams):
        """Initialize VAD analyzer.

        Args:
            params: VAD configuration
        """
        self.params = params
        self._state = None

    async def analyze(self, audio: bytes) -> bool:
        """Analyze audio for voice activity.

        Args:
            audio: Audio bytes to analyze

        Returns:
            True if speech detected, False otherwise
        """
        # Implementation is provider-specific
        raise NotImplementedError("Subclasses must implement analyze()")

    def reset(self):
        """Reset VAD state."""
        self._state = None


class VADParams:
    """VAD configuration parameters.

    Attributes:
        threshold (float): Speech detection threshold (0.0-1.0). Default: 0.5
        min_speech_duration_ms (int): Minimum speech duration to trigger. Default: 100
        min_silence_duration_ms (int): Minimum silence to end speech. Default: 500
        prefix_padding_ms (int): Audio to include before speech. Default: 100
        sample_rate (int): Expected sample rate in Hz. Default: 16000

    Example:
        params = VADParams(
            threshold=0.5,
            min_speech_duration_ms=250,
            min_silence_duration_ms=500,
            sample_rate=16000
        )
    """

    def __init__(
        self,
        threshold: float = 0.5,
        min_speech_duration_ms: int = 100,
        min_silence_duration_ms: int = 500,
        prefix_padding_ms: int = 100,
        sample_rate: int = 16000
    ):
        """Initialize VAD parameters.

        Args:
            threshold: Speech detection threshold (0.0-1.0)
            min_speech_duration_ms: Minimum speech duration in milliseconds
            min_silence_duration_ms: Minimum silence duration in milliseconds
            prefix_padding_ms: Prefix padding in milliseconds
            sample_rate: Audio sample rate in Hz
        """
        self.threshold = threshold
        self.min_speech_duration_ms = min_speech_duration_ms
        self.min_silence_duration_ms = min_silence_duration_ms
        self.prefix_padding_ms = prefix_padding_ms
        self.sample_rate = sample_rate

SileroVADAnalyzer

{ .api }
from pipecat.audio.vad.vad_analyzer import SileroVADAnalyzer

class SileroVADAnalyzer(VADAnalyzer):
    """Silero VAD analyzer.

    High-quality VAD using Silero VAD model. Accurate and efficient
    for real-time speech detection.

    Args:
        params: VAD parameters

    Example:
        vad = SileroVADAnalyzer(
            params=VADParams(
                threshold=0.5,
                min_speech_duration_ms=250,
                min_silence_duration_ms=500
            )
        )

        # Use with transport
        transport = DailyTransport(
            params=DailyParams(
                vad_enabled=True,
                vad_analyzer=vad
            )
        )
    """

    def __init__(self, params: Optional[VADParams] = None):
        """Initialize Silero VAD analyzer.

        Args:
            params: Optional VAD parameters. If None, uses defaults.
        """
        super().__init__(params or VADParams())
        self._model = None  # Silero model loaded on first use

AICVADAnalyzer

{ .api }
from pipecat.audio.vad.aic_vad_analyzer import AICVADAnalyzer

class AICVADAnalyzer(VADAnalyzer):
    """AIC VAD analyzer.

    VAD using AIC's voice activity detection.

    Args:
        params: VAD parameters

    Example:
        vad = AICVADAnalyzer(
            params=VADParams(threshold=0.6)
        )
    """

    def __init__(self, params: Optional[VADParams] = None):
        """Initialize AIC VAD analyzer.

        Args:
            params: Optional VAD parameters. If None, uses defaults.
        """
        super().__init__(params or VADParams())
        self._detector = None  # AIC detector initialized on first use

Usage Patterns

Basic VAD Setup

{ .api }
from pipecat.audio.vad.vad_analyzer import SileroVADAnalyzer, VADParams
from pipecat.transports.daily import DailyTransport, DailyParams

# Configure VAD
vad = SileroVADAnalyzer(
    params=VADParams(
        threshold=0.5,              # Sensitivity (0.0-1.0)
        min_speech_duration_ms=250,  # Min speech to trigger
        min_silence_duration_ms=500  # Min silence to end
    )
)

# Use with transport
transport = DailyTransport(
    room_url="...",
    params=DailyParams(
        audio_in_enabled=True,
        vad_enabled=True,
        vad_analyzer=vad
    )
)

# VAD automatically emits:
# - VADUserStartedSpeakingFrame when speech detected
# - VADUserStoppedSpeakingFrame when silence detected

Custom VAD Parameters

{ .api }
# Sensitive VAD (catches more speech)
sensitive_vad = SileroVADAnalyzer(
    params=VADParams(
        threshold=0.3,              # Lower threshold
        min_speech_duration_ms=100, # Shorter minimum
        min_silence_duration_ms=300 # Shorter silence
    )
)

# Conservative VAD (fewer false positives)
conservative_vad = SileroVADAnalyzer(
    params=VADParams(
        threshold=0.7,               # Higher threshold
        min_speech_duration_ms=500,  # Longer minimum
        min_silence_duration_ms=1000 # Longer silence
    )
)

Manual VAD Usage

{ .api }
from pipecat.audio.vad.vad_analyzer import SileroVADAnalyzer

vad = SileroVADAnalyzer()

# Analyze audio chunks
audio_chunk = b'...'  # 16kHz PCM audio
is_speech = await vad.analyze(audio_chunk)

if is_speech:
    print("Speech detected!")
else:
    print("Silence")

# Reset VAD state
vad.reset()

VAD Event Handling

{ .api }
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.frames.frames import VADUserStartedSpeakingFrame, VADUserStoppedSpeakingFrame

class VADMonitor(FrameProcessor):
    """Monitor VAD events."""

    async def process_frame(self, frame, direction):
        if isinstance(frame, VADUserStartedSpeakingFrame):
            print("User started speaking (VAD)")

        elif isinstance(frame, VADUserStoppedSpeakingFrame):
            print("User stopped speaking (VAD)")

        await self.push_frame(frame, direction)

Best Practices

Tune for Your Use Case

{ .api }
# Good: Quiet environment - sensitive VAD
vad = SileroVADAnalyzer(
    params=VADParams(
        threshold=0.4,
        min_speech_duration_ms=200
    )
)

# Good: Noisy environment - conservative VAD
vad = SileroVADAnalyzer(
    params=VADParams(
        threshold=0.7,
        min_speech_duration_ms=400
    )
)

# Bad: One-size-fits-all
vad = SileroVADAnalyzer()  # May not work well in all environments

Include Prefix Padding

{ .api }
# Good: Include audio before speech
vad = SileroVADAnalyzer(
    params=VADParams(
        prefix_padding_ms=300  # Include 300ms before speech
    )
)
# Captures beginning of speech, better transcription

# Bad: No padding
vad = SileroVADAnalyzer(
    params=VADParams(prefix_padding_ms=0)
)
# May cut off first word

Match Sample Rates

{ .api }
# Good: VAD sample rate matches audio
transport = DailyTransport(
    params=DailyParams(audio_in_sample_rate=16000)
)

vad = SileroVADAnalyzer(
    params=VADParams(sample_rate=16000)  # Matches transport
)

# Bad: Mismatched sample rates
vad = SileroVADAnalyzer(
    params=VADParams(sample_rate=8000)  # Mismatch!
)
# May not work correctly

Use with Interruptions

{ .api }
# Enable VAD and interruptions together
transport = DailyTransport(
    params=DailyParams(
        vad_enabled=True,
        vad_analyzer=vad
    )
)

task = PipelineTask(
    pipeline,
    params=PipelineParams(
        allow_interruptions=True  # Allow VAD to trigger interruptions
    )
)

# VAD detects speech -> Interrupts bot -> Natural conversation