or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/pipecat-ai@0.0.x

docs

core-concepts.mdindex.mdpipeline.mdrunner.mdtransports.mdturns.md
tile.json

tessl/pypi-pipecat-ai

tessl install tessl/pypi-pipecat-ai@0.0.0

An open source framework for building real-time voice and multimodal conversational AI agents with support for speech-to-text, text-to-speech, LLMs, and multiple transport protocols

audio-frames.mddocs/frames/

Audio Frames

Audio frames carry raw audio data through the pipeline. They support different sample rates, channel configurations, and distinguish between input (from user) and output (to user) audio.

Base Audio Frame

{ .api }
from pipecat.frames.frames import AudioRawFrame

class AudioRawFrame:
    """Raw audio chunk container.

    Carries PCM audio data with associated metadata about
    sample rate and channel configuration.

    Attributes:
        audio (bytes): Raw PCM audio data
        sample_rate (int): Sample rate in Hz (e.g., 16000, 24000, 48000)
        num_channels (int): Number of audio channels (1 for mono, 2 for stereo)
        id (int): Unique frame identifier
        name (str): Frame name
        pts (Optional[int]): Presentation timestamp in nanoseconds
        metadata (Dict[str, Any]): Frame metadata
    """

    def __init__(
        self,
        audio: bytes,
        sample_rate: int,
        num_channels: int
    ):
        """Initialize audio frame.

        Args:
            audio: Raw PCM audio bytes
            sample_rate: Sample rate in Hz
            num_channels: Number of channels (1=mono, 2=stereo)
        """
        pass

Output Audio Frames

Audio flowing from the bot to the user (output direction).

OutputAudioRawFrame

{ .api }
from pipecat.frames.frames import OutputAudioRawFrame

class OutputAudioRawFrame(DataFrame, AudioRawFrame):
    """Audio output to transport.

    Audio data intended for output through the transport
    to the user. This is the base class for all output audio.

    Inherits from:
        DataFrame: Can be interrupted by user
        AudioRawFrame: Contains audio data

    Example:
        frame = OutputAudioRawFrame(
            audio=audio_bytes,
            sample_rate=24000,
            num_channels=1
        )
    """
    pass

TTSAudioRawFrame

{ .api }
from pipecat.frames.frames import TTSAudioRawFrame

class TTSAudioRawFrame(OutputAudioRawFrame):
    """TTS-generated audio.

    Audio generated by a Text-to-Speech service. This frame
    type helps track the source of audio in the pipeline.

    Example:
        # Generated by TTS service
        frame = TTSAudioRawFrame(
            audio=synthesized_audio,
            sample_rate=24000,
            num_channels=1
        )
    """
    pass

SpeechOutputAudioRawFrame

{ .api }
from pipecat.frames.frames import SpeechOutputAudioRawFrame

class SpeechOutputAudioRawFrame(OutputAudioRawFrame):
    """Speech audio stream output.

    Speech audio intended for streaming output. Used for
    general speech output that may not be from TTS.

    Example:
        frame = SpeechOutputAudioRawFrame(
            audio=speech_bytes,
            sample_rate=16000,
            num_channels=1
        )
    """
    pass

Input Audio Frames

Audio flowing from the user to the bot (input direction).

InputAudioRawFrame

{ .api }
from pipecat.frames.frames import InputAudioRawFrame

class InputAudioRawFrame(SystemFrame, AudioRawFrame):
    """Audio input from transport.

    Audio received from the transport (user input). These are
    SystemFrames for immediate processing to minimize latency.

    Inherits from:
        SystemFrame: High priority, immediate processing
        AudioRawFrame: Contains audio data

    Example:
        # Received from transport
        frame = InputAudioRawFrame(
            audio=user_audio_bytes,
            sample_rate=16000,
            num_channels=1
        )
    """
    pass

UserAudioRawFrame

{ .api }
from pipecat.frames.frames import UserAudioRawFrame

class UserAudioRawFrame(InputAudioRawFrame):
    """User audio input.

    Audio specifically identified as coming from a user.
    Commonly used for STT processing.

    Example:
        # User speaking
        frame = UserAudioRawFrame(
            audio=user_voice_data,
            sample_rate=16000,
            num_channels=1
        )
    """
    pass

Audio Format Details

Sample Rates

Common sample rates used in Pipecat:

{ .api }
# Standard sample rates
SAMPLE_RATE_8K = 8000      # Telephony quality
SAMPLE_RATE_16K = 16000    # Wideband (common for STT)
SAMPLE_RATE_24K = 24000    # High quality (common for TTS)
SAMPLE_RATE_48K = 48000    # Professional quality

# Example: Different services use different rates
stt_frame = InputAudioRawFrame(
    audio=data,
    sample_rate=16000,  # 16kHz for speech recognition
    num_channels=1
)

tts_frame = TTSAudioRawFrame(
    audio=data,
    sample_rate=24000,  # 24kHz for synthesis
    num_channels=1
)

Channel Configuration

{ .api }
# Mono audio (1 channel)
mono_frame = AudioRawFrame(
    audio=mono_data,
    sample_rate=16000,
    num_channels=1  # Single channel
)

# Stereo audio (2 channels)
stereo_frame = AudioRawFrame(
    audio=stereo_data,
    sample_rate=48000,
    num_channels=2  # Left and right channels
)

Audio Format

Audio data is PCM (Pulse Code Modulation) format:

{ .api }
# PCM format details:
# - Little-endian signed 16-bit samples
# - Interleaved for multi-channel audio
# - No header (raw audio data)

# Calculate audio duration
def calculate_duration_ms(audio: bytes, sample_rate: int, num_channels: int) -> float:
    """Calculate audio duration in milliseconds.

    Args:
        audio: Raw PCM audio bytes
        sample_rate: Sample rate in Hz
        num_channels: Number of channels

    Returns:
        Duration in milliseconds
    """
    bytes_per_sample = 2  # 16-bit = 2 bytes
    num_samples = len(audio) // (bytes_per_sample * num_channels)
    duration_seconds = num_samples / sample_rate
    return duration_seconds * 1000

# Example
frame = AudioRawFrame(audio=b'\x00' * 32000, sample_rate=16000, num_channels=1)
duration = calculate_duration_ms(frame.audio, frame.sample_rate, frame.num_channels)
# duration = 1000.0 ms (1 second)

Usage Patterns

Generating Audio

{ .api }
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.frames.frames import TTSAudioRawFrame

class AudioGenerator(FrameProcessor):
    """Generate audio frames."""

    async def generate_audio(self, text: str):
        """Generate audio from text.

        Args:
            text: Text to synthesize
        """
        # Generate audio (example)
        audio_bytes = await self._synthesize(text)

        # Create frame
        frame = TTSAudioRawFrame(
            audio=audio_bytes,
            sample_rate=24000,
            num_channels=1
        )

        # Push downstream
        await self.push_frame(frame)

Processing Audio

{ .api }
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.frames.frames import InputAudioRawFrame, UserAudioRawFrame

class AudioProcessor(FrameProcessor):
    """Process incoming audio."""

    async def process_frame(self, frame, direction):
        if isinstance(frame, InputAudioRawFrame):
            # Process audio
            processed = await self._process_audio(
                frame.audio,
                frame.sample_rate,
                frame.num_channels
            )

            # Create new frame with processed audio
            new_frame = UserAudioRawFrame(
                audio=processed,
                sample_rate=frame.sample_rate,
                num_channels=frame.num_channels
            )
            await self.push_frame(new_frame, direction)
        else:
            await self.push_frame(frame, direction)

Buffering Audio

{ .api }
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor

class AudioBufferProcessor(FrameProcessor):
    """Buffer audio frames before processing.

    Accumulates audio frames until reaching a threshold,
    then processes in batches.

    Args:
        buffer_size_ms: Buffer size in milliseconds
        sample_rate: Expected sample rate
        num_channels: Expected number of channels
    """

    def __init__(
        self,
        buffer_size_ms: int = 100,
        sample_rate: int = 16000,
        num_channels: int = 1
    ):
        """Initialize audio buffer.

        Args:
            buffer_size_ms: Buffer size in milliseconds
            sample_rate: Audio sample rate
            num_channels: Number of audio channels
        """
        pass

    async def process_frame(self, frame, direction):
        """Buffer audio frames.

        Args:
            frame: Frame to process
            direction: Frame direction
        """
        if isinstance(frame, AudioRawFrame):
            # Add to buffer
            self._buffer.extend(frame.audio)

            # Check if buffer is full
            if len(self._buffer) >= self._buffer_size:
                # Process buffered audio
                buffered_frame = AudioRawFrame(
                    audio=bytes(self._buffer),
                    sample_rate=self.sample_rate,
                    num_channels=self.num_channels
                )
                await self.push_frame(buffered_frame, direction)
                self._buffer.clear()
        else:
            await self.push_frame(frame, direction)

Resampling Audio

{ .api }
from pipecat.audio.resamplers.soxr_resampler import SoxrResampler

# Resample audio to different sample rate
resampler = SoxrResampler(
    input_sample_rate=16000,
    output_sample_rate=24000
)

# Resample audio bytes
output_audio = resampler.resample(input_audio)

# Create new frame with resampled audio
resampled_frame = AudioRawFrame(
    audio=output_audio,
    sample_rate=24000,  # New sample rate
    num_channels=1
)

Audio Mixing

{ .api }
from pipecat.audio.mixers.soundfile_mixer import SoundfileMixer

class AudioMixer(FrameProcessor):
    """Mix multiple audio sources.

    Args:
        sample_rate: Output sample rate
        num_channels: Output channel count
    """

    def __init__(self, sample_rate: int = 24000, num_channels: int = 1):
        super().__init__()
        self._mixer = SoundfileMixer(
            sample_rate=sample_rate,
            num_channels=num_channels
        )

    async def add_audio(self, audio: bytes, mix_level: float = 1.0):
        """Add audio to mix.

        Args:
            audio: Audio bytes to add
            mix_level: Volume level (0.0 to 1.0)
        """
        mixed = self._mixer.mix(audio, mix_level)

        frame = OutputAudioRawFrame(
            audio=mixed,
            sample_rate=self._mixer.sample_rate,
            num_channels=self._mixer.num_channels
        )
        await self.push_frame(frame)

Transport Integration

Audio frames integrate with transports for I/O:

{ .api }
from pipecat.transports.daily import DailyTransport, DailyParams

# Configure audio parameters
transport = DailyTransport(
    room_url="https://daily.co/room",
    token="token",
    params=DailyParams(
        # Input audio
        audio_in_enabled=True,
        audio_in_sample_rate=16000,  # Rate for InputAudioRawFrame

        # Output audio
        audio_out_enabled=True,
        audio_out_sample_rate=24000,  # Rate for OutputAudioRawFrame

        # Bitrate
        audio_out_bitrate=64000,  # Output bitrate (bps)
    )
)

# Transport automatically:
# 1. Receives audio -> generates InputAudioRawFrame
# 2. Receives OutputAudioRawFrame -> sends audio

Best Practices

Memory Management

{ .api }
class EfficientAudioProcessor(FrameProcessor):
    """Process audio efficiently."""

    async def process_frame(self, frame, direction):
        if isinstance(frame, AudioRawFrame):
            # Process in chunks, not all at once
            chunk_size = 1600  # 100ms at 16kHz
            audio_data = frame.audio

            for i in range(0, len(audio_data), chunk_size):
                chunk = audio_data[i:i + chunk_size]
                await self._process_chunk(chunk)

        await self.push_frame(frame, direction)

    async def stop(self):
        # Clear buffers on stop
        self._buffer = bytearray()
        await super().stop()

Sample Rate Consistency

{ .api }
class SampleRateValidator(FrameProcessor):
    """Ensure consistent sample rate."""

    def __init__(self, expected_sample_rate: int = 16000):
        super().__init__()
        self._expected_rate = expected_sample_rate
        self._resampler = None

    async def process_frame(self, frame, direction):
        if isinstance(frame, AudioRawFrame):
            if frame.sample_rate != self._expected_rate:
                # Resample if needed
                if not self._resampler:
                    self._resampler = SoxrResampler(
                        input_sample_rate=frame.sample_rate,
                        output_sample_rate=self._expected_rate
                    )

                resampled = self._resampler.resample(frame.audio)
                frame = AudioRawFrame(
                    audio=resampled,
                    sample_rate=self._expected_rate,
                    num_channels=frame.num_channels
                )

        await self.push_frame(frame, direction)

Silence Detection

{ .api }
import numpy as np

class SilenceDetector(FrameProcessor):
    """Detect silence in audio."""

    def __init__(self, threshold: float = 0.01):
        super().__init__()
        self._threshold = threshold

    async def process_frame(self, frame, direction):
        if isinstance(frame, AudioRawFrame):
            # Convert to numpy array
            audio_array = np.frombuffer(frame.audio, dtype=np.int16)

            # Calculate RMS energy
            rms = np.sqrt(np.mean(audio_array.astype(float) ** 2))
            normalized_rms = rms / 32768.0  # Normalize to 0-1

            # Check if silent
            is_silent = normalized_rms < self._threshold

            # Add to metadata
            frame.metadata["is_silent"] = is_silent
            frame.metadata["rms"] = normalized_rms

        await self.push_frame(frame, direction)