docs
tessl install tessl/pypi-pipecat-ai@0.0.0An open source framework for building real-time voice and multimodal conversational AI agents with support for speech-to-text, text-to-speech, LLMs, and multiple transport protocols
Audio frames carry raw audio data through the pipeline. They support different sample rates, channel configurations, and distinguish between input (from user) and output (to user) audio.
{ .api }
from pipecat.frames.frames import AudioRawFrame
class AudioRawFrame:
"""Raw audio chunk container.
Carries PCM audio data with associated metadata about
sample rate and channel configuration.
Attributes:
audio (bytes): Raw PCM audio data
sample_rate (int): Sample rate in Hz (e.g., 16000, 24000, 48000)
num_channels (int): Number of audio channels (1 for mono, 2 for stereo)
id (int): Unique frame identifier
name (str): Frame name
pts (Optional[int]): Presentation timestamp in nanoseconds
metadata (Dict[str, Any]): Frame metadata
"""
def __init__(
self,
audio: bytes,
sample_rate: int,
num_channels: int
):
"""Initialize audio frame.
Args:
audio: Raw PCM audio bytes
sample_rate: Sample rate in Hz
num_channels: Number of channels (1=mono, 2=stereo)
"""
passAudio flowing from the bot to the user (output direction).
{ .api }
from pipecat.frames.frames import OutputAudioRawFrame
class OutputAudioRawFrame(DataFrame, AudioRawFrame):
"""Audio output to transport.
Audio data intended for output through the transport
to the user. This is the base class for all output audio.
Inherits from:
DataFrame: Can be interrupted by user
AudioRawFrame: Contains audio data
Example:
frame = OutputAudioRawFrame(
audio=audio_bytes,
sample_rate=24000,
num_channels=1
)
"""
pass{ .api }
from pipecat.frames.frames import TTSAudioRawFrame
class TTSAudioRawFrame(OutputAudioRawFrame):
"""TTS-generated audio.
Audio generated by a Text-to-Speech service. This frame
type helps track the source of audio in the pipeline.
Example:
# Generated by TTS service
frame = TTSAudioRawFrame(
audio=synthesized_audio,
sample_rate=24000,
num_channels=1
)
"""
pass{ .api }
from pipecat.frames.frames import SpeechOutputAudioRawFrame
class SpeechOutputAudioRawFrame(OutputAudioRawFrame):
"""Speech audio stream output.
Speech audio intended for streaming output. Used for
general speech output that may not be from TTS.
Example:
frame = SpeechOutputAudioRawFrame(
audio=speech_bytes,
sample_rate=16000,
num_channels=1
)
"""
passAudio flowing from the user to the bot (input direction).
{ .api }
from pipecat.frames.frames import InputAudioRawFrame
class InputAudioRawFrame(SystemFrame, AudioRawFrame):
"""Audio input from transport.
Audio received from the transport (user input). These are
SystemFrames for immediate processing to minimize latency.
Inherits from:
SystemFrame: High priority, immediate processing
AudioRawFrame: Contains audio data
Example:
# Received from transport
frame = InputAudioRawFrame(
audio=user_audio_bytes,
sample_rate=16000,
num_channels=1
)
"""
pass{ .api }
from pipecat.frames.frames import UserAudioRawFrame
class UserAudioRawFrame(InputAudioRawFrame):
"""User audio input.
Audio specifically identified as coming from a user.
Commonly used for STT processing.
Example:
# User speaking
frame = UserAudioRawFrame(
audio=user_voice_data,
sample_rate=16000,
num_channels=1
)
"""
passCommon sample rates used in Pipecat:
{ .api }
# Standard sample rates
SAMPLE_RATE_8K = 8000 # Telephony quality
SAMPLE_RATE_16K = 16000 # Wideband (common for STT)
SAMPLE_RATE_24K = 24000 # High quality (common for TTS)
SAMPLE_RATE_48K = 48000 # Professional quality
# Example: Different services use different rates
stt_frame = InputAudioRawFrame(
audio=data,
sample_rate=16000, # 16kHz for speech recognition
num_channels=1
)
tts_frame = TTSAudioRawFrame(
audio=data,
sample_rate=24000, # 24kHz for synthesis
num_channels=1
){ .api }
# Mono audio (1 channel)
mono_frame = AudioRawFrame(
audio=mono_data,
sample_rate=16000,
num_channels=1 # Single channel
)
# Stereo audio (2 channels)
stereo_frame = AudioRawFrame(
audio=stereo_data,
sample_rate=48000,
num_channels=2 # Left and right channels
)Audio data is PCM (Pulse Code Modulation) format:
{ .api }
# PCM format details:
# - Little-endian signed 16-bit samples
# - Interleaved for multi-channel audio
# - No header (raw audio data)
# Calculate audio duration
def calculate_duration_ms(audio: bytes, sample_rate: int, num_channels: int) -> float:
"""Calculate audio duration in milliseconds.
Args:
audio: Raw PCM audio bytes
sample_rate: Sample rate in Hz
num_channels: Number of channels
Returns:
Duration in milliseconds
"""
bytes_per_sample = 2 # 16-bit = 2 bytes
num_samples = len(audio) // (bytes_per_sample * num_channels)
duration_seconds = num_samples / sample_rate
return duration_seconds * 1000
# Example
frame = AudioRawFrame(audio=b'\x00' * 32000, sample_rate=16000, num_channels=1)
duration = calculate_duration_ms(frame.audio, frame.sample_rate, frame.num_channels)
# duration = 1000.0 ms (1 second){ .api }
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.frames.frames import TTSAudioRawFrame
class AudioGenerator(FrameProcessor):
"""Generate audio frames."""
async def generate_audio(self, text: str):
"""Generate audio from text.
Args:
text: Text to synthesize
"""
# Generate audio (example)
audio_bytes = await self._synthesize(text)
# Create frame
frame = TTSAudioRawFrame(
audio=audio_bytes,
sample_rate=24000,
num_channels=1
)
# Push downstream
await self.push_frame(frame){ .api }
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.frames.frames import InputAudioRawFrame, UserAudioRawFrame
class AudioProcessor(FrameProcessor):
"""Process incoming audio."""
async def process_frame(self, frame, direction):
if isinstance(frame, InputAudioRawFrame):
# Process audio
processed = await self._process_audio(
frame.audio,
frame.sample_rate,
frame.num_channels
)
# Create new frame with processed audio
new_frame = UserAudioRawFrame(
audio=processed,
sample_rate=frame.sample_rate,
num_channels=frame.num_channels
)
await self.push_frame(new_frame, direction)
else:
await self.push_frame(frame, direction){ .api }
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
class AudioBufferProcessor(FrameProcessor):
"""Buffer audio frames before processing.
Accumulates audio frames until reaching a threshold,
then processes in batches.
Args:
buffer_size_ms: Buffer size in milliseconds
sample_rate: Expected sample rate
num_channels: Expected number of channels
"""
def __init__(
self,
buffer_size_ms: int = 100,
sample_rate: int = 16000,
num_channels: int = 1
):
"""Initialize audio buffer.
Args:
buffer_size_ms: Buffer size in milliseconds
sample_rate: Audio sample rate
num_channels: Number of audio channels
"""
pass
async def process_frame(self, frame, direction):
"""Buffer audio frames.
Args:
frame: Frame to process
direction: Frame direction
"""
if isinstance(frame, AudioRawFrame):
# Add to buffer
self._buffer.extend(frame.audio)
# Check if buffer is full
if len(self._buffer) >= self._buffer_size:
# Process buffered audio
buffered_frame = AudioRawFrame(
audio=bytes(self._buffer),
sample_rate=self.sample_rate,
num_channels=self.num_channels
)
await self.push_frame(buffered_frame, direction)
self._buffer.clear()
else:
await self.push_frame(frame, direction){ .api }
from pipecat.audio.resamplers.soxr_resampler import SoxrResampler
# Resample audio to different sample rate
resampler = SoxrResampler(
input_sample_rate=16000,
output_sample_rate=24000
)
# Resample audio bytes
output_audio = resampler.resample(input_audio)
# Create new frame with resampled audio
resampled_frame = AudioRawFrame(
audio=output_audio,
sample_rate=24000, # New sample rate
num_channels=1
){ .api }
from pipecat.audio.mixers.soundfile_mixer import SoundfileMixer
class AudioMixer(FrameProcessor):
"""Mix multiple audio sources.
Args:
sample_rate: Output sample rate
num_channels: Output channel count
"""
def __init__(self, sample_rate: int = 24000, num_channels: int = 1):
super().__init__()
self._mixer = SoundfileMixer(
sample_rate=sample_rate,
num_channels=num_channels
)
async def add_audio(self, audio: bytes, mix_level: float = 1.0):
"""Add audio to mix.
Args:
audio: Audio bytes to add
mix_level: Volume level (0.0 to 1.0)
"""
mixed = self._mixer.mix(audio, mix_level)
frame = OutputAudioRawFrame(
audio=mixed,
sample_rate=self._mixer.sample_rate,
num_channels=self._mixer.num_channels
)
await self.push_frame(frame)Audio frames integrate with transports for I/O:
{ .api }
from pipecat.transports.daily import DailyTransport, DailyParams
# Configure audio parameters
transport = DailyTransport(
room_url="https://daily.co/room",
token="token",
params=DailyParams(
# Input audio
audio_in_enabled=True,
audio_in_sample_rate=16000, # Rate for InputAudioRawFrame
# Output audio
audio_out_enabled=True,
audio_out_sample_rate=24000, # Rate for OutputAudioRawFrame
# Bitrate
audio_out_bitrate=64000, # Output bitrate (bps)
)
)
# Transport automatically:
# 1. Receives audio -> generates InputAudioRawFrame
# 2. Receives OutputAudioRawFrame -> sends audio{ .api }
class EfficientAudioProcessor(FrameProcessor):
"""Process audio efficiently."""
async def process_frame(self, frame, direction):
if isinstance(frame, AudioRawFrame):
# Process in chunks, not all at once
chunk_size = 1600 # 100ms at 16kHz
audio_data = frame.audio
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i:i + chunk_size]
await self._process_chunk(chunk)
await self.push_frame(frame, direction)
async def stop(self):
# Clear buffers on stop
self._buffer = bytearray()
await super().stop(){ .api }
class SampleRateValidator(FrameProcessor):
"""Ensure consistent sample rate."""
def __init__(self, expected_sample_rate: int = 16000):
super().__init__()
self._expected_rate = expected_sample_rate
self._resampler = None
async def process_frame(self, frame, direction):
if isinstance(frame, AudioRawFrame):
if frame.sample_rate != self._expected_rate:
# Resample if needed
if not self._resampler:
self._resampler = SoxrResampler(
input_sample_rate=frame.sample_rate,
output_sample_rate=self._expected_rate
)
resampled = self._resampler.resample(frame.audio)
frame = AudioRawFrame(
audio=resampled,
sample_rate=self._expected_rate,
num_channels=frame.num_channels
)
await self.push_frame(frame, direction){ .api }
import numpy as np
class SilenceDetector(FrameProcessor):
"""Detect silence in audio."""
def __init__(self, threshold: float = 0.01):
super().__init__()
self._threshold = threshold
async def process_frame(self, frame, direction):
if isinstance(frame, AudioRawFrame):
# Convert to numpy array
audio_array = np.frombuffer(frame.audio, dtype=np.int16)
# Calculate RMS energy
rms = np.sqrt(np.mean(audio_array.astype(float) ** 2))
normalized_rms = rms / 32768.0 # Normalize to 0-1
# Check if silent
is_silent = normalized_rms < self._threshold
# Add to metadata
frame.metadata["is_silent"] = is_silent
frame.metadata["rms"] = normalized_rms
await self.push_frame(frame, direction)