docs
tessl install tessl/pypi-pipecat-ai@0.0.0An open source framework for building real-time voice and multimodal conversational AI agents with support for speech-to-text, text-to-speech, LLMs, and multiple transport protocols
Realtime services provide multimodal (audio + text) conversational AI through unified APIs. These services handle STT, LLM, and TTS in a single integration, offering ultra-low latency for natural conversations.
{ .api }
from pipecat.services.openai import OpenAIRealtimeLLMService
class OpenAIRealtimeLLMService:
"""OpenAI Realtime API service.
Multimodal service combining STT, LLM (GPT-4o), and TTS
in a single WebSocket connection for ultra-low latency.
Features:
- Native audio input/output
- Server-side VAD
- Function calling
- Conversation turn management
- Multiple voices
Args:
api_key: OpenAI API key
voice: Voice identifier ("alloy", "echo", "shimmer", etc.)
model: Model (default: "gpt-4o-realtime-preview")
params: Model parameters
Example:
realtime = OpenAIRealtimeLLMService(
api_key="sk-...",
voice="alloy",
params={
"instructions": "You are a helpful voice assistant.",
"temperature": 0.8,
"modalities": ["text", "audio"]
}
)
pipeline = Pipeline([
transport.input(),
realtime, # All-in-one STT+LLM+TTS
transport.output()
])
"""
def __init__(
self,
api_key: str,
voice: str = "alloy",
model: str = "gpt-4o-realtime-preview",
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.azure import AzureRealtimeLLMService
class AzureRealtimeLLMService:
"""Azure OpenAI Realtime API service.
Azure deployment of OpenAI Realtime API.
Args:
api_key: Azure API key
endpoint: Azure endpoint URL
deployment: Deployment name
voice: Voice identifier
params: Model parameters
Example:
realtime = AzureRealtimeLLMService(
api_key="...",
endpoint="https://your-resource.openai.azure.com/",
deployment="gpt-4o-realtime",
voice="alloy"
)
"""
def __init__(
self,
api_key: str,
endpoint: str,
deployment: str,
voice: str = "alloy",
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.google import GoogleGeminiLiveLLMService
class GoogleGeminiLiveLLMService:
"""Google Gemini Live multimodal service.
Gemini Live API for real-time multimodal conversations.
Features:
- Audio and video input
- Text and audio output
- Function calling
- Multimodal understanding
Args:
api_key: Google AI API key
voice_name: Voice configuration
params: Model parameters
Example:
gemini = GoogleGeminiLiveLLMService(
api_key="AIza...",
voice_name="Puck",
params={
"system_instruction": "You are helpful.",
"generation_config": {
"temperature": 0.8
}
}
)
"""
def __init__(
self,
api_key: str,
voice_name: Optional[str] = None,
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService
class AWSNovaSonicLLMService(LLMService):
"""AWS Nova Sonic speech-to-speech multimodal service.
AWS Bedrock Nova Sonic provides bidirectional audio streaming,
real-time transcription, text generation, and function calling.
Supports ultra-low latency voice conversations.
Features:
- Speech-to-speech (audio in, audio out)
- Real-time transcription
- Function calling
- Configurable endpointing sensitivity
- 24kHz audio output
Args:
model: Nova Sonic model ID (default: "us.amazon.nova-sonic-v2:0")
aws_access_key: AWS access key ID
aws_secret_key: AWS secret access key
aws_session_token: Optional AWS session token
aws_region: AWS region (default: "us-east-1")
params: Model parameters including audio settings and inference config
Example:
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService, Params
nova = AWSNovaSonicLLMService(
model="us.amazon.nova-sonic-v2:0",
aws_access_key="...",
aws_secret_key="...",
aws_region="us-east-1",
params=Params(
input_sample_rate=16000,
output_sample_rate=24000,
temperature=0.7,
max_tokens=1024,
endpointing_sensitivity="MEDIUM"
)
)
# Set system instructions
context = LLMContext(
messages=[
{"role": "system", "content": "You are a helpful voice assistant."}
]
)
nova.set_context(context)
# Use in pipeline
pipeline = Pipeline([
transport.input(),
nova,
transport.output()
])
"""
def __init__(
self,
model: str = "us.amazon.nova-sonic-v2:0",
aws_access_key: Optional[str] = None,
aws_secret_key: Optional[str] = None,
aws_session_token: Optional[str] = None,
aws_region: str = "us-east-1",
params: Optional[Params] = None,
**kwargs
):
passLearn more: AWS Nova Sonic Documentation
{ .api }
from pipecat.services.hume import HumeRealtimeLLMService
class HumeRealtimeLLMService(RealtimeLLMService):
"""Hume AI empathic voice interface.
Multimodal realtime service with emotional intelligence and prosody.
Detects and responds to emotional cues in voice for more natural,
empathetic conversations.
Features:
- Emotion detection in speech
- Emotionally responsive TTS
- Real-time multimodal processing
- Prosody control
- Multiple voice options
Args:
api_key: Hume AI API key
config_id: Optional EVI configuration ID
params: Service parameters (voice, emotion settings, etc.)
Example:
hume = HumeRealtimeLLMService(
api_key="your-api-key",
config_id="your-config-id",
params={
"voice": "default",
"instructions": "You are an empathetic assistant."
}
)
# Use in pipeline
pipeline = Pipeline([
transport.input(),
hume,
transport.output()
])
"""
def __init__(
self,
api_key: str,
config_id: Optional[str] = None,
params: Optional[Dict] = None,
**kwargs
):
passLearn more: Hume AI Documentation
{ .api }
from pipecat.services.grok import GrokRealtimeLLMService
class GrokRealtimeLLMService:
"""Grok Realtime API service.
xAI's Grok realtime multimodal service.
Args:
api_key: xAI API key
voice: Voice identifier
params: Model parameters
Example:
grok = GrokRealtimeLLMService(
api_key="...",
voice="ash",
params={"temperature": 0.8}
)
"""
def __init__(
self,
api_key: str,
voice: str = "ash",
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.ultravox import UltravoxService
class UltravoxService(RealtimeLLMService):
"""Ultravox multimodal realtime service.
End-to-end realtime multimodal AI service with audio and vision support.
Args:
api_key: Ultravox API key
model: Model identifier
params: Service parameters (instructions, temperature, etc.)
Example:
ultravox = UltravoxService(
api_key="your-api-key",
model="ultravox-v0.2",
params={
"instructions": "You are a helpful assistant.",
"temperature": 0.7
}
)
"""
def __init__(
self,
api_key: str,
model: str = "ultravox-v0.2",
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.inworld import InworldService
class InworldService(RealtimeLLMService):
"""Inworld character AI realtime service.
Realtime conversational AI with character personalities and
emotional intelligence. Designed for interactive characters in
games, virtual worlds, and applications.
Args:
api_key: Inworld API key
scene_id: Inworld scene identifier
character_id: Character identifier
params: Service parameters
Example:
inworld = InworldService(
api_key="your-api-key",
scene_id="your-scene-id",
character_id="character-123",
params={
"voice": "default",
"emotion_enabled": True
}
)
"""
def __init__(
self,
api_key: str,
scene_id: str,
character_id: str,
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.openai import OpenAIRealtimeLLMService
from pipecat.transports.daily import DailyTransport
from pipecat.pipeline.pipeline import Pipeline
# Create realtime service
realtime = OpenAIRealtimeLLMService(
api_key="sk-...",
voice="alloy",
params={
"instructions": "You are a helpful voice assistant. Keep responses concise.",
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500
}
}
)
# Simple pipeline - realtime handles everything
pipeline = Pipeline([
transport.input(),
realtime, # STT + LLM + TTS in one
transport.output()
]){ .api }
# Define function
async def get_weather(location: str) -> dict:
return {"temp": 72, "condition": "sunny"}
# Register with realtime service
realtime.register_function(
name="get_weather",
handler=get_weather,
description="Get current weather",
properties={
"location": {"type": "string"}
},
required=["location"]
)
# Function calls handled automatically
@realtime.event_handler("on_function_call")
async def handle_call(name: str, args: dict, result: Any):
print(f"Function {name} called with {args}, returned {result}"){ .api }
# OpenAI Realtime voices
voices = ["alloy", "echo", "shimmer", "ash", "ballad", "coral", "sage", "verse"]
realtime = OpenAIRealtimeLLMService(
api_key="...",
voice="shimmer" # Choose voice
){ .api }
# Server-side VAD (recommended)
realtime = OpenAIRealtimeLLMService(
api_key="...",
params={
"turn_detection": {
"type": "server_vad",
"threshold": 0.5, # Speech detection threshold
"prefix_padding_ms": 300, # Audio before speech
"silence_duration_ms": 500 # Silence to end turn
}
}
)
# Disable server VAD (client handles turns)
realtime = OpenAIRealtimeLLMService(
api_key="...",
params={
"turn_detection": None # Manual turn management
}
){ .api }
# Good: Realtime for voice conversations
realtime = OpenAIRealtimeLLMService(...)
# Ultra-low latency, natural conversations
# Bad: Separate services for real-time use
stt = DeepgramSTTService(...)
llm = OpenAILLMService(...)
tts = OpenAITTSService(...)
# Higher latency, more complex{ .api }
# Good: Clear voice-optimized instructions
realtime = OpenAIRealtimeLLMService(
params={
"instructions": """You are a helpful voice assistant.
Keep responses brief and conversational.
Avoid long explanations or lists.
Use natural speech patterns."""
}
)
# Bad: Text-optimized instructions
realtime = OpenAIRealtimeLLMService(
params={
"instructions": "You are an assistant. Please provide detailed, formatted responses with bullet points and code blocks."
}
)
# Not suitable for voice output{ .api }
@realtime.event_handler("on_connected")
async def handle_connected():
print("Realtime service connected")
@realtime.event_handler("on_disconnected")
async def handle_disconnected():
print("Realtime service disconnected")
@realtime.event_handler("on_connection_error")
async def handle_error(error: Exception):
print(f"Connection error: {error}")
# Implement reconnection logic{ .api }
@realtime.event_handler("on_metrics")
async def handle_metrics(metrics):
print(f"Audio duration: {metrics.audio_duration_ms}ms")
print(f"Tokens used: {metrics.tokens_used}"){ .api }
# Realtime Service
# Pros:
# - Ultra-low latency (50-100ms)
# - Simplified pipeline
# - Natural interruptions
# - Server-side VAD
# Cons:
# - Fewer provider choices
# - Less customization
# - Single provider lock-in
pipeline_realtime = Pipeline([
transport.input(),
realtime,
transport.output()
])
# Separate Services
# Pros:
# - Mix and match providers
# - More customization
# - Fallback options
# - Price optimization
# Cons:
# - Higher latency (200-500ms)
# - Complex pipeline
# - More setup
pipeline_separate = Pipeline([
transport.input(),
stt_service,
user_aggregator,
llm_service,
assistant_aggregator,
tts_service,
transport.output()
])