or run

tessl search

Version

Workspace: tessl
Visibility: Public
Created: 7 days ago
Last updated: 1 day ago
Describes: pkg:pypi/pipecat-ai@0.0.x

tessl/pypi-pipecat-ai

tessl install tessl/pypi-pipecat-ai@0.0.0

An open source framework for building real-time voice and multimodal conversational AI agents with support for speech-to-text, text-to-speech, LLMs, and multiple transport protocols

Realtime Services

Realtime services provide multimodal (audio + text) conversational AI through unified APIs. These services handle STT, LLM, and TTS in a single integration, offering ultra-low latency for natural conversations.

Supported Realtime Services

OpenAI Realtime API

{ .api }
from pipecat.services.openai import OpenAIRealtimeLLMService

class OpenAIRealtimeLLMService:
    """OpenAI Realtime API service.

    Multimodal service combining STT, LLM (GPT-4o), and TTS
    in a single WebSocket connection for ultra-low latency.

    Features:
    - Native audio input/output
    - Server-side VAD
    - Function calling
    - Conversation turn management
    - Multiple voices

    Args:
        api_key: OpenAI API key
        voice: Voice identifier ("alloy", "echo", "shimmer", etc.)
        model: Model (default: "gpt-4o-realtime-preview")
        params: Model parameters

    Example:
        realtime = OpenAIRealtimeLLMService(
            api_key="sk-...",
            voice="alloy",
            params={
                "instructions": "You are a helpful voice assistant.",
                "temperature": 0.8,
                "modalities": ["text", "audio"]
            }
        )

        pipeline = Pipeline([
            transport.input(),
            realtime,          # All-in-one STT+LLM+TTS
            transport.output()
        ])
    """

    def __init__(
        self,
        api_key: str,
        voice: str = "alloy",
        model: str = "gpt-4o-realtime-preview",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Azure OpenAI Realtime

{ .api }
from pipecat.services.azure import AzureRealtimeLLMService

class AzureRealtimeLLMService:
    """Azure OpenAI Realtime API service.

    Azure deployment of OpenAI Realtime API.

    Args:
        api_key: Azure API key
        endpoint: Azure endpoint URL
        deployment: Deployment name
        voice: Voice identifier
        params: Model parameters

    Example:
        realtime = AzureRealtimeLLMService(
            api_key="...",
            endpoint="https://your-resource.openai.azure.com/",
            deployment="gpt-4o-realtime",
            voice="alloy"
        )
    """

    def __init__(
        self,
        api_key: str,
        endpoint: str,
        deployment: str,
        voice: str = "alloy",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Google Gemini Live

{ .api }
from pipecat.services.google import GoogleGeminiLiveLLMService

class GoogleGeminiLiveLLMService:
    """Google Gemini Live multimodal service.

    Gemini Live API for real-time multimodal conversations.

    Features:
    - Audio and video input
    - Text and audio output
    - Function calling
    - Multimodal understanding

    Args:
        api_key: Google AI API key
        voice_name: Voice configuration
        params: Model parameters

    Example:
        gemini = GoogleGeminiLiveLLMService(
            api_key="AIza...",
            voice_name="Puck",
            params={
                "system_instruction": "You are helpful.",
                "generation_config": {
                    "temperature": 0.8
                }
            }
        )
    """

    def __init__(
        self,
        api_key: str,
        voice_name: Optional[str] = None,
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

AWS Nova Sonic

{ .api }
from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService

class AWSNovaSonicLLMService(LLMService):
    """AWS Nova Sonic speech-to-speech multimodal service.

    AWS Bedrock Nova Sonic provides bidirectional audio streaming,
    real-time transcription, text generation, and function calling.
    Supports ultra-low latency voice conversations.

    Features:
        - Speech-to-speech (audio in, audio out)
        - Real-time transcription
        - Function calling
        - Configurable endpointing sensitivity
        - 24kHz audio output

    Args:
        model: Nova Sonic model ID (default: "us.amazon.nova-sonic-v2:0")
        aws_access_key: AWS access key ID
        aws_secret_key: AWS secret access key
        aws_session_token: Optional AWS session token
        aws_region: AWS region (default: "us-east-1")
        params: Model parameters including audio settings and inference config

    Example:
        from pipecat.services.aws.nova_sonic.llm import AWSNovaSonicLLMService, Params

        nova = AWSNovaSonicLLMService(
            model="us.amazon.nova-sonic-v2:0",
            aws_access_key="...",
            aws_secret_key="...",
            aws_region="us-east-1",
            params=Params(
                input_sample_rate=16000,
                output_sample_rate=24000,
                temperature=0.7,
                max_tokens=1024,
                endpointing_sensitivity="MEDIUM"
            )
        )

        # Set system instructions
        context = LLMContext(
            messages=[
                {"role": "system", "content": "You are a helpful voice assistant."}
            ]
        )
        nova.set_context(context)

        # Use in pipeline
        pipeline = Pipeline([
            transport.input(),
            nova,
            transport.output()
        ])
    """

    def __init__(
        self,
        model: str = "us.amazon.nova-sonic-v2:0",
        aws_access_key: Optional[str] = None,
        aws_secret_key: Optional[str] = None,
        aws_session_token: Optional[str] = None,
        aws_region: str = "us-east-1",
        params: Optional[Params] = None,
        **kwargs
    ):
        pass

Learn more: AWS Nova Sonic Documentation

Hume AI

{ .api }
from pipecat.services.hume import HumeRealtimeLLMService

class HumeRealtimeLLMService(RealtimeLLMService):
    """Hume AI empathic voice interface.

    Multimodal realtime service with emotional intelligence and prosody.
    Detects and responds to emotional cues in voice for more natural,
    empathetic conversations.

    Features:
        - Emotion detection in speech
        - Emotionally responsive TTS
        - Real-time multimodal processing
        - Prosody control
        - Multiple voice options

    Args:
        api_key: Hume AI API key
        config_id: Optional EVI configuration ID
        params: Service parameters (voice, emotion settings, etc.)

    Example:
        hume = HumeRealtimeLLMService(
            api_key="your-api-key",
            config_id="your-config-id",
            params={
                "voice": "default",
                "instructions": "You are an empathetic assistant."
            }
        )

        # Use in pipeline
        pipeline = Pipeline([
            transport.input(),
            hume,
            transport.output()
        ])
    """

    def __init__(
        self,
        api_key: str,
        config_id: Optional[str] = None,
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Learn more: Hume AI Documentation

Grok Realtime

{ .api }
from pipecat.services.grok import GrokRealtimeLLMService

class GrokRealtimeLLMService:
    """Grok Realtime API service.

    xAI's Grok realtime multimodal service.

    Args:
        api_key: xAI API key
        voice: Voice identifier
        params: Model parameters

    Example:
        grok = GrokRealtimeLLMService(
            api_key="...",
            voice="ash",
            params={"temperature": 0.8}
        )
    """

    def __init__(
        self,
        api_key: str,
        voice: str = "ash",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Ultravox

{ .api }
from pipecat.services.ultravox import UltravoxService

class UltravoxService(RealtimeLLMService):
    """Ultravox multimodal realtime service.

    End-to-end realtime multimodal AI service with audio and vision support.

    Args:
        api_key: Ultravox API key
        model: Model identifier
        params: Service parameters (instructions, temperature, etc.)

    Example:
        ultravox = UltravoxService(
            api_key="your-api-key",
            model="ultravox-v0.2",
            params={
                "instructions": "You are a helpful assistant.",
                "temperature": 0.7
            }
        )
    """

    def __init__(
        self,
        api_key: str,
        model: str = "ultravox-v0.2",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Inworld

{ .api }
from pipecat.services.inworld import InworldService

class InworldService(RealtimeLLMService):
    """Inworld character AI realtime service.

    Realtime conversational AI with character personalities and
    emotional intelligence. Designed for interactive characters in
    games, virtual worlds, and applications.

    Args:
        api_key: Inworld API key
        scene_id: Inworld scene identifier
        character_id: Character identifier
        params: Service parameters

    Example:
        inworld = InworldService(
            api_key="your-api-key",
            scene_id="your-scene-id",
            character_id="character-123",
            params={
                "voice": "default",
                "emotion_enabled": True
            }
        )
    """

    def __init__(
        self,
        api_key: str,
        scene_id: str,
        character_id: str,
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Usage Patterns

Basic Realtime Setup

{ .api }
from pipecat.services.openai import OpenAIRealtimeLLMService
from pipecat.transports.daily import DailyTransport
from pipecat.pipeline.pipeline import Pipeline

# Create realtime service
realtime = OpenAIRealtimeLLMService(
    api_key="sk-...",
    voice="alloy",
    params={
        "instructions": "You are a helpful voice assistant. Keep responses concise.",
        "turn_detection": {
            "type": "server_vad",
            "threshold": 0.5,
            "prefix_padding_ms": 300,
            "silence_duration_ms": 500
        }
    }
)

# Simple pipeline - realtime handles everything
pipeline = Pipeline([
    transport.input(),
    realtime,          # STT + LLM + TTS in one
    transport.output()
])

Function Calling with Realtime

{ .api }
# Define function
async def get_weather(location: str) -> dict:
    return {"temp": 72, "condition": "sunny"}

# Register with realtime service
realtime.register_function(
    name="get_weather",
    handler=get_weather,
    description="Get current weather",
    properties={
        "location": {"type": "string"}
    },
    required=["location"]
)

# Function calls handled automatically
@realtime.event_handler("on_function_call")
async def handle_call(name: str, args: dict, result: Any):
    print(f"Function {name} called with {args}, returned {result}")

Voice Selection

{ .api }
# OpenAI Realtime voices
voices = ["alloy", "echo", "shimmer", "ash", "ballad", "coral", "sage", "verse"]

realtime = OpenAIRealtimeLLMService(
    api_key="...",
    voice="shimmer"  # Choose voice
)

Turn Detection Configuration

{ .api }
# Server-side VAD (recommended)
realtime = OpenAIRealtimeLLMService(
    api_key="...",
    params={
        "turn_detection": {
            "type": "server_vad",
            "threshold": 0.5,            # Speech detection threshold
            "prefix_padding_ms": 300,    # Audio before speech
            "silence_duration_ms": 500   # Silence to end turn
        }
    }
)

# Disable server VAD (client handles turns)
realtime = OpenAIRealtimeLLMService(
    api_key="...",
    params={
        "turn_detection": None  # Manual turn management
    }
)

Best Practices

Use for Low-Latency Conversations

{ .api }
# Good: Realtime for voice conversations
realtime = OpenAIRealtimeLLMService(...)
# Ultra-low latency, natural conversations

# Bad: Separate services for real-time use
stt = DeepgramSTTService(...)
llm = OpenAILLMService(...)
tts = OpenAITTSService(...)
# Higher latency, more complex

Configure Instructions Carefully

{ .api }
# Good: Clear voice-optimized instructions
realtime = OpenAIRealtimeLLMService(
    params={
        "instructions": """You are a helpful voice assistant.
        Keep responses brief and conversational.
        Avoid long explanations or lists.
        Use natural speech patterns."""
    }
)

# Bad: Text-optimized instructions
realtime = OpenAIRealtimeLLMService(
    params={
        "instructions": "You are an assistant. Please provide detailed, formatted responses with bullet points and code blocks."
    }
)
# Not suitable for voice output

Handle Connection State

{ .api }
@realtime.event_handler("on_connected")
async def handle_connected():
    print("Realtime service connected")

@realtime.event_handler("on_disconnected")
async def handle_disconnected():
    print("Realtime service disconnected")

@realtime.event_handler("on_connection_error")
async def handle_error(error: Exception):
    print(f"Connection error: {error}")
    # Implement reconnection logic

Monitor Usage

{ .api }
@realtime.event_handler("on_metrics")
async def handle_metrics(metrics):
    print(f"Audio duration: {metrics.audio_duration_ms}ms")
    print(f"Tokens used: {metrics.tokens_used}")

Comparison: Realtime vs. Separate Services

{ .api }
# Realtime Service
# Pros:
# - Ultra-low latency (50-100ms)
# - Simplified pipeline
# - Natural interruptions
# - Server-side VAD
# Cons:
# - Fewer provider choices
# - Less customization
# - Single provider lock-in

pipeline_realtime = Pipeline([
    transport.input(),
    realtime,
    transport.output()
])

# Separate Services
# Pros:
# - Mix and match providers
# - More customization
# - Fallback options
# - Price optimization
# Cons:
# - Higher latency (200-500ms)
# - Complex pipeline
# - More setup

pipeline_separate = Pipeline([
    transport.input(),
    stt_service,
    user_aggregator,
    llm_service,
    assistant_aggregator,
    tts_service,
    transport.output()
])