or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
pypipkg:pypi/pipecat-ai@0.0.x

docs

core-concepts.mdindex.mdpipeline.mdrunner.mdtransports.mdturns.md
tile.json

tessl/pypi-pipecat-ai

tessl install tessl/pypi-pipecat-ai@0.0.0

An open source framework for building real-time voice and multimodal conversational AI agents with support for speech-to-text, text-to-speech, LLMs, and multiple transport protocols

stt-services.mddocs/services/

STT Services

Speech-to-Text services transcribe audio into text in Pipecat pipelines. The framework supports 15+ STT providers with uniform interfaces for continuous transcription, interim results, and language selection.

Base STT Service

STTService

{ .api }
from pipecat.services.stt_service import STTService
from pipecat.services.ai_service import AIService
from pipecat.frames.frames import TranscriptionFrame, InterimTranscriptionFrame
from typing import Optional

class STTService(AIService):
    """Base class for all STT services.

    Provides universal interface for speech recognition
    across different providers.

    Key Features:
    - Real-time audio transcription
    - Interim (partial) results
    - Language detection and selection
    - Muting support
    - Event handlers for monitoring

    Methods:
        run_stt(audio): Transcribe audio
        set_model(model): Change STT model
        set_language(language): Change language
        process_frame(frame, direction): Process frames

    Properties:
        is_muted: Whether STT is muted
        sample_rate: Expected audio sample rate

    Event Handlers:
        on_connected: Service connected
        on_disconnected: Service disconnected
        on_connection_error: Connection error occurred

    Frames Consumed:
        - InputAudioRawFrame: Audio from transport
        - UserAudioRawFrame: User audio
        - STTMuteFrame: Mute/unmute signal

    Frames Produced:
        - TranscriptionFrame: Final transcription
        - InterimTranscriptionFrame: Partial transcription

    Example:
        from pipecat.services.deepgram import DeepgramSTTService

        stt = DeepgramSTTService(
            api_key="your-key",
            model="nova-2"
        )

        pipeline = Pipeline([
            transport.input(),
            stt,              # Transcribes audio
            user_aggregator,
            llm_service
        ])
    """

    def __init__(self, **kwargs):
        """Initialize STT service.

        Args:
            **kwargs: Provider-specific configuration
        """
        super().__init__(**kwargs)
        self._model = None
        self._language = None
        self._is_muted = False

    async def run_stt(self, audio: bytes) -> Optional[str]:
        """Transcribe audio.

        Args:
            audio: Audio bytes

        Returns:
            Transcription text or None
        """
        raise NotImplementedError("Subclasses must implement run_stt()")

    def set_model(self, model: str):
        """Set STT model.

        Args:
            model: Model identifier
        """
        self._model = model

    def set_language(self, language: str):
        """Set transcription language.

        Args:
            language: Language code (e.g., "en", "es")
        """
        self._language = language

    @property
    def is_muted(self) -> bool:
        """Whether STT is muted."""
        return self._is_muted

    @property
    def sample_rate(self) -> int:
        """Expected audio sample rate."""
        return 16000  # Default, override in subclasses

Major STT Providers

Deepgram STT

{ .api }
from pipecat.services.deepgram import DeepgramSTTService
from typing import Optional, Dict

class DeepgramSTTService(STTService):
    """Deepgram STT service (high accuracy, low latency).

    Args:
        api_key: Deepgram API key
        model: Model name (e.g., "nova-2", "base")
        language: Language code
        params: Additional parameters

    Example:
        stt = DeepgramSTTService(
            api_key="...",
            model="nova-2",
            language="en",
            params={
                "interim_results": True,
                "punctuate": True,
                "smart_format": True
            }
        )
    """

    def __init__(
        self,
        api_key: str,
        model: str = "nova-2",
        language: str = "en",
        params: Optional[Dict] = None,
        **kwargs
    ):
        """Initialize Deepgram STT service.

        Args:
            api_key: Deepgram API key
            model: Model name
            language: Language code
            params: Additional parameters
            **kwargs: Additional service arguments
        """
        super().__init__(**kwargs)
        self.api_key = api_key
        self._model = model
        self._language = language
        self.params = params or {}

AssemblyAI STT

{ .api }
from pipecat.services.assemblyai import AssemblyAISTTService

class AssemblyAISTTService(STTService):
    """AssemblyAI STT service.

    Args:
        api_key: AssemblyAI API key
        params: Transcription parameters

    Example:
        stt = AssemblyAISTTService(
            api_key="...",
            params={
                "punctuate": True,
                "format_text": True
            }
        )
    """

    def __init__(
        self,
        api_key: str,
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Azure STT

{ .api }
from pipecat.services.azure import AzureSTTService

class AzureSTTService(STTService):
    """Azure Speech STT service.

    Args:
        api_key: Azure subscription key
        region: Azure region
        language: Language code
        params: Speech config parameters

    Example:
        stt = AzureSTTService(
            api_key="...",
            region="eastus",
            language="en-US"
        )
    """

    def __init__(
        self,
        api_key: str,
        region: str,
        language: str = "en-US",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Google STT

{ .api }
from pipecat.services.google import GoogleSTTService

class GoogleSTTService(STTService):
    """Google Cloud STT service.

    Args:
        credentials: Path to credentials JSON or dict
        language: Language code
        params: Recognition config

    Example:
        stt = GoogleSTTService(
            credentials="path/to/credentials.json",
            language="en-US",
            params={
                "model": "latest_long",
                "use_enhanced": True
            }
        )
    """

    def __init__(
        self,
        credentials: Union[str, Dict],
        language: str = "en-US",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

AWS Transcribe

{ .api }
from pipecat.services.aws import AWSSTTService

class AWSSTTService(STTService):
    """AWS Transcribe streaming STT service.

    Args:
        aws_access_key_id: AWS access key
        aws_secret_access_key: AWS secret key
        aws_region: AWS region
        language: Language code
        params: Transcribe parameters

    Example:
        stt = AWSSTTService(
            aws_access_key_id="...",
            aws_secret_access_key="...",
            aws_region="us-east-1",
            language="en-US"
        )
    """

    def __init__(
        self,
        aws_access_key_id: str,
        aws_secret_access_key: str,
        aws_region: str,
        language: str = "en-US",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

OpenAI Whisper

{ .api }
from pipecat.services.openai import OpenAISTTService

class OpenAISTTService(STTService):
    """OpenAI Whisper STT service.

    Args:
        api_key: OpenAI API key
        model: Model name (default: "whisper-1")
        params: Transcription parameters

    Example:
        stt = OpenAISTTService(
            api_key="sk-...",
            model="whisper-1",
            params={"language": "en"}
        )
    """

    def __init__(
        self,
        api_key: str,
        model: str = "whisper-1",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Local Whisper

{ .api }
from pipecat.services.whisper import WhisperSTTService

class WhisperSTTService(STTService):
    """Local Whisper STT service (faster-whisper).

    Runs Whisper locally for privacy/offline use.

    Args:
        model: Model size ("tiny", "base", "small", "medium", "large")
        device: Device ("cpu", "cuda", "auto")
        compute_type: Compute type ("int8", "float16", "float32")

    Example:
        stt = WhisperSTTService(
            model="base",
            device="cuda",
            compute_type="float16"
        )
    """

    def __init__(
        self,
        model: str = "base",
        device: str = "auto",
        compute_type: str = "int8",
        **kwargs
    ):
        pass

Additional STT Providers

Soniox STT

{ .api }
from pipecat.services.soniox import SonioxSTTService

class SonioxSTTService(STTService):
    """Soniox STT service integration.

    High-accuracy speech recognition with advanced features like
    speaker diarization and custom vocabulary.

    Args:
        api_key: Soniox API key
        model: Model identifier
        language: Language code (e.g., "en", "es")
        params: STT parameters

    Example:
        stt = SonioxSTTService(
            api_key="your-api-key",
            model="soniox-general",
            language="en",
            params={"enable_dictation": True}
        )
    """
    pass

Speechmatics STT

{ .api }
from pipecat.services.speechmatics import SpeechmaticsSTTService

class SpeechmaticsSTTService(STTService):
    """Speechmatics STT service integration.

    Advanced speech recognition with support for 50+ languages,
    real-time transcription, and speaker identification.

    Args:
        api_key: Speechmatics API key
        model: Model identifier (e.g., "nova-2")
        language: Language code
        params: STT parameters

    Example:
        stt = SpeechmaticsSTTService(
            api_key="your-api-key",
            model="nova-2",
            language="en",
            params={
                "enable_partials": True,
                "max_delay": 3.0
            }
        )
    """
    pass

Gladia STT

{ .api }
from pipecat.services.gladia import GladiaSTTService

class GladiaSTTService(STTService):
    """Gladia STT service integration.

    Advanced multilingual speech recognition with support for 100+ languages,
    real-time transcription, custom vocabulary, and translation capabilities.

    Args:
        api_key: Gladia API key
        region: Region for processing ("us-west" or "eu-west")
        model: Model identifier (default: "solaria-1")
        sample_rate: Audio sample rate
        params: Additional configuration (language_config, endpointing, etc.)

    Example:
        from pipecat.services.gladia.config import GladiaInputParams, LanguageConfig

        stt = GladiaSTTService(
            api_key="your-api-key",
            region="us-west",
            model="solaria-1",
            params=GladiaInputParams(
                language_config=LanguageConfig(
                    languages=["en", "es"],
                    code_switching=True
                ),
                enable_vad=True,
                endpointing=500
            )
        )
    """

    def __init__(
        self,
        api_key: str,
        region: Optional[str] = None,
        model: str = "solaria-1",
        sample_rate: Optional[int] = None,
        params: Optional[GladiaInputParams] = None,
        **kwargs
    ):
        pass

Learn more: Gladia Documentation

Groq STT

{ .api }
from pipecat.services.groq import GroqSTTService

class GroqSTTService(STTService):
    """Groq Whisper STT service integration.

    Ultra-fast Whisper inference using Groq's LPU hardware for
    near real-time speech recognition.

    Args:
        api_key: Groq API key
        model: Whisper model (default: "whisper-large-v3")
        language: Language code for transcription
        params: STT parameters

    Example:
        stt = GroqSTTService(
            api_key="your-api-key",
            model="whisper-large-v3",
            language="en"
        )
    """

    def __init__(
        self,
        api_key: str,
        model: str = "whisper-large-v3",
        language: Optional[str] = None,
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Learn more: Groq Documentation

ElevenLabs STT

{ .api }
from pipecat.services.elevenlabs import ElevenLabsSTTService

class ElevenLabsSTTService(STTService):
    """ElevenLabs STT service integration.

    High-accuracy speech recognition from ElevenLabs with support
    for multiple languages and real-time streaming.

    Args:
        api_key: ElevenLabs API key
        model: STT model identifier
        language: Language code
        params: STT parameters

    Example:
        stt = ElevenLabsSTTService(
            api_key="your-api-key",
            model="eleven_speech_to_text",
            language="en"
        )
    """

    def __init__(
        self,
        api_key: str,
        model: Optional[str] = None,
        language: Optional[str] = None,
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Learn more: ElevenLabs Documentation

NVIDIA Riva STT

{ .api }
from pipecat.services.riva import RivaSTTService

class RivaSTTService(STTService):
    """NVIDIA Riva STT service integration.

    GPU-accelerated speech recognition using NVIDIA Riva for
    ultra-low latency, high-accuracy transcription. Supports
    on-premises deployment.

    Args:
        server_url: Riva gRPC server URL
        language: Language code (e.g., "en-US")
        sample_rate: Audio sample rate
        params: STT configuration parameters

    Example:
        stt = RivaSTTService(
            server_url="grpc://localhost:50051",
            language="en-US",
            sample_rate=16000,
            params={
                "automatic_punctuation": True,
                "enable_word_time_offsets": True
            }
        )
    """

    def __init__(
        self,
        server_url: str,
        language: str = "en-US",
        sample_rate: int = 16000,
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Learn more: NVIDIA Riva Documentation

NVIDIA NIM STT

{ .api }
from pipecat.services.nvidia import NVIDIASTTService

class NVIDIASTTService(STTService):
    """NVIDIA NIM STT service integration.

    NVIDIA inference microservices for speech-to-text with
    optimized deployment and scaling capabilities.

    Args:
        api_key: NVIDIA API key
        model: NIM model identifier
        base_url: NIM endpoint URL
        language: Language code
        params: STT parameters

    Example:
        stt = NVIDIASTTService(
            api_key="your-api-key",
            model="nvidia/parakeet-tdt-1.1b",
            base_url="https://your-nim-endpoint",
            language="en"
        )
    """

    def __init__(
        self,
        api_key: str,
        model: str,
        base_url: str,
        language: str = "en",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Learn more: NVIDIA NIM Documentation

SambaNova STT

{ .api }
from pipecat.services.sambanova import SambaNovaSTTService

class SambaNovaSTTService(STTService):
    """SambaNova STT service integration.

    Fast speech recognition using SambaNova's specialized AI hardware.

    Args:
        api_key: SambaNova API key
        model: STT model identifier
        language: Language code
        params: STT parameters

    Example:
        stt = SambaNovaSTTService(
            api_key="your-api-key",
            model="sambanova-whisper",
            language="en"
        )
    """

    def __init__(
        self,
        api_key: str,
        model: str,
        language: str = "en",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Fal AI STT

{ .api }
from pipecat.services.fal import FalSTTService

class FalSTTService(STTService):
    """Fal AI STT service integration.

    Fast and accurate speech recognition via Fal AI's inference platform.

    Args:
        api_key: Fal AI API key
        model: STT model identifier
        language: Language code
        params: STT parameters

    Example:
        stt = FalSTTService(
            api_key="your-api-key",
            model="fal-ai/whisper-large-v3",
            language="en"
        )
    """

    def __init__(
        self,
        api_key: str,
        model: str,
        language: str = "en",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Learn more: Fal AI Documentation

Hathora STT

{ .api }
from pipecat.services.hathora import HathoraSTTService

class HathoraSTTService(STTService):
    """Hathora STT service integration.

    Speech recognition integrated with Hathora's game backend
    infrastructure for multiplayer gaming applications.

    Args:
        api_key: Hathora API key
        model: STT model identifier
        params: STT parameters

    Example:
        stt = HathoraSTTService(
            api_key="your-api-key",
            model="hathora-stt"
        )
    """

    def __init__(
        self,
        api_key: str,
        model: str,
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Learn more: Hathora Documentation

Cartesia STT

{ .api }
from pipecat.services.cartesia import CartesiaSTTService

class CartesiaSTTService(STTService):
    """Cartesia STT service integration.

    Low-latency speech recognition optimized for real-time applications.

    Args:
        api_key: Cartesia API key
        model: STT model identifier
        language: Language code
        params: STT parameters

    Example:
        stt = CartesiaSTTService(
            api_key="your-api-key",
            model="cartesia-stt",
            language="en"
        )
    """

    def __init__(
        self,
        api_key: str,
        model: Optional[str] = None,
        language: str = "en",
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Learn more: Cartesia Documentation

Gradium STT

{ .api }
from pipecat.services.gradium import GradiumSTTService

class GradiumSTTService(STTService):
    """Gradium STT service integration.

    Speech recognition via the Gradium platform with customization options.

    Args:
        api_key: Gradium API key
        model: STT model identifier
        params: STT parameters

    Example:
        stt = GradiumSTTService(
            api_key="your-api-key",
            model="gradium-stt"
        )
    """

    def __init__(
        self,
        api_key: str,
        model: str,
        params: Optional[Dict] = None,
        **kwargs
    ):
        pass

Other STT Providers

{ .api }
# Additional supported STT providers
from pipecat.services.sarvam import SarvamSTTService  # Indian languages

# All follow similar pattern
stt = ProviderSTTService(
    api_key="...",
    model="...",
    params={...}
)

Usage Patterns

Basic STT Integration

{ .api }
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.pipeline.pipeline import Pipeline

stt = DeepgramSTTService(
    api_key="...",
    model="nova-2"
)

pipeline = Pipeline([
    transport.input(),  # Audio from user
    stt,                # Transcribes to text
    user_aggregator,
    llm_service
])

Interim Results

{ .api }
# Enable interim results for real-time feedback
stt = DeepgramSTTService(
    api_key="...",
    params={"interim_results": True}
)

# Monitor interim transcriptions
class InterimMonitor(FrameProcessor):
    async def process_frame(self, frame, direction):
        if isinstance(frame, InterimTranscriptionFrame):
            print(f"Interim: {frame.text}")
        elif isinstance(frame, TranscriptionFrame):
            print(f"Final: {frame.text}")
        await self.push_frame(frame, direction)

Language Selection

{ .api }
from pipecat.frames.frames import STTUpdateSettingsFrame

# Change language at runtime
update_frame = STTUpdateSettingsFrame(language="es")
await task.queue_frame(update_frame)

STT Muting

{ .api }
from pipecat.frames.frames import STTMuteFrame

# Mute STT (stop transcription)
await task.queue_frame(STTMuteFrame(muted=True))

# Unmute STT
await task.queue_frame(STTMuteFrame(muted=False))

Best Practices

Handle Connection Errors

{ .api }
@stt.event_handler("on_connection_error")
async def handle_error(error: Exception):
    print(f"STT connection error: {error}")
    # Reconnect or use fallback

Choose Right Model for Use Case

{ .api }
# Good: Fast model for real-time
stt = DeepgramSTTService(model="nova-2")  # Low latency

# Good: Accurate model for transcription
stt = AssemblyAISTTService()  # High accuracy

# Bad: Slow model for real-time
stt = WhisperSTTService(model="large")  # Too slow for real-time

Match Sample Rates

{ .api }
# Ensure transport and STT sample rates match
transport = DailyTransport(
    params=DailyParams(
        audio_in_sample_rate=16000  # 16kHz
    )
)

stt = DeepgramSTTService(...)  # Expects 16kHz
# Sample rates match - no resampling needed