docs
tessl install tessl/pypi-pipecat-ai@0.0.0An open source framework for building real-time voice and multimodal conversational AI agents with support for speech-to-text, text-to-speech, LLMs, and multiple transport protocols
Speech-to-Text services transcribe audio into text in Pipecat pipelines. The framework supports 15+ STT providers with uniform interfaces for continuous transcription, interim results, and language selection.
{ .api }
from pipecat.services.stt_service import STTService
from pipecat.services.ai_service import AIService
from pipecat.frames.frames import TranscriptionFrame, InterimTranscriptionFrame
from typing import Optional
class STTService(AIService):
"""Base class for all STT services.
Provides universal interface for speech recognition
across different providers.
Key Features:
- Real-time audio transcription
- Interim (partial) results
- Language detection and selection
- Muting support
- Event handlers for monitoring
Methods:
run_stt(audio): Transcribe audio
set_model(model): Change STT model
set_language(language): Change language
process_frame(frame, direction): Process frames
Properties:
is_muted: Whether STT is muted
sample_rate: Expected audio sample rate
Event Handlers:
on_connected: Service connected
on_disconnected: Service disconnected
on_connection_error: Connection error occurred
Frames Consumed:
- InputAudioRawFrame: Audio from transport
- UserAudioRawFrame: User audio
- STTMuteFrame: Mute/unmute signal
Frames Produced:
- TranscriptionFrame: Final transcription
- InterimTranscriptionFrame: Partial transcription
Example:
from pipecat.services.deepgram import DeepgramSTTService
stt = DeepgramSTTService(
api_key="your-key",
model="nova-2"
)
pipeline = Pipeline([
transport.input(),
stt, # Transcribes audio
user_aggregator,
llm_service
])
"""
def __init__(self, **kwargs):
"""Initialize STT service.
Args:
**kwargs: Provider-specific configuration
"""
super().__init__(**kwargs)
self._model = None
self._language = None
self._is_muted = False
async def run_stt(self, audio: bytes) -> Optional[str]:
"""Transcribe audio.
Args:
audio: Audio bytes
Returns:
Transcription text or None
"""
raise NotImplementedError("Subclasses must implement run_stt()")
def set_model(self, model: str):
"""Set STT model.
Args:
model: Model identifier
"""
self._model = model
def set_language(self, language: str):
"""Set transcription language.
Args:
language: Language code (e.g., "en", "es")
"""
self._language = language
@property
def is_muted(self) -> bool:
"""Whether STT is muted."""
return self._is_muted
@property
def sample_rate(self) -> int:
"""Expected audio sample rate."""
return 16000 # Default, override in subclasses{ .api }
from pipecat.services.deepgram import DeepgramSTTService
from typing import Optional, Dict
class DeepgramSTTService(STTService):
"""Deepgram STT service (high accuracy, low latency).
Args:
api_key: Deepgram API key
model: Model name (e.g., "nova-2", "base")
language: Language code
params: Additional parameters
Example:
stt = DeepgramSTTService(
api_key="...",
model="nova-2",
language="en",
params={
"interim_results": True,
"punctuate": True,
"smart_format": True
}
)
"""
def __init__(
self,
api_key: str,
model: str = "nova-2",
language: str = "en",
params: Optional[Dict] = None,
**kwargs
):
"""Initialize Deepgram STT service.
Args:
api_key: Deepgram API key
model: Model name
language: Language code
params: Additional parameters
**kwargs: Additional service arguments
"""
super().__init__(**kwargs)
self.api_key = api_key
self._model = model
self._language = language
self.params = params or {}{ .api }
from pipecat.services.assemblyai import AssemblyAISTTService
class AssemblyAISTTService(STTService):
"""AssemblyAI STT service.
Args:
api_key: AssemblyAI API key
params: Transcription parameters
Example:
stt = AssemblyAISTTService(
api_key="...",
params={
"punctuate": True,
"format_text": True
}
)
"""
def __init__(
self,
api_key: str,
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.azure import AzureSTTService
class AzureSTTService(STTService):
"""Azure Speech STT service.
Args:
api_key: Azure subscription key
region: Azure region
language: Language code
params: Speech config parameters
Example:
stt = AzureSTTService(
api_key="...",
region="eastus",
language="en-US"
)
"""
def __init__(
self,
api_key: str,
region: str,
language: str = "en-US",
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.google import GoogleSTTService
class GoogleSTTService(STTService):
"""Google Cloud STT service.
Args:
credentials: Path to credentials JSON or dict
language: Language code
params: Recognition config
Example:
stt = GoogleSTTService(
credentials="path/to/credentials.json",
language="en-US",
params={
"model": "latest_long",
"use_enhanced": True
}
)
"""
def __init__(
self,
credentials: Union[str, Dict],
language: str = "en-US",
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.aws import AWSSTTService
class AWSSTTService(STTService):
"""AWS Transcribe streaming STT service.
Args:
aws_access_key_id: AWS access key
aws_secret_access_key: AWS secret key
aws_region: AWS region
language: Language code
params: Transcribe parameters
Example:
stt = AWSSTTService(
aws_access_key_id="...",
aws_secret_access_key="...",
aws_region="us-east-1",
language="en-US"
)
"""
def __init__(
self,
aws_access_key_id: str,
aws_secret_access_key: str,
aws_region: str,
language: str = "en-US",
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.openai import OpenAISTTService
class OpenAISTTService(STTService):
"""OpenAI Whisper STT service.
Args:
api_key: OpenAI API key
model: Model name (default: "whisper-1")
params: Transcription parameters
Example:
stt = OpenAISTTService(
api_key="sk-...",
model="whisper-1",
params={"language": "en"}
)
"""
def __init__(
self,
api_key: str,
model: str = "whisper-1",
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.whisper import WhisperSTTService
class WhisperSTTService(STTService):
"""Local Whisper STT service (faster-whisper).
Runs Whisper locally for privacy/offline use.
Args:
model: Model size ("tiny", "base", "small", "medium", "large")
device: Device ("cpu", "cuda", "auto")
compute_type: Compute type ("int8", "float16", "float32")
Example:
stt = WhisperSTTService(
model="base",
device="cuda",
compute_type="float16"
)
"""
def __init__(
self,
model: str = "base",
device: str = "auto",
compute_type: str = "int8",
**kwargs
):
pass{ .api }
from pipecat.services.soniox import SonioxSTTService
class SonioxSTTService(STTService):
"""Soniox STT service integration.
High-accuracy speech recognition with advanced features like
speaker diarization and custom vocabulary.
Args:
api_key: Soniox API key
model: Model identifier
language: Language code (e.g., "en", "es")
params: STT parameters
Example:
stt = SonioxSTTService(
api_key="your-api-key",
model="soniox-general",
language="en",
params={"enable_dictation": True}
)
"""
pass{ .api }
from pipecat.services.speechmatics import SpeechmaticsSTTService
class SpeechmaticsSTTService(STTService):
"""Speechmatics STT service integration.
Advanced speech recognition with support for 50+ languages,
real-time transcription, and speaker identification.
Args:
api_key: Speechmatics API key
model: Model identifier (e.g., "nova-2")
language: Language code
params: STT parameters
Example:
stt = SpeechmaticsSTTService(
api_key="your-api-key",
model="nova-2",
language="en",
params={
"enable_partials": True,
"max_delay": 3.0
}
)
"""
pass{ .api }
from pipecat.services.gladia import GladiaSTTService
class GladiaSTTService(STTService):
"""Gladia STT service integration.
Advanced multilingual speech recognition with support for 100+ languages,
real-time transcription, custom vocabulary, and translation capabilities.
Args:
api_key: Gladia API key
region: Region for processing ("us-west" or "eu-west")
model: Model identifier (default: "solaria-1")
sample_rate: Audio sample rate
params: Additional configuration (language_config, endpointing, etc.)
Example:
from pipecat.services.gladia.config import GladiaInputParams, LanguageConfig
stt = GladiaSTTService(
api_key="your-api-key",
region="us-west",
model="solaria-1",
params=GladiaInputParams(
language_config=LanguageConfig(
languages=["en", "es"],
code_switching=True
),
enable_vad=True,
endpointing=500
)
)
"""
def __init__(
self,
api_key: str,
region: Optional[str] = None,
model: str = "solaria-1",
sample_rate: Optional[int] = None,
params: Optional[GladiaInputParams] = None,
**kwargs
):
passLearn more: Gladia Documentation
{ .api }
from pipecat.services.groq import GroqSTTService
class GroqSTTService(STTService):
"""Groq Whisper STT service integration.
Ultra-fast Whisper inference using Groq's LPU hardware for
near real-time speech recognition.
Args:
api_key: Groq API key
model: Whisper model (default: "whisper-large-v3")
language: Language code for transcription
params: STT parameters
Example:
stt = GroqSTTService(
api_key="your-api-key",
model="whisper-large-v3",
language="en"
)
"""
def __init__(
self,
api_key: str,
model: str = "whisper-large-v3",
language: Optional[str] = None,
params: Optional[Dict] = None,
**kwargs
):
passLearn more: Groq Documentation
{ .api }
from pipecat.services.elevenlabs import ElevenLabsSTTService
class ElevenLabsSTTService(STTService):
"""ElevenLabs STT service integration.
High-accuracy speech recognition from ElevenLabs with support
for multiple languages and real-time streaming.
Args:
api_key: ElevenLabs API key
model: STT model identifier
language: Language code
params: STT parameters
Example:
stt = ElevenLabsSTTService(
api_key="your-api-key",
model="eleven_speech_to_text",
language="en"
)
"""
def __init__(
self,
api_key: str,
model: Optional[str] = None,
language: Optional[str] = None,
params: Optional[Dict] = None,
**kwargs
):
passLearn more: ElevenLabs Documentation
{ .api }
from pipecat.services.riva import RivaSTTService
class RivaSTTService(STTService):
"""NVIDIA Riva STT service integration.
GPU-accelerated speech recognition using NVIDIA Riva for
ultra-low latency, high-accuracy transcription. Supports
on-premises deployment.
Args:
server_url: Riva gRPC server URL
language: Language code (e.g., "en-US")
sample_rate: Audio sample rate
params: STT configuration parameters
Example:
stt = RivaSTTService(
server_url="grpc://localhost:50051",
language="en-US",
sample_rate=16000,
params={
"automatic_punctuation": True,
"enable_word_time_offsets": True
}
)
"""
def __init__(
self,
server_url: str,
language: str = "en-US",
sample_rate: int = 16000,
params: Optional[Dict] = None,
**kwargs
):
passLearn more: NVIDIA Riva Documentation
{ .api }
from pipecat.services.nvidia import NVIDIASTTService
class NVIDIASTTService(STTService):
"""NVIDIA NIM STT service integration.
NVIDIA inference microservices for speech-to-text with
optimized deployment and scaling capabilities.
Args:
api_key: NVIDIA API key
model: NIM model identifier
base_url: NIM endpoint URL
language: Language code
params: STT parameters
Example:
stt = NVIDIASTTService(
api_key="your-api-key",
model="nvidia/parakeet-tdt-1.1b",
base_url="https://your-nim-endpoint",
language="en"
)
"""
def __init__(
self,
api_key: str,
model: str,
base_url: str,
language: str = "en",
params: Optional[Dict] = None,
**kwargs
):
passLearn more: NVIDIA NIM Documentation
{ .api }
from pipecat.services.sambanova import SambaNovaSTTService
class SambaNovaSTTService(STTService):
"""SambaNova STT service integration.
Fast speech recognition using SambaNova's specialized AI hardware.
Args:
api_key: SambaNova API key
model: STT model identifier
language: Language code
params: STT parameters
Example:
stt = SambaNovaSTTService(
api_key="your-api-key",
model="sambanova-whisper",
language="en"
)
"""
def __init__(
self,
api_key: str,
model: str,
language: str = "en",
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
from pipecat.services.fal import FalSTTService
class FalSTTService(STTService):
"""Fal AI STT service integration.
Fast and accurate speech recognition via Fal AI's inference platform.
Args:
api_key: Fal AI API key
model: STT model identifier
language: Language code
params: STT parameters
Example:
stt = FalSTTService(
api_key="your-api-key",
model="fal-ai/whisper-large-v3",
language="en"
)
"""
def __init__(
self,
api_key: str,
model: str,
language: str = "en",
params: Optional[Dict] = None,
**kwargs
):
passLearn more: Fal AI Documentation
{ .api }
from pipecat.services.hathora import HathoraSTTService
class HathoraSTTService(STTService):
"""Hathora STT service integration.
Speech recognition integrated with Hathora's game backend
infrastructure for multiplayer gaming applications.
Args:
api_key: Hathora API key
model: STT model identifier
params: STT parameters
Example:
stt = HathoraSTTService(
api_key="your-api-key",
model="hathora-stt"
)
"""
def __init__(
self,
api_key: str,
model: str,
params: Optional[Dict] = None,
**kwargs
):
passLearn more: Hathora Documentation
{ .api }
from pipecat.services.cartesia import CartesiaSTTService
class CartesiaSTTService(STTService):
"""Cartesia STT service integration.
Low-latency speech recognition optimized for real-time applications.
Args:
api_key: Cartesia API key
model: STT model identifier
language: Language code
params: STT parameters
Example:
stt = CartesiaSTTService(
api_key="your-api-key",
model="cartesia-stt",
language="en"
)
"""
def __init__(
self,
api_key: str,
model: Optional[str] = None,
language: str = "en",
params: Optional[Dict] = None,
**kwargs
):
passLearn more: Cartesia Documentation
{ .api }
from pipecat.services.gradium import GradiumSTTService
class GradiumSTTService(STTService):
"""Gradium STT service integration.
Speech recognition via the Gradium platform with customization options.
Args:
api_key: Gradium API key
model: STT model identifier
params: STT parameters
Example:
stt = GradiumSTTService(
api_key="your-api-key",
model="gradium-stt"
)
"""
def __init__(
self,
api_key: str,
model: str,
params: Optional[Dict] = None,
**kwargs
):
pass{ .api }
# Additional supported STT providers
from pipecat.services.sarvam import SarvamSTTService # Indian languages
# All follow similar pattern
stt = ProviderSTTService(
api_key="...",
model="...",
params={...}
){ .api }
from pipecat.services.deepgram import DeepgramSTTService
from pipecat.pipeline.pipeline import Pipeline
stt = DeepgramSTTService(
api_key="...",
model="nova-2"
)
pipeline = Pipeline([
transport.input(), # Audio from user
stt, # Transcribes to text
user_aggregator,
llm_service
]){ .api }
# Enable interim results for real-time feedback
stt = DeepgramSTTService(
api_key="...",
params={"interim_results": True}
)
# Monitor interim transcriptions
class InterimMonitor(FrameProcessor):
async def process_frame(self, frame, direction):
if isinstance(frame, InterimTranscriptionFrame):
print(f"Interim: {frame.text}")
elif isinstance(frame, TranscriptionFrame):
print(f"Final: {frame.text}")
await self.push_frame(frame, direction){ .api }
from pipecat.frames.frames import STTUpdateSettingsFrame
# Change language at runtime
update_frame = STTUpdateSettingsFrame(language="es")
await task.queue_frame(update_frame){ .api }
from pipecat.frames.frames import STTMuteFrame
# Mute STT (stop transcription)
await task.queue_frame(STTMuteFrame(muted=True))
# Unmute STT
await task.queue_frame(STTMuteFrame(muted=False)){ .api }
@stt.event_handler("on_connection_error")
async def handle_error(error: Exception):
print(f"STT connection error: {error}")
# Reconnect or use fallback{ .api }
# Good: Fast model for real-time
stt = DeepgramSTTService(model="nova-2") # Low latency
# Good: Accurate model for transcription
stt = AssemblyAISTTService() # High accuracy
# Bad: Slow model for real-time
stt = WhisperSTTService(model="large") # Too slow for real-time{ .api }
# Ensure transport and STT sample rates match
transport = DailyTransport(
params=DailyParams(
audio_in_sample_rate=16000 # 16kHz
)
)
stt = DeepgramSTTService(...) # Expects 16kHz
# Sample rates match - no resampling needed