The official Python SDK for the Deepgram automated speech recognition platform.
—
Utility classes for audio input/output operations including microphone capture and speaker playback, with configurable audio parameters and error handling. These utilities simplify integration with audio hardware for real-time speech applications.
Audio input utility for capturing microphone data with configurable parameters and streaming support.
class Microphone:
def __init__(
self,
rate: int = INPUT_RATE,
chunk: int = INPUT_CHUNK,
channels: int = INPUT_CHANNELS,
input_device_index: int = None,
callback: callable = None,
verbose: int = INPUT_LOGGING,
**kwargs
):
"""
Initialize microphone capture.
Args:
rate: Sample rate in Hz (default: 16000)
chunk: Buffer size in samples (default: 8192)
channels: Number of audio channels (default: 1)
input_device_index: Specific input device to use
callback: Callback function for audio data
verbose: Logging level
**kwargs: Additional PyAudio parameters
"""
def start(self) -> bool:
"""
Start microphone capture.
Returns:
bool: True if capture started successfully
"""
def finish(self) -> bool:
"""
Stop microphone capture and clean up resources.
Returns:
bool: True if capture stopped successfully
"""
def is_active(self) -> bool:
"""
Check if microphone is currently capturing.
Returns:
bool: True if microphone is active
"""
def get_stream(self):
"""
Get the underlying audio stream object.
Returns:
PyAudio stream object
"""Audio output utility for playing audio data with configurable parameters and streaming support.
class Speaker:
def __init__(
self,
rate: int = OUTPUT_RATE,
chunk: int = OUTPUT_CHUNK,
channels: int = OUTPUT_CHANNELS,
output_device_index: int = None,
verbose: int = OUTPUT_LOGGING,
**kwargs
):
"""
Initialize speaker playback.
Args:
rate: Sample rate in Hz (default: 24000)
chunk: Buffer size in samples (default: 8192)
channels: Number of audio channels (default: 1)
output_device_index: Specific output device to use
verbose: Logging level
**kwargs: Additional PyAudio parameters
"""
def start(self) -> bool:
"""
Start speaker playback.
Returns:
bool: True if playback started successfully
"""
def finish(self) -> bool:
"""
Stop speaker playback and clean up resources.
Returns:
bool: True if playback stopped successfully
"""
def is_active(self) -> bool:
"""
Check if speaker is currently playing.
Returns:
bool: True if speaker is active
"""
def play(self, audio_data: bytes) -> bool:
"""
Play audio data.
Args:
audio_data: Raw audio bytes to play
Returns:
bool: True if audio was queued successfully
"""
def get_stream(self):
"""
Get the underlying audio stream object.
Returns:
PyAudio stream object
"""Predefined constants for audio configuration with sensible defaults for speech applications.
# Microphone/Input Constants
INPUT_LOGGING: int = 10 # Logging level
INPUT_CHANNELS: int = 1 # Mono audio
INPUT_RATE: int = 16000 # 16kHz sample rate
INPUT_CHUNK: int = 8192 # 8K samples per chunk
# Speaker/Output Constants
OUTPUT_LOGGING: int = 10 # Logging level
OUTPUT_CHANNELS: int = 1 # Mono audio
OUTPUT_RATE: int = 24000 # 24kHz sample rate
OUTPUT_CHUNK: int = 8192 # 8K samples per chunk
OUTPUT_PLAYBACK_DELTA: float = 0.1 # Playback timing delta
# Legacy aliases (for backward compatibility)
LOGGING: int = INPUT_LOGGING
CHANNELS: int = INPUT_CHANNELS
RATE: int = INPUT_RATE
CHUNK: int = INPUT_CHUNKSpecific exception classes for audio-related errors.
class DeepgramMicrophoneError(Exception):
"""
Exception raised for microphone operation errors.
Covers issues like device not found, permission denied,
hardware failures, or configuration problems.
"""
class DeepgramSpeakerError(Exception):
"""
Exception raised for speaker operation errors.
Covers issues like device not found, audio format problems,
hardware failures, or configuration problems.
"""from deepgram import Microphone, DeepgramMicrophoneError
try:
# Create microphone with default settings
microphone = Microphone()
# Start capturing
if microphone.start():
print("Microphone started successfully")
# Check if actively capturing
if microphone.is_active():
print("Microphone is capturing audio")
# Stop capturing when done
microphone.finish()
print("Microphone stopped")
else:
print("Failed to start microphone")
except DeepgramMicrophoneError as e:
print(f"Microphone error: {e}")from deepgram import Microphone, INPUT_RATE, INPUT_CHUNK, INPUT_CHANNELS
# Custom configuration for specific use case
microphone = Microphone(
rate=22050, # Higher sample rate
chunk=4096, # Smaller buffer for lower latency
channels=2, # Stereo input
input_device_index=1, # Specific device
verbose=20 # More verbose logging
)
if microphone.start():
print(f"Microphone started with custom settings:")
print(f" Rate: 22050 Hz")
print(f" Chunk: 4096 samples")
print(f" Channels: 2")
# Use for a period of time
# ... your application logic ...
microphone.finish()from deepgram import Microphone
import queue
import threading
# Audio data queue for processing
audio_queue = queue.Queue()
def audio_callback(audio_data, frame_count, time_info, status):
"""Callback function to handle audio data"""
if status:
print(f"Audio callback status: {status}")
# Queue audio data for processing
audio_queue.put(audio_data)
return (None, 0) # Continue recording
# Create microphone with callback
microphone = Microphone(
callback=audio_callback,
rate=16000,
chunk=1024 # Smaller chunks for more frequent callbacks
)
def process_audio():
"""Process audio data from queue"""
while True:
try:
audio_data = audio_queue.get(timeout=1.0)
# Process the audio data
print(f"Processing {len(audio_data)} bytes of audio")
# Send to Deepgram, save to file, etc.
except queue.Empty:
continue
except KeyboardInterrupt:
break
# Start audio processing thread
processing_thread = threading.Thread(target=process_audio)
processing_thread.daemon = True
processing_thread.start()
# Start microphone
if microphone.start():
print("Recording with callback... Press Ctrl+C to stop")
try:
while microphone.is_active():
# Keep the main thread alive
threading.Event().wait(0.1)
except KeyboardInterrupt:
print("Stopping...")
finally:
microphone.finish()from deepgram import Speaker, DeepgramSpeakerError
try:
# Create speaker with default settings
speaker = Speaker()
# Start playback
if speaker.start():
print("Speaker started successfully")
# Load audio data (example: from file)
with open("audio.wav", "rb") as f:
audio_data = f.read()
# Play the audio
if speaker.play(audio_data):
print("Audio queued for playback")
# Wait for playback to complete or stop manually
# speaker.finish() when done
else:
print("Failed to start speaker")
except DeepgramSpeakerError as e:
print(f"Speaker error: {e}")from deepgram import Speaker
# High-quality audio playback configuration
speaker = Speaker(
rate=48000, # High sample rate
chunk=2048, # Smaller chunks for lower latency
channels=2, # Stereo output
output_device_index=0, # Default output device
verbose=10 # Standard logging
)
if speaker.start():
print("High-quality speaker started")
# Play multiple audio clips
audio_files = ["intro.wav", "content.wav", "outro.wav"]
for filename in audio_files:
with open(filename, "rb") as f:
audio_data = f.read()
print(f"Playing {filename}")
speaker.play(audio_data)
# Wait between clips if needed
# time.sleep(0.5)
# Clean up
speaker.finish()from deepgram import Microphone, Speaker, DeepgramClient
import threading
import queue
# Audio processing setup
client = DeepgramClient(api_key="your-api-key")
audio_queue = queue.Queue()
text_queue = queue.Queue()
def microphone_callback(audio_data, frame_count, time_info, status):
"""Capture audio data"""
audio_queue.put(audio_data)
return (None, 0)
def process_speech():
"""Process speech-to-text and text-to-speech"""
while True:
try:
# Get audio from microphone
audio_data = audio_queue.get(timeout=1.0)
# Send to Deepgram STT (simplified example)
# In practice, you'd use WebSocket for real-time
response = client.listen.rest.transcribe(
{"buffer": audio_data},
{"model": "nova-2", "interim_results": True}
)
text = response.results.channels[0].alternatives[0].transcript
if text.strip():
print(f"Heard: {text}")
# Generate response (example)
response_text = f"You said: {text}"
# Convert to speech
tts_response = client.speak.rest.synthesize(
{"text": response_text},
{"model": "aura-asteria-en"}
)
# Queue for playback
text_queue.put(tts_response.content)
except queue.Empty:
continue
except KeyboardInterrupt:
break
def play_responses():
"""Play TTS responses"""
speaker = Speaker()
if speaker.start():
while True:
try:
audio_data = text_queue.get(timeout=1.0)
speaker.play(audio_data)
except queue.Empty:
continue
except KeyboardInterrupt:
break
speaker.finish()
# Set up microphone
microphone = Microphone(callback=microphone_callback)
# Start processing threads
speech_thread = threading.Thread(target=process_speech)
playback_thread = threading.Thread(target=play_responses)
speech_thread.daemon = True
playback_thread.daemon = True
speech_thread.start()
playback_thread.start()
# Start microphone
if microphone.start():
print("Voice interaction started. Speak and hear responses...")
try:
while True:
threading.Event().wait(0.1)
except KeyboardInterrupt:
print("Stopping voice interaction...")
finally:
microphone.finish()import pyaudio
from deepgram import Microphone, Speaker
def list_audio_devices():
"""List available audio input and output devices"""
p = pyaudio.PyAudio()
print("Available Audio Devices:")
print("=" * 50)
for i in range(p.get_device_count()):
info = p.get_device_info_by_index(i)
print(f"Device {i}: {info['name']}")
print(f" Max Input Channels: {info['maxInputChannels']}")
print(f" Max Output Channels: {info['maxOutputChannels']}")
print(f" Default Sample Rate: {info['defaultSampleRate']}")
print()
p.terminate()
def use_specific_devices():
"""Use specific audio devices"""
list_audio_devices()
# Use specific devices based on discovery
input_device = 1 # Replace with desired input device index
output_device = 2 # Replace with desired output device index
microphone = Microphone(
input_device_index=input_device,
rate=16000,
channels=1
)
speaker = Speaker(
output_device_index=output_device,
rate=24000,
channels=1
)
print(f"Using input device {input_device} and output device {output_device}")
# Use the configured devices
if microphone.start() and speaker.start():
print("Both devices started successfully")
# ... use devices ...
microphone.finish()
speaker.finish()
# Run device discovery
use_specific_devices()from deepgram import Microphone, Speaker, DeepgramMicrophoneError, DeepgramSpeakerError
import pyaudio
def test_audio_system():
"""Test audio system with comprehensive error handling"""
# Test microphone
print("Testing microphone...")
try:
microphone = Microphone(
rate=16000,
chunk=1024,
channels=1,
verbose=20 # Verbose logging for debugging
)
if microphone.start():
print("✓ Microphone test passed")
microphone.finish()
else:
print("✗ Microphone failed to start")
except DeepgramMicrophoneError as e:
print(f"✗ Microphone error: {e}")
except Exception as e:
print(f"✗ Unexpected microphone error: {e}")
# Test speaker
print("\nTesting speaker...")
try:
speaker = Speaker(
rate=24000,
chunk=1024,
channels=1,
verbose=20 # Verbose logging for debugging
)
if speaker.start():
print("✓ Speaker test passed")
# Test with silent audio data
silent_audio = b'\x00' * 1024 # 1024 bytes of silence
if speaker.play(silent_audio):
print("✓ Audio playback test passed")
else:
print("✗ Audio playback test failed")
speaker.finish()
else:
print("✗ Speaker failed to start")
except DeepgramSpeakerError as e:
print(f"✗ Speaker error: {e}")
except Exception as e:
print(f"✗ Unexpected speaker error: {e}")
# Test PyAudio availability
print("\nTesting PyAudio...")
try:
p = pyaudio.PyAudio()
device_count = p.get_device_count()
print(f"✓ PyAudio found {device_count} audio devices")
p.terminate()
except Exception as e:
print(f"✗ PyAudio error: {e}")
print(" Try: pip install pyaudio")
# Run comprehensive audio test
test_audio_system()Install with Tessl CLI
npx tessl i tessl/pypi-deepgram-sdk