tessl/pypi-deepgram-sdk

The official Python SDK for the Deepgram automated speech recognition platform.

—

Pending

Overview

Eval results

Files

Audio Utilities

Name: tessl/pypi-deepgram-sdk
Author: tessl

Utility classes for audio input/output operations including microphone capture and speaker playback, with configurable audio parameters and error handling. These utilities simplify integration with audio hardware for real-time speech applications.

Capabilities

Microphone

Audio input utility for capturing microphone data with configurable parameters and streaming support.

class Microphone:
    def __init__(
        self,
        rate: int = INPUT_RATE,
        chunk: int = INPUT_CHUNK,
        channels: int = INPUT_CHANNELS,
        input_device_index: int = None,
        callback: callable = None,
        verbose: int = INPUT_LOGGING,
        **kwargs
    ):
        """
        Initialize microphone capture.
        
        Args:
            rate: Sample rate in Hz (default: 16000)
            chunk: Buffer size in samples (default: 8192)
            channels: Number of audio channels (default: 1)
            input_device_index: Specific input device to use
            callback: Callback function for audio data
            verbose: Logging level
            **kwargs: Additional PyAudio parameters
        """
    
    def start(self) -> bool:
        """
        Start microphone capture.
        
        Returns:
            bool: True if capture started successfully
        """
    
    def finish(self) -> bool:
        """
        Stop microphone capture and clean up resources.
        
        Returns:
            bool: True if capture stopped successfully
        """
    
    def is_active(self) -> bool:
        """
        Check if microphone is currently capturing.
        
        Returns:
            bool: True if microphone is active
        """
    
    def get_stream(self):
        """
        Get the underlying audio stream object.
        
        Returns:
            PyAudio stream object
        """

Speaker

Audio output utility for playing audio data with configurable parameters and streaming support.

class Speaker:
    def __init__(
        self,
        rate: int = OUTPUT_RATE,
        chunk: int = OUTPUT_CHUNK,
        channels: int = OUTPUT_CHANNELS,
        output_device_index: int = None,
        verbose: int = OUTPUT_LOGGING,
        **kwargs
    ):
        """
        Initialize speaker playback.
        
        Args:
            rate: Sample rate in Hz (default: 24000)
            chunk: Buffer size in samples (default: 8192)
            channels: Number of audio channels (default: 1)
            output_device_index: Specific output device to use
            verbose: Logging level
            **kwargs: Additional PyAudio parameters
        """
    
    def start(self) -> bool:
        """
        Start speaker playback.
        
        Returns:
            bool: True if playback started successfully
        """
    
    def finish(self) -> bool:
        """
        Stop speaker playback and clean up resources.
        
        Returns:
            bool: True if playback stopped successfully
        """
    
    def is_active(self) -> bool:
        """
        Check if speaker is currently playing.
        
        Returns:
            bool: True if speaker is active
        """
    
    def play(self, audio_data: bytes) -> bool:
        """
        Play audio data.
        
        Args:
            audio_data: Raw audio bytes to play
            
        Returns:
            bool: True if audio was queued successfully
        """
    
    def get_stream(self):
        """
        Get the underlying audio stream object.
        
        Returns:
            PyAudio stream object
        """

Audio Constants

Predefined constants for audio configuration with sensible defaults for speech applications.

# Microphone/Input Constants
INPUT_LOGGING: int = 10  # Logging level
INPUT_CHANNELS: int = 1  # Mono audio
INPUT_RATE: int = 16000  # 16kHz sample rate
INPUT_CHUNK: int = 8192  # 8K samples per chunk

# Speaker/Output Constants  
OUTPUT_LOGGING: int = 10  # Logging level
OUTPUT_CHANNELS: int = 1  # Mono audio
OUTPUT_RATE: int = 24000  # 24kHz sample rate
OUTPUT_CHUNK: int = 8192  # 8K samples per chunk
OUTPUT_PLAYBACK_DELTA: float = 0.1  # Playback timing delta

# Legacy aliases (for backward compatibility)
LOGGING: int = INPUT_LOGGING
CHANNELS: int = INPUT_CHANNELS
RATE: int = INPUT_RATE
CHUNK: int = INPUT_CHUNK

Error Classes

Specific exception classes for audio-related errors.

class DeepgramMicrophoneError(Exception):
    """
    Exception raised for microphone operation errors.
    
    Covers issues like device not found, permission denied,
    hardware failures, or configuration problems.
    """

class DeepgramSpeakerError(Exception):
    """
    Exception raised for speaker operation errors.
    
    Covers issues like device not found, audio format problems,
    hardware failures, or configuration problems.
    """

Usage Examples

Basic Microphone Capture

from deepgram import Microphone, DeepgramMicrophoneError

try:
    # Create microphone with default settings
    microphone = Microphone()
    
    # Start capturing
    if microphone.start():
        print("Microphone started successfully")
        
        # Check if actively capturing
        if microphone.is_active():
            print("Microphone is capturing audio")
        
        # Stop capturing when done
        microphone.finish()
        print("Microphone stopped")
    else:
        print("Failed to start microphone")
        
except DeepgramMicrophoneError as e:
    print(f"Microphone error: {e}")

Custom Microphone Configuration

from deepgram import Microphone, INPUT_RATE, INPUT_CHUNK, INPUT_CHANNELS

# Custom configuration for specific use case
microphone = Microphone(
    rate=22050,  # Higher sample rate
    chunk=4096,  # Smaller buffer for lower latency
    channels=2,  # Stereo input
    input_device_index=1,  # Specific device
    verbose=20  # More verbose logging
)

if microphone.start():
    print(f"Microphone started with custom settings:")
    print(f"  Rate: 22050 Hz")
    print(f"  Chunk: 4096 samples")
    print(f"  Channels: 2")
    
    # Use for a period of time
    # ... your application logic ...
    
    microphone.finish()

Microphone with Callback

from deepgram import Microphone
import queue
import threading

# Audio data queue for processing
audio_queue = queue.Queue()

def audio_callback(audio_data, frame_count, time_info, status):
    """Callback function to handle audio data"""
    if status:
        print(f"Audio callback status: {status}")
    
    # Queue audio data for processing
    audio_queue.put(audio_data)
    
    return (None, 0)  # Continue recording

# Create microphone with callback
microphone = Microphone(
    callback=audio_callback,
    rate=16000,
    chunk=1024  # Smaller chunks for more frequent callbacks
)

def process_audio():
    """Process audio data from queue"""
    while True:
        try:
            audio_data = audio_queue.get(timeout=1.0)
            # Process the audio data
            print(f"Processing {len(audio_data)} bytes of audio")
            # Send to Deepgram, save to file, etc.
            
        except queue.Empty:
            continue
        except KeyboardInterrupt:
            break

# Start audio processing thread
processing_thread = threading.Thread(target=process_audio)
processing_thread.daemon = True
processing_thread.start()

# Start microphone
if microphone.start():
    print("Recording with callback... Press Ctrl+C to stop")
    try:
        while microphone.is_active():
            # Keep the main thread alive
            threading.Event().wait(0.1)
    except KeyboardInterrupt:
        print("Stopping...")
    finally:
        microphone.finish()

Basic Speaker Playback

from deepgram import Speaker, DeepgramSpeakerError

try:
    # Create speaker with default settings
    speaker = Speaker()
    
    # Start playback
    if speaker.start():
        print("Speaker started successfully")
        
        # Load audio data (example: from file)
        with open("audio.wav", "rb") as f:
            audio_data = f.read()
        
        # Play the audio
        if speaker.play(audio_data):
            print("Audio queued for playback")
        
        # Wait for playback to complete or stop manually
        # speaker.finish() when done
        
    else:
        print("Failed to start speaker")
        
except DeepgramSpeakerError as e:
    print(f"Speaker error: {e}")

Custom Speaker Configuration

from deepgram import Speaker

# High-quality audio playback configuration
speaker = Speaker(
    rate=48000,  # High sample rate
    chunk=2048,  # Smaller chunks for lower latency
    channels=2,  # Stereo output
    output_device_index=0,  # Default output device
    verbose=10  # Standard logging
)

if speaker.start():
    print("High-quality speaker started")
    
    # Play multiple audio clips
    audio_files = ["intro.wav", "content.wav", "outro.wav"]
    
    for filename in audio_files:
        with open(filename, "rb") as f:
            audio_data = f.read()
        
        print(f"Playing {filename}")
        speaker.play(audio_data)
        
        # Wait between clips if needed
        # time.sleep(0.5)
    
    # Clean up
    speaker.finish()

Integrated Microphone and Speaker

from deepgram import Microphone, Speaker, DeepgramClient
import threading
import queue

# Audio processing setup
client = DeepgramClient(api_key="your-api-key")
audio_queue = queue.Queue()
text_queue = queue.Queue()

def microphone_callback(audio_data, frame_count, time_info, status):
    """Capture audio data"""
    audio_queue.put(audio_data)
    return (None, 0)

def process_speech():
    """Process speech-to-text and text-to-speech"""
    while True:
        try:
            # Get audio from microphone
            audio_data = audio_queue.get(timeout=1.0)
            
            # Send to Deepgram STT (simplified example)
            # In practice, you'd use WebSocket for real-time
            response = client.listen.rest.transcribe(
                {"buffer": audio_data},
                {"model": "nova-2", "interim_results": True}
            )
            
            text = response.results.channels[0].alternatives[0].transcript
            if text.strip():
                print(f"Heard: {text}")
                
                # Generate response (example)
                response_text = f"You said: {text}"
                
                # Convert to speech
                tts_response = client.speak.rest.synthesize(
                    {"text": response_text},
                    {"model": "aura-asteria-en"}
                )
                
                # Queue for playback
                text_queue.put(tts_response.content)
                
        except queue.Empty:
            continue
        except KeyboardInterrupt:
            break

def play_responses():
    """Play TTS responses"""
    speaker = Speaker()
    if speaker.start():
        while True:
            try:
                audio_data = text_queue.get(timeout=1.0)
                speaker.play(audio_data)
            except queue.Empty:
                continue
            except KeyboardInterrupt:
                break
        speaker.finish()

# Set up microphone
microphone = Microphone(callback=microphone_callback)

# Start processing threads
speech_thread = threading.Thread(target=process_speech)
playback_thread = threading.Thread(target=play_responses)

speech_thread.daemon = True
playback_thread.daemon = True

speech_thread.start()
playback_thread.start()

# Start microphone
if microphone.start():
    print("Voice interaction started. Speak and hear responses...")
    try:
        while True:
            threading.Event().wait(0.1)
    except KeyboardInterrupt:
        print("Stopping voice interaction...")
    finally:
        microphone.finish()

Device Discovery and Selection

import pyaudio
from deepgram import Microphone, Speaker

def list_audio_devices():
    """List available audio input and output devices"""
    p = pyaudio.PyAudio()
    
    print("Available Audio Devices:")
    print("=" * 50)
    
    for i in range(p.get_device_count()):
        info = p.get_device_info_by_index(i)
        print(f"Device {i}: {info['name']}")
        print(f"  Max Input Channels: {info['maxInputChannels']}")
        print(f"  Max Output Channels: {info['maxOutputChannels']}")
        print(f"  Default Sample Rate: {info['defaultSampleRate']}")
        print()
    
    p.terminate()

def use_specific_devices():
    """Use specific audio devices"""
    list_audio_devices()
    
    # Use specific devices based on discovery
    input_device = 1   # Replace with desired input device index
    output_device = 2  # Replace with desired output device index
    
    microphone = Microphone(
        input_device_index=input_device,
        rate=16000,
        channels=1
    )
    
    speaker = Speaker(
        output_device_index=output_device,
        rate=24000,
        channels=1
    )
    
    print(f"Using input device {input_device} and output device {output_device}")
    
    # Use the configured devices
    if microphone.start() and speaker.start():
        print("Both devices started successfully")
        # ... use devices ...
        microphone.finish()
        speaker.finish()

# Run device discovery
use_specific_devices()

Error Handling and Diagnostics

from deepgram import Microphone, Speaker, DeepgramMicrophoneError, DeepgramSpeakerError
import pyaudio

def test_audio_system():
    """Test audio system with comprehensive error handling"""
    
    # Test microphone
    print("Testing microphone...")
    try:
        microphone = Microphone(
            rate=16000,
            chunk=1024,
            channels=1,
            verbose=20  # Verbose logging for debugging
        )
        
        if microphone.start():
            print("✓ Microphone test passed")
            microphone.finish()
        else:
            print("✗ Microphone failed to start")
            
    except DeepgramMicrophoneError as e:
        print(f"✗ Microphone error: {e}")
    except Exception as e:
        print(f"✗ Unexpected microphone error: {e}")
    
    # Test speaker
    print("\nTesting speaker...")
    try:
        speaker = Speaker(
            rate=24000,
            chunk=1024,
            channels=1,
            verbose=20  # Verbose logging for debugging
        )
        
        if speaker.start():
            print("✓ Speaker test passed")
            
            # Test with silent audio data
            silent_audio = b'\x00' * 1024  # 1024 bytes of silence
            if speaker.play(silent_audio):
                print("✓ Audio playback test passed")
            else:
                print("✗ Audio playback test failed")
                
            speaker.finish()
        else:
            print("✗ Speaker failed to start")
            
    except DeepgramSpeakerError as e:
        print(f"✗ Speaker error: {e}")
    except Exception as e:
        print(f"✗ Unexpected speaker error: {e}")
    
    # Test PyAudio availability
    print("\nTesting PyAudio...")
    try:
        p = pyaudio.PyAudio()
        device_count = p.get_device_count()
        print(f"✓ PyAudio found {device_count} audio devices")
        p.terminate()
    except Exception as e:
        print(f"✗ PyAudio error: {e}")
        print("  Try: pip install pyaudio")

# Run comprehensive audio test
test_audio_system()

Install with Tessl CLI