0
# Audio Utilities
1
2
Utility classes for audio input/output operations including microphone capture and speaker playback, with configurable audio parameters and error handling. These utilities simplify integration with audio hardware for real-time speech applications.
3
4
## Capabilities
5
6
### Microphone
7
8
Audio input utility for capturing microphone data with configurable parameters and streaming support.
9
10
```python { .api }
11
class Microphone:
12
def __init__(
13
self,
14
rate: int = INPUT_RATE,
15
chunk: int = INPUT_CHUNK,
16
channels: int = INPUT_CHANNELS,
17
input_device_index: int = None,
18
callback: callable = None,
19
verbose: int = INPUT_LOGGING,
20
**kwargs
21
):
22
"""
23
Initialize microphone capture.
24
25
Args:
26
rate: Sample rate in Hz (default: 16000)
27
chunk: Buffer size in samples (default: 8192)
28
channels: Number of audio channels (default: 1)
29
input_device_index: Specific input device to use
30
callback: Callback function for audio data
31
verbose: Logging level
32
**kwargs: Additional PyAudio parameters
33
"""
34
35
def start(self) -> bool:
36
"""
37
Start microphone capture.
38
39
Returns:
40
bool: True if capture started successfully
41
"""
42
43
def finish(self) -> bool:
44
"""
45
Stop microphone capture and clean up resources.
46
47
Returns:
48
bool: True if capture stopped successfully
49
"""
50
51
def is_active(self) -> bool:
52
"""
53
Check if microphone is currently capturing.
54
55
Returns:
56
bool: True if microphone is active
57
"""
58
59
def get_stream(self):
60
"""
61
Get the underlying audio stream object.
62
63
Returns:
64
PyAudio stream object
65
"""
66
```
67
68
### Speaker
69
70
Audio output utility for playing audio data with configurable parameters and streaming support.
71
72
```python { .api }
73
class Speaker:
74
def __init__(
75
self,
76
rate: int = OUTPUT_RATE,
77
chunk: int = OUTPUT_CHUNK,
78
channels: int = OUTPUT_CHANNELS,
79
output_device_index: int = None,
80
verbose: int = OUTPUT_LOGGING,
81
**kwargs
82
):
83
"""
84
Initialize speaker playback.
85
86
Args:
87
rate: Sample rate in Hz (default: 24000)
88
chunk: Buffer size in samples (default: 8192)
89
channels: Number of audio channels (default: 1)
90
output_device_index: Specific output device to use
91
verbose: Logging level
92
**kwargs: Additional PyAudio parameters
93
"""
94
95
def start(self) -> bool:
96
"""
97
Start speaker playback.
98
99
Returns:
100
bool: True if playback started successfully
101
"""
102
103
def finish(self) -> bool:
104
"""
105
Stop speaker playback and clean up resources.
106
107
Returns:
108
bool: True if playback stopped successfully
109
"""
110
111
def is_active(self) -> bool:
112
"""
113
Check if speaker is currently playing.
114
115
Returns:
116
bool: True if speaker is active
117
"""
118
119
def play(self, audio_data: bytes) -> bool:
120
"""
121
Play audio data.
122
123
Args:
124
audio_data: Raw audio bytes to play
125
126
Returns:
127
bool: True if audio was queued successfully
128
"""
129
130
def get_stream(self):
131
"""
132
Get the underlying audio stream object.
133
134
Returns:
135
PyAudio stream object
136
"""
137
```
138
139
### Audio Constants
140
141
Predefined constants for audio configuration with sensible defaults for speech applications.
142
143
```python { .api }
144
# Microphone/Input Constants
145
INPUT_LOGGING: int = 10 # Logging level
146
INPUT_CHANNELS: int = 1 # Mono audio
147
INPUT_RATE: int = 16000 # 16kHz sample rate
148
INPUT_CHUNK: int = 8192 # 8K samples per chunk
149
150
# Speaker/Output Constants
151
OUTPUT_LOGGING: int = 10 # Logging level
152
OUTPUT_CHANNELS: int = 1 # Mono audio
153
OUTPUT_RATE: int = 24000 # 24kHz sample rate
154
OUTPUT_CHUNK: int = 8192 # 8K samples per chunk
155
OUTPUT_PLAYBACK_DELTA: float = 0.1 # Playback timing delta
156
157
# Legacy aliases (for backward compatibility)
158
LOGGING: int = INPUT_LOGGING
159
CHANNELS: int = INPUT_CHANNELS
160
RATE: int = INPUT_RATE
161
CHUNK: int = INPUT_CHUNK
162
```
163
164
### Error Classes
165
166
Specific exception classes for audio-related errors.
167
168
```python { .api }
169
class DeepgramMicrophoneError(Exception):
170
"""
171
Exception raised for microphone operation errors.
172
173
Covers issues like device not found, permission denied,
174
hardware failures, or configuration problems.
175
"""
176
177
class DeepgramSpeakerError(Exception):
178
"""
179
Exception raised for speaker operation errors.
180
181
Covers issues like device not found, audio format problems,
182
hardware failures, or configuration problems.
183
"""
184
```
185
186
## Usage Examples
187
188
### Basic Microphone Capture
189
190
```python
191
from deepgram import Microphone, DeepgramMicrophoneError
192
193
try:
194
# Create microphone with default settings
195
microphone = Microphone()
196
197
# Start capturing
198
if microphone.start():
199
print("Microphone started successfully")
200
201
# Check if actively capturing
202
if microphone.is_active():
203
print("Microphone is capturing audio")
204
205
# Stop capturing when done
206
microphone.finish()
207
print("Microphone stopped")
208
else:
209
print("Failed to start microphone")
210
211
except DeepgramMicrophoneError as e:
212
print(f"Microphone error: {e}")
213
```
214
215
### Custom Microphone Configuration
216
217
```python
218
from deepgram import Microphone, INPUT_RATE, INPUT_CHUNK, INPUT_CHANNELS
219
220
# Custom configuration for specific use case
221
microphone = Microphone(
222
rate=22050, # Higher sample rate
223
chunk=4096, # Smaller buffer for lower latency
224
channels=2, # Stereo input
225
input_device_index=1, # Specific device
226
verbose=20 # More verbose logging
227
)
228
229
if microphone.start():
230
print(f"Microphone started with custom settings:")
231
print(f" Rate: 22050 Hz")
232
print(f" Chunk: 4096 samples")
233
print(f" Channels: 2")
234
235
# Use for a period of time
236
# ... your application logic ...
237
238
microphone.finish()
239
```
240
241
### Microphone with Callback
242
243
```python
244
from deepgram import Microphone
245
import queue
246
import threading
247
248
# Audio data queue for processing
249
audio_queue = queue.Queue()
250
251
def audio_callback(audio_data, frame_count, time_info, status):
252
"""Callback function to handle audio data"""
253
if status:
254
print(f"Audio callback status: {status}")
255
256
# Queue audio data for processing
257
audio_queue.put(audio_data)
258
259
return (None, 0) # Continue recording
260
261
# Create microphone with callback
262
microphone = Microphone(
263
callback=audio_callback,
264
rate=16000,
265
chunk=1024 # Smaller chunks for more frequent callbacks
266
)
267
268
def process_audio():
269
"""Process audio data from queue"""
270
while True:
271
try:
272
audio_data = audio_queue.get(timeout=1.0)
273
# Process the audio data
274
print(f"Processing {len(audio_data)} bytes of audio")
275
# Send to Deepgram, save to file, etc.
276
277
except queue.Empty:
278
continue
279
except KeyboardInterrupt:
280
break
281
282
# Start audio processing thread
283
processing_thread = threading.Thread(target=process_audio)
284
processing_thread.daemon = True
285
processing_thread.start()
286
287
# Start microphone
288
if microphone.start():
289
print("Recording with callback... Press Ctrl+C to stop")
290
try:
291
while microphone.is_active():
292
# Keep the main thread alive
293
threading.Event().wait(0.1)
294
except KeyboardInterrupt:
295
print("Stopping...")
296
finally:
297
microphone.finish()
298
```
299
300
### Basic Speaker Playback
301
302
```python
303
from deepgram import Speaker, DeepgramSpeakerError
304
305
try:
306
# Create speaker with default settings
307
speaker = Speaker()
308
309
# Start playback
310
if speaker.start():
311
print("Speaker started successfully")
312
313
# Load audio data (example: from file)
314
with open("audio.wav", "rb") as f:
315
audio_data = f.read()
316
317
# Play the audio
318
if speaker.play(audio_data):
319
print("Audio queued for playback")
320
321
# Wait for playback to complete or stop manually
322
# speaker.finish() when done
323
324
else:
325
print("Failed to start speaker")
326
327
except DeepgramSpeakerError as e:
328
print(f"Speaker error: {e}")
329
```
330
331
### Custom Speaker Configuration
332
333
```python
334
from deepgram import Speaker
335
336
# High-quality audio playback configuration
337
speaker = Speaker(
338
rate=48000, # High sample rate
339
chunk=2048, # Smaller chunks for lower latency
340
channels=2, # Stereo output
341
output_device_index=0, # Default output device
342
verbose=10 # Standard logging
343
)
344
345
if speaker.start():
346
print("High-quality speaker started")
347
348
# Play multiple audio clips
349
audio_files = ["intro.wav", "content.wav", "outro.wav"]
350
351
for filename in audio_files:
352
with open(filename, "rb") as f:
353
audio_data = f.read()
354
355
print(f"Playing {filename}")
356
speaker.play(audio_data)
357
358
# Wait between clips if needed
359
# time.sleep(0.5)
360
361
# Clean up
362
speaker.finish()
363
```
364
365
### Integrated Microphone and Speaker
366
367
```python
368
from deepgram import Microphone, Speaker, DeepgramClient
369
import threading
370
import queue
371
372
# Audio processing setup
373
client = DeepgramClient(api_key="your-api-key")
374
audio_queue = queue.Queue()
375
text_queue = queue.Queue()
376
377
def microphone_callback(audio_data, frame_count, time_info, status):
378
"""Capture audio data"""
379
audio_queue.put(audio_data)
380
return (None, 0)
381
382
def process_speech():
383
"""Process speech-to-text and text-to-speech"""
384
while True:
385
try:
386
# Get audio from microphone
387
audio_data = audio_queue.get(timeout=1.0)
388
389
# Send to Deepgram STT (simplified example)
390
# In practice, you'd use WebSocket for real-time
391
response = client.listen.rest.transcribe(
392
{"buffer": audio_data},
393
{"model": "nova-2", "interim_results": True}
394
)
395
396
text = response.results.channels[0].alternatives[0].transcript
397
if text.strip():
398
print(f"Heard: {text}")
399
400
# Generate response (example)
401
response_text = f"You said: {text}"
402
403
# Convert to speech
404
tts_response = client.speak.rest.synthesize(
405
{"text": response_text},
406
{"model": "aura-asteria-en"}
407
)
408
409
# Queue for playback
410
text_queue.put(tts_response.content)
411
412
except queue.Empty:
413
continue
414
except KeyboardInterrupt:
415
break
416
417
def play_responses():
418
"""Play TTS responses"""
419
speaker = Speaker()
420
if speaker.start():
421
while True:
422
try:
423
audio_data = text_queue.get(timeout=1.0)
424
speaker.play(audio_data)
425
except queue.Empty:
426
continue
427
except KeyboardInterrupt:
428
break
429
speaker.finish()
430
431
# Set up microphone
432
microphone = Microphone(callback=microphone_callback)
433
434
# Start processing threads
435
speech_thread = threading.Thread(target=process_speech)
436
playback_thread = threading.Thread(target=play_responses)
437
438
speech_thread.daemon = True
439
playback_thread.daemon = True
440
441
speech_thread.start()
442
playback_thread.start()
443
444
# Start microphone
445
if microphone.start():
446
print("Voice interaction started. Speak and hear responses...")
447
try:
448
while True:
449
threading.Event().wait(0.1)
450
except KeyboardInterrupt:
451
print("Stopping voice interaction...")
452
finally:
453
microphone.finish()
454
```
455
456
### Device Discovery and Selection
457
458
```python
459
import pyaudio
460
from deepgram import Microphone, Speaker
461
462
def list_audio_devices():
463
"""List available audio input and output devices"""
464
p = pyaudio.PyAudio()
465
466
print("Available Audio Devices:")
467
print("=" * 50)
468
469
for i in range(p.get_device_count()):
470
info = p.get_device_info_by_index(i)
471
print(f"Device {i}: {info['name']}")
472
print(f" Max Input Channels: {info['maxInputChannels']}")
473
print(f" Max Output Channels: {info['maxOutputChannels']}")
474
print(f" Default Sample Rate: {info['defaultSampleRate']}")
475
print()
476
477
p.terminate()
478
479
def use_specific_devices():
480
"""Use specific audio devices"""
481
list_audio_devices()
482
483
# Use specific devices based on discovery
484
input_device = 1 # Replace with desired input device index
485
output_device = 2 # Replace with desired output device index
486
487
microphone = Microphone(
488
input_device_index=input_device,
489
rate=16000,
490
channels=1
491
)
492
493
speaker = Speaker(
494
output_device_index=output_device,
495
rate=24000,
496
channels=1
497
)
498
499
print(f"Using input device {input_device} and output device {output_device}")
500
501
# Use the configured devices
502
if microphone.start() and speaker.start():
503
print("Both devices started successfully")
504
# ... use devices ...
505
microphone.finish()
506
speaker.finish()
507
508
# Run device discovery
509
use_specific_devices()
510
```
511
512
### Error Handling and Diagnostics
513
514
```python
515
from deepgram import Microphone, Speaker, DeepgramMicrophoneError, DeepgramSpeakerError
516
import pyaudio
517
518
def test_audio_system():
519
"""Test audio system with comprehensive error handling"""
520
521
# Test microphone
522
print("Testing microphone...")
523
try:
524
microphone = Microphone(
525
rate=16000,
526
chunk=1024,
527
channels=1,
528
verbose=20 # Verbose logging for debugging
529
)
530
531
if microphone.start():
532
print("✓ Microphone test passed")
533
microphone.finish()
534
else:
535
print("✗ Microphone failed to start")
536
537
except DeepgramMicrophoneError as e:
538
print(f"✗ Microphone error: {e}")
539
except Exception as e:
540
print(f"✗ Unexpected microphone error: {e}")
541
542
# Test speaker
543
print("\nTesting speaker...")
544
try:
545
speaker = Speaker(
546
rate=24000,
547
chunk=1024,
548
channels=1,
549
verbose=20 # Verbose logging for debugging
550
)
551
552
if speaker.start():
553
print("✓ Speaker test passed")
554
555
# Test with silent audio data
556
silent_audio = b'\x00' * 1024 # 1024 bytes of silence
557
if speaker.play(silent_audio):
558
print("✓ Audio playback test passed")
559
else:
560
print("✗ Audio playback test failed")
561
562
speaker.finish()
563
else:
564
print("✗ Speaker failed to start")
565
566
except DeepgramSpeakerError as e:
567
print(f"✗ Speaker error: {e}")
568
except Exception as e:
569
print(f"✗ Unexpected speaker error: {e}")
570
571
# Test PyAudio availability
572
print("\nTesting PyAudio...")
573
try:
574
p = pyaudio.PyAudio()
575
device_count = p.get_device_count()
576
print(f"✓ PyAudio found {device_count} audio devices")
577
p.terminate()
578
except Exception as e:
579
print(f"✗ PyAudio error: {e}")
580
print(" Try: pip install pyaudio")
581
582
# Run comprehensive audio test
583
test_audio_system()
584
```