or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-utilities.mdconversational-ai.mdindex.mdproject-management.mdspeech-to-text.mdtext-analysis.mdtext-to-speech.md

audio-utilities.mddocs/

0

# Audio Utilities

1

2

Utility classes for audio input/output operations including microphone capture and speaker playback, with configurable audio parameters and error handling. These utilities simplify integration with audio hardware for real-time speech applications.

3

4

## Capabilities

5

6

### Microphone

7

8

Audio input utility for capturing microphone data with configurable parameters and streaming support.

9

10

```python { .api }

11

class Microphone:

12

def __init__(

13

self,

14

rate: int = INPUT_RATE,

15

chunk: int = INPUT_CHUNK,

16

channels: int = INPUT_CHANNELS,

17

input_device_index: int = None,

18

callback: callable = None,

19

verbose: int = INPUT_LOGGING,

20

**kwargs

21

):

22

"""

23

Initialize microphone capture.

24

25

Args:

26

rate: Sample rate in Hz (default: 16000)

27

chunk: Buffer size in samples (default: 8192)

28

channels: Number of audio channels (default: 1)

29

input_device_index: Specific input device to use

30

callback: Callback function for audio data

31

verbose: Logging level

32

**kwargs: Additional PyAudio parameters

33

"""

34

35

def start(self) -> bool:

36

"""

37

Start microphone capture.

38

39

Returns:

40

bool: True if capture started successfully

41

"""

42

43

def finish(self) -> bool:

44

"""

45

Stop microphone capture and clean up resources.

46

47

Returns:

48

bool: True if capture stopped successfully

49

"""

50

51

def is_active(self) -> bool:

52

"""

53

Check if microphone is currently capturing.

54

55

Returns:

56

bool: True if microphone is active

57

"""

58

59

def get_stream(self):

60

"""

61

Get the underlying audio stream object.

62

63

Returns:

64

PyAudio stream object

65

"""

66

```

67

68

### Speaker

69

70

Audio output utility for playing audio data with configurable parameters and streaming support.

71

72

```python { .api }

73

class Speaker:

74

def __init__(

75

self,

76

rate: int = OUTPUT_RATE,

77

chunk: int = OUTPUT_CHUNK,

78

channels: int = OUTPUT_CHANNELS,

79

output_device_index: int = None,

80

verbose: int = OUTPUT_LOGGING,

81

**kwargs

82

):

83

"""

84

Initialize speaker playback.

85

86

Args:

87

rate: Sample rate in Hz (default: 24000)

88

chunk: Buffer size in samples (default: 8192)

89

channels: Number of audio channels (default: 1)

90

output_device_index: Specific output device to use

91

verbose: Logging level

92

**kwargs: Additional PyAudio parameters

93

"""

94

95

def start(self) -> bool:

96

"""

97

Start speaker playback.

98

99

Returns:

100

bool: True if playback started successfully

101

"""

102

103

def finish(self) -> bool:

104

"""

105

Stop speaker playback and clean up resources.

106

107

Returns:

108

bool: True if playback stopped successfully

109

"""

110

111

def is_active(self) -> bool:

112

"""

113

Check if speaker is currently playing.

114

115

Returns:

116

bool: True if speaker is active

117

"""

118

119

def play(self, audio_data: bytes) -> bool:

120

"""

121

Play audio data.

122

123

Args:

124

audio_data: Raw audio bytes to play

125

126

Returns:

127

bool: True if audio was queued successfully

128

"""

129

130

def get_stream(self):

131

"""

132

Get the underlying audio stream object.

133

134

Returns:

135

PyAudio stream object

136

"""

137

```

138

139

### Audio Constants

140

141

Predefined constants for audio configuration with sensible defaults for speech applications.

142

143

```python { .api }

144

# Microphone/Input Constants

145

INPUT_LOGGING: int = 10 # Logging level

146

INPUT_CHANNELS: int = 1 # Mono audio

147

INPUT_RATE: int = 16000 # 16kHz sample rate

148

INPUT_CHUNK: int = 8192 # 8K samples per chunk

149

150

# Speaker/Output Constants

151

OUTPUT_LOGGING: int = 10 # Logging level

152

OUTPUT_CHANNELS: int = 1 # Mono audio

153

OUTPUT_RATE: int = 24000 # 24kHz sample rate

154

OUTPUT_CHUNK: int = 8192 # 8K samples per chunk

155

OUTPUT_PLAYBACK_DELTA: float = 0.1 # Playback timing delta

156

157

# Legacy aliases (for backward compatibility)

158

LOGGING: int = INPUT_LOGGING

159

CHANNELS: int = INPUT_CHANNELS

160

RATE: int = INPUT_RATE

161

CHUNK: int = INPUT_CHUNK

162

```

163

164

### Error Classes

165

166

Specific exception classes for audio-related errors.

167

168

```python { .api }

169

class DeepgramMicrophoneError(Exception):

170

"""

171

Exception raised for microphone operation errors.

172

173

Covers issues like device not found, permission denied,

174

hardware failures, or configuration problems.

175

"""

176

177

class DeepgramSpeakerError(Exception):

178

"""

179

Exception raised for speaker operation errors.

180

181

Covers issues like device not found, audio format problems,

182

hardware failures, or configuration problems.

183

"""

184

```

185

186

## Usage Examples

187

188

### Basic Microphone Capture

189

190

```python

191

from deepgram import Microphone, DeepgramMicrophoneError

192

193

try:

194

# Create microphone with default settings

195

microphone = Microphone()

196

197

# Start capturing

198

if microphone.start():

199

print("Microphone started successfully")

200

201

# Check if actively capturing

202

if microphone.is_active():

203

print("Microphone is capturing audio")

204

205

# Stop capturing when done

206

microphone.finish()

207

print("Microphone stopped")

208

else:

209

print("Failed to start microphone")

210

211

except DeepgramMicrophoneError as e:

212

print(f"Microphone error: {e}")

213

```

214

215

### Custom Microphone Configuration

216

217

```python

218

from deepgram import Microphone, INPUT_RATE, INPUT_CHUNK, INPUT_CHANNELS

219

220

# Custom configuration for specific use case

221

microphone = Microphone(

222

rate=22050, # Higher sample rate

223

chunk=4096, # Smaller buffer for lower latency

224

channels=2, # Stereo input

225

input_device_index=1, # Specific device

226

verbose=20 # More verbose logging

227

)

228

229

if microphone.start():

230

print(f"Microphone started with custom settings:")

231

print(f" Rate: 22050 Hz")

232

print(f" Chunk: 4096 samples")

233

print(f" Channels: 2")

234

235

# Use for a period of time

236

# ... your application logic ...

237

238

microphone.finish()

239

```

240

241

### Microphone with Callback

242

243

```python

244

from deepgram import Microphone

245

import queue

246

import threading

247

248

# Audio data queue for processing

249

audio_queue = queue.Queue()

250

251

def audio_callback(audio_data, frame_count, time_info, status):

252

"""Callback function to handle audio data"""

253

if status:

254

print(f"Audio callback status: {status}")

255

256

# Queue audio data for processing

257

audio_queue.put(audio_data)

258

259

return (None, 0) # Continue recording

260

261

# Create microphone with callback

262

microphone = Microphone(

263

callback=audio_callback,

264

rate=16000,

265

chunk=1024 # Smaller chunks for more frequent callbacks

266

)

267

268

def process_audio():

269

"""Process audio data from queue"""

270

while True:

271

try:

272

audio_data = audio_queue.get(timeout=1.0)

273

# Process the audio data

274

print(f"Processing {len(audio_data)} bytes of audio")

275

# Send to Deepgram, save to file, etc.

276

277

except queue.Empty:

278

continue

279

except KeyboardInterrupt:

280

break

281

282

# Start audio processing thread

283

processing_thread = threading.Thread(target=process_audio)

284

processing_thread.daemon = True

285

processing_thread.start()

286

287

# Start microphone

288

if microphone.start():

289

print("Recording with callback... Press Ctrl+C to stop")

290

try:

291

while microphone.is_active():

292

# Keep the main thread alive

293

threading.Event().wait(0.1)

294

except KeyboardInterrupt:

295

print("Stopping...")

296

finally:

297

microphone.finish()

298

```

299

300

### Basic Speaker Playback

301

302

```python

303

from deepgram import Speaker, DeepgramSpeakerError

304

305

try:

306

# Create speaker with default settings

307

speaker = Speaker()

308

309

# Start playback

310

if speaker.start():

311

print("Speaker started successfully")

312

313

# Load audio data (example: from file)

314

with open("audio.wav", "rb") as f:

315

audio_data = f.read()

316

317

# Play the audio

318

if speaker.play(audio_data):

319

print("Audio queued for playback")

320

321

# Wait for playback to complete or stop manually

322

# speaker.finish() when done

323

324

else:

325

print("Failed to start speaker")

326

327

except DeepgramSpeakerError as e:

328

print(f"Speaker error: {e}")

329

```

330

331

### Custom Speaker Configuration

332

333

```python

334

from deepgram import Speaker

335

336

# High-quality audio playback configuration

337

speaker = Speaker(

338

rate=48000, # High sample rate

339

chunk=2048, # Smaller chunks for lower latency

340

channels=2, # Stereo output

341

output_device_index=0, # Default output device

342

verbose=10 # Standard logging

343

)

344

345

if speaker.start():

346

print("High-quality speaker started")

347

348

# Play multiple audio clips

349

audio_files = ["intro.wav", "content.wav", "outro.wav"]

350

351

for filename in audio_files:

352

with open(filename, "rb") as f:

353

audio_data = f.read()

354

355

print(f"Playing {filename}")

356

speaker.play(audio_data)

357

358

# Wait between clips if needed

359

# time.sleep(0.5)

360

361

# Clean up

362

speaker.finish()

363

```

364

365

### Integrated Microphone and Speaker

366

367

```python

368

from deepgram import Microphone, Speaker, DeepgramClient

369

import threading

370

import queue

371

372

# Audio processing setup

373

client = DeepgramClient(api_key="your-api-key")

374

audio_queue = queue.Queue()

375

text_queue = queue.Queue()

376

377

def microphone_callback(audio_data, frame_count, time_info, status):

378

"""Capture audio data"""

379

audio_queue.put(audio_data)

380

return (None, 0)

381

382

def process_speech():

383

"""Process speech-to-text and text-to-speech"""

384

while True:

385

try:

386

# Get audio from microphone

387

audio_data = audio_queue.get(timeout=1.0)

388

389

# Send to Deepgram STT (simplified example)

390

# In practice, you'd use WebSocket for real-time

391

response = client.listen.rest.transcribe(

392

{"buffer": audio_data},

393

{"model": "nova-2", "interim_results": True}

394

)

395

396

text = response.results.channels[0].alternatives[0].transcript

397

if text.strip():

398

print(f"Heard: {text}")

399

400

# Generate response (example)

401

response_text = f"You said: {text}"

402

403

# Convert to speech

404

tts_response = client.speak.rest.synthesize(

405

{"text": response_text},

406

{"model": "aura-asteria-en"}

407

)

408

409

# Queue for playback

410

text_queue.put(tts_response.content)

411

412

except queue.Empty:

413

continue

414

except KeyboardInterrupt:

415

break

416

417

def play_responses():

418

"""Play TTS responses"""

419

speaker = Speaker()

420

if speaker.start():

421

while True:

422

try:

423

audio_data = text_queue.get(timeout=1.0)

424

speaker.play(audio_data)

425

except queue.Empty:

426

continue

427

except KeyboardInterrupt:

428

break

429

speaker.finish()

430

431

# Set up microphone

432

microphone = Microphone(callback=microphone_callback)

433

434

# Start processing threads

435

speech_thread = threading.Thread(target=process_speech)

436

playback_thread = threading.Thread(target=play_responses)

437

438

speech_thread.daemon = True

439

playback_thread.daemon = True

440

441

speech_thread.start()

442

playback_thread.start()

443

444

# Start microphone

445

if microphone.start():

446

print("Voice interaction started. Speak and hear responses...")

447

try:

448

while True:

449

threading.Event().wait(0.1)

450

except KeyboardInterrupt:

451

print("Stopping voice interaction...")

452

finally:

453

microphone.finish()

454

```

455

456

### Device Discovery and Selection

457

458

```python

459

import pyaudio

460

from deepgram import Microphone, Speaker

461

462

def list_audio_devices():

463

"""List available audio input and output devices"""

464

p = pyaudio.PyAudio()

465

466

print("Available Audio Devices:")

467

print("=" * 50)

468

469

for i in range(p.get_device_count()):

470

info = p.get_device_info_by_index(i)

471

print(f"Device {i}: {info['name']}")

472

print(f" Max Input Channels: {info['maxInputChannels']}")

473

print(f" Max Output Channels: {info['maxOutputChannels']}")

474

print(f" Default Sample Rate: {info['defaultSampleRate']}")

475

print()

476

477

p.terminate()

478

479

def use_specific_devices():

480

"""Use specific audio devices"""

481

list_audio_devices()

482

483

# Use specific devices based on discovery

484

input_device = 1 # Replace with desired input device index

485

output_device = 2 # Replace with desired output device index

486

487

microphone = Microphone(

488

input_device_index=input_device,

489

rate=16000,

490

channels=1

491

)

492

493

speaker = Speaker(

494

output_device_index=output_device,

495

rate=24000,

496

channels=1

497

)

498

499

print(f"Using input device {input_device} and output device {output_device}")

500

501

# Use the configured devices

502

if microphone.start() and speaker.start():

503

print("Both devices started successfully")

504

# ... use devices ...

505

microphone.finish()

506

speaker.finish()

507

508

# Run device discovery

509

use_specific_devices()

510

```

511

512

### Error Handling and Diagnostics

513

514

```python

515

from deepgram import Microphone, Speaker, DeepgramMicrophoneError, DeepgramSpeakerError

516

import pyaudio

517

518

def test_audio_system():

519

"""Test audio system with comprehensive error handling"""

520

521

# Test microphone

522

print("Testing microphone...")

523

try:

524

microphone = Microphone(

525

rate=16000,

526

chunk=1024,

527

channels=1,

528

verbose=20 # Verbose logging for debugging

529

)

530

531

if microphone.start():

532

print("✓ Microphone test passed")

533

microphone.finish()

534

else:

535

print("✗ Microphone failed to start")

536

537

except DeepgramMicrophoneError as e:

538

print(f"✗ Microphone error: {e}")

539

except Exception as e:

540

print(f"✗ Unexpected microphone error: {e}")

541

542

# Test speaker

543

print("\nTesting speaker...")

544

try:

545

speaker = Speaker(

546

rate=24000,

547

chunk=1024,

548

channels=1,

549

verbose=20 # Verbose logging for debugging

550

)

551

552

if speaker.start():

553

print("✓ Speaker test passed")

554

555

# Test with silent audio data

556

silent_audio = b'\x00' * 1024 # 1024 bytes of silence

557

if speaker.play(silent_audio):

558

print("✓ Audio playback test passed")

559

else:

560

print("✗ Audio playback test failed")

561

562

speaker.finish()

563

else:

564

print("✗ Speaker failed to start")

565

566

except DeepgramSpeakerError as e:

567

print(f"✗ Speaker error: {e}")

568

except Exception as e:

569

print(f"✗ Unexpected speaker error: {e}")

570

571

# Test PyAudio availability

572

print("\nTesting PyAudio...")

573

try:

574

p = pyaudio.PyAudio()

575

device_count = p.get_device_count()

576

print(f"✓ PyAudio found {device_count} audio devices")

577

p.terminate()

578

except Exception as e:

579

print(f"✗ PyAudio error: {e}")

580

print(" Try: pip install pyaudio")

581

582

# Run comprehensive audio test

583

test_audio_system()

584

```