or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio-utilities.mdconversational-ai.mdindex.mdproject-management.mdspeech-to-text.mdtext-analysis.mdtext-to-speech.md

speech-to-text.mddocs/

0

# Speech-to-Text

1

2

Comprehensive speech recognition capabilities supporting both batch transcription of prerecorded audio and real-time streaming transcription. The Listen module provides advanced features like speaker diarization, punctuation, profanity filtering, keyword detection, sentiment analysis, and support for multiple languages and audio formats.

3

4

## Capabilities

5

6

### REST Client (Prerecorded Audio)

7

8

Synchronous client for transcribing prerecorded audio files with comprehensive configuration options and detailed transcription results.

9

10

```python { .api }

11

class ListenRESTClient:

12

def transcribe_url(

13

self,

14

source: UrlSource,

15

options: ListenRESTOptions = None,

16

headers: dict = None,

17

timeout = None

18

) -> PrerecordedResponse:

19

"""

20

Transcribe audio from URL.

21

22

Args:

23

source: URL source containing audio to transcribe

24

options: Transcription configuration options

25

headers: Additional HTTP headers

26

timeout: Request timeout

27

28

Returns:

29

PrerecordedResponse: Complete transcription results with metadata

30

"""

31

32

def transcribe_file(

33

self,

34

source: FileSource,

35

options: ListenRESTOptions = None,

36

headers: dict = None,

37

timeout = None

38

) -> PrerecordedResponse:

39

"""

40

Transcribe audio from file.

41

42

Args:

43

source: File source containing audio to transcribe

44

options: Transcription configuration options

45

headers: Additional HTTP headers

46

timeout: Request timeout

47

48

Returns:

49

PrerecordedResponse: Complete transcription results with metadata

50

"""

51

52

def transcribe_url_callback(

53

self,

54

source: UrlSource,

55

callback: str,

56

options: ListenRESTOptions = None,

57

headers: dict = None,

58

timeout = None

59

) -> AsyncPrerecordedResponse:

60

"""

61

Transcribe audio from URL with callback URL for results.

62

63

Args:

64

source: URL source containing audio to transcribe

65

callback: Callback URL to receive transcription results

66

options: Transcription configuration options

67

headers: Additional HTTP headers

68

timeout: Request timeout

69

70

Returns:

71

AsyncPrerecordedResponse: Async response for callback processing

72

"""

73

74

def transcribe_file_callback(

75

self,

76

source: FileSource,

77

callback: str,

78

options: ListenRESTOptions = None,

79

headers: dict = None,

80

timeout = None

81

) -> AsyncPrerecordedResponse:

82

"""

83

Transcribe audio from file with callback URL for results.

84

85

Args:

86

source: File source containing audio to transcribe

87

callback: Callback URL to receive transcription results

88

options: Transcription configuration options

89

headers: Additional HTTP headers

90

timeout: Request timeout

91

92

Returns:

93

AsyncPrerecordedResponse: Async response for callback processing

94

"""

95

96

class AsyncListenRESTClient:

97

async def transcribe_url(

98

self,

99

source: UrlSource,

100

options: ListenRESTOptions = None,

101

headers: dict = None,

102

timeout = None

103

) -> AsyncPrerecordedResponse:

104

"""Async version of transcribe_url method"""

105

106

async def transcribe_file(

107

self,

108

source: FileSource,

109

options: ListenRESTOptions = None,

110

headers: dict = None,

111

timeout = None

112

) -> AsyncPrerecordedResponse:

113

"""Async version of transcribe_file method"""

114

115

async def transcribe_url_callback(

116

self,

117

source: UrlSource,

118

callback: str,

119

options: ListenRESTOptions = None,

120

headers: dict = None,

121

timeout = None

122

) -> AsyncPrerecordedResponse:

123

"""Async version of transcribe_url_callback method"""

124

125

async def transcribe_file_callback(

126

self,

127

source: FileSource,

128

callback: str,

129

options: ListenRESTOptions = None,

130

headers: dict = None,

131

timeout = None

132

) -> AsyncPrerecordedResponse:

133

"""Async version of transcribe_file_callback method"""

134

```

135

136

### WebSocket Client (Real-time Audio)

137

138

Real-time streaming transcription client supporting live audio processing with configurable buffering and result handling.

139

140

```python { .api }

141

class ListenWebSocketClient:

142

def start(self, options: ListenWebSocketOptions) -> bool:

143

"""

144

Start WebSocket connection for real-time transcription.

145

146

Args:

147

options: WebSocket configuration options

148

149

Returns:

150

bool: True if connection started successfully

151

"""

152

153

def send(self, data: bytes) -> bool:

154

"""

155

Send audio data for transcription.

156

157

Args:

158

data: Raw audio bytes

159

160

Returns:

161

bool: True if data sent successfully

162

"""

163

164

def finish(self) -> bool:

165

"""

166

Signal end of audio stream and receive final results.

167

168

Returns:

169

bool: True if stream finished successfully

170

"""

171

172

def close(self) -> bool:

173

"""

174

Close WebSocket connection.

175

176

Returns:

177

bool: True if connection closed successfully

178

"""

179

180

class AsyncListenWebSocketClient:

181

async def start(self, options: ListenWebSocketOptions) -> bool: ...

182

async def send(self, data: bytes) -> bool: ...

183

async def finish(self) -> bool: ...

184

async def close(self) -> bool: ...

185

```

186

187

### Router Access

188

189

Access speech-to-text clients through the main client's listen router.

190

191

```python { .api }

192

class ListenRouter:

193

@property

194

def rest(self) -> ListenRESTClient: ...

195

@property

196

def asyncrest(self) -> AsyncListenRESTClient: ...

197

@property

198

def websocket(self) -> ListenWebSocketClient: ...

199

@property

200

def asyncwebsocket(self) -> AsyncListenWebSocketClient: ...

201

```

202

203

### Options Classes

204

205

#### REST Options

206

207

```python { .api }

208

class ListenRESTOptions:

209

def __init__(self, **kwargs): ...

210

211

# Model and language settings

212

model: str = "nova-2" # AI model for transcription

213

language: str = "en-US" # Language code

214

version: str = None # Model version

215

216

# Audio processing

217

encoding: str = None # Audio encoding format

218

sample_rate: int = None # Audio sample rate

219

channels: int = None # Number of audio channels

220

221

# Transcription features

222

punctuate: bool = True # Add punctuation

223

profanity_filter: bool = False # Filter profanity

224

redact: list = None # Redact sensitive information

225

diarize: bool = False # Speaker diarization

226

diarize_version: str = None # Diarization model version

227

ner: bool = False # Named entity recognition

228

multichannel: bool = False # Process multiple channels separately

229

alternatives: int = 1 # Number of transcript alternatives

230

numerals: bool = False # Convert numbers to numerals

231

smart_format: bool = False # Smart formatting

232

233

# Analysis features

234

summarize: bool = False # Generate summary

235

detect_language: bool = False # Auto-detect language

236

paragraphs: bool = False # Paragraph detection

237

utterances: bool = False # Utterance segmentation

238

utt_split: float = None # Utterance split threshold

239

sentiment: bool = False # Sentiment analysis

240

topics: bool = False # Topic detection

241

intents: bool = False # Intent recognition

242

243

# Keywords and search

244

keywords: list = None # Keyword detection

245

keyword_boost: str = None # Keyword boosting

246

search: list = None # Search terms

247

replace: list = None # Text replacement

248

249

# Output formatting

250

filler_words: bool = False # Include filler words

251

dictation: bool = False # Dictation mode

252

measurements: bool = False # Measurement formatting

253

dates: bool = False # Date formatting

254

times: bool = False # Time formatting

255

256

# Callback and metadata

257

callback: str = None # Webhook callback URL

258

callback_method: str = "POST" # Callback HTTP method

259

custom_intent: list = None # Custom intent models

260

custom_intent_mode: str = None # Custom intent processing mode

261

custom_topic: list = None # Custom topic models

262

custom_topic_mode: str = None # Custom topic processing mode

263

264

# Advanced options

265

tag: list = None # Custom tags

266

extra: dict = None # Additional options

267

```

268

269

#### WebSocket Options

270

271

```python { .api }

272

class ListenWebSocketOptions:

273

def __init__(self, **kwargs): ...

274

275

# Model and language settings

276

model: str = "nova-2" # AI model for transcription

277

language: str = "en-US" # Language code

278

version: str = None # Model version

279

280

# Audio settings (required for WebSocket)

281

encoding: str = "linear16" # Audio encoding

282

sample_rate: int = 16000 # Sample rate in Hz

283

channels: int = 1 # Number of channels

284

285

# Real-time processing

286

interim_results: bool = True # Receive interim results

287

endpointing: bool = True # Automatic endpoint detection

288

vad_events: bool = False # Voice activity detection events

289

utterance_end_ms: int = 1000 # Utterance end timeout

290

291

# Transcription features (same as REST)

292

punctuate: bool = True

293

profanity_filter: bool = False

294

redact: list = None

295

diarize: bool = False

296

diarize_version: str = None

297

ner: bool = False

298

alternatives: int = 1

299

numerals: bool = False

300

smart_format: bool = False

301

302

# Analysis features

303

sentiment: bool = False

304

topics: bool = False

305

intents: bool = False

306

307

# Keywords and search

308

keywords: list = None

309

keyword_boost: str = None

310

search: list = None

311

replace: list = None

312

313

# Output options

314

filler_words: bool = False

315

dictation: bool = False

316

measurements: bool = False

317

dates: bool = False

318

times: bool = False

319

320

# Custom models

321

custom_intent: list = None

322

custom_intent_mode: str = None

323

custom_topic: list = None

324

custom_topic_mode: str = None

325

326

# Advanced options

327

tag: list = None

328

extra: dict = None

329

```

330

331

### Source Types

332

333

Input sources for audio data in various formats.

334

335

```python { .api }

336

class PrerecordedSource:

337

"""Base class for prerecorded audio sources"""

338

339

class UrlSource(PrerecordedSource):

340

def __init__(self, url: str):

341

"""

342

Audio from URL.

343

344

Args:

345

url: HTTP/HTTPS URL to audio file

346

"""

347

348

class FileSource(PrerecordedSource):

349

def __init__(self, file: str):

350

"""

351

Audio from local file.

352

353

Args:

354

file: Path to local audio file

355

"""

356

357

class BufferSource(PrerecordedSource):

358

def __init__(self, buffer: bytes):

359

"""

360

Audio from byte buffer.

361

362

Args:

363

buffer: Raw audio bytes

364

"""

365

366

class StreamSource(PrerecordedSource):

367

def __init__(self, stream):

368

"""

369

Audio from stream object.

370

371

Args:

372

stream: File-like stream object

373

"""

374

375

class PreRecordedStreamSource(PrerecordedSource):

376

"""Legacy stream source alias"""

377

378

class ListenRestSource(PrerecordedSource):

379

"""REST-specific source type"""

380

```

381

382

### Response Types

383

384

#### REST Response Types

385

386

```python { .api }

387

class PrerecordedResponse:

388

"""Main prerecorded transcription response"""

389

metadata: ListenRESTMetadata

390

results: ListenRESTResults

391

392

class AsyncPrerecordedResponse(PrerecordedResponse):

393

"""Async prerecorded response"""

394

395

class SyncPrerecordedResponse(PrerecordedResponse):

396

"""Sync prerecorded response"""

397

398

class ListenRESTMetadata:

399

"""REST transcription metadata"""

400

request_id: str

401

transaction_key: str

402

sha256: str

403

created: str

404

duration: float

405

channels: int

406

models: list

407

model_info: dict

408

409

class ListenRESTResults:

410

"""REST transcription results"""

411

channels: list[ListenRESTChannel]

412

utterances: list[Utterance] = None

413

summary: dict = None

414

415

class ListenRESTChannel:

416

"""Channel-specific transcription results"""

417

search: list[Search] = None

418

alternatives: list[ListenRESTAlternative]

419

420

class ListenRESTAlternative:

421

"""Alternative transcription result"""

422

transcript: str

423

confidence: float

424

words: list[ListenRESTWord]

425

paragraphs: Paragraphs = None

426

entities: list[Entity] = None

427

translations: list[Translation] = None

428

summaries: list[Summaries] = None

429

430

class ListenRESTWord:

431

"""Word-level transcription data"""

432

word: str

433

start: float

434

end: float

435

confidence: float

436

punctuated_word: str = None

437

speaker: int = None

438

speaker_confidence: float = None

439

language: str = None

440

```

441

442

#### WebSocket Response Types

443

444

```python { .api }

445

class LiveResultResponse:

446

"""Live transcription result"""

447

channel: ListenWSChannel

448

metadata: ListenWSMetadata

449

type: str

450

451

class ListenWSMetadataResponse:

452

"""WebSocket metadata response"""

453

type: str

454

transaction_key: str

455

request_id: str

456

sha256: str

457

created: str

458

duration: float

459

channels: int

460

461

class SpeechStartedResponse:

462

"""Speech detection event"""

463

type: str

464

timestamp: str

465

466

class UtteranceEndResponse:

467

"""Utterance completion event"""

468

type: str

469

channel: list

470

last_word_end: float

471

472

class ListenWSChannel:

473

"""WebSocket channel data"""

474

alternatives: list[ListenWSAlternative]

475

476

class ListenWSAlternative:

477

"""WebSocket alternative transcript"""

478

transcript: str

479

confidence: float

480

words: list[ListenWSWord]

481

482

class ListenWSWord:

483

"""WebSocket word-level data"""

484

word: str

485

start: float

486

end: float

487

confidence: float

488

punctuated_word: str = None

489

speaker: int = None

490

speaker_confidence: float = None

491

492

class ListenWSMetadata:

493

"""WebSocket connection metadata"""

494

request_id: str

495

model_name: str

496

model_uuid: str

497

```

498

499

#### Common Response Elements

500

501

```python { .api }

502

class Entity:

503

"""Named entity recognition result"""

504

label: str

505

value: str

506

confidence: float

507

start_word: int

508

end_word: int

509

510

class Paragraph:

511

"""Paragraph structure"""

512

sentences: list[Sentence]

513

start: float

514

end: float

515

516

class Paragraphs:

517

"""Collection of paragraphs"""

518

transcript: str

519

paragraphs: list[Paragraph]

520

521

class Sentence:

522

"""Sentence structure"""

523

text: str

524

start: float

525

end: float

526

527

class Utterance:

528

"""Speaker utterance"""

529

start: float

530

end: float

531

confidence: float

532

channel: int

533

transcript: str

534

words: list[ListenRESTWord]

535

speaker: int

536

id: str

537

538

class Translation:

539

"""Translation result"""

540

language: str

541

translation: str

542

543

class Warning:

544

"""Processing warning"""

545

parameter: str

546

type: str

547

message: str

548

549

class Summaries:

550

"""Summary collection"""

551

summary: str

552

start_word: int

553

end_word: int

554

555

class SummaryV1:

556

"""Version 1 summary format"""

557

summary: str

558

559

class SummaryV2:

560

"""Version 2 summary format"""

561

result: str

562

short: str

563

```

564

565

### Events

566

567

```python { .api }

568

class LiveTranscriptionEvents:

569

"""WebSocket event types for real-time transcription"""

570

Open: str = "Open"

571

Close: str = "Close"

572

Transcript: str = "Results"

573

Metadata: str = "Metadata"

574

UtteranceEnd: str = "UtteranceEnd"

575

SpeechStarted: str = "SpeechStarted"

576

Finalize: str = "Finalize"

577

Error: str = "Error"

578

Unhandled: str = "Unhandled"

579

Warning: str = "Warning"

580

```

581

582

## Usage Examples

583

584

### Basic Prerecorded Transcription

585

586

```python

587

from deepgram import DeepgramClient, UrlSource, ListenRESTOptions

588

589

client = DeepgramClient(api_key="your-api-key")

590

591

# Transcribe from URL

592

source = UrlSource("https://example.com/audio.wav")

593

options = ListenRESTOptions(

594

model="nova-2",

595

language="en-US",

596

punctuate=True,

597

diarize=True

598

)

599

600

response = client.listen.rest.transcribe_url(source, options)

601

transcript = response.results.channels[0].alternatives[0].transcript

602

print(transcript)

603

```

604

605

### Real-time Transcription

606

607

```python

608

from deepgram import DeepgramClient, ListenWebSocketOptions

609

import threading

610

611

client = DeepgramClient(api_key="your-api-key")

612

613

def on_message(self, result, **kwargs):

614

sentence = result.channel.alternatives[0].transcript

615

if sentence:

616

print(f"Transcript: {sentence}")

617

618

def on_error(self, error, **kwargs):

619

print(f"Error: {error}")

620

621

# Configure WebSocket options

622

options = ListenWebSocketOptions(

623

model="nova-2",

624

language="en-US",

625

encoding="linear16",

626

sample_rate=16000,

627

channels=1,

628

interim_results=True

629

)

630

631

# Start connection

632

dg_connection = client.listen.websocket.v("1")

633

dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)

634

dg_connection.on(LiveTranscriptionEvents.Error, on_error)

635

636

if dg_connection.start(options):

637

# Send audio data (typically from microphone)

638

# dg_connection.send(audio_data)

639

640

# When done

641

dg_connection.finish()

642

dg_connection.close()

643

```

644

645

### Advanced Features

646

647

```python

648

from deepgram import DeepgramClient, FileSource, ListenRESTOptions

649

650

client = DeepgramClient(api_key="your-api-key")

651

652

# Advanced transcription with multiple features

653

source = FileSource("meeting.wav")

654

options = ListenRESTOptions(

655

model="nova-2",

656

language="en-US",

657

punctuate=True,

658

diarize=True,

659

diarize_version="2021-07-14.0",

660

ner=True,

661

summarize="v2",

662

topics=True,

663

intents=True,

664

sentiment=True,

665

utterances=True,

666

paragraphs=True,

667

keywords=["project", "deadline", "budget"],

668

search=["important", "action item"]

669

)

670

671

response = client.listen.rest.transcribe_url(source, options)

672

673

# Access different types of results

674

transcript = response.results.channels[0].alternatives[0].transcript

675

utterances = response.results.utterances

676

summary = response.results.summary

677

```