or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-features.mdindex.mdspeech-adaptation.mdspeech-recognition.mdstreaming-recognition.mdtypes-and-configuration.md

advanced-features.mddocs/

0

# Advanced Features (v2)

1

2

Next-generation Speech API (v2) features including batch recognition, recognizer management, enhanced output formatting, and advanced configuration options.

3

4

## Version 2 API Import

5

6

```python

7

from google.cloud import speech_v2

8

9

# Initialize v2 client

10

client = speech_v2.SpeechClient()

11

```

12

13

## Capabilities

14

15

### Batch Recognition

16

17

Process multiple audio files efficiently with batch recognition operations.

18

19

```python { .api }

20

def batch_recognize(

21

self,

22

request: BatchRecognizeRequest,

23

*,

24

retry: OptionalRetry = None,

25

timeout: Optional[float] = None,

26

metadata: Sequence[Tuple[str, str]] = ()

27

) -> Operation:

28

"""

29

Performs batch speech recognition on multiple audio files.

30

31

Parameters:

32

- request: Batch recognition request with files and configuration

33

- retry: Retry configuration for failed requests

34

- timeout: Request timeout in seconds

35

- metadata: Additional metadata to send with the request

36

37

Returns:

38

Operation: Long-running operation for batch processing

39

40

Raises:

41

google.api_core.exceptions.InvalidArgument: If the request is malformed

42

"""

43

```

44

45

#### Batch Recognition Usage

46

47

```python

48

from google.cloud import speech_v2

49

50

client = speech_v2.SpeechClient()

51

52

# Configure batch recognition

53

request = speech_v2.BatchRecognizeRequest(

54

parent="projects/your-project-id/locations/global",

55

config=speech_v2.RecognitionConfig(

56

auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),

57

language_codes=["en-US"],

58

features=speech_v2.RecognitionFeatures(

59

enable_automatic_punctuation=True,

60

enable_word_time_offsets=True,

61

enable_speaker_diarization=True,

62

),

63

),

64

files=[

65

speech_v2.BatchRecognizeFileMetadata(

66

uri="gs://your-bucket/audio1.wav",

67

output_config=speech_v2.RecognitionOutputConfig(

68

gcs_output_config=speech_v2.GcsOutputConfig(

69

uri="gs://your-bucket/output/"

70

),

71

output_format_config=speech_v2.OutputFormatConfig(

72

native=speech_v2.NativeOutputFileFormatConfig()

73

),

74

),

75

),

76

speech_v2.BatchRecognizeFileMetadata(

77

uri="gs://your-bucket/audio2.flac",

78

),

79

],

80

recognition_output_config=speech_v2.RecognitionOutputConfig(

81

inline_response_config=speech_v2.InlineOutputConfig(),

82

),

83

)

84

85

# Start batch operation

86

operation = client.batch_recognize(request=request)

87

print(f"Batch operation: {operation.operation.name}")

88

89

# Wait for completion

90

response = operation.result(timeout=1800) # 30 minutes

91

print(f"Processed {len(response.results)} files")

92

```

93

94

### Recognizer Management

95

96

Create, manage, and configure persistent recognizers for consistent speech recognition settings.

97

98

```python { .api }

99

def create_recognizer(

100

self,

101

request: CreateRecognizerRequest,

102

*,

103

retry: OptionalRetry = None,

104

timeout: Optional[float] = None,

105

metadata: Sequence[Tuple[str, str]] = ()

106

) -> Operation:

107

"""Create a custom recognizer with specific configuration."""

108

109

def get_recognizer(

110

self,

111

request: GetRecognizerRequest,

112

*,

113

retry: OptionalRetry = None,

114

timeout: Optional[float] = None,

115

metadata: Sequence[Tuple[str, str]] = ()

116

) -> Recognizer:

117

"""Retrieve a recognizer by name."""

118

119

def list_recognizers(

120

self,

121

request: ListRecognizersRequest,

122

*,

123

retry: OptionalRetry = None,

124

timeout: Optional[float] = None,

125

metadata: Sequence[Tuple[str, str]] = ()

126

) -> ListRecognizersResponse:

127

"""List recognizers in a project."""

128

129

def update_recognizer(

130

self,

131

request: UpdateRecognizerRequest,

132

*,

133

retry: OptionalRetry = None,

134

timeout: Optional[float] = None,

135

metadata: Sequence[Tuple[str, str]] = ()

136

) -> Operation:

137

"""Update an existing recognizer."""

138

139

def delete_recognizer(

140

self,

141

request: DeleteRecognizerRequest,

142

*,

143

retry: OptionalRetry = None,

144

timeout: Optional[float] = None,

145

metadata: Sequence[Tuple[str, str]] = ()

146

) -> Operation:

147

"""Delete a recognizer."""

148

149

def undelete_recognizer(

150

self,

151

request: UndeleteRecognizerRequest,

152

*,

153

retry: OptionalRetry = None,

154

timeout: Optional[float] = None,

155

metadata: Sequence[Tuple[str, str]] = ()

156

) -> Operation:

157

"""

158

Undeletes a previously deleted recognizer.

159

160

Parameters:

161

- request: Request to undelete a recognizer

162

- retry: Retry configuration for failed requests

163

- timeout: Request timeout in seconds

164

- metadata: Additional metadata to send with the request

165

166

Returns:

167

Operation: Long-running operation for undelete process

168

"""

169

```

170

171

#### Recognizer Usage

172

173

```python

174

from google.cloud import speech_v2

175

176

client = speech_v2.SpeechClient()

177

178

# Create a custom recognizer

179

recognizer_request = speech_v2.CreateRecognizerRequest(

180

parent="projects/your-project-id/locations/us-central1",

181

recognizer_id="medical-transcription",

182

recognizer=speech_v2.Recognizer(

183

display_name="Medical Transcription Recognizer",

184

model="medical_conversation",

185

language_codes=["en-US"],

186

default_recognition_config=speech_v2.RecognitionConfig(

187

features=speech_v2.RecognitionFeatures(

188

enable_automatic_punctuation=True,

189

profanity_filter=True,

190

enable_speaker_diarization=True,

191

diarization_config=speech_v2.SpeakerDiarizationConfig(

192

min_speaker_count=2,

193

max_speaker_count=4,

194

),

195

),

196

),

197

),

198

)

199

200

operation = client.create_recognizer(request=recognizer_request)

201

recognizer = operation.result()

202

203

# Use the recognizer for recognition

204

recognize_request = speech_v2.RecognizeRequest(

205

recognizer=recognizer.name,

206

config=speech_v2.RecognitionConfig(

207

auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),

208

),

209

content=audio_content,

210

)

211

212

response = client.recognize(request=recognize_request)

213

```

214

215

### Enhanced Output Formatting

216

217

Generate output in various formats including VTT and SRT subtitles.

218

219

```python { .api }

220

class OutputFormatConfig:

221

"""Configuration for output formatting."""

222

native: NativeOutputFileFormatConfig

223

vtt: VttOutputFileFormatConfig

224

srt: SrtOutputFileFormatConfig

225

226

class VttOutputFileFormatConfig:

227

"""Configuration for VTT subtitle format."""

228

229

class SrtOutputFileFormatConfig:

230

"""Configuration for SRT subtitle format."""

231

232

class NativeOutputFileFormatConfig:

233

"""Configuration for native JSON format."""

234

```

235

236

#### Subtitle Generation Usage

237

238

```python

239

from google.cloud import speech_v2

240

241

client = speech_v2.SpeechClient()

242

243

# Configure for subtitle generation

244

request = speech_v2.RecognizeRequest(

245

recognizer="projects/project/locations/global/recognizers/default",

246

config=speech_v2.RecognitionConfig(

247

auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),

248

language_codes=["en-US"],

249

features=speech_v2.RecognitionFeatures(

250

enable_word_time_offsets=True,

251

enable_automatic_punctuation=True,

252

),

253

),

254

content=audio_content,

255

output_config=speech_v2.RecognitionOutputConfig(

256

output_format_config=speech_v2.OutputFormatConfig(

257

# Generate VTT subtitles

258

vtt=speech_v2.VttOutputFileFormatConfig()

259

),

260

gcs_output_config=speech_v2.GcsOutputConfig(

261

uri="gs://your-bucket/subtitles/"

262

),

263

),

264

)

265

266

response = client.recognize(request=request)

267

268

# Also generate SRT format

269

srt_request = speech_v2.RecognizeRequest(

270

recognizer="projects/project/locations/global/recognizers/default",

271

config=speech_v2.RecognitionConfig(

272

auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),

273

language_codes=["en-US"],

274

features=speech_v2.RecognitionFeatures(

275

enable_word_time_offsets=True,

276

enable_automatic_punctuation=True,

277

),

278

),

279

content=audio_content,

280

output_config=speech_v2.RecognitionOutputConfig(

281

output_format_config=speech_v2.OutputFormatConfig(

282

# Generate SRT subtitles

283

srt=speech_v2.SrtOutputFileFormatConfig()

284

),

285

gcs_output_config=speech_v2.GcsOutputConfig(

286

uri="gs://your-bucket/subtitles/"

287

),

288

),

289

)

290

291

srt_response = client.recognize(request=srt_request)

292

```

293

294

### Configuration Management

295

296

Manage project-level configuration settings for speech recognition services.

297

298

```python { .api }

299

def get_config(

300

self,

301

request: GetConfigRequest,

302

*,

303

retry: OptionalRetry = None,

304

timeout: Optional[float] = None,

305

metadata: Sequence[Tuple[str, str]] = ()

306

) -> Config:

307

"""

308

Retrieves the requested configuration.

309

310

Parameters:

311

- request: Request to get configuration

312

- retry: Retry configuration for failed requests

313

- timeout: Request timeout in seconds

314

- metadata: Additional metadata to send with the request

315

316

Returns:

317

Config: The requested configuration object

318

"""

319

320

def update_config(

321

self,

322

request: UpdateConfigRequest,

323

*,

324

retry: OptionalRetry = None,

325

timeout: Optional[float] = None,

326

metadata: Sequence[Tuple[str, str]] = ()

327

) -> Config:

328

"""

329

Updates the configuration settings.

330

331

Parameters:

332

- request: Request to update configuration with new settings

333

- retry: Retry configuration for failed requests

334

- timeout: Request timeout in seconds

335

- metadata: Additional metadata to send with the request

336

337

Returns:

338

Config: The updated configuration object

339

"""

340

```

341

342

#### Configuration Management Usage

343

344

```python

345

from google.cloud import speech_v2

346

347

client = speech_v2.SpeechClient()

348

349

# Get current configuration

350

get_request = speech_v2.GetConfigRequest(

351

name="projects/your-project-id/locations/global/config"

352

)

353

config = client.get_config(request=get_request)

354

print(f"Current config: {config}")

355

356

# Update configuration

357

updated_config = speech_v2.Config(

358

name="projects/your-project-id/locations/global/config",

359

kms_key_name="projects/your-project-id/locations/us-central1/keyRings/ring/cryptoKeys/key",

360

update_time=None, # Will be set by service

361

)

362

363

update_request = speech_v2.UpdateConfigRequest(

364

config=updated_config,

365

update_mask={"paths": ["kms_key_name"]}, # Only update encryption key

366

)

367

368

updated_config = client.update_config(request=update_request)

369

print(f"Updated config: {updated_config}")

370

```

371

372

## V2 Configuration Types

373

374

### RecognitionConfig (v2)

375

376

```python { .api }

377

class RecognitionConfig:

378

"""Enhanced recognition configuration for v2 API."""

379

explicit_decoding_config: ExplicitDecodingConfig

380

auto_decoding_config: AutoDetectDecodingConfig

381

model: str

382

language_codes: Sequence[str]

383

translation_config: TranslationConfig

384

features: RecognitionFeatures

385

adaptation: SpeechAdaptation

386

transcript_normalization: TranscriptNormalization

387

```

388

389

### RecognitionFeatures

390

391

```python { .api }

392

class RecognitionFeatures:

393

"""Feature flags for speech recognition."""

394

enable_word_time_offsets: bool

395

enable_word_confidence: bool

396

enable_automatic_punctuation: bool

397

enable_spoken_punctuation: bool

398

enable_spoken_emojis: bool

399

enable_speaker_diarization: bool

400

diarization_config: SpeakerDiarizationConfig

401

max_alternatives: int

402

profanity_filter: bool

403

```

404

405

### AutoDetectDecodingConfig

406

407

```python { .api }

408

class AutoDetectDecodingConfig:

409

"""Automatic audio format detection."""

410

# No configuration needed - automatically detects format

411

```

412

413

### ExplicitDecodingConfig

414

415

```python { .api }

416

class ExplicitDecodingConfig:

417

"""Explicit audio format specification."""

418

encoding: AudioEncoding

419

sample_rate_hertz: int

420

audio_channel_count: int

421

```

422

423

### Recognizer

424

425

```python { .api }

426

class Recognizer:

427

"""Persistent recognizer configuration."""

428

name: str

429

uid: str

430

display_name: str

431

model: str

432

language_codes: Sequence[str]

433

default_recognition_config: RecognitionConfig

434

annotations: Mapping[str, str]

435

state: State

436

create_time: Timestamp

437

update_time: Timestamp

438

delete_time: Timestamp

439

expire_time: Timestamp

440

etag: str

441

reconciling: bool

442

kms_key_name: str

443

kms_key_version_name: str

444

445

class State:

446

"""Recognizer lifecycle state."""

447

STATE_UNSPECIFIED = 0

448

ACTIVE = 2

449

DELETE_REQUESTED = 3

450

```

451

452

## V2 Request Types

453

454

### BatchRecognizeRequest

455

456

```python { .api }

457

class BatchRecognizeRequest:

458

"""Request for batch recognition."""

459

parent: str

460

config: RecognitionConfig

461

config_mask: FieldMask

462

files: Sequence[BatchRecognizeFileMetadata]

463

recognition_output_config: RecognitionOutputConfig

464

processing_strategy: ProcessingStrategy

465

```

466

467

### BatchRecognizeFileMetadata

468

469

```python { .api }

470

class BatchRecognizeFileMetadata:

471

"""Metadata for individual file in batch."""

472

uri: str

473

config: RecognitionConfig

474

config_mask: FieldMask

475

output_config: RecognitionOutputConfig

476

```

477

478

### RecognitionOutputConfig

479

480

```python { .api }

481

class RecognitionOutputConfig:

482

"""Configuration for recognition output."""

483

gcs_output_config: GcsOutputConfig

484

inline_response_config: InlineOutputConfig

485

output_format_config: OutputFormatConfig

486

```

487

488

## V2 Response Types

489

490

### BatchRecognizeResponse

491

492

```python { .api }

493

class BatchRecognizeResponse:

494

"""Response from batch recognition."""

495

results: Mapping[str, BatchRecognizeFileResult]

496

total_billed_duration: Duration

497

```

498

499

### BatchRecognizeFileResult

500

501

```python { .api }

502

class BatchRecognizeFileResult:

503

"""Result for individual file in batch."""

504

uri: str

505

error: Status

506

metadata: BatchRecognizeTranscriptionMetadata

507

transcript: BatchRecognizeResults

508

```

509

510

### BatchRecognizeResults

511

512

```python { .api }

513

class BatchRecognizeResults:

514

"""Transcription results from batch recognition."""

515

results: Sequence[SpeechRecognitionResult]

516

metadata: RecognitionResponseMetadata

517

```

518

519

### Config

520

521

```python { .api }

522

class Config:

523

"""Project-level configuration for Speech services."""

524

name: str

525

kms_key_name: str

526

update_time: Timestamp

527

```

528

529

## V2 Request Types (Configuration Management)

530

531

### GetConfigRequest

532

533

```python { .api }

534

class GetConfigRequest:

535

"""Request to retrieve configuration."""

536

name: str # Format: projects/{project}/locations/{location}/config

537

```

538

539

### UpdateConfigRequest

540

541

```python { .api }

542

class UpdateConfigRequest:

543

"""Request to update configuration."""

544

config: Config

545

update_mask: FieldMask

546

```

547

548

### UndeleteRecognizerRequest

549

550

```python { .api }

551

class UndeleteRecognizerRequest:

552

"""Request to undelete a recognizer."""

553

name: str # Format: projects/{project}/locations/{location}/recognizers/{recognizer}

554

validate_only: bool

555

etag: str

556

```

557

558

## Advanced Configuration Examples

559

560

### Multi-language Recognition

561

562

```python

563

# Configure for automatic language detection

564

config = speech_v2.RecognitionConfig(

565

auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),

566

language_codes=["en-US", "es-ES", "fr-FR"], # Multiple languages

567

features=speech_v2.RecognitionFeatures(

568

enable_automatic_punctuation=True,

569

max_alternatives=3, # Multiple transcription alternatives

570

),

571

)

572

```

573

574

### Translation Integration

575

576

```python

577

# Configure for speech-to-text with translation

578

config = speech_v2.RecognitionConfig(

579

auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),

580

language_codes=["es-ES"], # Source language

581

translation_config=speech_v2.TranslationConfig(

582

target_language="en-US" # Translate to English

583

),

584

features=speech_v2.RecognitionFeatures(

585

enable_automatic_punctuation=True,

586

),

587

)

588

```

589

590

### Advanced Diarization

591

592

```python

593

# Enhanced speaker diarization configuration

594

diarization_config = speech_v2.SpeakerDiarizationConfig(

595

min_speaker_count=2,

596

max_speaker_count=10,

597

speaker_ids=["SPEAKER_1", "SPEAKER_2"], # Predefined speaker IDs

598

)

599

600

config = speech_v2.RecognitionConfig(

601

auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),

602

language_codes=["en-US"],

603

features=speech_v2.RecognitionFeatures(

604

enable_speaker_diarization=True,

605

diarization_config=diarization_config,

606

enable_word_time_offsets=True,

607

),

608

)

609

```

610

611

## Migration from v1 to v2

612

613

### Key Changes

614

615

```python

616

# v1 approach

617

from google.cloud import speech

618

619

client = speech.SpeechClient()

620

config = speech.RecognitionConfig(

621

encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,

622

sample_rate_hertz=16000,

623

language_code="en-US", # Single language

624

)

625

626

# v2 approach

627

from google.cloud import speech_v2

628

629

client = speech_v2.SpeechClient()

630

config = speech_v2.RecognitionConfig(

631

explicit_decoding_config=speech_v2.ExplicitDecodingConfig(

632

encoding=speech_v2.ExplicitDecodingConfig.AudioEncoding.LINEAR16,

633

sample_rate_hertz=16000,

634

),

635

language_codes=["en-US"], # Multiple languages supported

636

features=speech_v2.RecognitionFeatures(

637

enable_automatic_punctuation=True,

638

),

639

)

640

```

641

642

### Recognition Request Changes

643

644

```python

645

# v1 request

646

response = client.recognize(config=config, audio=audio)

647

648

# v2 request

649

request = speech_v2.RecognizeRequest(

650

recognizer="projects/project/locations/global/recognizers/default",

651

config=config,

652

content=audio_content,

653

)

654

response = client.recognize(request=request)

655

```