0
# Advanced Features (v2)
1
2
Next-generation Speech API (v2) features including batch recognition, recognizer management, enhanced output formatting, and advanced configuration options.
3
4
## Version 2 API Import
5
6
```python
7
from google.cloud import speech_v2
8
9
# Initialize v2 client
10
client = speech_v2.SpeechClient()
11
```
12
13
## Capabilities
14
15
### Batch Recognition
16
17
Process multiple audio files efficiently with batch recognition operations.
18
19
```python { .api }
20
def batch_recognize(
21
self,
22
request: BatchRecognizeRequest,
23
*,
24
retry: OptionalRetry = None,
25
timeout: Optional[float] = None,
26
metadata: Sequence[Tuple[str, str]] = ()
27
) -> Operation:
28
"""
29
Performs batch speech recognition on multiple audio files.
30
31
Parameters:
32
- request: Batch recognition request with files and configuration
33
- retry: Retry configuration for failed requests
34
- timeout: Request timeout in seconds
35
- metadata: Additional metadata to send with the request
36
37
Returns:
38
Operation: Long-running operation for batch processing
39
40
Raises:
41
google.api_core.exceptions.InvalidArgument: If the request is malformed
42
"""
43
```
44
45
#### Batch Recognition Usage
46
47
```python
48
from google.cloud import speech_v2
49
50
client = speech_v2.SpeechClient()
51
52
# Configure batch recognition
53
request = speech_v2.BatchRecognizeRequest(
54
parent="projects/your-project-id/locations/global",
55
config=speech_v2.RecognitionConfig(
56
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
57
language_codes=["en-US"],
58
features=speech_v2.RecognitionFeatures(
59
enable_automatic_punctuation=True,
60
enable_word_time_offsets=True,
61
enable_speaker_diarization=True,
62
),
63
),
64
files=[
65
speech_v2.BatchRecognizeFileMetadata(
66
uri="gs://your-bucket/audio1.wav",
67
output_config=speech_v2.RecognitionOutputConfig(
68
gcs_output_config=speech_v2.GcsOutputConfig(
69
uri="gs://your-bucket/output/"
70
),
71
output_format_config=speech_v2.OutputFormatConfig(
72
native=speech_v2.NativeOutputFileFormatConfig()
73
),
74
),
75
),
76
speech_v2.BatchRecognizeFileMetadata(
77
uri="gs://your-bucket/audio2.flac",
78
),
79
],
80
recognition_output_config=speech_v2.RecognitionOutputConfig(
81
inline_response_config=speech_v2.InlineOutputConfig(),
82
),
83
)
84
85
# Start batch operation
86
operation = client.batch_recognize(request=request)
87
print(f"Batch operation: {operation.operation.name}")
88
89
# Wait for completion
90
response = operation.result(timeout=1800) # 30 minutes
91
print(f"Processed {len(response.results)} files")
92
```
93
94
### Recognizer Management
95
96
Create, manage, and configure persistent recognizers for consistent speech recognition settings.
97
98
```python { .api }
99
def create_recognizer(
100
self,
101
request: CreateRecognizerRequest,
102
*,
103
retry: OptionalRetry = None,
104
timeout: Optional[float] = None,
105
metadata: Sequence[Tuple[str, str]] = ()
106
) -> Operation:
107
"""Create a custom recognizer with specific configuration."""
108
109
def get_recognizer(
110
self,
111
request: GetRecognizerRequest,
112
*,
113
retry: OptionalRetry = None,
114
timeout: Optional[float] = None,
115
metadata: Sequence[Tuple[str, str]] = ()
116
) -> Recognizer:
117
"""Retrieve a recognizer by name."""
118
119
def list_recognizers(
120
self,
121
request: ListRecognizersRequest,
122
*,
123
retry: OptionalRetry = None,
124
timeout: Optional[float] = None,
125
metadata: Sequence[Tuple[str, str]] = ()
126
) -> ListRecognizersResponse:
127
"""List recognizers in a project."""
128
129
def update_recognizer(
130
self,
131
request: UpdateRecognizerRequest,
132
*,
133
retry: OptionalRetry = None,
134
timeout: Optional[float] = None,
135
metadata: Sequence[Tuple[str, str]] = ()
136
) -> Operation:
137
"""Update an existing recognizer."""
138
139
def delete_recognizer(
140
self,
141
request: DeleteRecognizerRequest,
142
*,
143
retry: OptionalRetry = None,
144
timeout: Optional[float] = None,
145
metadata: Sequence[Tuple[str, str]] = ()
146
) -> Operation:
147
"""Delete a recognizer."""
148
149
def undelete_recognizer(
150
self,
151
request: UndeleteRecognizerRequest,
152
*,
153
retry: OptionalRetry = None,
154
timeout: Optional[float] = None,
155
metadata: Sequence[Tuple[str, str]] = ()
156
) -> Operation:
157
"""
158
Undeletes a previously deleted recognizer.
159
160
Parameters:
161
- request: Request to undelete a recognizer
162
- retry: Retry configuration for failed requests
163
- timeout: Request timeout in seconds
164
- metadata: Additional metadata to send with the request
165
166
Returns:
167
Operation: Long-running operation for undelete process
168
"""
169
```
170
171
#### Recognizer Usage
172
173
```python
174
from google.cloud import speech_v2
175
176
client = speech_v2.SpeechClient()
177
178
# Create a custom recognizer
179
recognizer_request = speech_v2.CreateRecognizerRequest(
180
parent="projects/your-project-id/locations/us-central1",
181
recognizer_id="medical-transcription",
182
recognizer=speech_v2.Recognizer(
183
display_name="Medical Transcription Recognizer",
184
model="medical_conversation",
185
language_codes=["en-US"],
186
default_recognition_config=speech_v2.RecognitionConfig(
187
features=speech_v2.RecognitionFeatures(
188
enable_automatic_punctuation=True,
189
profanity_filter=True,
190
enable_speaker_diarization=True,
191
diarization_config=speech_v2.SpeakerDiarizationConfig(
192
min_speaker_count=2,
193
max_speaker_count=4,
194
),
195
),
196
),
197
),
198
)
199
200
operation = client.create_recognizer(request=recognizer_request)
201
recognizer = operation.result()
202
203
# Use the recognizer for recognition
204
recognize_request = speech_v2.RecognizeRequest(
205
recognizer=recognizer.name,
206
config=speech_v2.RecognitionConfig(
207
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
208
),
209
content=audio_content,
210
)
211
212
response = client.recognize(request=recognize_request)
213
```
214
215
### Enhanced Output Formatting
216
217
Generate output in various formats including VTT and SRT subtitles.
218
219
```python { .api }
220
class OutputFormatConfig:
221
"""Configuration for output formatting."""
222
native: NativeOutputFileFormatConfig
223
vtt: VttOutputFileFormatConfig
224
srt: SrtOutputFileFormatConfig
225
226
class VttOutputFileFormatConfig:
227
"""Configuration for VTT subtitle format."""
228
229
class SrtOutputFileFormatConfig:
230
"""Configuration for SRT subtitle format."""
231
232
class NativeOutputFileFormatConfig:
233
"""Configuration for native JSON format."""
234
```
235
236
#### Subtitle Generation Usage
237
238
```python
239
from google.cloud import speech_v2
240
241
client = speech_v2.SpeechClient()
242
243
# Configure for subtitle generation
244
request = speech_v2.RecognizeRequest(
245
recognizer="projects/project/locations/global/recognizers/default",
246
config=speech_v2.RecognitionConfig(
247
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
248
language_codes=["en-US"],
249
features=speech_v2.RecognitionFeatures(
250
enable_word_time_offsets=True,
251
enable_automatic_punctuation=True,
252
),
253
),
254
content=audio_content,
255
output_config=speech_v2.RecognitionOutputConfig(
256
output_format_config=speech_v2.OutputFormatConfig(
257
# Generate VTT subtitles
258
vtt=speech_v2.VttOutputFileFormatConfig()
259
),
260
gcs_output_config=speech_v2.GcsOutputConfig(
261
uri="gs://your-bucket/subtitles/"
262
),
263
),
264
)
265
266
response = client.recognize(request=request)
267
268
# Also generate SRT format
269
srt_request = speech_v2.RecognizeRequest(
270
recognizer="projects/project/locations/global/recognizers/default",
271
config=speech_v2.RecognitionConfig(
272
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
273
language_codes=["en-US"],
274
features=speech_v2.RecognitionFeatures(
275
enable_word_time_offsets=True,
276
enable_automatic_punctuation=True,
277
),
278
),
279
content=audio_content,
280
output_config=speech_v2.RecognitionOutputConfig(
281
output_format_config=speech_v2.OutputFormatConfig(
282
# Generate SRT subtitles
283
srt=speech_v2.SrtOutputFileFormatConfig()
284
),
285
gcs_output_config=speech_v2.GcsOutputConfig(
286
uri="gs://your-bucket/subtitles/"
287
),
288
),
289
)
290
291
srt_response = client.recognize(request=srt_request)
292
```
293
294
### Configuration Management
295
296
Manage project-level configuration settings for speech recognition services.
297
298
```python { .api }
299
def get_config(
300
self,
301
request: GetConfigRequest,
302
*,
303
retry: OptionalRetry = None,
304
timeout: Optional[float] = None,
305
metadata: Sequence[Tuple[str, str]] = ()
306
) -> Config:
307
"""
308
Retrieves the requested configuration.
309
310
Parameters:
311
- request: Request to get configuration
312
- retry: Retry configuration for failed requests
313
- timeout: Request timeout in seconds
314
- metadata: Additional metadata to send with the request
315
316
Returns:
317
Config: The requested configuration object
318
"""
319
320
def update_config(
321
self,
322
request: UpdateConfigRequest,
323
*,
324
retry: OptionalRetry = None,
325
timeout: Optional[float] = None,
326
metadata: Sequence[Tuple[str, str]] = ()
327
) -> Config:
328
"""
329
Updates the configuration settings.
330
331
Parameters:
332
- request: Request to update configuration with new settings
333
- retry: Retry configuration for failed requests
334
- timeout: Request timeout in seconds
335
- metadata: Additional metadata to send with the request
336
337
Returns:
338
Config: The updated configuration object
339
"""
340
```
341
342
#### Configuration Management Usage
343
344
```python
345
from google.cloud import speech_v2
346
347
client = speech_v2.SpeechClient()
348
349
# Get current configuration
350
get_request = speech_v2.GetConfigRequest(
351
name="projects/your-project-id/locations/global/config"
352
)
353
config = client.get_config(request=get_request)
354
print(f"Current config: {config}")
355
356
# Update configuration
357
updated_config = speech_v2.Config(
358
name="projects/your-project-id/locations/global/config",
359
kms_key_name="projects/your-project-id/locations/us-central1/keyRings/ring/cryptoKeys/key",
360
update_time=None, # Will be set by service
361
)
362
363
update_request = speech_v2.UpdateConfigRequest(
364
config=updated_config,
365
update_mask={"paths": ["kms_key_name"]}, # Only update encryption key
366
)
367
368
updated_config = client.update_config(request=update_request)
369
print(f"Updated config: {updated_config}")
370
```
371
372
## V2 Configuration Types
373
374
### RecognitionConfig (v2)
375
376
```python { .api }
377
class RecognitionConfig:
378
"""Enhanced recognition configuration for v2 API."""
379
explicit_decoding_config: ExplicitDecodingConfig
380
auto_decoding_config: AutoDetectDecodingConfig
381
model: str
382
language_codes: Sequence[str]
383
translation_config: TranslationConfig
384
features: RecognitionFeatures
385
adaptation: SpeechAdaptation
386
transcript_normalization: TranscriptNormalization
387
```
388
389
### RecognitionFeatures
390
391
```python { .api }
392
class RecognitionFeatures:
393
"""Feature flags for speech recognition."""
394
enable_word_time_offsets: bool
395
enable_word_confidence: bool
396
enable_automatic_punctuation: bool
397
enable_spoken_punctuation: bool
398
enable_spoken_emojis: bool
399
enable_speaker_diarization: bool
400
diarization_config: SpeakerDiarizationConfig
401
max_alternatives: int
402
profanity_filter: bool
403
```
404
405
### AutoDetectDecodingConfig
406
407
```python { .api }
408
class AutoDetectDecodingConfig:
409
"""Automatic audio format detection."""
410
# No configuration needed - automatically detects format
411
```
412
413
### ExplicitDecodingConfig
414
415
```python { .api }
416
class ExplicitDecodingConfig:
417
"""Explicit audio format specification."""
418
encoding: AudioEncoding
419
sample_rate_hertz: int
420
audio_channel_count: int
421
```
422
423
### Recognizer
424
425
```python { .api }
426
class Recognizer:
427
"""Persistent recognizer configuration."""
428
name: str
429
uid: str
430
display_name: str
431
model: str
432
language_codes: Sequence[str]
433
default_recognition_config: RecognitionConfig
434
annotations: Mapping[str, str]
435
state: State
436
create_time: Timestamp
437
update_time: Timestamp
438
delete_time: Timestamp
439
expire_time: Timestamp
440
etag: str
441
reconciling: bool
442
kms_key_name: str
443
kms_key_version_name: str
444
445
class State:
446
"""Recognizer lifecycle state."""
447
STATE_UNSPECIFIED = 0
448
ACTIVE = 2
449
DELETE_REQUESTED = 3
450
```
451
452
## V2 Request Types
453
454
### BatchRecognizeRequest
455
456
```python { .api }
457
class BatchRecognizeRequest:
458
"""Request for batch recognition."""
459
parent: str
460
config: RecognitionConfig
461
config_mask: FieldMask
462
files: Sequence[BatchRecognizeFileMetadata]
463
recognition_output_config: RecognitionOutputConfig
464
processing_strategy: ProcessingStrategy
465
```
466
467
### BatchRecognizeFileMetadata
468
469
```python { .api }
470
class BatchRecognizeFileMetadata:
471
"""Metadata for individual file in batch."""
472
uri: str
473
config: RecognitionConfig
474
config_mask: FieldMask
475
output_config: RecognitionOutputConfig
476
```
477
478
### RecognitionOutputConfig
479
480
```python { .api }
481
class RecognitionOutputConfig:
482
"""Configuration for recognition output."""
483
gcs_output_config: GcsOutputConfig
484
inline_response_config: InlineOutputConfig
485
output_format_config: OutputFormatConfig
486
```
487
488
## V2 Response Types
489
490
### BatchRecognizeResponse
491
492
```python { .api }
493
class BatchRecognizeResponse:
494
"""Response from batch recognition."""
495
results: Mapping[str, BatchRecognizeFileResult]
496
total_billed_duration: Duration
497
```
498
499
### BatchRecognizeFileResult
500
501
```python { .api }
502
class BatchRecognizeFileResult:
503
"""Result for individual file in batch."""
504
uri: str
505
error: Status
506
metadata: BatchRecognizeTranscriptionMetadata
507
transcript: BatchRecognizeResults
508
```
509
510
### BatchRecognizeResults
511
512
```python { .api }
513
class BatchRecognizeResults:
514
"""Transcription results from batch recognition."""
515
results: Sequence[SpeechRecognitionResult]
516
metadata: RecognitionResponseMetadata
517
```
518
519
### Config
520
521
```python { .api }
522
class Config:
523
"""Project-level configuration for Speech services."""
524
name: str
525
kms_key_name: str
526
update_time: Timestamp
527
```
528
529
## V2 Request Types (Configuration Management)
530
531
### GetConfigRequest
532
533
```python { .api }
534
class GetConfigRequest:
535
"""Request to retrieve configuration."""
536
name: str # Format: projects/{project}/locations/{location}/config
537
```
538
539
### UpdateConfigRequest
540
541
```python { .api }
542
class UpdateConfigRequest:
543
"""Request to update configuration."""
544
config: Config
545
update_mask: FieldMask
546
```
547
548
### UndeleteRecognizerRequest
549
550
```python { .api }
551
class UndeleteRecognizerRequest:
552
"""Request to undelete a recognizer."""
553
name: str # Format: projects/{project}/locations/{location}/recognizers/{recognizer}
554
validate_only: bool
555
etag: str
556
```
557
558
## Advanced Configuration Examples
559
560
### Multi-language Recognition
561
562
```python
563
# Configure for automatic language detection
564
config = speech_v2.RecognitionConfig(
565
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
566
language_codes=["en-US", "es-ES", "fr-FR"], # Multiple languages
567
features=speech_v2.RecognitionFeatures(
568
enable_automatic_punctuation=True,
569
max_alternatives=3, # Multiple transcription alternatives
570
),
571
)
572
```
573
574
### Translation Integration
575
576
```python
577
# Configure for speech-to-text with translation
578
config = speech_v2.RecognitionConfig(
579
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
580
language_codes=["es-ES"], # Source language
581
translation_config=speech_v2.TranslationConfig(
582
target_language="en-US" # Translate to English
583
),
584
features=speech_v2.RecognitionFeatures(
585
enable_automatic_punctuation=True,
586
),
587
)
588
```
589
590
### Advanced Diarization
591
592
```python
593
# Enhanced speaker diarization configuration
594
diarization_config = speech_v2.SpeakerDiarizationConfig(
595
min_speaker_count=2,
596
max_speaker_count=10,
597
speaker_ids=["SPEAKER_1", "SPEAKER_2"], # Predefined speaker IDs
598
)
599
600
config = speech_v2.RecognitionConfig(
601
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
602
language_codes=["en-US"],
603
features=speech_v2.RecognitionFeatures(
604
enable_speaker_diarization=True,
605
diarization_config=diarization_config,
606
enable_word_time_offsets=True,
607
),
608
)
609
```
610
611
## Migration from v1 to v2
612
613
### Key Changes
614
615
```python
616
# v1 approach
617
from google.cloud import speech
618
619
client = speech.SpeechClient()
620
config = speech.RecognitionConfig(
621
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
622
sample_rate_hertz=16000,
623
language_code="en-US", # Single language
624
)
625
626
# v2 approach
627
from google.cloud import speech_v2
628
629
client = speech_v2.SpeechClient()
630
config = speech_v2.RecognitionConfig(
631
explicit_decoding_config=speech_v2.ExplicitDecodingConfig(
632
encoding=speech_v2.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
633
sample_rate_hertz=16000,
634
),
635
language_codes=["en-US"], # Multiple languages supported
636
features=speech_v2.RecognitionFeatures(
637
enable_automatic_punctuation=True,
638
),
639
)
640
```
641
642
### Recognition Request Changes
643
644
```python
645
# v1 request
646
response = client.recognize(config=config, audio=audio)
647
648
# v2 request
649
request = speech_v2.RecognizeRequest(
650
recognizer="projects/project/locations/global/recognizers/default",
651
config=config,
652
content=audio_content,
653
)
654
response = client.recognize(request=request)
655
```