Google Cloud Speech API client library for speech-to-text conversion with support for real-time streaming, batch processing, and advanced speech recognition models
—
Next-generation Speech API (v2) features including batch recognition, recognizer management, enhanced output formatting, and advanced configuration options.
from google.cloud import speech_v2
# Initialize v2 client
client = speech_v2.SpeechClient()Process multiple audio files efficiently with batch recognition operations.
def batch_recognize(
self,
request: BatchRecognizeRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
"""
Performs batch speech recognition on multiple audio files.
Parameters:
- request: Batch recognition request with files and configuration
- retry: Retry configuration for failed requests
- timeout: Request timeout in seconds
- metadata: Additional metadata to send with the request
Returns:
Operation: Long-running operation for batch processing
Raises:
google.api_core.exceptions.InvalidArgument: If the request is malformed
"""from google.cloud import speech_v2
client = speech_v2.SpeechClient()
# Configure batch recognition
request = speech_v2.BatchRecognizeRequest(
parent="projects/your-project-id/locations/global",
config=speech_v2.RecognitionConfig(
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
language_codes=["en-US"],
features=speech_v2.RecognitionFeatures(
enable_automatic_punctuation=True,
enable_word_time_offsets=True,
enable_speaker_diarization=True,
),
),
files=[
speech_v2.BatchRecognizeFileMetadata(
uri="gs://your-bucket/audio1.wav",
output_config=speech_v2.RecognitionOutputConfig(
gcs_output_config=speech_v2.GcsOutputConfig(
uri="gs://your-bucket/output/"
),
output_format_config=speech_v2.OutputFormatConfig(
native=speech_v2.NativeOutputFileFormatConfig()
),
),
),
speech_v2.BatchRecognizeFileMetadata(
uri="gs://your-bucket/audio2.flac",
),
],
recognition_output_config=speech_v2.RecognitionOutputConfig(
inline_response_config=speech_v2.InlineOutputConfig(),
),
)
# Start batch operation
operation = client.batch_recognize(request=request)
print(f"Batch operation: {operation.operation.name}")
# Wait for completion
response = operation.result(timeout=1800) # 30 minutes
print(f"Processed {len(response.results)} files")Create, manage, and configure persistent recognizers for consistent speech recognition settings.
def create_recognizer(
self,
request: CreateRecognizerRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
"""Create a custom recognizer with specific configuration."""
def get_recognizer(
self,
request: GetRecognizerRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Recognizer:
"""Retrieve a recognizer by name."""
def list_recognizers(
self,
request: ListRecognizersRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> ListRecognizersResponse:
"""List recognizers in a project."""
def update_recognizer(
self,
request: UpdateRecognizerRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
"""Update an existing recognizer."""
def delete_recognizer(
self,
request: DeleteRecognizerRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
"""Delete a recognizer."""
def undelete_recognizer(
self,
request: UndeleteRecognizerRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
"""
Undeletes a previously deleted recognizer.
Parameters:
- request: Request to undelete a recognizer
- retry: Retry configuration for failed requests
- timeout: Request timeout in seconds
- metadata: Additional metadata to send with the request
Returns:
Operation: Long-running operation for undelete process
"""from google.cloud import speech_v2
client = speech_v2.SpeechClient()
# Create a custom recognizer
recognizer_request = speech_v2.CreateRecognizerRequest(
parent="projects/your-project-id/locations/us-central1",
recognizer_id="medical-transcription",
recognizer=speech_v2.Recognizer(
display_name="Medical Transcription Recognizer",
model="medical_conversation",
language_codes=["en-US"],
default_recognition_config=speech_v2.RecognitionConfig(
features=speech_v2.RecognitionFeatures(
enable_automatic_punctuation=True,
profanity_filter=True,
enable_speaker_diarization=True,
diarization_config=speech_v2.SpeakerDiarizationConfig(
min_speaker_count=2,
max_speaker_count=4,
),
),
),
),
)
operation = client.create_recognizer(request=recognizer_request)
recognizer = operation.result()
# Use the recognizer for recognition
recognize_request = speech_v2.RecognizeRequest(
recognizer=recognizer.name,
config=speech_v2.RecognitionConfig(
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
),
content=audio_content,
)
response = client.recognize(request=recognize_request)Generate output in various formats including VTT and SRT subtitles.
class OutputFormatConfig:
"""Configuration for output formatting."""
native: NativeOutputFileFormatConfig
vtt: VttOutputFileFormatConfig
srt: SrtOutputFileFormatConfig
class VttOutputFileFormatConfig:
"""Configuration for VTT subtitle format."""
class SrtOutputFileFormatConfig:
"""Configuration for SRT subtitle format."""
class NativeOutputFileFormatConfig:
"""Configuration for native JSON format."""from google.cloud import speech_v2
client = speech_v2.SpeechClient()
# Configure for subtitle generation
request = speech_v2.RecognizeRequest(
recognizer="projects/project/locations/global/recognizers/default",
config=speech_v2.RecognitionConfig(
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
language_codes=["en-US"],
features=speech_v2.RecognitionFeatures(
enable_word_time_offsets=True,
enable_automatic_punctuation=True,
),
),
content=audio_content,
output_config=speech_v2.RecognitionOutputConfig(
output_format_config=speech_v2.OutputFormatConfig(
# Generate VTT subtitles
vtt=speech_v2.VttOutputFileFormatConfig()
),
gcs_output_config=speech_v2.GcsOutputConfig(
uri="gs://your-bucket/subtitles/"
),
),
)
response = client.recognize(request=request)
# Also generate SRT format
srt_request = speech_v2.RecognizeRequest(
recognizer="projects/project/locations/global/recognizers/default",
config=speech_v2.RecognitionConfig(
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
language_codes=["en-US"],
features=speech_v2.RecognitionFeatures(
enable_word_time_offsets=True,
enable_automatic_punctuation=True,
),
),
content=audio_content,
output_config=speech_v2.RecognitionOutputConfig(
output_format_config=speech_v2.OutputFormatConfig(
# Generate SRT subtitles
srt=speech_v2.SrtOutputFileFormatConfig()
),
gcs_output_config=speech_v2.GcsOutputConfig(
uri="gs://your-bucket/subtitles/"
),
),
)
srt_response = client.recognize(request=srt_request)Manage project-level configuration settings for speech recognition services.
def get_config(
self,
request: GetConfigRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Config:
"""
Retrieves the requested configuration.
Parameters:
- request: Request to get configuration
- retry: Retry configuration for failed requests
- timeout: Request timeout in seconds
- metadata: Additional metadata to send with the request
Returns:
Config: The requested configuration object
"""
def update_config(
self,
request: UpdateConfigRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Config:
"""
Updates the configuration settings.
Parameters:
- request: Request to update configuration with new settings
- retry: Retry configuration for failed requests
- timeout: Request timeout in seconds
- metadata: Additional metadata to send with the request
Returns:
Config: The updated configuration object
"""from google.cloud import speech_v2
client = speech_v2.SpeechClient()
# Get current configuration
get_request = speech_v2.GetConfigRequest(
name="projects/your-project-id/locations/global/config"
)
config = client.get_config(request=get_request)
print(f"Current config: {config}")
# Update configuration
updated_config = speech_v2.Config(
name="projects/your-project-id/locations/global/config",
kms_key_name="projects/your-project-id/locations/us-central1/keyRings/ring/cryptoKeys/key",
update_time=None, # Will be set by service
)
update_request = speech_v2.UpdateConfigRequest(
config=updated_config,
update_mask={"paths": ["kms_key_name"]}, # Only update encryption key
)
updated_config = client.update_config(request=update_request)
print(f"Updated config: {updated_config}")class RecognitionConfig:
"""Enhanced recognition configuration for v2 API."""
explicit_decoding_config: ExplicitDecodingConfig
auto_decoding_config: AutoDetectDecodingConfig
model: str
language_codes: Sequence[str]
translation_config: TranslationConfig
features: RecognitionFeatures
adaptation: SpeechAdaptation
transcript_normalization: TranscriptNormalizationclass RecognitionFeatures:
"""Feature flags for speech recognition."""
enable_word_time_offsets: bool
enable_word_confidence: bool
enable_automatic_punctuation: bool
enable_spoken_punctuation: bool
enable_spoken_emojis: bool
enable_speaker_diarization: bool
diarization_config: SpeakerDiarizationConfig
max_alternatives: int
profanity_filter: boolclass AutoDetectDecodingConfig:
"""Automatic audio format detection."""
# No configuration needed - automatically detects formatclass ExplicitDecodingConfig:
"""Explicit audio format specification."""
encoding: AudioEncoding
sample_rate_hertz: int
audio_channel_count: intclass Recognizer:
"""Persistent recognizer configuration."""
name: str
uid: str
display_name: str
model: str
language_codes: Sequence[str]
default_recognition_config: RecognitionConfig
annotations: Mapping[str, str]
state: State
create_time: Timestamp
update_time: Timestamp
delete_time: Timestamp
expire_time: Timestamp
etag: str
reconciling: bool
kms_key_name: str
kms_key_version_name: str
class State:
"""Recognizer lifecycle state."""
STATE_UNSPECIFIED = 0
ACTIVE = 2
DELETE_REQUESTED = 3class BatchRecognizeRequest:
"""Request for batch recognition."""
parent: str
config: RecognitionConfig
config_mask: FieldMask
files: Sequence[BatchRecognizeFileMetadata]
recognition_output_config: RecognitionOutputConfig
processing_strategy: ProcessingStrategyclass BatchRecognizeFileMetadata:
"""Metadata for individual file in batch."""
uri: str
config: RecognitionConfig
config_mask: FieldMask
output_config: RecognitionOutputConfigclass RecognitionOutputConfig:
"""Configuration for recognition output."""
gcs_output_config: GcsOutputConfig
inline_response_config: InlineOutputConfig
output_format_config: OutputFormatConfigclass BatchRecognizeResponse:
"""Response from batch recognition."""
results: Mapping[str, BatchRecognizeFileResult]
total_billed_duration: Durationclass BatchRecognizeFileResult:
"""Result for individual file in batch."""
uri: str
error: Status
metadata: BatchRecognizeTranscriptionMetadata
transcript: BatchRecognizeResultsclass BatchRecognizeResults:
"""Transcription results from batch recognition."""
results: Sequence[SpeechRecognitionResult]
metadata: RecognitionResponseMetadataclass Config:
"""Project-level configuration for Speech services."""
name: str
kms_key_name: str
update_time: Timestampclass GetConfigRequest:
"""Request to retrieve configuration."""
name: str # Format: projects/{project}/locations/{location}/configclass UpdateConfigRequest:
"""Request to update configuration."""
config: Config
update_mask: FieldMaskclass UndeleteRecognizerRequest:
"""Request to undelete a recognizer."""
name: str # Format: projects/{project}/locations/{location}/recognizers/{recognizer}
validate_only: bool
etag: str# Configure for automatic language detection
config = speech_v2.RecognitionConfig(
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
language_codes=["en-US", "es-ES", "fr-FR"], # Multiple languages
features=speech_v2.RecognitionFeatures(
enable_automatic_punctuation=True,
max_alternatives=3, # Multiple transcription alternatives
),
)# Configure for speech-to-text with translation
config = speech_v2.RecognitionConfig(
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
language_codes=["es-ES"], # Source language
translation_config=speech_v2.TranslationConfig(
target_language="en-US" # Translate to English
),
features=speech_v2.RecognitionFeatures(
enable_automatic_punctuation=True,
),
)# Enhanced speaker diarization configuration
diarization_config = speech_v2.SpeakerDiarizationConfig(
min_speaker_count=2,
max_speaker_count=10,
speaker_ids=["SPEAKER_1", "SPEAKER_2"], # Predefined speaker IDs
)
config = speech_v2.RecognitionConfig(
auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
language_codes=["en-US"],
features=speech_v2.RecognitionFeatures(
enable_speaker_diarization=True,
diarization_config=diarization_config,
enable_word_time_offsets=True,
),
)# v1 approach
from google.cloud import speech
client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US", # Single language
)
# v2 approach
from google.cloud import speech_v2
client = speech_v2.SpeechClient()
config = speech_v2.RecognitionConfig(
explicit_decoding_config=speech_v2.ExplicitDecodingConfig(
encoding=speech_v2.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
),
language_codes=["en-US"], # Multiple languages supported
features=speech_v2.RecognitionFeatures(
enable_automatic_punctuation=True,
),
)# v1 request
response = client.recognize(config=config, audio=audio)
# v2 request
request = speech_v2.RecognizeRequest(
recognizer="projects/project/locations/global/recognizers/default",
config=config,
content=audio_content,
)
response = client.recognize(request=request)Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-speech