Google Cloud Speech API client library for speech-to-text conversion with support for real-time streaming, batch processing, and advanced speech recognition models
—
Core data types, configuration objects, and enums for speech recognition setup and result processing across all API versions.
Main configuration object for speech recognition requests.
class RecognitionConfig:
"""Configuration for speech recognition."""
encoding: AudioEncoding
sample_rate_hertz: int
audio_channel_count: int
enable_separate_recognition_per_channel: bool
language_code: str
alternative_language_codes: Sequence[str]
max_alternatives: int
profanity_filter: bool
speech_contexts: Sequence[SpeechContext]
enable_word_time_offsets: bool
enable_word_confidence: bool
enable_automatic_punctuation: bool
enable_spoken_punctuation: bool
enable_spoken_emojis: bool
enable_speaker_diarization: bool
diarization_config: SpeakerDiarizationConfig
metadata: RecognitionMetadata
model: str
use_enhanced: bool
adaptation: SpeechAdaptation
transcript_normalization: TranscriptNormalization
enable_voice_activity_events: boolSpecifies the audio input for recognition.
class RecognitionAudio:
"""Audio input specification."""
content: bytes # Raw audio bytes
uri: str # Cloud Storage URI (gs://bucket/file)Configuration for speaker diarization (identifying different speakers).
class SpeakerDiarizationConfig:
"""Configuration for speaker diarization."""
enable_speaker_diarization: bool
min_speaker_count: int
max_speaker_count: int
speaker_tag: intProvides hints to improve recognition accuracy.
class SpeechContext:
"""Context hints for speech recognition."""
phrases: Sequence[str]
boost: float
speech_adaptation: SpeechAdaptationMetadata about the recognition request for analytics and optimization.
class RecognitionMetadata:
"""Metadata for recognition requests."""
interaction_type: InteractionType
industry_naics_code_of_audio: int
microphone_distance: MicrophoneDistance
original_media_type: OriginalMediaType
recording_device_type: RecordingDeviceType
recording_device_name: str
original_mime_type: str
audio_topic: strContainer for recognition results.
class SpeechRecognitionResult:
"""Container for speech recognition results."""
alternatives: Sequence[SpeechRecognitionAlternative]
channel_tag: int
result_end_time: Duration
language_code: strIndividual recognition hypothesis with confidence score.
class SpeechRecognitionAlternative:
"""Individual recognition alternative."""
transcript: str
confidence: float
words: Sequence[WordInfo]Word-level information including timing and confidence.
class WordInfo:
"""Word-level recognition information."""
start_time: Duration
end_time: Duration
word: str
confidence: float
speaker_tag: int
speaker_label: strInformation about applied speech adaptations.
class SpeechAdaptationInfo:
"""Information about applied speech adaptations."""
adaptation_timeout: bool
timeout_message: strSupported audio encoding formats.
class AudioEncoding:
"""Audio encoding formats."""
ENCODING_UNSPECIFIED = 0
LINEAR16 = 1 # 16-bit linear PCM
FLAC = 2 # FLAC lossless
MULAW = 3 # 8-bit mu-law
AMR = 4 # AMR narrowband
AMR_WB = 5 # AMR wideband
OGG_OPUS = 6 # Ogg Opus
SPEEX_WITH_HEADER_BYTE = 7 # Speex with header
MP3 = 8 # MP3
WEBM_OPUS = 9 # WebM OpusTypes of user interactions for recognition optimization.
class InteractionType:
"""Interaction types for recognition optimization."""
INTERACTION_TYPE_UNSPECIFIED = 0
DISCUSSION = 1 # Multi-participant discussion
PRESENTATION = 2 # Single speaker presentation
PHONE_CALL = 3 # Phone conversation
VOICEMAIL = 4 # Voicemail message
PROFESSIONALLY_PRODUCED = 5 # Professional audio content
VOICE_SEARCH = 6 # Voice search queries
VOICE_COMMAND = 7 # Voice commands
DICTATION = 8 # Dictation use caseMicrophone distance from the audio source.
class MicrophoneDistance:
"""Microphone distance categories."""
MICROPHONE_DISTANCE_UNSPECIFIED = 0
NEARFIELD = 1 # 0-1 meter from source
MIDFIELD = 2 # 1-3 meters from source
FARFIELD = 3 # 3+ meters from sourceOriginal media type of the audio.
class OriginalMediaType:
"""Original media type categories."""
ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0
AUDIO = 1 # Audio-only content
VIDEO = 2 # Video content with audio trackType of device used for recording.
class RecordingDeviceType:
"""Recording device types."""
RECORDING_DEVICE_TYPE_UNSPECIFIED = 0
SMARTPHONE = 1 # Mobile phone
PC = 2 # Personal computer
PHONE_LINE = 3 # Traditional phone line
VEHICLE = 4 # In-vehicle system
OTHER_OUTDOOR_DEVICE = 5 # Other outdoor recording
OTHER_INDOOR_DEVICE = 6 # Other indoor recordingfrom google.cloud import speech
# Simple configuration for high-quality audio
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=44100,
language_code="en-US",
enable_automatic_punctuation=True,
enable_word_time_offsets=True,
)
# Audio from file content
with open("audio.flac", "rb") as f:
audio_content = f.read()
audio = speech.RecognitionAudio(content=audio_content)from google.cloud import speech
# Comprehensive configuration with all features
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
audio_channel_count=2,
enable_separate_recognition_per_channel=True,
language_code="en-US",
alternative_language_codes=["en-GB", "en-AU"],
max_alternatives=3,
profanity_filter=True,
enable_word_time_offsets=True,
enable_word_confidence=True,
enable_automatic_punctuation=True,
enable_speaker_diarization=True,
diarization_config=speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=2,
max_speaker_count=6,
),
metadata=speech.RecognitionMetadata(
interaction_type=speech.RecognitionMetadata.InteractionType.DISCUSSION,
microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
original_media_type=speech.RecognitionMetadata.OriginalMediaType.AUDIO,
recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE,
),
speech_contexts=[
speech.SpeechContext(
phrases=["technical", "terminology", "API", "cloud computing"],
boost=10.0
)
],
use_enhanced=True, # Use enhanced model
)
# Cloud Storage audio
audio = speech.RecognitionAudio(
uri="gs://your-bucket/meeting-recording.wav"
)# Process comprehensive results
response = client.recognize(config=config, audio=audio)
for i, result in enumerate(response.results):
print(f"Result {i + 1}:")
# Process alternatives
for j, alternative in enumerate(result.alternatives):
print(f" Alternative {j + 1} (confidence: {alternative.confidence:.2f}):")
print(f" Transcript: {alternative.transcript}")
# Process word-level information
if alternative.words:
print(" Word details:")
for word in alternative.words[:5]: # Show first 5 words
print(f" '{word.word}': "
f"{word.start_time.total_seconds():.1f}s-"
f"{word.end_time.total_seconds():.1f}s "
f"(confidence: {word.confidence:.2f})")
if word.speaker_tag:
print(f" Speaker: {word.speaker_tag}")
# Access metadata
if response.speech_adaptation_info:
if response.speech_adaptation_info.adaptation_timeout:
print("Warning: Speech adaptation timed out")# Optimal settings for different audio sources
phone_config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.MULAW,
sample_rate_hertz=8000,
language_code="en-US",
metadata=speech.RecognitionMetadata(
interaction_type=speech.RecognitionMetadata.InteractionType.PHONE_CALL,
microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PHONE_LINE,
),
)
# High-quality studio recording
studio_config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=48000,
language_code="en-US",
use_enhanced=True,
metadata=speech.RecognitionMetadata(
interaction_type=speech.RecognitionMetadata.InteractionType.PROFESSIONALLY_PRODUCED,
microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
original_media_type=speech.RecognitionMetadata.OriginalMediaType.AUDIO,
),
)
# Mobile app recording
mobile_config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
enable_automatic_punctuation=True,
metadata=speech.RecognitionMetadata(
interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_COMMAND,
microphone_distance=speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD,
recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE,
),
)# Multi-language support
multilingual_config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US", # Primary language
alternative_language_codes=[
"es-ES", # Spanish
"fr-FR", # French
"de-DE", # German
],
max_alternatives=2, # Get alternatives for uncertain regions
)# Optimized for speed vs accuracy trade-offs
fast_config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
max_alternatives=1, # Single alternative
enable_word_time_offsets=False, # Skip word timing
enable_word_confidence=False, # Skip word confidence
# Keep automatic punctuation for readability
enable_automatic_punctuation=True,
)
# Optimized for maximum accuracy
accurate_config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=48000,
language_code="en-US",
use_enhanced=True, # Enhanced model
max_alternatives=3, # Multiple alternatives
enable_word_time_offsets=True, # Word-level timing
enable_word_confidence=True, # Word-level confidence
enable_automatic_punctuation=True,
enable_speaker_diarization=True,
diarization_config=speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=1,
max_speaker_count=10,
),
)from google.protobuf.duration_pb2 import Duration
# Working with Duration objects
for word in alternative.words:
# Convert to seconds
start_seconds = word.start_time.total_seconds()
end_seconds = word.end_time.total_seconds()
duration = end_seconds - start_seconds
print(f"Word '{word.word}': {start_seconds:.2f}s - {end_seconds:.2f}s ({duration:.2f}s)")from google.api_core import exceptions
from google.cloud import speech
try:
response = client.recognize(config=config, audio=audio)
# Check for empty results
if not response.results:
print("No speech detected in audio")
# Validate result structure
for result in response.results:
if not result.alternatives:
print("No alternatives found for this result")
continue
best_alternative = result.alternatives[0]
if best_alternative.confidence < 0.5:
print(f"Low confidence result: {best_alternative.confidence}")
except exceptions.InvalidArgument as e:
print(f"Invalid configuration: {e}")
except exceptions.OutOfRange as e:
print(f"Audio too long or other limit exceeded: {e}")
except exceptions.DeadlineExceeded as e:
print(f"Request timed out: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-speech