Google Cloud Speech API client library for speech-to-text conversion with support for real-time streaming, batch processing, and advanced speech recognition models
—
Core speech-to-text functionality providing synchronous and asynchronous recognition modes for converting audio to text with high accuracy and extensive configuration options.
Performs immediate speech recognition on short audio files (typically under 1 minute). Ideal for real-time applications requiring immediate results.
def recognize(
self,
config: RecognitionConfig,
audio: RecognitionAudio,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> RecognizeResponse:
"""
Performs synchronous speech recognition.
Parameters:
- config: Configuration for the recognition request
- audio: Audio data to be recognized
- retry: Retry configuration for failed requests
- timeout: Request timeout in seconds
- metadata: Additional metadata to send with the request
Returns:
RecognizeResponse containing recognition results
Raises:
google.api_core.exceptions.InvalidArgument: If the request is malformed
google.api_core.exceptions.DeadlineExceeded: If the request times out
"""from google.cloud import speech
import io
client = speech.SpeechClient()
# Load audio file
with io.open("short_audio.wav", "rb") as audio_file:
content = audio_file.read()
# Configure recognition
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
enable_automatic_punctuation=True,
enable_word_time_offsets=True,
)
# Perform recognition
response = client.recognize(config=config, audio=audio)
# Process results
for result in response.results:
alternative = result.alternatives[0]
print(f"Transcript: {alternative.transcript}")
print(f"Confidence: {alternative.confidence}")
# Word-level information
for word in alternative.words:
print(f"Word: {word.word}, "
f"Start: {word.start_time.total_seconds()}s, "
f"End: {word.end_time.total_seconds()}s")Performs long-running speech recognition on longer audio files. Returns immediately with an operation object that can be polled for results.
def long_running_recognize(
self,
config: RecognitionConfig,
audio: RecognitionAudio,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
"""
Performs asynchronous speech recognition for longer audio files.
Parameters:
- config: Configuration for the recognition request
- audio: Audio data to be recognized (can be Cloud Storage URI)
- retry: Retry configuration for failed requests
- timeout: Request timeout in seconds
- metadata: Additional metadata to send with the request
Returns:
Operation object that can be polled for results
Raises:
google.api_core.exceptions.InvalidArgument: If the request is malformed
"""from google.cloud import speech
client = speech.SpeechClient()
# Configure for Cloud Storage audio file
audio = speech.RecognitionAudio(
uri="gs://your-bucket/long_audio.flac"
)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=44100,
language_code="en-US",
enable_speaker_diarization=True,
diarization_config=speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=2,
max_speaker_count=10,
),
)
# Start long-running operation
operation = client.long_running_recognize(config=config, audio=audio)
print(f"Operation name: {operation.operation.name}")
# Wait for completion (with timeout)
response = operation.result(timeout=600) # 10 minutes
# Process results with speaker information
for result in response.results:
alternative = result.alternatives[0]
print(f"Transcript: {alternative.transcript}")
# Speaker diarization results
for word in alternative.words:
print(f"Speaker {word.speaker_tag}: {word.word}")class RecognizeRequest:
"""Request for synchronous speech recognition."""
config: RecognitionConfig
audio: RecognitionAudioclass LongRunningRecognizeRequest:
"""Request for asynchronous speech recognition."""
config: RecognitionConfig
audio: RecognitionAudio
output_config: TranscriptOutputConfig # Optional output configurationclass RecognizeResponse:
"""Response from synchronous speech recognition."""
results: Sequence[SpeechRecognitionResult]
total_billed_time: Duration
speech_adaptation_info: SpeechAdaptationInfo
request_id: intclass LongRunningRecognizeResponse:
"""Response from asynchronous speech recognition."""
results: Sequence[SpeechRecognitionResult]
total_billed_time: Duration
speech_adaptation_info: SpeechAdaptationInfo
request_id: int
output_config: TranscriptOutputConfig
output_error: Statusclass LongRunningRecognizeMetadata:
"""Metadata for long-running recognition operations."""
progress_percent: int
start_time: Timestamp
last_update_time: Timestamp
uri: str# Supported audio encodings
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
# Other options:
# - FLAC
# - MULAW
# - AMR
# - AMR_WB
# - OGG_OPUS
# - SPEEX_WITH_HEADER_BYTE
# - MP3
# - WEBM_OPUS
)# Language codes
config = speech.RecognitionConfig(
language_code="en-US", # Primary language
alternative_language_codes=["en-GB", "es-ES"], # Alternative languages
)config = speech.RecognitionConfig(
# Automatic punctuation
enable_automatic_punctuation=True,
# Word timing information
enable_word_time_offsets=True,
# Confidence scores
enable_word_confidence=True,
# Speaker diarization
enable_speaker_diarization=True,
diarization_config=speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=2,
max_speaker_count=6,
),
# Profanity filter
profanity_filter=True,
# Speech contexts for better accuracy
speech_contexts=[
speech.SpeechContext(
phrases=["custom", "terminology", "specific", "words"]
)
],
)Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-speech