Google Cloud Speech API client library for speech-to-text conversion with support for real-time streaming, batch processing, and advanced speech recognition models
—
Real-time bidirectional streaming speech recognition for live audio processing. Enables continuous recognition with immediate results as audio is streamed to the service.
Performs real-time speech recognition on streaming audio with immediate partial and final results.
def streaming_recognize(
self,
requests: Iterator[StreamingRecognizeRequest],
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Iterator[StreamingRecognizeResponse]:
"""
Performs bidirectional streaming speech recognition.
Parameters:
- requests: Iterator of streaming recognition requests
- retry: Retry configuration for failed requests
- timeout: Request timeout in seconds
- metadata: Additional metadata to send with the request
Returns:
Iterator of StreamingRecognizeResponse objects
Raises:
google.api_core.exceptions.InvalidArgument: If the request is malformed
google.api_core.exceptions.OutOfRange: If streaming limits are exceeded
"""Simplified streaming interface provided by the SpeechHelpers mixin class that automatically handles request formatting and configuration injection.
class SpeechHelpers:
def streaming_recognize(
self,
config: StreamingRecognitionConfig,
requests: Iterator[StreamingRecognizeRequest],
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Iterator[StreamingRecognizeResponse]:
"""
Enhanced streaming recognition with automatic request formatting.
This helper method automatically prepends the configuration to the
request stream, simplifying the streaming workflow.
Parameters:
- config: Streaming recognition configuration (automatically sent first)
- requests: Iterator of audio-only requests (no config needed)
- retry: Retry configuration for failed requests
- timeout: Request timeout in seconds
- metadata: Additional metadata to send with the request
Returns:
Iterator of StreamingRecognizeResponse objects
Note:
This method is mixed into SpeechClient via multiple inheritance.
Available in speech_v1 and speech_v1p1beta1.
"""from google.cloud import speech
import pyaudio
import threading
client = speech.SpeechClient()
# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10) # 100ms chunks
# Configure streaming recognition
config = speech.StreamingRecognitionConfig(
config=speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=RATE,
language_code="en-US",
),
interim_results=True, # Enable partial results
)
def generate_requests():
"""Generator function to yield audio chunks."""
# Initialize audio
audio_interface = pyaudio.PyAudio()
audio_stream = audio_interface.open(
format=pyaudio.paInt16,
channels=1,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
)
try:
while True:
data = audio_stream.read(CHUNK)
yield speech.StreamingRecognizeRequest(audio_content=data)
finally:
audio_stream.stop_stream()
audio_stream.close()
audio_interface.terminate()
# Perform streaming recognition
requests = generate_requests()
responses = client.streaming_recognize(config, requests)
# Process results
for response in responses:
for result in response.results:
if result.is_final:
print(f"Final transcript: {result.alternatives[0].transcript}")
else:
print(f"Partial transcript: {result.alternatives[0].transcript}")from google.cloud import speech
client = speech.SpeechClient()
# Advanced streaming configuration
config = speech.StreamingRecognitionConfig(
config=speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
enable_automatic_punctuation=True,
enable_voice_activity_events=True,
speech_contexts=[
speech.SpeechContext(
phrases=["technical", "keywords", "domain", "specific"]
)
],
),
interim_results=True,
single_utterance=False, # Continue listening after pauses
enable_voice_activity_events=True,
)
def stream_recognition():
"""Handle streaming recognition with voice activity detection."""
def request_generator():
# First request with configuration
yield speech.StreamingRecognizeRequest(streaming_config=config)
# Subsequent requests with audio data
# (Implementation would include actual audio capture)
pass
requests = request_generator()
responses = client.streaming_recognize(requests)
for response in responses:
# Handle speech event detection
if response.speech_event_type:
if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN:
print("Speech activity started")
elif response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END:
print("Speech activity ended")
# Handle recognition results
for result in response.results:
if result.is_final:
print(f"Final: {result.alternatives[0].transcript}")
print(f"Stability: {result.stability}")
else:
print(f"Interim: {result.alternatives[0].transcript}")class StreamingRecognizeRequest:
"""Request for streaming speech recognition."""
streaming_config: StreamingRecognitionConfig # First request only
audio_content: bytes # Audio data for subsequent requestsclass StreamingRecognitionConfig:
"""Configuration for streaming recognition."""
config: RecognitionConfig
single_utterance: bool # Stop after first utterance
interim_results: bool # Return partial results
enable_voice_activity_events: bool # Detect speech activityclass StreamingRecognizeResponse:
"""Response from streaming speech recognition."""
error: Status
results: Sequence[StreamingRecognitionResult]
speech_event_type: SpeechEventType
speech_event_offset: Duration
total_billed_time: Duration
speech_adaptation_info: SpeechAdaptationInfo
request_id: intclass StreamingRecognitionResult:
"""Individual recognition result in streaming response."""
alternatives: Sequence[SpeechRecognitionAlternative]
is_final: bool # True for final results
stability: float # Stability score (0.0-1.0)
result_end_time: Duration
channel_tag: int
language_code: str# Streaming session limits
MAX_STREAMING_DURATION = 305 # seconds (5 minutes + 5 seconds)
MAX_AUDIO_DURATION = 300 # seconds of audio content
# Restart streaming session before limits
import time
def long_running_stream():
"""Example of handling streaming session limits."""
session_start = time.time()
while True:
if time.time() - session_start > 280: # Restart before 5-minute limit
print("Restarting streaming session...")
break
# Continue streaming...# Optimal audio settings for streaming
config = speech.StreamingRecognitionConfig(
config=speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000, # Recommended for best performance
audio_channel_count=1, # Mono audio
language_code="en-US",
),
interim_results=True,
)from google.api_core import exceptions
def robust_streaming():
"""Example of robust streaming with error handling."""
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
responses = client.streaming_recognize(config, requests)
for response in responses:
if response.error.code != 0:
print(f"Recognition error: {response.error.message}")
break
# Process results...
except exceptions.OutOfRange as e:
print(f"Streaming limit exceeded: {e}")
retry_count += 1
except exceptions.InvalidArgument as e:
print(f"Invalid request: {e}")
break # Don't retry on invalid argumentsclass SpeechEventType:
"""Types of speech events in streaming recognition."""
SPEECH_EVENT_UNSPECIFIED = 0
END_OF_SINGLE_UTTERANCE = 1
SPEECH_ACTIVITY_BEGIN = 2
SPEECH_ACTIVITY_END = 3# Enable voice activity events
config = speech.StreamingRecognitionConfig(
config=speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
enable_voice_activity_events=True,
),
enable_voice_activity_events=True,
interim_results=True,
)
# Process voice activity events
for response in client.streaming_recognize(requests):
if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN:
print("User started speaking")
elif response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END:
print("User stopped speaking")Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-speech