0
# Streaming Recognition
1
2
Real-time bidirectional streaming speech recognition for live audio processing. Enables continuous recognition with immediate results as audio is streamed to the service.
3
4
## Capabilities
5
6
### Bidirectional Streaming
7
8
Performs real-time speech recognition on streaming audio with immediate partial and final results.
9
10
```python { .api }
11
def streaming_recognize(
12
self,
13
requests: Iterator[StreamingRecognizeRequest],
14
*,
15
retry: OptionalRetry = None,
16
timeout: Optional[float] = None,
17
metadata: Sequence[Tuple[str, str]] = ()
18
) -> Iterator[StreamingRecognizeResponse]:
19
"""
20
Performs bidirectional streaming speech recognition.
21
22
Parameters:
23
- requests: Iterator of streaming recognition requests
24
- retry: Retry configuration for failed requests
25
- timeout: Request timeout in seconds
26
- metadata: Additional metadata to send with the request
27
28
Returns:
29
Iterator of StreamingRecognizeResponse objects
30
31
Raises:
32
google.api_core.exceptions.InvalidArgument: If the request is malformed
33
google.api_core.exceptions.OutOfRange: If streaming limits are exceeded
34
"""
35
```
36
37
### SpeechHelpers Streaming Interface
38
39
Simplified streaming interface provided by the SpeechHelpers mixin class that automatically handles request formatting and configuration injection.
40
41
```python { .api }
42
class SpeechHelpers:
43
def streaming_recognize(
44
self,
45
config: StreamingRecognitionConfig,
46
requests: Iterator[StreamingRecognizeRequest],
47
*,
48
retry: OptionalRetry = None,
49
timeout: Optional[float] = None,
50
metadata: Sequence[Tuple[str, str]] = ()
51
) -> Iterator[StreamingRecognizeResponse]:
52
"""
53
Enhanced streaming recognition with automatic request formatting.
54
55
This helper method automatically prepends the configuration to the
56
request stream, simplifying the streaming workflow.
57
58
Parameters:
59
- config: Streaming recognition configuration (automatically sent first)
60
- requests: Iterator of audio-only requests (no config needed)
61
- retry: Retry configuration for failed requests
62
- timeout: Request timeout in seconds
63
- metadata: Additional metadata to send with the request
64
65
Returns:
66
Iterator of StreamingRecognizeResponse objects
67
68
Note:
69
This method is mixed into SpeechClient via multiple inheritance.
70
Available in speech_v1 and speech_v1p1beta1.
71
"""
72
```
73
74
## Usage Examples
75
76
### Basic Streaming Recognition
77
78
```python
79
from google.cloud import speech
80
import pyaudio
81
import threading
82
83
client = speech.SpeechClient()
84
85
# Audio recording parameters
86
RATE = 16000
87
CHUNK = int(RATE / 10) # 100ms chunks
88
89
# Configure streaming recognition
90
config = speech.StreamingRecognitionConfig(
91
config=speech.RecognitionConfig(
92
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
93
sample_rate_hertz=RATE,
94
language_code="en-US",
95
),
96
interim_results=True, # Enable partial results
97
)
98
99
def generate_requests():
100
"""Generator function to yield audio chunks."""
101
# Initialize audio
102
audio_interface = pyaudio.PyAudio()
103
audio_stream = audio_interface.open(
104
format=pyaudio.paInt16,
105
channels=1,
106
rate=RATE,
107
input=True,
108
frames_per_buffer=CHUNK,
109
)
110
111
try:
112
while True:
113
data = audio_stream.read(CHUNK)
114
yield speech.StreamingRecognizeRequest(audio_content=data)
115
finally:
116
audio_stream.stop_stream()
117
audio_stream.close()
118
audio_interface.terminate()
119
120
# Perform streaming recognition
121
requests = generate_requests()
122
responses = client.streaming_recognize(config, requests)
123
124
# Process results
125
for response in responses:
126
for result in response.results:
127
if result.is_final:
128
print(f"Final transcript: {result.alternatives[0].transcript}")
129
else:
130
print(f"Partial transcript: {result.alternatives[0].transcript}")
131
```
132
133
### Advanced Streaming with Voice Activity Detection
134
135
```python
136
from google.cloud import speech
137
138
client = speech.SpeechClient()
139
140
# Advanced streaming configuration
141
config = speech.StreamingRecognitionConfig(
142
config=speech.RecognitionConfig(
143
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
144
sample_rate_hertz=16000,
145
language_code="en-US",
146
enable_automatic_punctuation=True,
147
enable_voice_activity_events=True,
148
speech_contexts=[
149
speech.SpeechContext(
150
phrases=["technical", "keywords", "domain", "specific"]
151
)
152
],
153
),
154
interim_results=True,
155
single_utterance=False, # Continue listening after pauses
156
enable_voice_activity_events=True,
157
)
158
159
def stream_recognition():
160
"""Handle streaming recognition with voice activity detection."""
161
def request_generator():
162
# First request with configuration
163
yield speech.StreamingRecognizeRequest(streaming_config=config)
164
165
# Subsequent requests with audio data
166
# (Implementation would include actual audio capture)
167
pass
168
169
requests = request_generator()
170
responses = client.streaming_recognize(requests)
171
172
for response in responses:
173
# Handle speech event detection
174
if response.speech_event_type:
175
if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN:
176
print("Speech activity started")
177
elif response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END:
178
print("Speech activity ended")
179
180
# Handle recognition results
181
for result in response.results:
182
if result.is_final:
183
print(f"Final: {result.alternatives[0].transcript}")
184
print(f"Stability: {result.stability}")
185
else:
186
print(f"Interim: {result.alternatives[0].transcript}")
187
```
188
189
## Request Types
190
191
### StreamingRecognizeRequest
192
193
```python { .api }
194
class StreamingRecognizeRequest:
195
"""Request for streaming speech recognition."""
196
streaming_config: StreamingRecognitionConfig # First request only
197
audio_content: bytes # Audio data for subsequent requests
198
```
199
200
### StreamingRecognitionConfig
201
202
```python { .api }
203
class StreamingRecognitionConfig:
204
"""Configuration for streaming recognition."""
205
config: RecognitionConfig
206
single_utterance: bool # Stop after first utterance
207
interim_results: bool # Return partial results
208
enable_voice_activity_events: bool # Detect speech activity
209
```
210
211
## Response Types
212
213
### StreamingRecognizeResponse
214
215
```python { .api }
216
class StreamingRecognizeResponse:
217
"""Response from streaming speech recognition."""
218
error: Status
219
results: Sequence[StreamingRecognitionResult]
220
speech_event_type: SpeechEventType
221
speech_event_offset: Duration
222
total_billed_time: Duration
223
speech_adaptation_info: SpeechAdaptationInfo
224
request_id: int
225
```
226
227
### StreamingRecognitionResult
228
229
```python { .api }
230
class StreamingRecognitionResult:
231
"""Individual recognition result in streaming response."""
232
alternatives: Sequence[SpeechRecognitionAlternative]
233
is_final: bool # True for final results
234
stability: float # Stability score (0.0-1.0)
235
result_end_time: Duration
236
channel_tag: int
237
language_code: str
238
```
239
240
## Streaming Limitations and Best Practices
241
242
### Time Limits
243
244
```python
245
# Streaming session limits
246
MAX_STREAMING_DURATION = 305 # seconds (5 minutes + 5 seconds)
247
MAX_AUDIO_DURATION = 300 # seconds of audio content
248
249
# Restart streaming session before limits
250
import time
251
252
def long_running_stream():
253
"""Example of handling streaming session limits."""
254
session_start = time.time()
255
256
while True:
257
if time.time() - session_start > 280: # Restart before 5-minute limit
258
print("Restarting streaming session...")
259
break
260
261
# Continue streaming...
262
```
263
264
### Audio Quality Requirements
265
266
```python
267
# Optimal audio settings for streaming
268
config = speech.StreamingRecognitionConfig(
269
config=speech.RecognitionConfig(
270
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
271
sample_rate_hertz=16000, # Recommended for best performance
272
audio_channel_count=1, # Mono audio
273
language_code="en-US",
274
),
275
interim_results=True,
276
)
277
```
278
279
### Error Handling in Streaming
280
281
```python
282
from google.api_core import exceptions
283
284
def robust_streaming():
285
"""Example of robust streaming with error handling."""
286
max_retries = 3
287
retry_count = 0
288
289
while retry_count < max_retries:
290
try:
291
responses = client.streaming_recognize(config, requests)
292
293
for response in responses:
294
if response.error.code != 0:
295
print(f"Recognition error: {response.error.message}")
296
break
297
298
# Process results...
299
300
except exceptions.OutOfRange as e:
301
print(f"Streaming limit exceeded: {e}")
302
retry_count += 1
303
304
except exceptions.InvalidArgument as e:
305
print(f"Invalid request: {e}")
306
break # Don't retry on invalid arguments
307
```
308
309
## Voice Activity Events
310
311
```python { .api }
312
class SpeechEventType:
313
"""Types of speech events in streaming recognition."""
314
SPEECH_EVENT_UNSPECIFIED = 0
315
END_OF_SINGLE_UTTERANCE = 1
316
SPEECH_ACTIVITY_BEGIN = 2
317
SPEECH_ACTIVITY_END = 3
318
```
319
320
### Voice Activity Detection Usage
321
322
```python
323
# Enable voice activity events
324
config = speech.StreamingRecognitionConfig(
325
config=speech.RecognitionConfig(
326
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
327
sample_rate_hertz=16000,
328
language_code="en-US",
329
enable_voice_activity_events=True,
330
),
331
enable_voice_activity_events=True,
332
interim_results=True,
333
)
334
335
# Process voice activity events
336
for response in client.streaming_recognize(requests):
337
if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN:
338
print("User started speaking")
339
elif response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END:
340
print("User stopped speaking")
341
```