Tessl Tile for pypi/google-cloud-speech@2.33.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced-features.md index.md speech-adaptation.md speech-recognition.md streaming-recognition.md types-and-configuration.md

streaming-recognition.mddocs/

0
# Streaming Recognition
1

2
Real-time bidirectional streaming speech recognition for live audio processing. Enables continuous recognition with immediate results as audio is streamed to the service.
3

4
## Capabilities
5

6
### Bidirectional Streaming
7

8
Performs real-time speech recognition on streaming audio with immediate partial and final results.
9

10
```python { .api }
11
def streaming_recognize(
12
    self,
13
    requests: Iterator[StreamingRecognizeRequest],
14
    *,
15
    retry: OptionalRetry = None,
16
    timeout: Optional[float] = None,
17
    metadata: Sequence[Tuple[str, str]] = ()
18
) -> Iterator[StreamingRecognizeResponse]:
19
    """
20
    Performs bidirectional streaming speech recognition.
21

22
    Parameters:
23
    - requests: Iterator of streaming recognition requests
24
    - retry: Retry configuration for failed requests
25
    - timeout: Request timeout in seconds
26
    - metadata: Additional metadata to send with the request
27

28
    Returns:
29
    Iterator of StreamingRecognizeResponse objects
30

31
    Raises:
32
    google.api_core.exceptions.InvalidArgument: If the request is malformed
33
    google.api_core.exceptions.OutOfRange: If streaming limits are exceeded
34
    """
35
```
36

37
### SpeechHelpers Streaming Interface
38

39
Simplified streaming interface provided by the SpeechHelpers mixin class that automatically handles request formatting and configuration injection.
40

41
```python { .api }
42
class SpeechHelpers:
43
    def streaming_recognize(
44
        self,
45
        config: StreamingRecognitionConfig,
46
        requests: Iterator[StreamingRecognizeRequest],
47
        *,
48
        retry: OptionalRetry = None,
49
        timeout: Optional[float] = None,
50
        metadata: Sequence[Tuple[str, str]] = ()
51
    ) -> Iterator[StreamingRecognizeResponse]:
52
        """
53
        Enhanced streaming recognition with automatic request formatting.
54
        
55
        This helper method automatically prepends the configuration to the
56
        request stream, simplifying the streaming workflow.
57

58
        Parameters:
59
        - config: Streaming recognition configuration (automatically sent first)
60
        - requests: Iterator of audio-only requests (no config needed)
61
        - retry: Retry configuration for failed requests
62
        - timeout: Request timeout in seconds
63
        - metadata: Additional metadata to send with the request
64

65
        Returns:
66
        Iterator of StreamingRecognizeResponse objects
67
        
68
        Note:
69
        This method is mixed into SpeechClient via multiple inheritance.
70
        Available in speech_v1 and speech_v1p1beta1.
71
        """
72
```
73

74
## Usage Examples
75

76
### Basic Streaming Recognition
77

78
```python
79
from google.cloud import speech
80
import pyaudio
81
import threading
82

83
client = speech.SpeechClient()
84

85
# Audio recording parameters
86
RATE = 16000
87
CHUNK = int(RATE / 10)  # 100ms chunks
88

89
# Configure streaming recognition
90
config = speech.StreamingRecognitionConfig(
91
    config=speech.RecognitionConfig(
92
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
93
        sample_rate_hertz=RATE,
94
        language_code="en-US",
95
    ),
96
    interim_results=True,  # Enable partial results
97
)
98

99
def generate_requests():
100
    """Generator function to yield audio chunks."""
101
    # Initialize audio
102
    audio_interface = pyaudio.PyAudio()
103
    audio_stream = audio_interface.open(
104
        format=pyaudio.paInt16,
105
        channels=1,
106
        rate=RATE,
107
        input=True,
108
        frames_per_buffer=CHUNK,
109
    )
110
    
111
    try:
112
        while True:
113
            data = audio_stream.read(CHUNK)
114
            yield speech.StreamingRecognizeRequest(audio_content=data)
115
    finally:
116
        audio_stream.stop_stream()
117
        audio_stream.close()
118
        audio_interface.terminate()
119

120
# Perform streaming recognition
121
requests = generate_requests()
122
responses = client.streaming_recognize(config, requests)
123

124
# Process results
125
for response in responses:
126
    for result in response.results:
127
        if result.is_final:
128
            print(f"Final transcript: {result.alternatives[0].transcript}")
129
        else:
130
            print(f"Partial transcript: {result.alternatives[0].transcript}")
131
```
132

133
### Advanced Streaming with Voice Activity Detection
134

135
```python
136
from google.cloud import speech
137

138
client = speech.SpeechClient()
139

140
# Advanced streaming configuration
141
config = speech.StreamingRecognitionConfig(
142
    config=speech.RecognitionConfig(
143
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
144
        sample_rate_hertz=16000,
145
        language_code="en-US",
146
        enable_automatic_punctuation=True,
147
        enable_voice_activity_events=True,
148
        speech_contexts=[
149
            speech.SpeechContext(
150
                phrases=["technical", "keywords", "domain", "specific"]
151
            )
152
        ],
153
    ),
154
    interim_results=True,
155
    single_utterance=False,  # Continue listening after pauses
156
    enable_voice_activity_events=True,
157
)
158

159
def stream_recognition():
160
    """Handle streaming recognition with voice activity detection."""
161
    def request_generator():
162
        # First request with configuration
163
        yield speech.StreamingRecognizeRequest(streaming_config=config)
164
        
165
        # Subsequent requests with audio data
166
        # (Implementation would include actual audio capture)
167
        pass
168
    
169
    requests = request_generator()
170
    responses = client.streaming_recognize(requests)
171
    
172
    for response in responses:
173
        # Handle speech event detection
174
        if response.speech_event_type:
175
            if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN:
176
                print("Speech activity started")
177
            elif response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END:
178
                print("Speech activity ended")
179
        
180
        # Handle recognition results
181
        for result in response.results:
182
            if result.is_final:
183
                print(f"Final: {result.alternatives[0].transcript}")
184
                print(f"Stability: {result.stability}")
185
            else:
186
                print(f"Interim: {result.alternatives[0].transcript}")
187
```
188

189
## Request Types
190

191
### StreamingRecognizeRequest
192

193
```python { .api }
194
class StreamingRecognizeRequest:
195
    """Request for streaming speech recognition."""
196
    streaming_config: StreamingRecognitionConfig  # First request only
197
    audio_content: bytes  # Audio data for subsequent requests
198
```
199

200
### StreamingRecognitionConfig
201

202
```python { .api }
203
class StreamingRecognitionConfig:
204
    """Configuration for streaming recognition."""
205
    config: RecognitionConfig
206
    single_utterance: bool  # Stop after first utterance
207
    interim_results: bool   # Return partial results
208
    enable_voice_activity_events: bool  # Detect speech activity
209
```
210

211
## Response Types
212

213
### StreamingRecognizeResponse
214

215
```python { .api }
216
class StreamingRecognizeResponse:
217
    """Response from streaming speech recognition."""
218
    error: Status
219
    results: Sequence[StreamingRecognitionResult]
220
    speech_event_type: SpeechEventType
221
    speech_event_offset: Duration
222
    total_billed_time: Duration
223
    speech_adaptation_info: SpeechAdaptationInfo
224
    request_id: int
225
```
226

227
### StreamingRecognitionResult
228

229
```python { .api }
230
class StreamingRecognitionResult:
231
    """Individual recognition result in streaming response."""
232
    alternatives: Sequence[SpeechRecognitionAlternative]
233
    is_final: bool        # True for final results
234
    stability: float      # Stability score (0.0-1.0)
235
    result_end_time: Duration
236
    channel_tag: int
237
    language_code: str
238
```
239

240
## Streaming Limitations and Best Practices
241

242
### Time Limits
243

244
```python
245
# Streaming session limits
246
MAX_STREAMING_DURATION = 305  # seconds (5 minutes + 5 seconds)
247
MAX_AUDIO_DURATION = 300      # seconds of audio content
248

249
# Restart streaming session before limits
250
import time
251

252
def long_running_stream():
253
    """Example of handling streaming session limits."""
254
    session_start = time.time()
255
    
256
    while True:
257
        if time.time() - session_start > 280:  # Restart before 5-minute limit
258
            print("Restarting streaming session...")
259
            break
260
            
261
        # Continue streaming...
262
```
263

264
### Audio Quality Requirements
265

266
```python
267
# Optimal audio settings for streaming
268
config = speech.StreamingRecognitionConfig(
269
    config=speech.RecognitionConfig(
270
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
271
        sample_rate_hertz=16000,  # Recommended for best performance
272
        audio_channel_count=1,    # Mono audio
273
        language_code="en-US",
274
    ),
275
    interim_results=True,
276
)
277
```
278

279
### Error Handling in Streaming
280

281
```python
282
from google.api_core import exceptions
283

284
def robust_streaming():
285
    """Example of robust streaming with error handling."""
286
    max_retries = 3
287
    retry_count = 0
288
    
289
    while retry_count < max_retries:
290
        try:
291
            responses = client.streaming_recognize(config, requests)
292
            
293
            for response in responses:
294
                if response.error.code != 0:
295
                    print(f"Recognition error: {response.error.message}")
296
                    break
297
                    
298
                # Process results...
299
                
300
        except exceptions.OutOfRange as e:
301
            print(f"Streaming limit exceeded: {e}")
302
            retry_count += 1
303
            
304
        except exceptions.InvalidArgument as e:
305
            print(f"Invalid request: {e}")
306
            break  # Don't retry on invalid arguments
307
```
308

309
## Voice Activity Events
310

311
```python { .api }
312
class SpeechEventType:
313
    """Types of speech events in streaming recognition."""
314
    SPEECH_EVENT_UNSPECIFIED = 0
315
    END_OF_SINGLE_UTTERANCE = 1
316
    SPEECH_ACTIVITY_BEGIN = 2
317
    SPEECH_ACTIVITY_END = 3
318
```
319

320
### Voice Activity Detection Usage
321

322
```python
323
# Enable voice activity events
324
config = speech.StreamingRecognitionConfig(
325
    config=speech.RecognitionConfig(
326
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
327
        sample_rate_hertz=16000,
328
        language_code="en-US",
329
        enable_voice_activity_events=True,
330
    ),
331
    enable_voice_activity_events=True,
332
    interim_results=True,
333
)
334

335
# Process voice activity events
336
for response in client.streaming_recognize(requests):
337
    if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN:
338
        print("User started speaking")
339
    elif response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END:
340
        print("User stopped speaking")
341
```

Version

Tile

Files

streaming-recognition.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

streaming-recognition.mddocs/