or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-features.mdindex.mdspeech-adaptation.mdspeech-recognition.mdstreaming-recognition.mdtypes-and-configuration.md

streaming-recognition.mddocs/

0

# Streaming Recognition

1

2

Real-time bidirectional streaming speech recognition for live audio processing. Enables continuous recognition with immediate results as audio is streamed to the service.

3

4

## Capabilities

5

6

### Bidirectional Streaming

7

8

Performs real-time speech recognition on streaming audio with immediate partial and final results.

9

10

```python { .api }

11

def streaming_recognize(

12

self,

13

requests: Iterator[StreamingRecognizeRequest],

14

*,

15

retry: OptionalRetry = None,

16

timeout: Optional[float] = None,

17

metadata: Sequence[Tuple[str, str]] = ()

18

) -> Iterator[StreamingRecognizeResponse]:

19

"""

20

Performs bidirectional streaming speech recognition.

21

22

Parameters:

23

- requests: Iterator of streaming recognition requests

24

- retry: Retry configuration for failed requests

25

- timeout: Request timeout in seconds

26

- metadata: Additional metadata to send with the request

27

28

Returns:

29

Iterator of StreamingRecognizeResponse objects

30

31

Raises:

32

google.api_core.exceptions.InvalidArgument: If the request is malformed

33

google.api_core.exceptions.OutOfRange: If streaming limits are exceeded

34

"""

35

```

36

37

### SpeechHelpers Streaming Interface

38

39

Simplified streaming interface provided by the SpeechHelpers mixin class that automatically handles request formatting and configuration injection.

40

41

```python { .api }

42

class SpeechHelpers:

43

def streaming_recognize(

44

self,

45

config: StreamingRecognitionConfig,

46

requests: Iterator[StreamingRecognizeRequest],

47

*,

48

retry: OptionalRetry = None,

49

timeout: Optional[float] = None,

50

metadata: Sequence[Tuple[str, str]] = ()

51

) -> Iterator[StreamingRecognizeResponse]:

52

"""

53

Enhanced streaming recognition with automatic request formatting.

54

55

This helper method automatically prepends the configuration to the

56

request stream, simplifying the streaming workflow.

57

58

Parameters:

59

- config: Streaming recognition configuration (automatically sent first)

60

- requests: Iterator of audio-only requests (no config needed)

61

- retry: Retry configuration for failed requests

62

- timeout: Request timeout in seconds

63

- metadata: Additional metadata to send with the request

64

65

Returns:

66

Iterator of StreamingRecognizeResponse objects

67

68

Note:

69

This method is mixed into SpeechClient via multiple inheritance.

70

Available in speech_v1 and speech_v1p1beta1.

71

"""

72

```

73

74

## Usage Examples

75

76

### Basic Streaming Recognition

77

78

```python

79

from google.cloud import speech

80

import pyaudio

81

import threading

82

83

client = speech.SpeechClient()

84

85

# Audio recording parameters

86

RATE = 16000

87

CHUNK = int(RATE / 10) # 100ms chunks

88

89

# Configure streaming recognition

90

config = speech.StreamingRecognitionConfig(

91

config=speech.RecognitionConfig(

92

encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,

93

sample_rate_hertz=RATE,

94

language_code="en-US",

95

),

96

interim_results=True, # Enable partial results

97

)

98

99

def generate_requests():

100

"""Generator function to yield audio chunks."""

101

# Initialize audio

102

audio_interface = pyaudio.PyAudio()

103

audio_stream = audio_interface.open(

104

format=pyaudio.paInt16,

105

channels=1,

106

rate=RATE,

107

input=True,

108

frames_per_buffer=CHUNK,

109

)

110

111

try:

112

while True:

113

data = audio_stream.read(CHUNK)

114

yield speech.StreamingRecognizeRequest(audio_content=data)

115

finally:

116

audio_stream.stop_stream()

117

audio_stream.close()

118

audio_interface.terminate()

119

120

# Perform streaming recognition

121

requests = generate_requests()

122

responses = client.streaming_recognize(config, requests)

123

124

# Process results

125

for response in responses:

126

for result in response.results:

127

if result.is_final:

128

print(f"Final transcript: {result.alternatives[0].transcript}")

129

else:

130

print(f"Partial transcript: {result.alternatives[0].transcript}")

131

```

132

133

### Advanced Streaming with Voice Activity Detection

134

135

```python

136

from google.cloud import speech

137

138

client = speech.SpeechClient()

139

140

# Advanced streaming configuration

141

config = speech.StreamingRecognitionConfig(

142

config=speech.RecognitionConfig(

143

encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,

144

sample_rate_hertz=16000,

145

language_code="en-US",

146

enable_automatic_punctuation=True,

147

enable_voice_activity_events=True,

148

speech_contexts=[

149

speech.SpeechContext(

150

phrases=["technical", "keywords", "domain", "specific"]

151

)

152

],

153

),

154

interim_results=True,

155

single_utterance=False, # Continue listening after pauses

156

enable_voice_activity_events=True,

157

)

158

159

def stream_recognition():

160

"""Handle streaming recognition with voice activity detection."""

161

def request_generator():

162

# First request with configuration

163

yield speech.StreamingRecognizeRequest(streaming_config=config)

164

165

# Subsequent requests with audio data

166

# (Implementation would include actual audio capture)

167

pass

168

169

requests = request_generator()

170

responses = client.streaming_recognize(requests)

171

172

for response in responses:

173

# Handle speech event detection

174

if response.speech_event_type:

175

if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN:

176

print("Speech activity started")

177

elif response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END:

178

print("Speech activity ended")

179

180

# Handle recognition results

181

for result in response.results:

182

if result.is_final:

183

print(f"Final: {result.alternatives[0].transcript}")

184

print(f"Stability: {result.stability}")

185

else:

186

print(f"Interim: {result.alternatives[0].transcript}")

187

```

188

189

## Request Types

190

191

### StreamingRecognizeRequest

192

193

```python { .api }

194

class StreamingRecognizeRequest:

195

"""Request for streaming speech recognition."""

196

streaming_config: StreamingRecognitionConfig # First request only

197

audio_content: bytes # Audio data for subsequent requests

198

```

199

200

### StreamingRecognitionConfig

201

202

```python { .api }

203

class StreamingRecognitionConfig:

204

"""Configuration for streaming recognition."""

205

config: RecognitionConfig

206

single_utterance: bool # Stop after first utterance

207

interim_results: bool # Return partial results

208

enable_voice_activity_events: bool # Detect speech activity

209

```

210

211

## Response Types

212

213

### StreamingRecognizeResponse

214

215

```python { .api }

216

class StreamingRecognizeResponse:

217

"""Response from streaming speech recognition."""

218

error: Status

219

results: Sequence[StreamingRecognitionResult]

220

speech_event_type: SpeechEventType

221

speech_event_offset: Duration

222

total_billed_time: Duration

223

speech_adaptation_info: SpeechAdaptationInfo

224

request_id: int

225

```

226

227

### StreamingRecognitionResult

228

229

```python { .api }

230

class StreamingRecognitionResult:

231

"""Individual recognition result in streaming response."""

232

alternatives: Sequence[SpeechRecognitionAlternative]

233

is_final: bool # True for final results

234

stability: float # Stability score (0.0-1.0)

235

result_end_time: Duration

236

channel_tag: int

237

language_code: str

238

```

239

240

## Streaming Limitations and Best Practices

241

242

### Time Limits

243

244

```python

245

# Streaming session limits

246

MAX_STREAMING_DURATION = 305 # seconds (5 minutes + 5 seconds)

247

MAX_AUDIO_DURATION = 300 # seconds of audio content

248

249

# Restart streaming session before limits

250

import time

251

252

def long_running_stream():

253

"""Example of handling streaming session limits."""

254

session_start = time.time()

255

256

while True:

257

if time.time() - session_start > 280: # Restart before 5-minute limit

258

print("Restarting streaming session...")

259

break

260

261

# Continue streaming...

262

```

263

264

### Audio Quality Requirements

265

266

```python

267

# Optimal audio settings for streaming

268

config = speech.StreamingRecognitionConfig(

269

config=speech.RecognitionConfig(

270

encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,

271

sample_rate_hertz=16000, # Recommended for best performance

272

audio_channel_count=1, # Mono audio

273

language_code="en-US",

274

),

275

interim_results=True,

276

)

277

```

278

279

### Error Handling in Streaming

280

281

```python

282

from google.api_core import exceptions

283

284

def robust_streaming():

285

"""Example of robust streaming with error handling."""

286

max_retries = 3

287

retry_count = 0

288

289

while retry_count < max_retries:

290

try:

291

responses = client.streaming_recognize(config, requests)

292

293

for response in responses:

294

if response.error.code != 0:

295

print(f"Recognition error: {response.error.message}")

296

break

297

298

# Process results...

299

300

except exceptions.OutOfRange as e:

301

print(f"Streaming limit exceeded: {e}")

302

retry_count += 1

303

304

except exceptions.InvalidArgument as e:

305

print(f"Invalid request: {e}")

306

break # Don't retry on invalid arguments

307

```

308

309

## Voice Activity Events

310

311

```python { .api }

312

class SpeechEventType:

313

"""Types of speech events in streaming recognition."""

314

SPEECH_EVENT_UNSPECIFIED = 0

315

END_OF_SINGLE_UTTERANCE = 1

316

SPEECH_ACTIVITY_BEGIN = 2

317

SPEECH_ACTIVITY_END = 3

318

```

319

320

### Voice Activity Detection Usage

321

322

```python

323

# Enable voice activity events

324

config = speech.StreamingRecognitionConfig(

325

config=speech.RecognitionConfig(

326

encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,

327

sample_rate_hertz=16000,

328

language_code="en-US",

329

enable_voice_activity_events=True,

330

),

331

enable_voice_activity_events=True,

332

interim_results=True,

333

)

334

335

# Process voice activity events

336

for response in client.streaming_recognize(requests):

337

if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN:

338

print("User started speaking")

339

elif response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END:

340

print("User stopped speaking")

341

```