or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-features.mdindex.mdspeech-adaptation.mdspeech-recognition.mdstreaming-recognition.mdtypes-and-configuration.md

speech-recognition.mddocs/

0

# Speech Recognition

1

2

Core speech-to-text functionality providing synchronous and asynchronous recognition modes for converting audio to text with high accuracy and extensive configuration options.

3

4

## Capabilities

5

6

### Synchronous Recognition

7

8

Performs immediate speech recognition on short audio files (typically under 1 minute). Ideal for real-time applications requiring immediate results.

9

10

```python { .api }

11

def recognize(

12

self,

13

config: RecognitionConfig,

14

audio: RecognitionAudio,

15

*,

16

retry: OptionalRetry = None,

17

timeout: Optional[float] = None,

18

metadata: Sequence[Tuple[str, str]] = ()

19

) -> RecognizeResponse:

20

"""

21

Performs synchronous speech recognition.

22

23

Parameters:

24

- config: Configuration for the recognition request

25

- audio: Audio data to be recognized

26

- retry: Retry configuration for failed requests

27

- timeout: Request timeout in seconds

28

- metadata: Additional metadata to send with the request

29

30

Returns:

31

RecognizeResponse containing recognition results

32

33

Raises:

34

google.api_core.exceptions.InvalidArgument: If the request is malformed

35

google.api_core.exceptions.DeadlineExceeded: If the request times out

36

"""

37

```

38

39

#### Usage Example

40

41

```python

42

from google.cloud import speech

43

import io

44

45

client = speech.SpeechClient()

46

47

# Load audio file

48

with io.open("short_audio.wav", "rb") as audio_file:

49

content = audio_file.read()

50

51

# Configure recognition

52

audio = speech.RecognitionAudio(content=content)

53

config = speech.RecognitionConfig(

54

encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,

55

sample_rate_hertz=16000,

56

language_code="en-US",

57

enable_automatic_punctuation=True,

58

enable_word_time_offsets=True,

59

)

60

61

# Perform recognition

62

response = client.recognize(config=config, audio=audio)

63

64

# Process results

65

for result in response.results:

66

alternative = result.alternatives[0]

67

print(f"Transcript: {alternative.transcript}")

68

print(f"Confidence: {alternative.confidence}")

69

70

# Word-level information

71

for word in alternative.words:

72

print(f"Word: {word.word}, "

73

f"Start: {word.start_time.total_seconds()}s, "

74

f"End: {word.end_time.total_seconds()}s")

75

```

76

77

### Asynchronous Recognition

78

79

Performs long-running speech recognition on longer audio files. Returns immediately with an operation object that can be polled for results.

80

81

```python { .api }

82

def long_running_recognize(

83

self,

84

config: RecognitionConfig,

85

audio: RecognitionAudio,

86

*,

87

retry: OptionalRetry = None,

88

timeout: Optional[float] = None,

89

metadata: Sequence[Tuple[str, str]] = ()

90

) -> Operation:

91

"""

92

Performs asynchronous speech recognition for longer audio files.

93

94

Parameters:

95

- config: Configuration for the recognition request

96

- audio: Audio data to be recognized (can be Cloud Storage URI)

97

- retry: Retry configuration for failed requests

98

- timeout: Request timeout in seconds

99

- metadata: Additional metadata to send with the request

100

101

Returns:

102

Operation object that can be polled for results

103

104

Raises:

105

google.api_core.exceptions.InvalidArgument: If the request is malformed

106

"""

107

```

108

109

#### Usage Example

110

111

```python

112

from google.cloud import speech

113

114

client = speech.SpeechClient()

115

116

# Configure for Cloud Storage audio file

117

audio = speech.RecognitionAudio(

118

uri="gs://your-bucket/long_audio.flac"

119

)

120

config = speech.RecognitionConfig(

121

encoding=speech.RecognitionConfig.AudioEncoding.FLAC,

122

sample_rate_hertz=44100,

123

language_code="en-US",

124

enable_speaker_diarization=True,

125

diarization_config=speech.SpeakerDiarizationConfig(

126

enable_speaker_diarization=True,

127

min_speaker_count=2,

128

max_speaker_count=10,

129

),

130

)

131

132

# Start long-running operation

133

operation = client.long_running_recognize(config=config, audio=audio)

134

print(f"Operation name: {operation.operation.name}")

135

136

# Wait for completion (with timeout)

137

response = operation.result(timeout=600) # 10 minutes

138

139

# Process results with speaker information

140

for result in response.results:

141

alternative = result.alternatives[0]

142

print(f"Transcript: {alternative.transcript}")

143

144

# Speaker diarization results

145

for word in alternative.words:

146

print(f"Speaker {word.speaker_tag}: {word.word}")

147

```

148

149

## Request Types

150

151

### RecognizeRequest

152

153

```python { .api }

154

class RecognizeRequest:

155

"""Request for synchronous speech recognition."""

156

config: RecognitionConfig

157

audio: RecognitionAudio

158

```

159

160

### LongRunningRecognizeRequest

161

162

```python { .api }

163

class LongRunningRecognizeRequest:

164

"""Request for asynchronous speech recognition."""

165

config: RecognitionConfig

166

audio: RecognitionAudio

167

output_config: TranscriptOutputConfig # Optional output configuration

168

```

169

170

## Response Types

171

172

### RecognizeResponse

173

174

```python { .api }

175

class RecognizeResponse:

176

"""Response from synchronous speech recognition."""

177

results: Sequence[SpeechRecognitionResult]

178

total_billed_time: Duration

179

speech_adaptation_info: SpeechAdaptationInfo

180

request_id: int

181

```

182

183

### LongRunningRecognizeResponse

184

185

```python { .api }

186

class LongRunningRecognizeResponse:

187

"""Response from asynchronous speech recognition."""

188

results: Sequence[SpeechRecognitionResult]

189

total_billed_time: Duration

190

speech_adaptation_info: SpeechAdaptationInfo

191

request_id: int

192

output_config: TranscriptOutputConfig

193

output_error: Status

194

```

195

196

### LongRunningRecognizeMetadata

197

198

```python { .api }

199

class LongRunningRecognizeMetadata:

200

"""Metadata for long-running recognition operations."""

201

progress_percent: int

202

start_time: Timestamp

203

last_update_time: Timestamp

204

uri: str

205

```

206

207

## Configuration Options

208

209

### Audio Format Support

210

211

```python

212

# Supported audio encodings

213

config = speech.RecognitionConfig(

214

encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,

215

# Other options:

216

# - FLAC

217

# - MULAW

218

# - AMR

219

# - AMR_WB

220

# - OGG_OPUS

221

# - SPEEX_WITH_HEADER_BYTE

222

# - MP3

223

# - WEBM_OPUS

224

)

225

```

226

227

### Language and Regional Support

228

229

```python

230

# Language codes

231

config = speech.RecognitionConfig(

232

language_code="en-US", # Primary language

233

alternative_language_codes=["en-GB", "es-ES"], # Alternative languages

234

)

235

```

236

237

### Audio Enhancement Features

238

239

```python

240

config = speech.RecognitionConfig(

241

# Automatic punctuation

242

enable_automatic_punctuation=True,

243

244

# Word timing information

245

enable_word_time_offsets=True,

246

247

# Confidence scores

248

enable_word_confidence=True,

249

250

# Speaker diarization

251

enable_speaker_diarization=True,

252

diarization_config=speech.SpeakerDiarizationConfig(

253

enable_speaker_diarization=True,

254

min_speaker_count=2,

255

max_speaker_count=6,

256

),

257

258

# Profanity filter

259

profanity_filter=True,

260

261

# Speech contexts for better accuracy

262

speech_contexts=[

263

speech.SpeechContext(

264

phrases=["custom", "terminology", "specific", "words"]

265

)

266

],

267

)