Google Cloud Speech API client library for speech-to-text conversion with support for real-time streaming, batch processing, and advanced speech recognition models
npx @tessl/cli install tessl/pypi-google-cloud-speech@2.33.0Google Cloud Speech API client library providing advanced speech-to-text conversion capabilities. This package offers real-time streaming recognition, batch processing, and custom speech adaptation, serving as Python's interface to Google's industry-leading speech recognition technology.
pip install google-cloud-speechDefault import (uses v1 API):
from google.cloud import speechVersion-specific imports:
from google.cloud import speech_v1 # Stable API
from google.cloud import speech_v1p1beta1 # Beta features
from google.cloud import speech_v2 # Next-generation APICommon client initialization:
from google.cloud import speech
# Initialize the speech client
client = speech.SpeechClient()from google.cloud import speech
import io
# Initialize the client
client = speech.SpeechClient()
# Load audio file
with io.open("audio_file.wav", "rb") as audio_file:
content = audio_file.read()
# Configure recognition
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
)
# Perform speech recognition
response = client.recognize(config=config, audio=audio)
# Process results
for result in response.results:
print(f"Transcript: {result.alternatives[0].transcript}")
print(f"Confidence: {result.alternatives[0].confidence}")The Google Cloud Speech API provides three main API versions:
Core speech-to-text functionality supporting synchronous, asynchronous, and streaming recognition modes with extensive configuration options.
class SpeechClient:
def recognize(
self,
config: RecognitionConfig,
audio: RecognitionAudio,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> RecognizeResponse: ...
def long_running_recognize(
self,
config: RecognitionConfig,
audio: RecognitionAudio,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Operation: ...Real-time bidirectional streaming speech recognition for live audio processing with immediate results.
class SpeechClient:
def streaming_recognize(
self,
requests: Iterator[StreamingRecognizeRequest],
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Iterator[StreamingRecognizeResponse]: ...Custom speech model adaptation using phrase sets and custom word classes to improve recognition accuracy for domain-specific vocabulary.
class AdaptationClient:
def create_phrase_set(
self,
request: CreatePhraseSetRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> PhraseSet: ...
def create_custom_class(
self,
request: CreateCustomClassRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> CustomClass: ...Next-generation API features including batch recognition, recognizer management, and enhanced output formatting.
class SpeechClient: # v2
def batch_recognize(
self,
request: BatchRecognizeRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Operation: ...
def create_recognizer(
self,
request: CreateRecognizerRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Operation: ...Asynchronous client interfaces for all API versions, enabling non-blocking speech recognition operations in async Python applications.
class SpeechAsyncClient:
async def recognize(
self,
config: RecognitionConfig,
audio: RecognitionAudio,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> RecognizeResponse: ...
async def long_running_recognize(
self,
config: RecognitionConfig,
audio: RecognitionAudio,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> Operation: ...
class AdaptationAsyncClient:
async def create_phrase_set(
self,
request: CreatePhraseSetRequest,
*,
retry: OptionalRetry = None,
timeout: Optional[float] = None,
metadata: Sequence[Tuple[str, str]] = ()
) -> PhraseSet: ...Core data types, configuration objects, and enums for speech recognition setup and result processing.
class RecognitionConfig:
encoding: AudioEncoding
sample_rate_hertz: int
language_code: str
enable_automatic_punctuation: bool
enable_speaker_diarization: bool
diarization_config: SpeakerDiarizationConfig
speech_contexts: Sequence[SpeechContext]
class RecognitionAudio:
content: bytes
uri: strfrom google.api_core import exceptions
from google.cloud import speech
client = speech.SpeechClient()
try:
response = client.recognize(config=config, audio=audio)
except exceptions.InvalidArgument as e:
print(f"Invalid request: {e}")
except exceptions.DeadlineExceeded as e:
print(f"Request timeout: {e}")from google.cloud import speech
client = speech.SpeechClient()
# Start long-running operation
operation = client.long_running_recognize(config=config, audio=audio)
# Wait for completion
response = operation.result(timeout=300)import asyncio
from google.cloud import speech
async def async_speech_recognition():
# Initialize async client
client = speech.SpeechAsyncClient()
# Configure recognition
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="en-US",
)
audio = speech.RecognitionAudio(content=audio_content)
# Perform async recognition
response = await client.recognize(config=config, audio=audio)
# Process results
for result in response.results:
print(f"Transcript: {result.alternatives[0].transcript}")
# Close the client
await client.transport.close()
# Run async function
asyncio.run(async_speech_recognition())