CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-google-cloud-speech

Google Cloud Speech API client library for speech-to-text conversion with support for real-time streaming, batch processing, and advanced speech recognition models

Pending
Overview
Eval results
Files

streaming-recognition.mddocs/

Streaming Recognition

Real-time bidirectional streaming speech recognition for live audio processing. Enables continuous recognition with immediate results as audio is streamed to the service.

Capabilities

Bidirectional Streaming

Performs real-time speech recognition on streaming audio with immediate partial and final results.

def streaming_recognize(
    self,
    requests: Iterator[StreamingRecognizeRequest],
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> Iterator[StreamingRecognizeResponse]:
    """
    Performs bidirectional streaming speech recognition.

    Parameters:
    - requests: Iterator of streaming recognition requests
    - retry: Retry configuration for failed requests
    - timeout: Request timeout in seconds
    - metadata: Additional metadata to send with the request

    Returns:
    Iterator of StreamingRecognizeResponse objects

    Raises:
    google.api_core.exceptions.InvalidArgument: If the request is malformed
    google.api_core.exceptions.OutOfRange: If streaming limits are exceeded
    """

SpeechHelpers Streaming Interface

Simplified streaming interface provided by the SpeechHelpers mixin class that automatically handles request formatting and configuration injection.

class SpeechHelpers:
    def streaming_recognize(
        self,
        config: StreamingRecognitionConfig,
        requests: Iterator[StreamingRecognizeRequest],
        *,
        retry: OptionalRetry = None,
        timeout: Optional[float] = None,
        metadata: Sequence[Tuple[str, str]] = ()
    ) -> Iterator[StreamingRecognizeResponse]:
        """
        Enhanced streaming recognition with automatic request formatting.
        
        This helper method automatically prepends the configuration to the
        request stream, simplifying the streaming workflow.

        Parameters:
        - config: Streaming recognition configuration (automatically sent first)
        - requests: Iterator of audio-only requests (no config needed)
        - retry: Retry configuration for failed requests
        - timeout: Request timeout in seconds
        - metadata: Additional metadata to send with the request

        Returns:
        Iterator of StreamingRecognizeResponse objects
        
        Note:
        This method is mixed into SpeechClient via multiple inheritance.
        Available in speech_v1 and speech_v1p1beta1.
        """

Usage Examples

Basic Streaming Recognition

from google.cloud import speech
import pyaudio
import threading

client = speech.SpeechClient()

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms chunks

# Configure streaming recognition
config = speech.StreamingRecognitionConfig(
    config=speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=RATE,
        language_code="en-US",
    ),
    interim_results=True,  # Enable partial results
)

def generate_requests():
    """Generator function to yield audio chunks."""
    # Initialize audio
    audio_interface = pyaudio.PyAudio()
    audio_stream = audio_interface.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=RATE,
        input=True,
        frames_per_buffer=CHUNK,
    )
    
    try:
        while True:
            data = audio_stream.read(CHUNK)
            yield speech.StreamingRecognizeRequest(audio_content=data)
    finally:
        audio_stream.stop_stream()
        audio_stream.close()
        audio_interface.terminate()

# Perform streaming recognition
requests = generate_requests()
responses = client.streaming_recognize(config, requests)

# Process results
for response in responses:
    for result in response.results:
        if result.is_final:
            print(f"Final transcript: {result.alternatives[0].transcript}")
        else:
            print(f"Partial transcript: {result.alternatives[0].transcript}")

Advanced Streaming with Voice Activity Detection

from google.cloud import speech

client = speech.SpeechClient()

# Advanced streaming configuration
config = speech.StreamingRecognitionConfig(
    config=speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_voice_activity_events=True,
        speech_contexts=[
            speech.SpeechContext(
                phrases=["technical", "keywords", "domain", "specific"]
            )
        ],
    ),
    interim_results=True,
    single_utterance=False,  # Continue listening after pauses
    enable_voice_activity_events=True,
)

def stream_recognition():
    """Handle streaming recognition with voice activity detection."""
    def request_generator():
        # First request with configuration
        yield speech.StreamingRecognizeRequest(streaming_config=config)
        
        # Subsequent requests with audio data
        # (Implementation would include actual audio capture)
        pass
    
    requests = request_generator()
    responses = client.streaming_recognize(requests)
    
    for response in responses:
        # Handle speech event detection
        if response.speech_event_type:
            if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN:
                print("Speech activity started")
            elif response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END:
                print("Speech activity ended")
        
        # Handle recognition results
        for result in response.results:
            if result.is_final:
                print(f"Final: {result.alternatives[0].transcript}")
                print(f"Stability: {result.stability}")
            else:
                print(f"Interim: {result.alternatives[0].transcript}")

Request Types

StreamingRecognizeRequest

class StreamingRecognizeRequest:
    """Request for streaming speech recognition."""
    streaming_config: StreamingRecognitionConfig  # First request only
    audio_content: bytes  # Audio data for subsequent requests

StreamingRecognitionConfig

class StreamingRecognitionConfig:
    """Configuration for streaming recognition."""
    config: RecognitionConfig
    single_utterance: bool  # Stop after first utterance
    interim_results: bool   # Return partial results
    enable_voice_activity_events: bool  # Detect speech activity

Response Types

StreamingRecognizeResponse

class StreamingRecognizeResponse:
    """Response from streaming speech recognition."""
    error: Status
    results: Sequence[StreamingRecognitionResult]
    speech_event_type: SpeechEventType
    speech_event_offset: Duration
    total_billed_time: Duration
    speech_adaptation_info: SpeechAdaptationInfo
    request_id: int

StreamingRecognitionResult

class StreamingRecognitionResult:
    """Individual recognition result in streaming response."""
    alternatives: Sequence[SpeechRecognitionAlternative]
    is_final: bool        # True for final results
    stability: float      # Stability score (0.0-1.0)
    result_end_time: Duration
    channel_tag: int
    language_code: str

Streaming Limitations and Best Practices

Time Limits

# Streaming session limits
MAX_STREAMING_DURATION = 305  # seconds (5 minutes + 5 seconds)
MAX_AUDIO_DURATION = 300      # seconds of audio content

# Restart streaming session before limits
import time

def long_running_stream():
    """Example of handling streaming session limits."""
    session_start = time.time()
    
    while True:
        if time.time() - session_start > 280:  # Restart before 5-minute limit
            print("Restarting streaming session...")
            break
            
        # Continue streaming...

Audio Quality Requirements

# Optimal audio settings for streaming
config = speech.StreamingRecognitionConfig(
    config=speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,  # Recommended for best performance
        audio_channel_count=1,    # Mono audio
        language_code="en-US",
    ),
    interim_results=True,
)

Error Handling in Streaming

from google.api_core import exceptions

def robust_streaming():
    """Example of robust streaming with error handling."""
    max_retries = 3
    retry_count = 0
    
    while retry_count < max_retries:
        try:
            responses = client.streaming_recognize(config, requests)
            
            for response in responses:
                if response.error.code != 0:
                    print(f"Recognition error: {response.error.message}")
                    break
                    
                # Process results...
                
        except exceptions.OutOfRange as e:
            print(f"Streaming limit exceeded: {e}")
            retry_count += 1
            
        except exceptions.InvalidArgument as e:
            print(f"Invalid request: {e}")
            break  # Don't retry on invalid arguments

Voice Activity Events

class SpeechEventType:
    """Types of speech events in streaming recognition."""
    SPEECH_EVENT_UNSPECIFIED = 0
    END_OF_SINGLE_UTTERANCE = 1
    SPEECH_ACTIVITY_BEGIN = 2
    SPEECH_ACTIVITY_END = 3

Voice Activity Detection Usage

# Enable voice activity events
config = speech.StreamingRecognitionConfig(
    config=speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_voice_activity_events=True,
    ),
    enable_voice_activity_events=True,
    interim_results=True,
)

# Process voice activity events
for response in client.streaming_recognize(requests):
    if response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN:
        print("User started speaking")
    elif response.speech_event_type == speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END:
        print("User stopped speaking")

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-speech

docs

advanced-features.md

index.md

speech-adaptation.md

speech-recognition.md

streaming-recognition.md

types-and-configuration.md

tile.json