CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-google-cloud-speech

Google Cloud Speech API client library for speech-to-text conversion with support for real-time streaming, batch processing, and advanced speech recognition models

Pending
Overview
Eval results
Files

speech-recognition.mddocs/

Speech Recognition

Core speech-to-text functionality providing synchronous and asynchronous recognition modes for converting audio to text with high accuracy and extensive configuration options.

Capabilities

Synchronous Recognition

Performs immediate speech recognition on short audio files (typically under 1 minute). Ideal for real-time applications requiring immediate results.

def recognize(
    self,
    config: RecognitionConfig,
    audio: RecognitionAudio,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> RecognizeResponse:
    """
    Performs synchronous speech recognition.

    Parameters:
    - config: Configuration for the recognition request
    - audio: Audio data to be recognized
    - retry: Retry configuration for failed requests
    - timeout: Request timeout in seconds
    - metadata: Additional metadata to send with the request

    Returns:
    RecognizeResponse containing recognition results

    Raises:
    google.api_core.exceptions.InvalidArgument: If the request is malformed
    google.api_core.exceptions.DeadlineExceeded: If the request times out
    """

Usage Example

from google.cloud import speech
import io

client = speech.SpeechClient()

# Load audio file
with io.open("short_audio.wav", "rb") as audio_file:
    content = audio_file.read()

# Configure recognition
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=16000,
    language_code="en-US",
    enable_automatic_punctuation=True,
    enable_word_time_offsets=True,
)

# Perform recognition
response = client.recognize(config=config, audio=audio)

# Process results
for result in response.results:
    alternative = result.alternatives[0]
    print(f"Transcript: {alternative.transcript}")
    print(f"Confidence: {alternative.confidence}")
    
    # Word-level information
    for word in alternative.words:
        print(f"Word: {word.word}, "
              f"Start: {word.start_time.total_seconds()}s, "
              f"End: {word.end_time.total_seconds()}s")

Asynchronous Recognition

Performs long-running speech recognition on longer audio files. Returns immediately with an operation object that can be polled for results.

def long_running_recognize(
    self,
    config: RecognitionConfig,
    audio: RecognitionAudio,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
    """
    Performs asynchronous speech recognition for longer audio files.

    Parameters:
    - config: Configuration for the recognition request
    - audio: Audio data to be recognized (can be Cloud Storage URI)
    - retry: Retry configuration for failed requests
    - timeout: Request timeout in seconds
    - metadata: Additional metadata to send with the request

    Returns:
    Operation object that can be polled for results

    Raises:
    google.api_core.exceptions.InvalidArgument: If the request is malformed
    """

Usage Example

from google.cloud import speech

client = speech.SpeechClient()

# Configure for Cloud Storage audio file
audio = speech.RecognitionAudio(
    uri="gs://your-bucket/long_audio.flac"
)
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
    sample_rate_hertz=44100,
    language_code="en-US",
    enable_speaker_diarization=True,
    diarization_config=speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=True,
        min_speaker_count=2,
        max_speaker_count=10,
    ),
)

# Start long-running operation
operation = client.long_running_recognize(config=config, audio=audio)
print(f"Operation name: {operation.operation.name}")

# Wait for completion (with timeout)
response = operation.result(timeout=600)  # 10 minutes

# Process results with speaker information
for result in response.results:
    alternative = result.alternatives[0]
    print(f"Transcript: {alternative.transcript}")
    
    # Speaker diarization results
    for word in alternative.words:
        print(f"Speaker {word.speaker_tag}: {word.word}")

Request Types

RecognizeRequest

class RecognizeRequest:
    """Request for synchronous speech recognition."""
    config: RecognitionConfig
    audio: RecognitionAudio

LongRunningRecognizeRequest

class LongRunningRecognizeRequest:
    """Request for asynchronous speech recognition."""
    config: RecognitionConfig
    audio: RecognitionAudio
    output_config: TranscriptOutputConfig  # Optional output configuration

Response Types

RecognizeResponse

class RecognizeResponse:
    """Response from synchronous speech recognition."""
    results: Sequence[SpeechRecognitionResult]
    total_billed_time: Duration
    speech_adaptation_info: SpeechAdaptationInfo
    request_id: int

LongRunningRecognizeResponse

class LongRunningRecognizeResponse:
    """Response from asynchronous speech recognition."""
    results: Sequence[SpeechRecognitionResult]
    total_billed_time: Duration
    speech_adaptation_info: SpeechAdaptationInfo
    request_id: int
    output_config: TranscriptOutputConfig
    output_error: Status

LongRunningRecognizeMetadata

class LongRunningRecognizeMetadata:
    """Metadata for long-running recognition operations."""
    progress_percent: int
    start_time: Timestamp
    last_update_time: Timestamp
    uri: str

Configuration Options

Audio Format Support

# Supported audio encodings
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    # Other options:
    # - FLAC
    # - MULAW
    # - AMR
    # - AMR_WB
    # - OGG_OPUS
    # - SPEEX_WITH_HEADER_BYTE
    # - MP3
    # - WEBM_OPUS
)

Language and Regional Support

# Language codes
config = speech.RecognitionConfig(
    language_code="en-US",  # Primary language
    alternative_language_codes=["en-GB", "es-ES"],  # Alternative languages
)

Audio Enhancement Features

config = speech.RecognitionConfig(
    # Automatic punctuation
    enable_automatic_punctuation=True,
    
    # Word timing information
    enable_word_time_offsets=True,
    
    # Confidence scores
    enable_word_confidence=True,
    
    # Speaker diarization
    enable_speaker_diarization=True,
    diarization_config=speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=True,
        min_speaker_count=2,
        max_speaker_count=6,
    ),
    
    # Profanity filter
    profanity_filter=True,
    
    # Speech contexts for better accuracy
    speech_contexts=[
        speech.SpeechContext(
            phrases=["custom", "terminology", "specific", "words"]
        )
    ],
)

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-speech

docs

advanced-features.md

index.md

speech-adaptation.md

speech-recognition.md

streaming-recognition.md

types-and-configuration.md

tile.json