CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-google-cloud-speech

Google Cloud Speech API client library for speech-to-text conversion with support for real-time streaming, batch processing, and advanced speech recognition models

Pending
Overview
Eval results
Files

advanced-features.mddocs/

Advanced Features (v2)

Next-generation Speech API (v2) features including batch recognition, recognizer management, enhanced output formatting, and advanced configuration options.

Version 2 API Import

from google.cloud import speech_v2

# Initialize v2 client
client = speech_v2.SpeechClient()

Capabilities

Batch Recognition

Process multiple audio files efficiently with batch recognition operations.

def batch_recognize(
    self,
    request: BatchRecognizeRequest,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
    """
    Performs batch speech recognition on multiple audio files.

    Parameters:
    - request: Batch recognition request with files and configuration
    - retry: Retry configuration for failed requests
    - timeout: Request timeout in seconds
    - metadata: Additional metadata to send with the request

    Returns:
    Operation: Long-running operation for batch processing

    Raises:
    google.api_core.exceptions.InvalidArgument: If the request is malformed
    """

Batch Recognition Usage

from google.cloud import speech_v2

client = speech_v2.SpeechClient()

# Configure batch recognition
request = speech_v2.BatchRecognizeRequest(
    parent="projects/your-project-id/locations/global",
    config=speech_v2.RecognitionConfig(
        auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
        language_codes=["en-US"],
        features=speech_v2.RecognitionFeatures(
            enable_automatic_punctuation=True,
            enable_word_time_offsets=True,
            enable_speaker_diarization=True,
        ),
    ),
    files=[
        speech_v2.BatchRecognizeFileMetadata(
            uri="gs://your-bucket/audio1.wav",
            output_config=speech_v2.RecognitionOutputConfig(
                gcs_output_config=speech_v2.GcsOutputConfig(
                    uri="gs://your-bucket/output/"
                ),
                output_format_config=speech_v2.OutputFormatConfig(
                    native=speech_v2.NativeOutputFileFormatConfig()
                ),
            ),
        ),
        speech_v2.BatchRecognizeFileMetadata(
            uri="gs://your-bucket/audio2.flac",
        ),
    ],
    recognition_output_config=speech_v2.RecognitionOutputConfig(
        inline_response_config=speech_v2.InlineOutputConfig(),
    ),
)

# Start batch operation
operation = client.batch_recognize(request=request)
print(f"Batch operation: {operation.operation.name}")

# Wait for completion
response = operation.result(timeout=1800)  # 30 minutes
print(f"Processed {len(response.results)} files")

Recognizer Management

Create, manage, and configure persistent recognizers for consistent speech recognition settings.

def create_recognizer(
    self,
    request: CreateRecognizerRequest,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
    """Create a custom recognizer with specific configuration."""

def get_recognizer(
    self,
    request: GetRecognizerRequest,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> Recognizer:
    """Retrieve a recognizer by name."""

def list_recognizers(
    self,
    request: ListRecognizersRequest,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> ListRecognizersResponse:
    """List recognizers in a project."""

def update_recognizer(
    self,
    request: UpdateRecognizerRequest,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
    """Update an existing recognizer."""

def delete_recognizer(
    self,
    request: DeleteRecognizerRequest,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
    """Delete a recognizer."""

def undelete_recognizer(
    self,
    request: UndeleteRecognizerRequest,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> Operation:
    """
    Undeletes a previously deleted recognizer.
    
    Parameters:
    - request: Request to undelete a recognizer
    - retry: Retry configuration for failed requests
    - timeout: Request timeout in seconds
    - metadata: Additional metadata to send with the request
    
    Returns:
    Operation: Long-running operation for undelete process
    """

Recognizer Usage

from google.cloud import speech_v2

client = speech_v2.SpeechClient()

# Create a custom recognizer
recognizer_request = speech_v2.CreateRecognizerRequest(
    parent="projects/your-project-id/locations/us-central1",
    recognizer_id="medical-transcription",
    recognizer=speech_v2.Recognizer(
        display_name="Medical Transcription Recognizer",
        model="medical_conversation",
        language_codes=["en-US"],
        default_recognition_config=speech_v2.RecognitionConfig(
            features=speech_v2.RecognitionFeatures(
                enable_automatic_punctuation=True,
                profanity_filter=True,
                enable_speaker_diarization=True,
                diarization_config=speech_v2.SpeakerDiarizationConfig(
                    min_speaker_count=2,
                    max_speaker_count=4,
                ),
            ),
        ),
    ),
)

operation = client.create_recognizer(request=recognizer_request)
recognizer = operation.result()

# Use the recognizer for recognition
recognize_request = speech_v2.RecognizeRequest(
    recognizer=recognizer.name,
    config=speech_v2.RecognitionConfig(
        auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
    ),
    content=audio_content,
)

response = client.recognize(request=recognize_request)

Enhanced Output Formatting

Generate output in various formats including VTT and SRT subtitles.

class OutputFormatConfig:
    """Configuration for output formatting."""
    native: NativeOutputFileFormatConfig
    vtt: VttOutputFileFormatConfig  
    srt: SrtOutputFileFormatConfig

class VttOutputFileFormatConfig:
    """Configuration for VTT subtitle format."""

class SrtOutputFileFormatConfig:
    """Configuration for SRT subtitle format."""

class NativeOutputFileFormatConfig:
    """Configuration for native JSON format."""

Subtitle Generation Usage

from google.cloud import speech_v2

client = speech_v2.SpeechClient()

# Configure for subtitle generation
request = speech_v2.RecognizeRequest(
    recognizer="projects/project/locations/global/recognizers/default",
    config=speech_v2.RecognitionConfig(
        auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
        language_codes=["en-US"],
        features=speech_v2.RecognitionFeatures(
            enable_word_time_offsets=True,
            enable_automatic_punctuation=True,
        ),
    ),
    content=audio_content,
    output_config=speech_v2.RecognitionOutputConfig(
        output_format_config=speech_v2.OutputFormatConfig(
            # Generate VTT subtitles
            vtt=speech_v2.VttOutputFileFormatConfig()
        ),
        gcs_output_config=speech_v2.GcsOutputConfig(
            uri="gs://your-bucket/subtitles/"
        ),
    ),
)

response = client.recognize(request=request)

# Also generate SRT format
srt_request = speech_v2.RecognizeRequest(
    recognizer="projects/project/locations/global/recognizers/default",
    config=speech_v2.RecognitionConfig(
        auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
        language_codes=["en-US"],
        features=speech_v2.RecognitionFeatures(
            enable_word_time_offsets=True,
            enable_automatic_punctuation=True,
        ),
    ),
    content=audio_content,
    output_config=speech_v2.RecognitionOutputConfig(
        output_format_config=speech_v2.OutputFormatConfig(
            # Generate SRT subtitles
            srt=speech_v2.SrtOutputFileFormatConfig()
        ),
        gcs_output_config=speech_v2.GcsOutputConfig(
            uri="gs://your-bucket/subtitles/"
        ),
    ),
)

srt_response = client.recognize(request=srt_request)

Configuration Management

Manage project-level configuration settings for speech recognition services.

def get_config(
    self,
    request: GetConfigRequest,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> Config:
    """
    Retrieves the requested configuration.
    
    Parameters:
    - request: Request to get configuration
    - retry: Retry configuration for failed requests
    - timeout: Request timeout in seconds
    - metadata: Additional metadata to send with the request
    
    Returns:
    Config: The requested configuration object
    """

def update_config(
    self,
    request: UpdateConfigRequest,
    *,
    retry: OptionalRetry = None,
    timeout: Optional[float] = None,
    metadata: Sequence[Tuple[str, str]] = ()
) -> Config:
    """
    Updates the configuration settings.
    
    Parameters:
    - request: Request to update configuration with new settings
    - retry: Retry configuration for failed requests
    - timeout: Request timeout in seconds
    - metadata: Additional metadata to send with the request
    
    Returns:
    Config: The updated configuration object
    """

Configuration Management Usage

from google.cloud import speech_v2

client = speech_v2.SpeechClient()

# Get current configuration
get_request = speech_v2.GetConfigRequest(
    name="projects/your-project-id/locations/global/config"
)
config = client.get_config(request=get_request)
print(f"Current config: {config}")

# Update configuration
updated_config = speech_v2.Config(
    name="projects/your-project-id/locations/global/config",
    kms_key_name="projects/your-project-id/locations/us-central1/keyRings/ring/cryptoKeys/key",
    update_time=None,  # Will be set by service
)

update_request = speech_v2.UpdateConfigRequest(
    config=updated_config,
    update_mask={"paths": ["kms_key_name"]},  # Only update encryption key
)

updated_config = client.update_config(request=update_request)
print(f"Updated config: {updated_config}")

V2 Configuration Types

RecognitionConfig (v2)

class RecognitionConfig:
    """Enhanced recognition configuration for v2 API."""
    explicit_decoding_config: ExplicitDecodingConfig
    auto_decoding_config: AutoDetectDecodingConfig
    model: str
    language_codes: Sequence[str]
    translation_config: TranslationConfig
    features: RecognitionFeatures
    adaptation: SpeechAdaptation
    transcript_normalization: TranscriptNormalization

RecognitionFeatures

class RecognitionFeatures:
    """Feature flags for speech recognition."""
    enable_word_time_offsets: bool
    enable_word_confidence: bool
    enable_automatic_punctuation: bool
    enable_spoken_punctuation: bool
    enable_spoken_emojis: bool
    enable_speaker_diarization: bool
    diarization_config: SpeakerDiarizationConfig
    max_alternatives: int
    profanity_filter: bool

AutoDetectDecodingConfig

class AutoDetectDecodingConfig:
    """Automatic audio format detection."""
    # No configuration needed - automatically detects format

ExplicitDecodingConfig

class ExplicitDecodingConfig:
    """Explicit audio format specification."""
    encoding: AudioEncoding
    sample_rate_hertz: int
    audio_channel_count: int

Recognizer

class Recognizer:
    """Persistent recognizer configuration."""
    name: str
    uid: str
    display_name: str
    model: str
    language_codes: Sequence[str]
    default_recognition_config: RecognitionConfig
    annotations: Mapping[str, str]
    state: State
    create_time: Timestamp
    update_time: Timestamp
    delete_time: Timestamp
    expire_time: Timestamp
    etag: str
    reconciling: bool
    kms_key_name: str
    kms_key_version_name: str
    
    class State:
        """Recognizer lifecycle state."""
        STATE_UNSPECIFIED = 0
        ACTIVE = 2
        DELETE_REQUESTED = 3

V2 Request Types

BatchRecognizeRequest

class BatchRecognizeRequest:
    """Request for batch recognition."""
    parent: str
    config: RecognitionConfig
    config_mask: FieldMask
    files: Sequence[BatchRecognizeFileMetadata]
    recognition_output_config: RecognitionOutputConfig
    processing_strategy: ProcessingStrategy

BatchRecognizeFileMetadata

class BatchRecognizeFileMetadata:
    """Metadata for individual file in batch."""
    uri: str
    config: RecognitionConfig
    config_mask: FieldMask
    output_config: RecognitionOutputConfig

RecognitionOutputConfig

class RecognitionOutputConfig:
    """Configuration for recognition output."""
    gcs_output_config: GcsOutputConfig
    inline_response_config: InlineOutputConfig
    output_format_config: OutputFormatConfig

V2 Response Types

BatchRecognizeResponse

class BatchRecognizeResponse:
    """Response from batch recognition."""
    results: Mapping[str, BatchRecognizeFileResult]
    total_billed_duration: Duration

BatchRecognizeFileResult

class BatchRecognizeFileResult:
    """Result for individual file in batch."""
    uri: str
    error: Status
    metadata: BatchRecognizeTranscriptionMetadata
    transcript: BatchRecognizeResults

BatchRecognizeResults

class BatchRecognizeResults:
    """Transcription results from batch recognition."""
    results: Sequence[SpeechRecognitionResult]
    metadata: RecognitionResponseMetadata

Config

class Config:
    """Project-level configuration for Speech services."""
    name: str
    kms_key_name: str
    update_time: Timestamp

V2 Request Types (Configuration Management)

GetConfigRequest

class GetConfigRequest:
    """Request to retrieve configuration."""
    name: str  # Format: projects/{project}/locations/{location}/config

UpdateConfigRequest

class UpdateConfigRequest:
    """Request to update configuration."""
    config: Config
    update_mask: FieldMask

UndeleteRecognizerRequest

class UndeleteRecognizerRequest:
    """Request to undelete a recognizer."""
    name: str  # Format: projects/{project}/locations/{location}/recognizers/{recognizer}
    validate_only: bool
    etag: str

Advanced Configuration Examples

Multi-language Recognition

# Configure for automatic language detection
config = speech_v2.RecognitionConfig(
    auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
    language_codes=["en-US", "es-ES", "fr-FR"],  # Multiple languages
    features=speech_v2.RecognitionFeatures(
        enable_automatic_punctuation=True,
        max_alternatives=3,  # Multiple transcription alternatives
    ),
)

Translation Integration

# Configure for speech-to-text with translation
config = speech_v2.RecognitionConfig(
    auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
    language_codes=["es-ES"],  # Source language
    translation_config=speech_v2.TranslationConfig(
        target_language="en-US"  # Translate to English
    ),
    features=speech_v2.RecognitionFeatures(
        enable_automatic_punctuation=True,
    ),
)

Advanced Diarization

# Enhanced speaker diarization configuration
diarization_config = speech_v2.SpeakerDiarizationConfig(
    min_speaker_count=2,
    max_speaker_count=10,
    speaker_ids=["SPEAKER_1", "SPEAKER_2"],  # Predefined speaker IDs
)

config = speech_v2.RecognitionConfig(
    auto_decoding_config=speech_v2.AutoDetectDecodingConfig(),
    language_codes=["en-US"],
    features=speech_v2.RecognitionFeatures(
        enable_speaker_diarization=True,
        diarization_config=diarization_config,
        enable_word_time_offsets=True,
    ),
)

Migration from v1 to v2

Key Changes

# v1 approach
from google.cloud import speech

client = speech.SpeechClient()
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=16000,
    language_code="en-US",  # Single language
)

# v2 approach
from google.cloud import speech_v2

client = speech_v2.SpeechClient()
config = speech_v2.RecognitionConfig(
    explicit_decoding_config=speech_v2.ExplicitDecodingConfig(
        encoding=speech_v2.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
    ),
    language_codes=["en-US"],  # Multiple languages supported
    features=speech_v2.RecognitionFeatures(
        enable_automatic_punctuation=True,
    ),
)

Recognition Request Changes

# v1 request
response = client.recognize(config=config, audio=audio)

# v2 request
request = speech_v2.RecognizeRequest(
    recognizer="projects/project/locations/global/recognizers/default",
    config=config,
    content=audio_content,
)
response = client.recognize(request=request)

Install with Tessl CLI

npx tessl i tessl/pypi-google-cloud-speech

docs

advanced-features.md

index.md

speech-adaptation.md

speech-recognition.md

streaming-recognition.md

types-and-configuration.md

tile.json