or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

embedding-models.mderrors.mdimage-models.mdindex.mdjson-utilities.mdlanguage-models.mdmiddleware.mdprovider.mdspeech-models.mdtranscription-models.md
tile.json

speech-models.mddocs/

Speech Models

Interface for text-to-speech models that convert text into spoken audio with support for voice selection, output formats, and audio customization options.

Capabilities

SpeechModelV2 Type

Core type definition for text-to-speech model implementations.

/**
 * Core text-to-speech model type
 */
type SpeechModelV2 = {
  /** API specification version */
  specificationVersion: 'v2';
  /** Provider identifier (e.g., 'openai', 'elevenlabs') */
  provider: string;
  /** Model identifier (e.g., 'tts-1', 'eleven_multilingual_v2') */
  modelId: string;
  
  /** Generate speech audio from text */
  doGenerate(options: SpeechModelV2CallOptions): PromiseLike<SpeechModelV2Result>;
};

Call Options

Configuration options for speech generation calls.

/**
 * Configuration options for speech generation
 */
interface SpeechModelV2CallOptions {
  /** Text to convert to speech */
  text: string;
  /** Voice identifier or name */
  voice?: string;
  /** Audio output format (e.g., 'mp3', 'wav', 'opus') */
  outputFormat?: string;
  /** Additional instructions for speech generation */
  instructions?: string;
  /** Speech speed multiplier (e.g., 0.5 for half speed, 2.0 for double speed) */
  speed?: number;
  /** Language code for the text (e.g., 'en-US', 'es-ES') */
  language?: string;
  /** Provider-specific options */
  providerOptions?: SpeechModelV2ProviderOptions;
  /** Abort signal for cancellation */
  abortSignal?: AbortSignal;
  /** Custom HTTP headers */
  headers?: Record<string, string | undefined>;
}

/**
 * Provider-specific options for speech generation
 */
type SpeechModelV2ProviderOptions = Record<string, Record<string, JSONValue>>;

Generation Results

Response structure containing generated audio and metadata.

/**
 * Result from speech generation
 */
interface SpeechModelV2Result {
  /** Generated audio data as string or bytes */
  audio: string | Uint8Array;
  /** Warnings from the generation */
  warnings: SpeechModelV2CallWarning[];
  /** Request details */
  request?: { body?: unknown };
  /** Response details (required) */
  response: {
    timestamp: Date;
    modelId: string;
    headers?: SharedV2Headers;
    body?: unknown;
  };
  /** Provider-specific metadata */
  providerMetadata?: Record<string, Record<string, JSONValue>>;
}

Warning Types

Warnings that can be returned from speech generation calls.

/**
 * Warning types for speech generation calls
 */
type SpeechModelV2CallWarning =
  | { type: 'unsupported-setting'; setting: keyof SpeechModelV2CallOptions; details?: string }
  | { type: 'other'; message: string };

Usage Examples:

import { SpeechModelV2 } from "@ai-sdk/provider";

// Basic text-to-speech
const model: SpeechModelV2 = provider.speechModel('tts-1');

const result = await model.doGenerate({
  text: 'Hello, welcome to our AI-powered application. How can I help you today?',
  voice: 'alloy',
  outputFormat: 'mp3'
});

// Save audio to file (Node.js)
import fs from 'fs';
fs.writeFileSync('output.mp3', result.audio);

// Play audio in browser
const audioBlob = new Blob([result.audio], { type: 'audio/mpeg' });
const audioUrl = URL.createObjectURL(audioBlob);
const audio = new Audio(audioUrl);
audio.play();

// Advanced speech generation with customization
const customSpeech = await model.doGenerate({
  text: 'This is a demonstration of advanced speech synthesis capabilities.',
  voice: 'nova',
  speed: 0.9,
  language: 'en-US',
  instructions: 'Speak in a professional, calm tone with clear pronunciation',
  providerOptions: {
    openai: {
      model: 'tts-1-hd'
    }
  }
});

// Multi-language speech
const spanishSpeech = await model.doGenerate({
  text: 'Hola, bienvenido a nuestra aplicación. ¿Cómo puedo ayudarte hoy?',
  voice: 'alloy',
  language: 'es-ES',
  outputFormat: 'wav'
});

// Handle different output formats
const formats = ['mp3', 'wav', 'opus'];
const audioFiles = await Promise.all(
  formats.map(async format => {
    const result = await model.doGenerate({
      text: 'This is a test of different audio formats.',
      outputFormat: format
    });
    return { format, audio: result.audio };
  })
);

// Check for warnings and handle unsupported features
const result = await model.doGenerate({
  text: 'Testing speech generation',
  voice: 'custom-voice',
  speed: 3.0, // May be unsupported
  outputFormat: 'flac' // May be unsupported
});

if (result.warnings) {
  result.warnings.forEach(warning => {
    if (warning.type === 'unsupported-setting') {
      console.warn(`Setting '${warning.setting}' is not supported by this provider`);
    }
  });
}

// Streaming or real-time speech (if supported by provider)
const longText = `
  This is a longer piece of text that we want to convert to speech.
  It contains multiple sentences and might benefit from streaming generation
  to reduce latency for the user experience.
`;

const speechResult = await model.doGenerate({
  text: longText,
  voice: 'echo',
  providerOptions: {
    elevenlabs: {
      stability: 0.75,
      similarity_boost: 0.85,
      optimize_streaming_latency: 3
    }
  }
});

// Voice cloning (provider-specific)
const clonedVoice = await model.doGenerate({
  text: 'This text will be spoken in a cloned voice.',
  providerOptions: {
    elevenlabs: {
      voice_id: 'custom-voice-id-here',
      voice_settings: {
        stability: 0.71,
        similarity_boost: 0.5,
        style: 0.0,
        use_speaker_boost: true
      }
    }
  }
});