or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

embedding-models.mderrors.mdimage-models.mdindex.mdjson-utilities.mdlanguage-models.mdmiddleware.mdprovider.mdspeech-models.mdtranscription-models.md
tile.json

transcription-models.mddocs/

Transcription Models

Interface for speech-to-text models that convert audio into text transcriptions with support for timestamps, language detection, and detailed segment information.

Capabilities

TranscriptionModelV2 Type

Core type definition for speech-to-text model implementations.

/**
 * Core speech-to-text model type
 */
type TranscriptionModelV2 = {
  /** API specification version */
  specificationVersion: 'v2';
  /** Provider identifier (e.g., 'openai', 'assemblyai') */
  provider: string;
  /** Model identifier (e.g., 'whisper-1', 'best') */
  modelId: string;
  
  /** Transcribe audio to text */
  doGenerate(options: TranscriptionModelV2CallOptions): PromiseLlike<TranscriptionModelV2Result>;
};

Call Options

Configuration options for transcription calls.

/**
 * Configuration options for transcription
 */
interface TranscriptionModelV2CallOptions {
  /** Audio data to transcribe */
  audio: Uint8Array | string;
  /** Media type of the audio (e.g., 'audio/mpeg', 'audio/wav') */
  mediaType: string;
  /** Provider-specific options */
  providerOptions?: TranscriptionModelV2ProviderOptions;
  /** Abort signal for cancellation */
  abortSignal?: AbortSignal;
  /** Custom HTTP headers */
  headers?: Record<string, string | undefined>;
}

/**
 * Provider-specific options for transcription
 */
type TranscriptionModelV2ProviderOptions = Record<string, Record<string, JSONValue>>;

Transcription Results

Response structure containing transcribed text and detailed metadata.

/**
 * Result from transcription
 */
interface TranscriptionModelV2Result {
  /** Complete transcribed text */
  text: string;
  /** Detailed segments with timestamps */
  segments: Array<TranscriptionModelV2Segment>;
  /** Detected or specified language code */
  language: string | undefined;
  /** Total audio duration in seconds */
  durationInSeconds: number | undefined;
  /** Warnings from the transcription */
  warnings: TranscriptionModelV2CallWarning[];
  /** Request details */
  request?: { body?: string };
  /** Response details (required) */
  response: {
    timestamp: Date;
    modelId: string;
    headers?: SharedV2Headers;
    body?: unknown;
  };
  /** Provider-specific metadata */
  providerMetadata?: Record<string, Record<string, JSONValue>>;
}

/**
 * Individual transcription segment with timing information
 */
interface TranscriptionModelV2Segment {
  /** Transcribed text for this segment */
  text: string;
  /** Start time in seconds */
  startSecond: number;
  /** End time in seconds */
  endSecond: number;
}

Warning Types

Warnings that can be returned from transcription calls.

/**
 * Warning types for transcription calls
 */
type TranscriptionModelV2CallWarning =
  | { type: 'unsupported-setting'; setting: keyof TranscriptionModelV2CallOptions; details?: string }
  | { type: 'other'; message: string };

Usage Examples:

import { TranscriptionModelV2 } from "@ai-sdk/provider";
import fs from 'fs';

// Basic audio transcription
const model: TranscriptionModelV2 = provider.transcriptionModel('whisper-1');

// Transcribe from file
const audioData = fs.readFileSync('recording.mp3');
const result = await model.doGenerate({
  audio: audioData,
  mediaType: 'audio/mpeg'
});

console.log('Transcription:', result.text);
console.log('Language:', result.language);
console.log('Duration:', result.durationInSeconds, 'seconds');

// Access detailed segments with timestamps
if (result.segments) {
  result.segments.forEach((segment, index) => {
    console.log(`Segment ${index + 1} (${segment.startSecond}s - ${segment.endSecond}s): ${segment.text}`);
  });
}

// Transcribe from base64 string
const base64Audio = 'data:audio/wav;base64,UklGRiQAAABXQVZFZm10IBA...'; // Base64 encoded audio
const base64Result = await model.doGenerate({
  audio: base64Audio,
  mediaType: 'audio/wav'
});

// Advanced transcription with provider-specific options
const advancedResult = await model.doGenerate({
  audio: audioData,
  mediaType: 'audio/mpeg',
  providerOptions: {
    openai: {
      language: 'en', // Specify source language
      prompt: 'This is a technical discussion about AI and machine learning.',
      temperature: 0.0, // More deterministic output
      response_format: 'verbose_json' // Get detailed response with timestamps
    }
  }
});

// Multi-language transcription
const multilingualAudio = fs.readFileSync('multilingual-recording.mp3');
const multilingualResult = await model.doGenerate({
  audio: multilingualAudio,
  mediaType: 'audio/mpeg',
  providerOptions: {
    openai: {
      language: 'auto', // Auto-detect language
    },
    assemblyai: {
      language_detection: true,
      punctuate: true,
      format_text: true
    }
  }
});

// Handle different audio formats
const formats = [
  { file: 'audio.mp3', type: 'audio/mpeg' },
  { file: 'audio.wav', type: 'audio/wav' },
  { file: 'audio.m4a', type: 'audio/mp4' },
  { file: 'audio.webm', type: 'audio/webm' }
];

const transcriptions = await Promise.all(
  formats.map(async ({ file, type }) => {
    const data = fs.readFileSync(file);
    const result = await model.doGenerate({
      audio: data,
      mediaType: type
    });
    return { file, transcription: result.text };
  })
);

// Real-time transcription with streaming (if supported)
const streamingResult = await model.doGenerate({
  audio: liveAudioStream,
  mediaType: 'audio/wav',
  providerOptions: {
    assemblyai: {
      real_time: true,
      sample_rate: 16000,
      word_boost: ['AI', 'machine learning', 'neural network'],
      boost_param: 'high'
    }
  }
});

// Transcription with speaker diarization
const meetingAudio = fs.readFileSync('meeting-recording.mp3');
const meetingResult = await model.doGenerate({
  audio: meetingAudio,
  mediaType: 'audio/mpeg',
  providerOptions: {
    assemblyai: {
      speaker_labels: true,
      speakers_expected: 3
    }
  }
});

// Check for transcription quality and confidence
if (result.providerMetadata) {
  const metadata = result.providerMetadata;
  if (metadata.confidence) {
    console.log('Transcription confidence:', metadata.confidence);
  }
  if (metadata.words) {
    // Word-level confidence scores
    metadata.words.forEach(word => {
      console.log(`"${word.text}" - confidence: ${word.confidence}`);
    });
  }
}

// Handle warnings
if (result.warnings) {
  result.warnings.forEach(warning => {
    if (warning.type === 'unsupported-setting') {
      console.warn(`Setting not supported: ${warning.setting}`);
    } else {
      console.warn('Transcription warning:', warning.message);
    }
  });
}

// Convert segments to subtitle format (SRT)
function toSRT(segments: TranscriptionModelV2Segment[]): string {
  return segments.map((segment, index) => {
    const startTime = formatTime(segment.startSecond);
    const endTime = formatTime(segment.endSecond);
    return `${index + 1}\n${startTime} --> ${endTime}\n${segment.text}\n`;
  }).join('\n');
}

function formatTime(seconds: number): string {
  const hours = Math.floor(seconds / 3600);
  const minutes = Math.floor((seconds % 3600) / 60);
  const secs = Math.floor(seconds % 60);
  const ms = Math.floor((seconds % 1) * 1000);
  return `${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${secs.toString().padStart(2, '0')},${ms.toString().padStart(3, '0')}`;
}

if (result.segments) {
  const srtContent = toSRT(result.segments);
  fs.writeFileSync('transcription.srt', srtContent);
}