Interface for speech-to-text models that convert audio into text transcriptions with support for timestamps, language detection, and detailed segment information.
Core type definition for speech-to-text model implementations.
/**
* Core speech-to-text model type
*/
type TranscriptionModelV2 = {
/** API specification version */
specificationVersion: 'v2';
/** Provider identifier (e.g., 'openai', 'assemblyai') */
provider: string;
/** Model identifier (e.g., 'whisper-1', 'best') */
modelId: string;
/** Transcribe audio to text */
doGenerate(options: TranscriptionModelV2CallOptions): PromiseLlike<TranscriptionModelV2Result>;
};Configuration options for transcription calls.
/**
* Configuration options for transcription
*/
interface TranscriptionModelV2CallOptions {
/** Audio data to transcribe */
audio: Uint8Array | string;
/** Media type of the audio (e.g., 'audio/mpeg', 'audio/wav') */
mediaType: string;
/** Provider-specific options */
providerOptions?: TranscriptionModelV2ProviderOptions;
/** Abort signal for cancellation */
abortSignal?: AbortSignal;
/** Custom HTTP headers */
headers?: Record<string, string | undefined>;
}
/**
* Provider-specific options for transcription
*/
type TranscriptionModelV2ProviderOptions = Record<string, Record<string, JSONValue>>;Response structure containing transcribed text and detailed metadata.
/**
* Result from transcription
*/
interface TranscriptionModelV2Result {
/** Complete transcribed text */
text: string;
/** Detailed segments with timestamps */
segments: Array<TranscriptionModelV2Segment>;
/** Detected or specified language code */
language: string | undefined;
/** Total audio duration in seconds */
durationInSeconds: number | undefined;
/** Warnings from the transcription */
warnings: TranscriptionModelV2CallWarning[];
/** Request details */
request?: { body?: string };
/** Response details (required) */
response: {
timestamp: Date;
modelId: string;
headers?: SharedV2Headers;
body?: unknown;
};
/** Provider-specific metadata */
providerMetadata?: Record<string, Record<string, JSONValue>>;
}
/**
* Individual transcription segment with timing information
*/
interface TranscriptionModelV2Segment {
/** Transcribed text for this segment */
text: string;
/** Start time in seconds */
startSecond: number;
/** End time in seconds */
endSecond: number;
}Warnings that can be returned from transcription calls.
/**
* Warning types for transcription calls
*/
type TranscriptionModelV2CallWarning =
| { type: 'unsupported-setting'; setting: keyof TranscriptionModelV2CallOptions; details?: string }
| { type: 'other'; message: string };Usage Examples:
import { TranscriptionModelV2 } from "@ai-sdk/provider";
import fs from 'fs';
// Basic audio transcription
const model: TranscriptionModelV2 = provider.transcriptionModel('whisper-1');
// Transcribe from file
const audioData = fs.readFileSync('recording.mp3');
const result = await model.doGenerate({
audio: audioData,
mediaType: 'audio/mpeg'
});
console.log('Transcription:', result.text);
console.log('Language:', result.language);
console.log('Duration:', result.durationInSeconds, 'seconds');
// Access detailed segments with timestamps
if (result.segments) {
result.segments.forEach((segment, index) => {
console.log(`Segment ${index + 1} (${segment.startSecond}s - ${segment.endSecond}s): ${segment.text}`);
});
}
// Transcribe from base64 string
const base64Audio = 'data:audio/wav;base64,UklGRiQAAABXQVZFZm10IBA...'; // Base64 encoded audio
const base64Result = await model.doGenerate({
audio: base64Audio,
mediaType: 'audio/wav'
});
// Advanced transcription with provider-specific options
const advancedResult = await model.doGenerate({
audio: audioData,
mediaType: 'audio/mpeg',
providerOptions: {
openai: {
language: 'en', // Specify source language
prompt: 'This is a technical discussion about AI and machine learning.',
temperature: 0.0, // More deterministic output
response_format: 'verbose_json' // Get detailed response with timestamps
}
}
});
// Multi-language transcription
const multilingualAudio = fs.readFileSync('multilingual-recording.mp3');
const multilingualResult = await model.doGenerate({
audio: multilingualAudio,
mediaType: 'audio/mpeg',
providerOptions: {
openai: {
language: 'auto', // Auto-detect language
},
assemblyai: {
language_detection: true,
punctuate: true,
format_text: true
}
}
});
// Handle different audio formats
const formats = [
{ file: 'audio.mp3', type: 'audio/mpeg' },
{ file: 'audio.wav', type: 'audio/wav' },
{ file: 'audio.m4a', type: 'audio/mp4' },
{ file: 'audio.webm', type: 'audio/webm' }
];
const transcriptions = await Promise.all(
formats.map(async ({ file, type }) => {
const data = fs.readFileSync(file);
const result = await model.doGenerate({
audio: data,
mediaType: type
});
return { file, transcription: result.text };
})
);
// Real-time transcription with streaming (if supported)
const streamingResult = await model.doGenerate({
audio: liveAudioStream,
mediaType: 'audio/wav',
providerOptions: {
assemblyai: {
real_time: true,
sample_rate: 16000,
word_boost: ['AI', 'machine learning', 'neural network'],
boost_param: 'high'
}
}
});
// Transcription with speaker diarization
const meetingAudio = fs.readFileSync('meeting-recording.mp3');
const meetingResult = await model.doGenerate({
audio: meetingAudio,
mediaType: 'audio/mpeg',
providerOptions: {
assemblyai: {
speaker_labels: true,
speakers_expected: 3
}
}
});
// Check for transcription quality and confidence
if (result.providerMetadata) {
const metadata = result.providerMetadata;
if (metadata.confidence) {
console.log('Transcription confidence:', metadata.confidence);
}
if (metadata.words) {
// Word-level confidence scores
metadata.words.forEach(word => {
console.log(`"${word.text}" - confidence: ${word.confidence}`);
});
}
}
// Handle warnings
if (result.warnings) {
result.warnings.forEach(warning => {
if (warning.type === 'unsupported-setting') {
console.warn(`Setting not supported: ${warning.setting}`);
} else {
console.warn('Transcription warning:', warning.message);
}
});
}
// Convert segments to subtitle format (SRT)
function toSRT(segments: TranscriptionModelV2Segment[]): string {
return segments.map((segment, index) => {
const startTime = formatTime(segment.startSecond);
const endTime = formatTime(segment.endSecond);
return `${index + 1}\n${startTime} --> ${endTime}\n${segment.text}\n`;
}).join('\n');
}
function formatTime(seconds: number): string {
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
const secs = Math.floor(seconds % 60);
const ms = Math.floor((seconds % 1) * 1000);
return `${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${secs.toString().padStart(2, '0')},${ms.toString().padStart(3, '0')}`;
}
if (result.segments) {
const srtContent = toSRT(result.segments);
fs.writeFileSync('transcription.srt', srtContent);
}