Audio transcription functionality for converting speech to text using Azure OpenAI's transcription models. Supports various audio formats and provides accurate speech-to-text conversion.
Creates Azure OpenAI transcription models for converting audio files to text.
/**
* Creates an Azure OpenAI model for audio transcription
* @param deploymentId - Azure OpenAI transcription deployment name (e.g., "whisper-1")
* @returns Transcription model instance for converting audio to text
*/
transcription(deploymentId: string): TranscriptionModelV1;Usage Example:
import { azure } from "@ai-sdk/azure";
import { transcribe } from "ai";
import { readFileSync } from "fs";
const transcriptionModel = azure.transcription("whisper-1");
// Load audio file
const audioBuffer = readFileSync("./audio/meeting-recording.mp3");
const { text } = await transcribe({
model: transcriptionModel,
audio: audioBuffer,
});
console.log("Transcribed text:", text);import { azure } from "@ai-sdk/azure";
import { transcribe } from "ai";
import { readFileSync, readdirSync } from "fs";
import { join } from "path";
const transcriptionModel = azure.transcription("whisper-1");
async function transcribeDirectory(audioDir: string) {
const audioFiles = readdirSync(audioDir).filter(file =>
file.endsWith('.mp3') || file.endsWith('.wav') || file.endsWith('.m4a')
);
const results = [];
for (const filename of audioFiles) {
try {
const audioBuffer = readFileSync(join(audioDir, filename));
const { text } = await transcribe({
model: transcriptionModel,
audio: audioBuffer,
});
results.push({
filename,
text,
success: true,
});
console.log(`✓ Transcribed ${filename}`);
} catch (error) {
results.push({
filename,
text: null,
success: false,
error: error instanceof Error ? error.message : "Unknown error",
});
console.error(`✗ Failed to transcribe ${filename}:`, error);
}
}
return results;
}
// Transcribe all audio files in a directory
const transcriptions = await transcribeDirectory("./audio-files");
// Save results
transcriptions.forEach(result => {
if (result.success) {
console.log(`${result.filename}: ${result.text}`);
}
});import { azure } from "@ai-sdk/azure";
import { transcribe } from "ai";
import { readFileSync } from "fs";
const transcriptionModel = azure.transcription("whisper-1");
async function transcribeMeeting(audioFile: string) {
const audioBuffer = readFileSync(audioFile);
const { text } = await transcribe({
model: transcriptionModel,
audio: audioBuffer,
});
// Split into sentences for better readability
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
return {
fullText: text,
sentences: sentences.map(sentence => sentence.trim()),
wordCount: text.split(/\s+/).length,
estimatedDuration: `${Math.ceil(text.split(/\s+/).length / 150)} minutes`, // ~150 words per minute
};
}
const meetingTranscription = await transcribeMeeting("./meeting.mp3");
console.log("Full Transcription:");
console.log(meetingTranscription.fullText);
console.log("\nSentences:");
meetingTranscription.sentences.forEach((sentence, index) => {
console.log(`${index + 1}. ${sentence}`);
});
console.log(`\nWord Count: ${meetingTranscription.wordCount}`);
console.log(`Estimated Duration: ${meetingTranscription.estimatedDuration}`);import { azure } from "@ai-sdk/azure";
import { transcribe } from "ai";
import { readFileSync, writeFileSync } from "fs";
const transcriptionModel = azure.transcription("whisper-1");
async function transcribePodcast(audioFile: string, outputFile: string) {
console.log("Starting transcription...");
const audioBuffer = readFileSync(audioFile);
const startTime = Date.now();
const { text } = await transcribe({
model: transcriptionModel,
audio: audioBuffer,
});
const endTime = Date.now();
// Format the transcription
const formattedTranscription = `
# Podcast Transcription
Generated: ${new Date().toISOString()}
Processing Time: ${((endTime - startTime) / 1000).toFixed(1)} seconds
## Full Transcript
${text}
---
Transcribed using Azure OpenAI Whisper
`.trim();
// Save to file
writeFileSync(outputFile, formattedTranscription);
console.log(`Transcription saved to ${outputFile}`);
console.log(`Processing time: ${((endTime - startTime) / 1000).toFixed(1)} seconds`);
return {
text,
processingTimeMs: endTime - startTime,
outputFile,
};
}
// Transcribe podcast episode
const result = await transcribePodcast(
"./podcast-episode-042.mp3",
"./transcripts/episode-042.md"
);import { azure } from "@ai-sdk/azure";
import { transcribe } from "ai";
import { readFileSync } from "fs";
const transcriptionModel = azure.transcription("whisper-1");
async function transcribeWithRetry(
audioFile: string,
maxRetries: number = 3
): Promise<{ success: boolean; text?: string; error?: string }> {
let lastError: Error | null = null;
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
console.log(`Transcription attempt ${attempt}/${maxRetries}...`);
const audioBuffer = readFileSync(audioFile);
const { text } = await transcribe({
model: transcriptionModel,
audio: audioBuffer,
});
console.log(`✓ Transcription successful on attempt ${attempt}`);
return { success: true, text };
} catch (error) {
lastError = error instanceof Error ? error : new Error("Unknown error");
console.warn(`✗ Attempt ${attempt} failed:`, lastError.message);
if (attempt < maxRetries) {
// Wait before retrying (exponential backoff)
const waitTime = Math.pow(2, attempt) * 1000;
console.log(`Waiting ${waitTime}ms before retry...`);
await new Promise(resolve => setTimeout(resolve, waitTime));
}
}
}
return {
success: false,
error: lastError?.message || "Transcription failed after all retries"
};
}
// Use with retry logic
const result = await transcribeWithRetry("./difficult-audio.mp3");
if (result.success) {
console.log("Transcription:", result.text);
} else {
console.error("Failed to transcribe:", result.error);
}Azure OpenAI's Whisper model supports various audio formats:
Transcription models implement the TranscriptionModelV1 interface and are compatible with AI SDK transcription functions:
transcribe() - Convert audio file to textThe transcription process returns plain text output of the spoken content in the audio file.