LangChain4j integration for Azure OpenAI providing chat, streaming, embeddings, image generation, audio transcription, and token counting capabilities
Audio transcription model for speech-to-text using Azure-hosted Whisper models. Converts audio files to text with support for various audio formats and output formats.
Note: This feature is marked as @Experimental and may change in future versions.
import dev.langchain4j.model.azure.AzureOpenAiAudioTranscriptionModel;
import dev.langchain4j.model.audio.AudioTranscriptionModel;
import dev.langchain4j.model.audio.AudioTranscriptionRequest;
import dev.langchain4j.model.audio.AudioTranscriptionResponse;
import com.azure.ai.openai.models.AudioTranscriptionFormat;AzureOpenAiAudioTranscriptionModel model = AzureOpenAiAudioTranscriptionModel.builder()
.endpoint("https://your-resource.openai.azure.com/")
.apiKey("your-api-key")
.deploymentName("whisper")
.serviceVersion("2024-02-15-preview")
.responseFormat(AudioTranscriptionFormat.JSON)
.build();
// Transcribe audio file
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioFile("/path/to/audio.mp3")
.build();
AudioTranscriptionResponse response = model.transcribe(request);
String transcription = response.text();package dev.langchain4j.model.azure;
/**
* @Experimental - API may change in future versions
* Azure OpenAI audio transcription using Whisper.
* Thread-safe: Yes - instances are immutable and thread-safe.
* File size limit: Maximum 25 MB per audio file.
* Supported formats: MP3, MP4, MPEG, MPGA, M4A, WAV, WEBM.
* Processing time: Varies by audio length (typically 10-50% of audio duration).
*/
@dev.langchain4j.Experimental
class AzureOpenAiAudioTranscriptionModel implements dev.langchain4j.model.audio.AudioTranscriptionModel {
static Builder builder();
/**
* Transcribes audio file to text.
* @param request AudioTranscriptionRequest with file and options
* @return AudioTranscriptionResponse with text and optional timestamps
* @throws IllegalArgumentException if file doesn't exist, > 25MB, or unsupported format
* @throws java.util.concurrent.TimeoutException if transcription exceeds timeout
* @throws RuntimeException for network or API errors
*/
dev.langchain4j.model.audio.AudioTranscriptionResponse
transcribe(dev.langchain4j.model.audio.AudioTranscriptionRequest request);
/**
* Returns model provider.
* @return ModelProvider.AZURE
*/
dev.langchain4j.model.provider.ModelProvider provider();
/**
* Alternative constructor using OpenAI client.
* Recommended: Use builder() instead.
* @param client OpenAIClient instance
* @param deploymentName Whisper deployment name
* @param responseFormat Output format
*/
AzureOpenAiAudioTranscriptionModel(
com.azure.ai.openai.OpenAIClient client,
String deploymentName,
com.azure.ai.openai.models.AudioTranscriptionFormat responseFormat
);
class Builder {
// Mandatory
Builder endpoint(String endpoint);
Builder serviceVersion(String serviceVersion);
Builder deploymentName(String deploymentName);
// Authentication
Builder apiKey(String apiKey);
Builder nonAzureApiKey(String apiKey);
Builder tokenCredential(com.azure.core.credential.TokenCredential credential);
// Transcription configuration
/**
* Output format.
* @param responseFormat JSON, TEXT, SRT, VTT, or VERBOSE_JSON
* @default JSON
*/
Builder responseFormat(com.azure.ai.openai.models.AudioTranscriptionFormat responseFormat);
// HTTP configuration
/**
* @default 120 seconds (longer for audio processing)
*/
Builder timeout(java.time.Duration timeout);
Builder maxRetries(Integer maxRetries);
Builder retryOptions(com.azure.core.http.policy.RetryOptions retryOptions);
Builder proxyOptions(com.azure.core.http.ProxyOptions proxyOptions);
Builder httpClientProvider(com.azure.core.http.HttpClientProvider httpClientProvider);
Builder openAIClient(com.azure.ai.openai.OpenAIClient client);
Builder customHeaders(java.util.Map<String, String> customHeaders);
Builder userAgentSuffix(String userAgentSuffix);
Builder logRequestsAndResponses(Boolean logRequestsAndResponses);
AzureOpenAiAudioTranscriptionModel build();
}
}package dev.langchain4j.model.audio;
/**
* Audio transcription request.
*/
class AudioTranscriptionRequest {
static Builder builder();
/**
* @return Audio file path
*/
String audioFile();
/**
* @return ISO-639-1 language code or null (auto-detect)
*/
String language();
/**
* @return Prompt for context/style or null
*/
String prompt();
/**
* @return Temperature 0.0-1.0 or null
*/
Double temperature();
class Builder {
/**
* Audio file path (absolute or relative).
* @param audioFile Path to MP3, WAV, M4A, etc. (max 25MB)
* @throws IllegalArgumentException if null
*/
Builder audioFile(String audioFile);
/**
* Audio language (optional, auto-detected if not specified).
* @param language ISO-639-1 code: "en", "es", "fr", etc.
* @default null (auto-detect)
*/
Builder language(String language);
/**
* Prompt to guide transcription style or continue previous segment.
* @param prompt Context string (e.g., "Technical discussion about AI")
* @default null
*/
Builder prompt(String prompt);
/**
* Sampling temperature for randomness.
* @param temperature 0.0 (deterministic) to 1.0 (more random)
* @default 0.0 (deterministic)
* @throws IllegalArgumentException if not in [0.0, 1.0]
*/
Builder temperature(Double temperature);
AudioTranscriptionRequest build();
}
}package dev.langchain4j.model.audio;
/**
* Audio transcription response.
*/
class AudioTranscriptionResponse {
/**
* Transcribed text (always present).
* @return Transcription string
*/
String text();
/**
* Detected language (if language detection used).
* @return ISO-639-1 code or null
*/
String language();
/**
* Audio duration in seconds (verbose JSON only).
* @return Duration or null
*/
Double duration();
/**
* Segments with timestamps (verbose JSON only).
* @return List of segments or null
*/
java.util.List<TranscriptionSegment> segments();
/**
* Word-level timestamps (verbose JSON only).
* @return List of words or null
*/
java.util.List<TranscriptionWord> words();
}
/**
* Transcription segment with timing.
*/
class TranscriptionSegment {
String text();
double start(); // Start time in seconds
double end(); // End time in seconds
int id(); // Segment ID
}
/**
* Individual word with timestamp.
*/
class TranscriptionWord {
String word();
double start(); // Word start in seconds
double end(); // Word end in seconds
}package com.azure.ai.openai.models;
/**
* Output format for transcription.
*/
enum AudioTranscriptionFormat {
/** JSON format with basic text and metadata */
JSON,
/** Plain text only (no metadata) */
TEXT,
/** SubRip subtitle format (.srt) with timestamps */
SRT,
/** WebVTT subtitle format (.vtt) with timestamps */
VTT,
/** Verbose JSON with word-level timestamps and segments */
VERBOSE_JSON
}JSON (default):
{
"text": "Hello, how are you today?"
}TEXT:
Hello, how are you today?SRT (subtitles):
1
00:00:00,000 --> 00:00:02,500
Hello, how are you today?VTT (subtitles):
WEBVTT
00:00:00.000 --> 00:00:02.500
Hello, how are you today?VERBOSE_JSON:
{
"text": "Hello, how are you today?",
"language": "en",
"duration": 2.5,
"segments": [
{
"id": 0,
"text": "Hello, how are you today?",
"start": 0.0,
"end": 2.5
}
],
"words": [
{"word": "Hello", "start": 0.0, "end": 0.5},
{"word": "how", "start": 0.6, "end": 0.8},
{"word": "are", "start": 0.9, "end": 1.0},
{"word": "you", "start": 1.1, "end": 1.3},
{"word": "today", "start": 1.4, "end": 2.5}
]
}| Format | Extension | Notes |
|---|---|---|
| MP3 | .mp3 | Most common |
| MP4 | .mp4 | Video with audio |
| MPEG | .mpeg | Audio/video |
| MPGA | .mpga | MPEG audio |
| M4A | .m4a | Apple audio |
| WAV | .wav | Uncompressed |
| WEBM | .webm | Web media |
File size limit: 25 MB maximum
Whisper supports 50+ languages. Common examples (ISO-639-1 codes):
| Code | Language | Code | Language |
|---|---|---|---|
| en | English | es | Spanish |
| fr | French | de | German |
| it | Italian | pt | Portuguese |
| ja | Japanese | ko | Korean |
| zh | Chinese | ru | Russian |
| ar | Arabic | hi | Hindi |
Language detection: If language not specified, Whisper auto-detects.
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioFile("/path/to/meeting.mp3")
.build();
AudioTranscriptionResponse response = model.transcribe(request);
System.out.println("Transcription: " + response.text());AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioFile("/path/to/spanish_audio.wav")
.language("es") // Spanish
.build();
AudioTranscriptionResponse response = model.transcribe(request);// Prompt guides transcription style and terminology
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioFile("/path/to/technical_talk.mp3")
.prompt("This is a technical discussion about machine learning and neural networks.")
.build();
AudioTranscriptionResponse response = model.transcribe(request);AzureOpenAiAudioTranscriptionModel model =
AzureOpenAiAudioTranscriptionModel.builder()
.endpoint(endpoint)
.apiKey(apiKey)
.deploymentName("whisper")
.serviceVersion("2024-02-15-preview")
.responseFormat(AudioTranscriptionFormat.SRT)
.build();
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioFile("/path/to/video.mp4")
.build();
AudioTranscriptionResponse response = model.transcribe(request);
// Save SRT file
Files.writeString(Path.of("subtitles.srt"), response.text());AzureOpenAiAudioTranscriptionModel model =
AzureOpenAiAudioTranscriptionModel.builder()
.endpoint(endpoint)
.apiKey(apiKey)
.deploymentName("whisper")
.responseFormat(AudioTranscriptionFormat.VERBOSE_JSON)
.build();
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioFile("/path/to/audio.wav")
.build();
AudioTranscriptionResponse response = model.transcribe(request);
// Access detailed timing
System.out.printf("Duration: %.2f seconds%n", response.duration());
System.out.println("Language: " + response.language());
// Word-level timing
for (TranscriptionWord word : response.words()) {
System.out.printf("%s [%.2f - %.2f]%n",
word.word(), word.start(), word.end());
}try {
AudioTranscriptionResponse response = model.transcribe(request);
} catch (IllegalArgumentException e) {
// Invalid input:
// - Audio file doesn't exist
// - File exceeds 25 MB
// - Unsupported audio format
// - Invalid temperature (not in [0.0, 1.0])
System.err.println("Invalid input: " + e.getMessage());
} catch (java.util.concurrent.TimeoutException e) {
// Transcription exceeded timeout (default 120s)
// Long audio files may need longer timeout
System.err.println("Request timed out");
} catch (RuntimeException e) {
// Network, API, or auth error
System.err.println("Error: " + e.getMessage());
}// Check file size before transcription
File audioFile = new File("/path/to/audio.mp3");
long sizeInMB = audioFile.length() / (1024 * 1024);
if (sizeInMB > 25) {
// Split large file into segments
List<File> segments = splitAudioFile(audioFile, 20); // 20 MB segments
StringBuilder fullTranscription = new StringBuilder();
for (File segment : segments) {
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioFile(segment.getAbsolutePath())
.build();
AudioTranscriptionResponse response = model.transcribe(request);
fullTranscription.append(response.text()).append(" ");
}
}Better audio quality = better transcription:
Transcription time varies:
Formula: Processing time ≈ 10-50% of audio duration
Implement appropriate timeouts:
// For 30-minute audio: 30 * 60 * 0.5 = 900 seconds max
.timeout(Duration.ofSeconds(900)) // 15 minutes timeoutInstall with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-azure-open-ai