LangChain4j OpenAI Integration providing Java access to OpenAI APIs including chat models, embeddings, image generation, audio transcription, and moderation.
Audio transcription models convert spoken audio to written text using OpenAI's Whisper and GPT-4o audio models. Supports multiple audio formats, languages, and optional speaker diarization for identifying different speakers in the audio.
The transcription API is ideal for meeting transcripts, podcast notes, voice memos, and accessibility features. Advanced models provide enhanced accuracy and speaker identification capabilities.
Experimental synchronous audio transcription model that converts audio files to text. Supports various audio formats and optional parameters for language and prompts.
@Experimental
public class OpenAiAudioTranscriptionModel implements AudioTranscriptionModel {
public static Builder builder();
// Core transcription method
public AudioTranscriptionResponse transcribe(AudioTranscriptionRequest audioRequest);
// Model information
public ModelProvider provider();
}Builder for configuring OpenAiAudioTranscriptionModel instances.
public static class Builder {
// Core configuration
public Builder apiKey(String apiKey);
public Builder baseUrl(String baseUrl);
public Builder organizationId(String organizationId);
public Builder projectId(String projectId);
public Builder modelName(String modelName);
public Builder modelName(OpenAiAudioTranscriptionModelName modelName);
// HTTP configuration
public Builder httpClientProvider(HttpClientBuilder httpClientBuilder);
public Builder timeout(Duration timeout);
public Builder maxRetries(Integer maxRetries);
// Logging
public Builder logRequests(Boolean logRequests);
public Builder logResponses(Boolean logResponses);
public Builder logger(Logger logger);
// Build
public OpenAiAudioTranscriptionModel build();
}import dev.langchain4j.model.openai.OpenAiAudioTranscriptionModel;
import dev.langchain4j.model.openai.OpenAiAudioTranscriptionModelName;
import dev.langchain4j.model.audio.AudioTranscriptionRequest;
import dev.langchain4j.model.audio.AudioTranscriptionResponse;
import java.nio.file.Files;
import java.nio.file.Paths;
// Create transcription model
OpenAiAudioTranscriptionModel model = OpenAiAudioTranscriptionModel.builder()
.apiKey(System.getenv("OPENAI_API_KEY"))
.modelName(OpenAiAudioTranscriptionModelName.WHISPER_1)
.build();
// Load audio file
byte[] audioData = Files.readAllBytes(Paths.get("meeting.mp3"));
// Create transcription request
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(audioData)
.fileName("meeting.mp3")
.build();
// Transcribe audio
AudioTranscriptionResponse response = model.transcribe(request);
System.out.println("Transcription: " + response.text());import java.time.Duration;
// Create model with advanced settings
OpenAiAudioTranscriptionModel model = OpenAiAudioTranscriptionModel.builder()
.apiKey(System.getenv("OPENAI_API_KEY"))
.modelName(OpenAiAudioTranscriptionModelName.GPT_4_O_TRANSCRIBE)
.timeout(Duration.ofMinutes(5)) // Long audio files need more time
.maxRetries(3)
.logRequests(true)
.logResponses(true)
.build();
// Load audio
byte[] audioData = Files.readAllBytes(Paths.get("podcast.mp3"));
// Create request with language hint and prompt
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(audioData)
.fileName("podcast.mp3")
.language("en") // English
.prompt("This is a podcast about artificial intelligence and machine learning.")
.temperature(0.2) // Lower temperature for more consistent output
.responseFormat("verbose_json") // Get detailed response with timestamps
.build();
AudioTranscriptionResponse response = model.transcribe(request);
System.out.println("Transcription:\n" + response.text());// Use diarization model to identify different speakers
OpenAiAudioTranscriptionModel diarizeModel = OpenAiAudioTranscriptionModel.builder()
.apiKey(System.getenv("OPENAI_API_KEY"))
.modelName(OpenAiAudioTranscriptionModelName.GPT_4_O_TRANSCRIBE_DIARIZE)
.build();
byte[] meetingAudio = Files.readAllBytes(Paths.get("team_meeting.mp3"));
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(meetingAudio)
.fileName("team_meeting.mp3")
.responseFormat("verbose_json")
.build();
AudioTranscriptionResponse response = diarizeModel.transcribe(request);
// Response includes speaker labels
System.out.println("Meeting transcript with speakers:");
System.out.println(response.text());// Transcribe audio in different languages
String[] languages = {"es", "fr", "de", "ja", "zh"};
String[] audioFiles = {
"spanish.mp3", "french.mp3", "german.mp3",
"japanese.mp3", "chinese.mp3"
};
OpenAiAudioTranscriptionModel model = OpenAiAudioTranscriptionModel.builder()
.apiKey(System.getenv("OPENAI_API_KEY"))
.modelName(OpenAiAudioTranscriptionModelName.WHISPER_1)
.build();
for (int i = 0; i < languages.length; i++) {
byte[] audioData = Files.readAllBytes(Paths.get(audioFiles[i]));
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(audioData)
.fileName(audioFiles[i])
.language(languages[i])
.build();
AudioTranscriptionResponse response = model.transcribe(request);
System.out.println(languages[i] + ": " + response.text());
}import java.io.File;
import java.util.List;
import java.util.stream.Collectors;
public class AudioBatchTranscriber {
private final OpenAiAudioTranscriptionModel model;
public AudioBatchTranscriber(String apiKey) {
this.model = OpenAiAudioTranscriptionModel.builder()
.apiKey(apiKey)
.modelName(OpenAiAudioTranscriptionModelName.WHISPER_1)
.timeout(Duration.ofMinutes(5))
.build();
}
public List<String> transcribeDirectory(String directoryPath) throws Exception {
File directory = new File(directoryPath);
File[] audioFiles = directory.listFiles(
f -> f.getName().endsWith(".mp3") ||
f.getName().endsWith(".wav") ||
f.getName().endsWith(".m4a")
);
List<String> transcriptions = new ArrayList<>();
for (File file : audioFiles) {
System.out.println("Transcribing: " + file.getName());
byte[] audioData = Files.readAllBytes(file.toPath());
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(audioData)
.fileName(file.getName())
.build();
AudioTranscriptionResponse response = model.transcribe(request);
transcriptions.add(response.text());
// Rate limiting
Thread.sleep(1000);
}
return transcriptions;
}
}
// Usage
AudioBatchTranscriber transcriber = new AudioBatchTranscriber(apiKey);
List<String> results = transcriber.transcribeDirectory("./audio_files/");@Experimental
public enum OpenAiAudioTranscriptionModelName {
WHISPER_1("whisper-1"),
GPT_4_O_TRANSCRIBE("gpt-4o-transcribe"),
GPT_4_O_MINI_TRANSCRIBE("gpt-4o-mini-transcribe"),
GPT_4_O_TRANSCRIBE_DIARIZE("gpt-4o-transcribe-diarize");
public String toString();
}| Model | Accuracy | Speed | Diarization | Cost |
|---|---|---|---|---|
| whisper-1 | Good | Fast | No | Low |
| gpt-4o-transcribe | Excellent | Medium | No | Medium |
| gpt-4o-mini-transcribe | Very Good | Fast | No | Low-Medium |
| gpt-4o-transcribe-diarize | Excellent | Slow | Yes | High |
public class AudioTranscriptionRequest {
public static Builder builder();
public byte[] audioData();
public String fileName();
public String language();
public String prompt();
public Double temperature();
public String responseFormat();
}
public static class Builder {
public Builder audioData(byte[] audioData);
public Builder fileName(String fileName);
public Builder language(String language);
public Builder prompt(String prompt);
public Builder temperature(Double temperature);
public Builder responseFormat(String responseFormat);
public AudioTranscriptionRequest build();
}public class AudioTranscriptionResponse {
public String text();
public String language();
public Double duration();
public List<Segment> segments();
}public interface AudioTranscriptionModel {
AudioTranscriptionResponse transcribe(AudioTranscriptionRequest request);
}The raw audio file bytes:
Name of the audio file:
ISO-639-1 language code:
Optional text to guide transcription:
Example prompts:
// Technical content
.prompt("This is a discussion about machine learning, neural networks, and deep learning algorithms.")
// Names and terminology
.prompt("The speakers are Dr. Smith and Prof. Johnson discussing quantum computing.")
// Formatting hints
.prompt("The transcript should include technical terms like API, SDK, and REST.")Controls randomness in transcription:
Format of the transcription output:
"json": Simple JSON with text only (default)"text": Plain text"srt": SubRip subtitle format"verbose_json": Detailed JSON with timestamps and metadata"vtt": WebVTT subtitle formatMaximum time to wait for transcription:
Number of retry attempts on failure:
| Format | Extension | Notes |
|---|---|---|
| MP3 | .mp3 | Most common, good compression |
| MP4 | .mp4, .m4a | Audio from video files |
| MPEG | .mpeg, .mpga | Standard audio format |
| WAV | .wav | Uncompressed, larger files |
| WebM | .webm | Modern web format |
Optimize File Size:
// Convert large WAV files to MP3 to reduce size and upload time
// Use external tools like ffmpeg before uploadingSplit Long Files:
public List<AudioTranscriptionResponse> transcribeLongAudio(
byte[] audioData,
int segmentSizeMinutes
) throws Exception {
// Split audio into segments (use audio processing library)
List<byte[]> segments = splitAudio(audioData, segmentSizeMinutes);
List<AudioTranscriptionResponse> transcriptions = new ArrayList<>();
for (int i = 0; i < segments.size(); i++) {
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(segments.get(i))
.fileName("segment_" + i + ".mp3")
.prompt(getContextFromPrevious(transcriptions)) // Maintain context
.build();
AudioTranscriptionResponse response = model.transcribe(request);
transcriptions.add(response);
}
return transcriptions;
}Specify Language:
// Better accuracy and faster processing
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(audioData)
.fileName("audio.mp3")
.language("en") // Specify when known
.build();Provide Context:
// Help with technical terms and proper nouns
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(audioData)
.fileName("tech_talk.mp3")
.prompt("Discussion about LangChain4j, OpenAI API, and Java programming.")
.build();Use High-Quality Audio:
Podcasts:
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(audioData)
.fileName("podcast.mp3")
.responseFormat("verbose_json") // Get timestamps for chapters
.prompt("This is a podcast with intro music and multiple segments.")
.build();Meetings:
// Use diarization for speaker identification
OpenAiAudioTranscriptionModel model = OpenAiAudioTranscriptionModel.builder()
.apiKey(apiKey)
.modelName(OpenAiAudioTranscriptionModelName.GPT_4_O_TRANSCRIBE_DIARIZE)
.build();
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(audioData)
.fileName("meeting.mp3")
.responseFormat("verbose_json")
.build();Lectures:
AudioTranscriptionRequest request = AudioTranscriptionRequest.builder()
.audioData(audioData)
.fileName("lecture.mp3")
.prompt("University lecture on quantum physics by Professor Anderson.")
.temperature(0.0) // Maximum accuracy
.build();public class TranscriptionProcessor {
public String cleanTranscription(String rawText) {
return rawText
.replaceAll("\\s+", " ") // Normalize whitespace
.replaceAll("([.!?])([A-Z])", "$1 $2") // Add space after punctuation
.trim();
}
public String addTimestamps(AudioTranscriptionResponse response) {
if (response.segments() == null) {
return response.text();
}
StringBuilder formatted = new StringBuilder();
for (Segment segment : response.segments()) {
formatted.append(formatTime(segment.start()))
.append(" - ")
.append(formatTime(segment.end()))
.append(": ")
.append(segment.text())
.append("\n");
}
return formatted.toString();
}
private String formatTime(double seconds) {
int hours = (int) (seconds / 3600);
int minutes = (int) ((seconds % 3600) / 60);
int secs = (int) (seconds % 60);
return String.format("%02d:%02d:%02d", hours, minutes, secs);
}
}Automatically transcribe meetings for notes and action items.
Convert podcasts to text for show notes and SEO.
Transcribe voice recordings into searchable text.
Provide captions and transcripts for audio content.
Extract insights from customer calls or interviews.
Transcribe foreign language audio for study materials.
Convert recordings to text for documentation (review for accuracy).
Charged per second of audio:
Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-open-ai@1.11.0