Spring Boot Starter for OpenAI integration providing auto-configuration for chat completion, embeddings, image generation, audio speech synthesis, audio transcription, and content moderation models. Includes high-level ChatClient API and conversation memory support.
The OpenAiAudioTranscriptionModel transcribes audio files to text using OpenAI's Whisper model with support for multiple languages and output formats.
import org.springframework.ai.openai.OpenAiAudioTranscriptionModel;
import org.springframework.ai.openai.OpenAiAudioTranscriptionOptions;
import org.springframework.ai.openai.api.OpenAiAudioApi;
import org.springframework.ai.model.AudioTranscriptionPrompt;
import org.springframework.ai.model.AudioTranscriptionResponse;
import org.springframework.core.io.Resource;
import org.springframework.core.io.FileSystemResource;package org.springframework.ai.openai;
public class OpenAiAudioTranscriptionModel implements AudioTranscriptionModel {
// Constructors
public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi);
public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi, OpenAiAudioTranscriptionOptions options);
public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi, OpenAiAudioTranscriptionOptions options, RetryTemplate retryTemplate);
// Transcribe audio with options
public AudioTranscriptionResponse call(AudioTranscriptionPrompt transcriptionPrompt);
// Convenience method
public AudioTranscriptionResponse call(Resource audioResource);
}package org.springframework.ai.openai;
public class OpenAiAudioTranscriptionOptions implements AudioTranscriptionOptions {
// Builder methods
public static OpenAiAudioTranscriptionOptions.Builder builder();
// Getters
public String getModel();
public String getLanguage();
public String getPrompt();
public OpenAiAudioApi.TranscriptResponseFormat getResponseFormat();
public Float getTemperature();
public OpenAiAudioApi.TranscriptResponseFormat getGranularityType();
// Setters
public void setModel(String model);
public void setLanguage(String language);
public void setPrompt(String prompt);
public void setResponseFormat(OpenAiAudioApi.TranscriptResponseFormat responseFormat);
public void setTemperature(Float temperature);
public void setGranularityType(OpenAiAudioApi.TranscriptResponseFormat granularityType);
}import org.springframework.ai.openai.OpenAiAudioTranscriptionModel;
import org.springframework.ai.model.AudioTranscriptionPrompt;
import org.springframework.ai.model.AudioTranscriptionResponse;
import org.springframework.core.io.FileSystemResource;
import org.springframework.core.io.Resource;
import org.springframework.stereotype.Service;
@Service
public class TranscriptionService {
private final OpenAiAudioTranscriptionModel transcriptionModel;
public TranscriptionService(OpenAiAudioTranscriptionModel transcriptionModel) {
this.transcriptionModel = transcriptionModel;
}
public String transcribeAudio(String audioFilePath) {
Resource audioFile = new FileSystemResource(audioFilePath);
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
return response.getResult().getOutput();
}
// Convenience method - direct Resource usage
public String transcribeAudioDirect(String audioFilePath) {
Resource audioFile = new FileSystemResource(audioFilePath);
AudioTranscriptionResponse response = transcriptionModel.call(audioFile);
return response.getResult().getOutput();
}
}import org.springframework.ai.openai.OpenAiAudioTranscriptionOptions;
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptResponseFormat;
public String transcribeWithOptions(String audioFilePath) {
Resource audioFile = new FileSystemResource(audioFilePath);
OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
.language("en")
.responseFormat(TranscriptResponseFormat.TEXT)
.temperature(0.0f)
.build();
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
return response.getResult().getOutput();
}public String transcribeSpanish(String audioFilePath) {
Resource audioFile = new FileSystemResource(audioFilePath);
OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
.language("es") // Spanish
.build();
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
return response.getResult().getOutput();
}public String transcribeWithGuidance(String audioFilePath, String contextPrompt) {
Resource audioFile = new FileSystemResource(audioFilePath);
OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
.prompt(contextPrompt) // Guide transcription with context
.build();
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
return response.getResult().getOutput();
}
// Example usage
public void demonstrateGuidance() {
String audioPath = "meeting.mp3";
String guidance = "This is a technical discussion about Spring Boot and microservices.";
String transcript = transcribeWithGuidance(audioPath, guidance);
}import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptResponseFormat;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
public JsonNode transcribeAsJson(String audioFilePath) throws Exception {
Resource audioFile = new FileSystemResource(audioFilePath);
OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
.responseFormat(TranscriptResponseFormat.JSON)
.build();
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
String jsonOutput = response.getResult().getOutput();
ObjectMapper mapper = new ObjectMapper();
return mapper.readTree(jsonOutput);
}public JsonNode transcribeVerbose(String audioFilePath) throws Exception {
Resource audioFile = new FileSystemResource(audioFilePath);
OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
.responseFormat(TranscriptResponseFormat.VERBOSE_JSON)
.build();
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
String jsonOutput = response.getResult().getOutput();
ObjectMapper mapper = new ObjectMapper();
JsonNode json = mapper.readTree(jsonOutput);
// Verbose JSON includes: text, language, duration, segments with timestamps
return json;
}import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptResponseFormat;
import java.nio.file.Files;
import java.nio.file.Path;
public String transcribeToSRT(String audioFilePath, String outputSrtPath) throws Exception {
Resource audioFile = new FileSystemResource(audioFilePath);
OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
.responseFormat(TranscriptResponseFormat.SRT)
.build();
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
String srtContent = response.getResult().getOutput();
Files.writeString(Path.of(outputSrtPath), srtContent);
return srtContent;
}public String transcribeToVTT(String audioFilePath, String outputVttPath) throws Exception {
Resource audioFile = new FileSystemResource(audioFilePath);
OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
.responseFormat(TranscriptResponseFormat.VTT)
.build();
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
String vttContent = response.getResult().getOutput();
Files.writeString(Path.of(outputVttPath), vttContent);
return vttContent;
}public String transcribeDeterministic(String audioFilePath) {
Resource audioFile = new FileSystemResource(audioFilePath);
OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
.temperature(0.0f) // More deterministic
.build();
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
return response.getResult().getOutput();
}
public String transcribeCreative(String audioFilePath) {
Resource audioFile = new FileSystemResource(audioFilePath);
OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
.temperature(1.0f) // More varied output
.build();
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
return response.getResult().getOutput();
}import org.springframework.core.io.ByteArrayResource;
public String transcribeFromBytes(byte[] audioData, String filename) {
Resource audioFile = new ByteArrayResource(audioData) {
@Override
public String getFilename() {
return filename; // Must provide filename with extension
}
};
TranscriptionPrompt prompt = new TranscriptionPrompt(audioFile);
TranscriptionResponse response = transcriptionModel.call(prompt);
return response.getResult().getOutput();
}Configure via application.properties:
# API Connection
spring.ai.openai.audio.transcription.api-key=sk-...
spring.ai.openai.audio.transcription.base-url=https://api.openai.com
spring.ai.openai.audio.transcription.project-id=proj_...
spring.ai.openai.audio.transcription.organization-id=org-...
# Model Configuration
spring.ai.openai.audio.transcription.options.model=whisper-1
spring.ai.openai.audio.transcription.options.language=en
spring.ai.openai.audio.transcription.options.response-format=text
spring.ai.openai.audio.transcription.options.temperature=0.7
spring.ai.openai.audio.transcription.options.timestamp-granularities=segmentwhisper-1 - OpenAI's Whisper model (default, only option)Specify input language using ISO-639-1 codes:
en - Englishes - Spanishfr - Frenchde - Germanit - Italianpt - Portuguesenl - Dutchja - Japaneseko - Koreanzh - ChineseLanguage hint improves accuracy and speed.
TEXT - Plain text transcript (default)JSON - JSON with transcript textVERBOSE_JSON - Detailed JSON with segments, timestamps, language, durationSRT - SubRip subtitle format (with timestamps)VTT - WebVTT subtitle format (with timestamps)0.0 (more deterministic)0.7 (balanced)1.0 (more varied)Lower temperature = more conservative transcription Higher temperature = more creative interpretation
Optional text to guide transcription style and vocabulary:
The following audio formats are supported:
File size limit: 25 MB maximum
import org.springframework.ai.openai.api.OpenAiApiException;
public String safeTranscribe(String audioFilePath) {
try {
Resource audioFile = new FileSystemResource(audioFilePath);
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
return response.getResult().getOutput();
} catch (OpenAiApiException e) {
// Handle API errors (rate limits, invalid audio format, etc.)
throw new RuntimeException("Transcription failed: " + e.getMessage(), e);
}
}package org.springframework.ai.openai;
public static class Builder {
public Builder model(String model);
public Builder language(String language);
public Builder prompt(String prompt);
public Builder responseFormat(OpenAiAudioApi.TranscriptResponseFormat responseFormat);
public Builder temperature(Float temperature);
public Builder granularityType(OpenAiAudioApi.TranscriptResponseFormat granularityType);
public OpenAiAudioTranscriptionOptions build();
}package org.springframework.ai.openai.api.OpenAiAudioApi;
public enum TranscriptResponseFormat {
TEXT("text"),
JSON("json"),
VERBOSE_JSON("verbose_json"),
SRT("srt"),
VTT("vtt");
}package org.springframework.ai.model;
public class AudioTranscriptionPrompt {
public AudioTranscriptionPrompt(Resource audioResource);
public AudioTranscriptionPrompt(Resource audioResource, AudioTranscriptionOptions options);
public Resource getInstructions();
public AudioTranscriptionOptions getOptions();
}package org.springframework.ai.model;
public class AudioTranscriptionResponse {
public AudioTranscription getResult();
public List<AudioTranscription> getResults();
public AudioTranscriptionResponseMetadata getMetadata();
}package org.springframework.ai.model;
public class AudioTranscription {
public String getOutput();
public AudioTranscriptionMetadata getMetadata();
}tessl i tessl/maven-org-springframework-ai--spring-ai-starter-model-openai@1.1.1