OpenAI models support for Spring AI, providing comprehensive integration for chat completion, embeddings, image generation, audio transcription, text-to-speech, and content moderation capabilities within Spring Boot applications.
Convert text to speech and transcribe audio to text using OpenAI's TTS and Whisper models, with support for multiple voices, formats, and languages.
Text-to-speech synthesis using OpenAI's TTS models, converting written text into natural-sounding speech.
/**
* OpenAI text-to-speech model implementation
*/
public class OpenAiAudioSpeechModel implements TextToSpeechModel {
/**
* Simple text-to-speech conversion
* @param text Text to convert to speech
* @return Audio data as byte array
*/
public byte[] call(String text);
/**
* Full text-to-speech with options and metadata
* @param textToSpeechPrompt Prompt containing text and options
* @return TextToSpeechResponse with audio and metadata
*/
public TextToSpeechResponse call(TextToSpeechPrompt textToSpeechPrompt);
/**
* Streaming text-to-speech generation
* @param textToSpeechPrompt Prompt containing text and options
* @return Flux of TextToSpeechResponse chunks
*/
public Flux<TextToSpeechResponse> stream(TextToSpeechPrompt textToSpeechPrompt);
/**
* Get the default options for this model
* @return TextToSpeechOptions configuration
*/
public TextToSpeechOptions getDefaultOptions();
}Constructors:
// Basic constructor
public OpenAiAudioSpeechModel(OpenAiAudioApi openAiAudioApi);
// With options
public OpenAiAudioSpeechModel(
OpenAiAudioApi openAiAudioApi,
OpenAiAudioSpeechOptions options
);
// Full constructor with retry support
public OpenAiAudioSpeechModel(
OpenAiAudioApi openAiAudioApi,
OpenAiAudioSpeechOptions options,
RetryTemplate retryTemplate
);Note: OpenAiAudioSpeechModel uses constructor-based initialization only and does not provide a builder pattern. Use the appropriate constructor based on your configuration needs.
Usage Example:
import org.springframework.ai.openai.OpenAiAudioSpeechModel;
import org.springframework.ai.openai.OpenAiAudioSpeechOptions;
import org.springframework.ai.openai.api.OpenAiAudioApi;
import org.springframework.ai.audio.speech.TextToSpeechPrompt;
import java.nio.file.Files;
import java.nio.file.Paths;
// Create API client
var audioApi = OpenAiAudioApi.builder()
.apiKey(System.getenv("OPENAI_API_KEY"))
.build();
// Create speech model with default options
var speechModel = new OpenAiAudioSpeechModel(
audioApi,
OpenAiAudioSpeechOptions.builder()
.model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
.voice("nova")
.speed(1.0)
.build()
);
// Simple text-to-speech
byte[] audioBytes = speechModel.call("Hello, how can I help you today?");
Files.write(Paths.get("speech.mp3"), audioBytes);
// Full text-to-speech with options
var prompt = new TextToSpeechPrompt(
"Welcome to our service. We're glad you're here!",
OpenAiAudioSpeechOptions.builder()
.voice("alloy")
.responseFormat("opus")
.speed(1.1)
.build()
);
var response = speechModel.call(prompt);
Files.write(Paths.get("welcome.opus"), response.getResult().getOutput());Streaming Audio Generation:
import reactor.core.publisher.Flux;
// Stream audio chunks as they're generated
var prompt = new TextToSpeechPrompt(
"This is a long text that will be streamed as audio chunks...",
OpenAiAudioSpeechOptions.builder()
.model(OpenAiAudioApi.TtsModel.TTS_1.getValue())
.voice("echo")
.responseFormat("pcm")
.build()
);
Flux<byte[]> audioStream = speechModel.stream(prompt)
.map(response -> response.getResult().getOutput());
// Process audio chunks as they arrive
audioStream.subscribe(chunk -> {
// Play or process audio chunk in real-time
processAudioChunk(chunk);
});Different Voices:
// Try different voices for various use cases
var voices = List.of("alloy", "echo", "fable", "onyx", "nova", "shimmer");
for (String voice : voices) {
var options = OpenAiAudioSpeechOptions.builder()
.voice(voice)
.build();
byte[] audio = speechModel.call(
new TextToSpeechPrompt("Hello, my name is " + voice, options)
);
Files.write(Paths.get("voice_" + voice + ".mp3"), audio);
}Configuration options for text-to-speech requests.
/**
* Configuration options for OpenAI text-to-speech
*/
public class OpenAiAudioSpeechOptions implements TextToSpeechOptions {
/**
* Create a new builder for speech options
* @return Builder instance
*/
public static Builder builder();
/**
* Create a copy of these options
* @return New OpenAiAudioSpeechOptions with same values
*/
public OpenAiAudioSpeechOptions copy();
/**
* Get the TTS model identifier
* @return Model name
*/
public String getModel();
public void setModel(String model);
/**
* Get the text input to synthesize
* @return Text to convert to speech
*/
public String getInput();
public void setInput(String input);
/**
* Get the voice selection
* @return Voice name (alloy/echo/fable/onyx/nova/shimmer)
*/
public String getVoice();
public void setVoice(String voice);
/**
* Get the audio response format
* @return Format (mp3/opus/aac/flac/wav/pcm)
*/
public String getResponseFormat();
public void setResponseFormat(String responseFormat);
/**
* Get the playback speed
* @return Speed multiplier (0.25 to 4.0)
*/
public Double getSpeed();
public void setSpeed(Double speed);
}Builder Pattern:
public static class Builder {
public Builder model(String model);
public Builder input(String input);
public Builder voice(String voice);
public Builder responseFormat(String responseFormat);
public Builder speed(Double speed);
public OpenAiAudioSpeechOptions build();
}Usage Example:
// High-quality speech with custom speed
var hdOptions = OpenAiAudioSpeechOptions.builder()
.model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
.voice("nova")
.responseFormat("wav")
.speed(1.25) // Faster playback
.build();
// Standard quality with different format
var standardOptions = OpenAiAudioSpeechOptions.builder()
.model(OpenAiAudioApi.TtsModel.TTS_1.getValue())
.voice("alloy")
.responseFormat("opus") // Good for streaming
.speed(1.0)
.build();Audio transcription using the Whisper model, converting spoken audio into written text.
/**
* OpenAI audio transcription model implementation (Whisper)
*/
public class OpenAiAudioTranscriptionModel implements TranscriptionModel {
/**
* Simple audio transcription returning text
* @param audioResource Audio file to transcribe
* @return Transcribed text
*/
public String call(Resource audioResource);
/**
* Full audio transcription with metadata
* @param audioTranscriptionPrompt Prompt containing audio and options
* @return AudioTranscriptionResponse with text and metadata
*/
public AudioTranscriptionResponse call(AudioTranscriptionPrompt audioTranscriptionPrompt);
}Constructors:
// Basic constructor
public OpenAiAudioTranscriptionModel(OpenAiAudioApi openAiAudioApi);
// With options
public OpenAiAudioTranscriptionModel(
OpenAiAudioApi openAiAudioApi,
OpenAiAudioTranscriptionOptions options
);
// Full constructor with retry support
public OpenAiAudioTranscriptionModel(
OpenAiAudioApi openAiAudioApi,
OpenAiAudioTranscriptionOptions options,
RetryTemplate retryTemplate
);Note: OpenAiAudioTranscriptionModel uses constructor-based initialization only and does not provide a builder pattern. Use the appropriate constructor based on your configuration needs.
Usage Example:
import org.springframework.ai.openai.OpenAiAudioTranscriptionModel;
import org.springframework.ai.openai.OpenAiAudioTranscriptionOptions;
import org.springframework.ai.openai.api.OpenAiAudioApi;
import org.springframework.ai.audio.transcription.AudioTranscriptionPrompt;
import org.springframework.core.io.FileSystemResource;
// Create API client
var audioApi = OpenAiAudioApi.builder()
.apiKey(System.getenv("OPENAI_API_KEY"))
.build();
// Create transcription model
var transcriptionModel = new OpenAiAudioTranscriptionModel(
audioApi,
OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.language("en")
.build()
);
// Simple transcription
var audioFile = new FileSystemResource("recording.mp3");
String transcript = transcriptionModel.call(audioFile);
System.out.println("Transcript: " + transcript);
// Full transcription with options
var prompt = new AudioTranscriptionPrompt(
audioFile,
OpenAiAudioTranscriptionOptions.builder()
.responseFormat("verbose_json")
.language("es")
.prompt("This is a business meeting discussion")
.temperature(0.0)
.build()
);
var response = transcriptionModel.call(prompt);
System.out.println("Text: " + response.getResult().getOutput());
System.out.println("Metadata: " + response.getMetadata());Transcription with Timestamps:
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionRequest.GranularityType;
// Get word-level timestamps
var options = OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.responseFormat("verbose_json")
.granularityType(GranularityType.WORD)
.build();
var response = transcriptionModel.call(
new AudioTranscriptionPrompt(audioFile, options)
);
// Access timestamp information from metadata
var metadata = response.getMetadata();
// Process word-level timing informationDifferent Output Formats:
// Plain text format
var textOptions = OpenAiAudioTranscriptionOptions.builder()
.responseFormat("text")
.build();
// SRT subtitle format
var srtOptions = OpenAiAudioTranscriptionOptions.builder()
.responseFormat("srt")
.build();
// VTT subtitle format
var vttOptions = OpenAiAudioTranscriptionOptions.builder()
.responseFormat("vtt")
.build();
// Verbose JSON with all metadata
var verboseOptions = OpenAiAudioTranscriptionOptions.builder()
.responseFormat("verbose_json")
.build();Configuration options for audio transcription requests.
/**
* Configuration options for OpenAI audio transcription
*/
public class OpenAiAudioTranscriptionOptions implements AudioTranscriptionOptions {
/**
* Create a new builder for transcription options
* @return Builder instance
*/
public static Builder builder();
/**
* Get the transcription model identifier
* @return Model name
*/
public String getModel();
public void setModel(String model);
/**
* Get the response format
* @return Format (json/text/srt/verbose_json/vtt)
*/
public String getResponseFormat();
public void setResponseFormat(String responseFormat);
/**
* Get the optional transcription prompt/guide
* @return Prompt text to guide transcription style
*/
public String getPrompt();
public void setPrompt(String prompt);
/**
* Get the audio language code
* @return ISO-639-1 language code (e.g., "en", "es", "fr")
*/
public String getLanguage();
public void setLanguage(String language);
/**
* Get the sampling temperature
* @return Temperature value (0.0 to 1.0)
*/
public Double getTemperature();
public void setTemperature(Double temperature);
/**
* Get the timestamp granularity type
* @return Granularity (WORD or SEGMENT)
*/
public GranularityType getGranularityType();
public void setGranularityType(GranularityType granularityType);
}Builder Pattern:
public static class Builder {
public Builder model(String model);
public Builder responseFormat(String responseFormat);
public Builder prompt(String prompt);
public Builder language(String language);
public Builder temperature(Double temperature);
public Builder granularityType(GranularityType granularityType);
public OpenAiAudioTranscriptionOptions build();
}Usage Example:
// English transcription with context
var englishOptions = OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.language("en")
.prompt("This is a technical discussion about machine learning")
.temperature(0.0) // More deterministic
.build();
// Spanish transcription with subtitles
var spanishSubtitles = OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.language("es")
.responseFormat("srt")
.build();
// Detailed transcription with word timestamps
var detailedOptions = OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.responseFormat("verbose_json")
.granularityType(GranularityType.WORD)
.build();Text-to-speech model identifier:
Available voice options for TTS:
Audio format for speech output:
Playback speed multiplier for TTS. Range: 0.25 to 4.0
Audio transcription model identifier:
Format for transcription output:
Optional text to guide transcription:
ISO-639-1 language code for the audio:
Sampling temperature for transcription. Range: 0.0 to 1.0
Timestamp granularity for verbose_json format:
// High-level text-to-speech prompt (from spring-ai-core)
public class TextToSpeechPrompt {
public TextToSpeechPrompt(String instructions);
public TextToSpeechPrompt(String instructions, TextToSpeechOptions options);
public String getInstructions();
public TextToSpeechOptions getOptions();
}
// Low-level speech request
public record SpeechRequest(
String model, // TTS model
String input, // Text to synthesize
Voice voice, // Voice selection
AudioResponseFormat responseFormat, // Audio format
Double speed // Playback speed
) {}
public enum Voice {
ALLOY, ECHO, FABLE, ONYX, NOVA, SHIMMER,
ASH, BALLAD, CORAL, SAGE, VERSE
}
public enum AudioResponseFormat {
MP3, OPUS, AAC, FLAC, WAV, PCM
}// High-level text-to-speech response (from spring-ai-core)
public interface TextToSpeechResponse {
TextToSpeechResult getResult();
TextToSpeechResponseMetadata getMetadata();
}
public class TextToSpeechResult {
public byte[] getOutput();
}
public class OpenAiAudioSpeechResponseMetadata extends TextToSpeechResponseMetadata {
public static OpenAiAudioSpeechResponseMetadata from(StructuredResponse response);
public static OpenAiAudioSpeechResponseMetadata from(String text);
public RateLimit getRateLimit();
public OpenAiAudioSpeechResponseMetadata withRateLimit(RateLimit rateLimit);
}// High-level transcription prompt (from spring-ai-core)
public class AudioTranscriptionPrompt {
public AudioTranscriptionPrompt(Resource audioResource);
public AudioTranscriptionPrompt(Resource audioResource, AudioTranscriptionOptions options);
public Resource getInstructions();
public AudioTranscriptionOptions getOptions();
}
// Low-level transcription request
public record TranscriptionRequest(
Resource file, // Audio file
String model, // Transcription model
String language, // Audio language
String prompt, // Transcription guide
TranscriptResponseFormat responseFormat, // Response format
Float temperature, // Sampling temperature
List<GranularityType> timestampGranularities // Timestamp types
) {}
public enum TranscriptResponseFormat {
JSON, TEXT, SRT, VERBOSE_JSON, VTT
}
public enum GranularityType {
WORD, SEGMENT
}// High-level transcription response (from spring-ai-core)
public interface AudioTranscriptionResponse {
AudioTranscriptionResult getResult();
AudioTranscriptionResponseMetadata getMetadata();
}
public class AudioTranscriptionResult {
public String getOutput();
}
public class OpenAiAudioTranscriptionResponseMetadata extends AudioTranscriptionResponseMetadata {
public static OpenAiAudioTranscriptionResponseMetadata from(StructuredResponse response);
public static OpenAiAudioTranscriptionResponseMetadata from(String text);
public RateLimit getRateLimit();
public OpenAiAudioTranscriptionResponseMetadata withRateLimit(RateLimit rateLimit);
}
// Low-level structured response (for verbose_json format)
public record StructuredResponse(
String task, // "transcribe" or "translate"
String language, // Detected language
Float duration, // Audio duration in seconds
String text, // Transcribed text
List<Word> words, // Word-level timestamps
List<Segment> segments // Segment-level timestamps
) {
public record Word(
String word, // Word text
Float start, // Start time in seconds
Float end // End time in seconds
) {}
public record Segment(
Integer id, // Segment ID
Integer seek, // Seek position
Float start, // Start time in seconds
Float end, // End time in seconds
String text, // Segment text
List<Integer> tokens, // Token IDs
Float temperature, // Temperature used
Float avgLogprob, // Average log probability
Float compressionRatio, // Compression ratio
Float noSpeechProb // No-speech probability
) {}
}// Translation request (translates audio to English)
public record TranslationRequest(
Resource file, // Audio file
String model, // Translation model
String prompt, // Translation guide
String responseFormat, // Response format
Float temperature // Sampling temperature
) {}// Generate greeting messages for phone systems
var options = OpenAiAudioSpeechOptions.builder()
.model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
.voice("nova")
.responseFormat("wav")
.build();
byte[] greeting = speechModel.call(
new TextToSpeechPrompt(
"Thank you for calling. Please press 1 for sales, 2 for support.",
options
)
);// High-quality narration with natural pacing
var options = OpenAiAudioSpeechOptions.builder()
.model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue())
.voice("fable") // Good for storytelling
.responseFormat("flac") // High quality
.speed(0.95) // Slightly slower for clarity
.build();
byte[] narration = speechModel.call(
new TextToSpeechPrompt(scriptText, options)
);// Convert article text to audio for accessibility
var options = OpenAiAudioSpeechOptions.builder()
.model(OpenAiAudioApi.TtsModel.TTS_1.getValue())
.voice("alloy")
.responseFormat("mp3")
.speed(1.1) // Slightly faster for efficiency
.build();
byte[] audioArticle = speechModel.call(
new TextToSpeechPrompt(articleContent, options)
);// Transcribe recorded meeting with context
var options = OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.language("en")
.prompt("Business meeting discussing Q4 strategy and budget planning")
.responseFormat("verbose_json")
.granularityType(GranularityType.SEGMENT)
.build();
var response = transcriptionModel.call(
new AudioTranscriptionPrompt(meetingRecording, options)
);
String transcript = response.getResult().getOutput();// Generate SRT subtitles for video
var options = OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.language("en")
.responseFormat("srt")
.build();
var audioFile = new FileSystemResource("video_audio.mp3");
String subtitles = transcriptionModel.call(audioFile);
Files.writeString(Paths.get("subtitles.srt"), subtitles);// Transcribe voice commands for processing
var options = OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.language("en")
.responseFormat("text")
.temperature(0.0) // Most deterministic
.prompt("Voice commands: play, pause, stop, next, previous")
.build();
var voiceCommand = new FileSystemResource("voice_input.wav");
String command = transcriptionModel.call(voiceCommand);
processCommand(command);// Transcribe non-English audio
var spanishOptions = OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.language("es")
.responseFormat("text")
.build();
var japaneseOptions = OpenAiAudioTranscriptionOptions.builder()
.model(OpenAiAudioApi.WhisperModel.WHISPER_1.getValue())
.language("ja")
.responseFormat("text")
.build();The following audio formats are supported for transcription:
Maximum file size: 25 MB
Text-to-speech supports these output formats:
Install with Tessl CLI
npx tessl i tessl/maven-org-springframework-ai--spring-ai-openai