Core model interfaces and abstractions for Spring AI framework providing portable API for chat, embeddings, images, audio, and tool calling across multiple AI providers
Speech-to-text capabilities for converting audio files to text, supporting multiple audio formats (WAV, MP3, FLAC, OPUS), language detection, and transcription options like temperature and prompting.
Main interface for speech-to-text transcription.
public interface TranscriptionModel extends Model<AudioTranscriptionPrompt, AudioTranscriptionResponse> {
/**
* Transcribe audio to text based on the given prompt.
*
* @param prompt the audio transcription prompt
* @return the transcription response with text
*/
AudioTranscriptionResponse call(AudioTranscriptionPrompt prompt);
/**
* Transcribe an audio resource with default options.
*
* @param audioResource the audio file resource
* @return the transcribed text
*/
String transcribe(Resource audioResource);
/**
* Transcribe an audio resource with custom options.
*
* @param audioResource the audio file resource
* @param options the transcription options
* @return the transcribed text
*/
String transcribe(Resource audioResource, AudioTranscriptionOptions options);
}Request for transcribing audio to text.
public class AudioTranscriptionPrompt implements ModelRequest<Resource> {
/**
* Construct an AudioTranscriptionPrompt with an audio resource.
*
* @param audioResource the audio file to transcribe
*/
public AudioTranscriptionPrompt(Resource audioResource);
/**
* Construct an AudioTranscriptionPrompt with audio and options.
*
* @param audioResource the audio file to transcribe
* @param options the transcription options
*/
public AudioTranscriptionPrompt(Resource audioResource, AudioTranscriptionOptions options);
/**
* Get the audio resource to transcribe.
*
* @return the audio resource
*/
Resource getInstructions();
/**
* Get the transcription options.
*
* @return the audio transcription options
*/
AudioTranscriptionOptions getOptions();
}Response containing transcribed text and metadata.
public class AudioTranscriptionResponse implements ModelResponse<AudioTranscription> {
/**
* Construct an AudioTranscriptionResponse with transcriptions.
*
* @param transcriptions the list of transcriptions
*/
public AudioTranscriptionResponse(List<AudioTranscription> transcriptions);
/**
* Construct an AudioTranscriptionResponse with transcriptions and metadata.
*
* @param transcriptions the list of transcriptions
* @param metadata the response metadata
*/
public AudioTranscriptionResponse(
List<AudioTranscription> transcriptions,
AudioTranscriptionResponseMetadata metadata
);
/**
* Get the first transcription.
*
* @return the first audio transcription
*/
AudioTranscription getResult();
/**
* Get all transcriptions.
*
* @return list of audio transcriptions
*/
List<AudioTranscription> getResults();
/**
* Get response metadata.
*
* @return the audio transcription response metadata
*/
AudioTranscriptionResponseMetadata getMetadata();
}Single transcription result containing text.
public class AudioTranscription implements ModelResult<String> {
/**
* Construct an AudioTranscription with text.
*
* @param text the transcribed text
*/
public AudioTranscription(String text);
/**
* Get the transcribed text.
*
* @return the text output
*/
String getOutput();
/**
* Get transcription metadata.
*
* @return the audio transcription metadata
*/
AudioTranscriptionMetadata getMetadata();
/**
* Set transcription metadata.
*
* @param transcriptionMetadata the transcription metadata
* @return this AudioTranscription instance
*/
AudioTranscription withTranscriptionMetadata(AudioTranscriptionMetadata transcriptionMetadata);
}Options for configuring audio transcription.
public interface AudioTranscriptionOptions extends ModelOptions {
/**
* Get the model name to use.
*
* @return the model name
*/
String getModel();
}Default implementation of AudioTranscriptionOptions.
public class DefaultAudioTranscriptionOptions implements AudioTranscriptionOptions {
// Default implementation with standard transcription options
}Metadata for individual transcription results.
public interface AudioTranscriptionMetadata extends ResultMetadata {
// Transcription-specific metadata
}Metadata for transcription responses.
public class AudioTranscriptionResponseMetadata extends MutableResponseMetadata {
// Response-level metadata for transcription
}import org.springframework.ai.audio.transcription.TranscriptionModel;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.beans.factory.annotation.Autowired;
@Service
public class TranscriptionService {
@Autowired
private TranscriptionModel transcriptionModel;
public String transcribeAudio(String audioPath) {
// Load audio resource
Resource audioResource = new ClassPathResource(audioPath);
// Transcribe with defaults
return transcriptionModel.transcribe(audioResource);
}
}import org.springframework.ai.audio.transcription.*;
// Configure transcription options
AudioTranscriptionOptions options = AudioTranscriptionOptions.builder()
.language("en")
.temperature(0.0f)
.responseFormat("text")
.build();
// Load audio file
Resource audioResource = new ClassPathResource("audio/speech.mp3");
// Transcribe
String transcription = transcriptionModel.transcribe(audioResource, options);
System.out.println("Transcription: " + transcription);import org.springframework.ai.audio.transcription.AudioTranscriptionPrompt;
import org.springframework.ai.audio.transcription.AudioTranscriptionResponse;
// Create options
AudioTranscriptionOptions options = AudioTranscriptionOptions.builder()
.language("es")
.model("whisper-1")
.build();
// Create prompt
Resource audio = new ClassPathResource("audio/spanish.wav");
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audio, options);
// Get full response
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
// Access transcription
String text = response.getResult().getOutput();
System.out.println("Spanish transcription: " + text);// Omit language option for auto-detection
AudioTranscriptionOptions options = AudioTranscriptionOptions.builder()
.model("whisper-1")
.build();
Resource audioResource = new ClassPathResource("audio/unknown-language.mp3");
String transcription = transcriptionModel.transcribe(audioResource, options);// Use prompt to guide transcription style and vocabulary
AudioTranscriptionOptions options = AudioTranscriptionOptions.builder()
.prompt("This is a technical discussion about machine learning, " +
"neural networks, and artificial intelligence.")
.language("en")
.build();
Resource audio = new ClassPathResource("audio/tech-talk.wav");
String transcription = transcriptionModel.transcribe(audio, options);// Get plain text
AudioTranscriptionOptions textOptions = AudioTranscriptionOptions.builder()
.responseFormat("text")
.build();
// Get JSON with timestamps
AudioTranscriptionOptions jsonOptions = AudioTranscriptionOptions.builder()
.responseFormat("json")
.build();
// Get SRT subtitle format
AudioTranscriptionOptions srtOptions = AudioTranscriptionOptions.builder()
.responseFormat("srt")
.build();
// Get VTT subtitle format
AudioTranscriptionOptions vttOptions = AudioTranscriptionOptions.builder()
.responseFormat("vtt")
.build();
Resource audio = new ClassPathResource("audio/speech.mp3");
String textTranscription = transcriptionModel.transcribe(audio, textOptions);
String jsonTranscription = transcriptionModel.transcribe(audio, jsonOptions);// WAV format
Resource wavAudio = new ClassPathResource("audio/speech.wav");
String wavTranscription = transcriptionModel.transcribe(wavAudio);
// MP3 format
Resource mp3Audio = new ClassPathResource("audio/speech.mp3");
String mp3Transcription = transcriptionModel.transcribe(mp3Audio);
// FLAC format
Resource flacAudio = new ClassPathResource("audio/speech.flac");
String flacTranscription = transcriptionModel.transcribe(flacAudio);
// OPUS format
Resource opusAudio = new ClassPathResource("audio/speech.opus");
String opusTranscription = transcriptionModel.transcribe(opusAudio);// Lower temperature for more consistent output
AudioTranscriptionOptions lowTemp = AudioTranscriptionOptions.builder()
.temperature(0.0f) // More deterministic
.build();
// Higher temperature for more varied output
AudioTranscriptionOptions highTemp = AudioTranscriptionOptions.builder()
.temperature(0.8f) // More creative/varied
.build();
Resource audio = new ClassPathResource("audio/speech.mp3");
String consistent = transcriptionModel.transcribe(audio, lowTemp);
String varied = transcriptionModel.transcribe(audio, highTemp);import org.springframework.core.io.FileSystemResource;
import java.io.File;
// Load from file system
File audioFile = new File("/path/to/audio/recording.mp3");
Resource audioResource = new FileSystemResource(audioFile);
String transcription = transcriptionModel.transcribe(audioResource);import org.springframework.core.io.UrlResource;
import java.net.URL;
// Load from URL
URL audioUrl = new URL("https://example.com/audio/speech.mp3");
Resource audioResource = new UrlResource(audioUrl);
String transcription = transcriptionModel.transcribe(audioResource);@Service
public class BatchTranscriptionService {
private final TranscriptionModel transcriptionModel;
public List<String> transcribeBatch(List<String> audioPaths) {
return audioPaths.stream()
.map(ClassPathResource::new)
.map(transcriptionModel::transcribe)
.toList();
}
}public String safeTranscribe(String audioPath) {
try {
Resource audioResource = new ClassPathResource(audioPath);
AudioTranscriptionOptions options = AudioTranscriptionOptions.builder()
.language("en")
.temperature(0.0f)
.build();
return transcriptionModel.transcribe(audioResource, options);
} catch (Exception e) {
System.err.println("Transcription failed: " + e.getMessage());
return null;
}
}AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioResource);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);
// Access transcription
AudioTranscription transcription = response.getResult();
String text = transcription.getOutput();
// Access metadata
AudioTranscriptionMetadata metadata = transcription.getMetadata();
AudioTranscriptionResponseMetadata responseMetadata = response.getMetadata();@RestController
@RequestMapping("/api/transcription")
public class TranscriptionController {
private final TranscriptionModel transcriptionModel;
public TranscriptionController(TranscriptionModel transcriptionModel) {
this.transcriptionModel = transcriptionModel;
}
@PostMapping("/transcribe")
public TranscriptionResult transcribe(
@RequestParam("file") MultipartFile file,
@RequestParam(required = false) String language
) throws Exception {
// Save uploaded file temporarily
File tempFile = File.createTempFile("audio", ".tmp");
file.transferTo(tempFile);
try {
// Create resource
Resource audioResource = new FileSystemResource(tempFile);
// Build options
AudioTranscriptionOptions.Builder optionsBuilder =
AudioTranscriptionOptions.builder();
if (language != null) {
optionsBuilder.language(language);
}
// Transcribe
String transcription = transcriptionModel.transcribe(
audioResource,
optionsBuilder.build()
);
return new TranscriptionResult(transcription);
} finally {
tempFile.delete();
}
}
record TranscriptionResult(String text) {}
}import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@Configuration
public class TranscriptionConfig {
@Bean
public AudioTranscriptionOptions defaultTranscriptionOptions() {
return AudioTranscriptionOptions.builder()
.model("whisper-1")
.temperature(0.0f)
.responseFormat("text")
.build();
}
}