CtrlK
CommunityDocumentationLog inGet started
Tessl Logo

tessl/maven-org-springframework-ai--spring-ai-starter-model-openai

Spring Boot Starter for OpenAI integration providing auto-configuration for chat completion, embeddings, image generation, audio speech synthesis, audio transcription, and content moderation models. Includes high-level ChatClient API and conversation memory support.

Overview
Eval results
Files

audio-transcription-model.mddocs/reference/

Audio Transcription Model (Speech-to-Text)

The OpenAiAudioTranscriptionModel transcribes audio files to text using OpenAI's Whisper model with support for multiple languages and output formats.

Import

import org.springframework.ai.openai.OpenAiAudioTranscriptionModel;
import org.springframework.ai.openai.OpenAiAudioTranscriptionOptions;
import org.springframework.ai.openai.api.OpenAiAudioApi;
import org.springframework.ai.model.AudioTranscriptionPrompt;
import org.springframework.ai.model.AudioTranscriptionResponse;
import org.springframework.core.io.Resource;
import org.springframework.core.io.FileSystemResource;

API

OpenAiAudioTranscriptionModel

package org.springframework.ai.openai;

public class OpenAiAudioTranscriptionModel implements AudioTranscriptionModel {
    // Constructors
    public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi);
    public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi, OpenAiAudioTranscriptionOptions options);
    public OpenAiAudioTranscriptionModel(OpenAiAudioApi audioApi, OpenAiAudioTranscriptionOptions options, RetryTemplate retryTemplate);

    // Transcribe audio with options
    public AudioTranscriptionResponse call(AudioTranscriptionPrompt transcriptionPrompt);

    // Convenience method
    public AudioTranscriptionResponse call(Resource audioResource);
}

OpenAiAudioTranscriptionOptions

package org.springframework.ai.openai;

public class OpenAiAudioTranscriptionOptions implements AudioTranscriptionOptions {
    // Builder methods
    public static OpenAiAudioTranscriptionOptions.Builder builder();

    // Getters
    public String getModel();
    public String getLanguage();
    public String getPrompt();
    public OpenAiAudioApi.TranscriptResponseFormat getResponseFormat();
    public Float getTemperature();
    public OpenAiAudioApi.TranscriptResponseFormat getGranularityType();

    // Setters
    public void setModel(String model);
    public void setLanguage(String language);
    public void setPrompt(String prompt);
    public void setResponseFormat(OpenAiAudioApi.TranscriptResponseFormat responseFormat);
    public void setTemperature(Float temperature);
    public void setGranularityType(OpenAiAudioApi.TranscriptResponseFormat granularityType);
}

Usage Examples

Basic Transcription

import org.springframework.ai.openai.OpenAiAudioTranscriptionModel;
import org.springframework.ai.model.AudioTranscriptionPrompt;
import org.springframework.ai.model.AudioTranscriptionResponse;
import org.springframework.core.io.FileSystemResource;
import org.springframework.core.io.Resource;
import org.springframework.stereotype.Service;

@Service
public class TranscriptionService {

    private final OpenAiAudioTranscriptionModel transcriptionModel;

    public TranscriptionService(OpenAiAudioTranscriptionModel transcriptionModel) {
        this.transcriptionModel = transcriptionModel;
    }

    public String transcribeAudio(String audioFilePath) {
        Resource audioFile = new FileSystemResource(audioFilePath);
        AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile);
        AudioTranscriptionResponse response = transcriptionModel.call(prompt);
        return response.getResult().getOutput();
    }

    // Convenience method - direct Resource usage
    public String transcribeAudioDirect(String audioFilePath) {
        Resource audioFile = new FileSystemResource(audioFilePath);
        AudioTranscriptionResponse response = transcriptionModel.call(audioFile);
        return response.getResult().getOutput();
    }
}

Transcription with Options

import org.springframework.ai.openai.OpenAiAudioTranscriptionOptions;
import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptResponseFormat;

public String transcribeWithOptions(String audioFilePath) {
    Resource audioFile = new FileSystemResource(audioFilePath);

    OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
        .language("en")
        .responseFormat(TranscriptResponseFormat.TEXT)
        .temperature(0.0f)
        .build();

    AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
    AudioTranscriptionResponse response = transcriptionModel.call(prompt);
    return response.getResult().getOutput();
}

Transcription with Language Hint

public String transcribeSpanish(String audioFilePath) {
    Resource audioFile = new FileSystemResource(audioFilePath);

    OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
        .language("es") // Spanish
        .build();

    AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
    AudioTranscriptionResponse response = transcriptionModel.call(prompt);
    return response.getResult().getOutput();
}

Transcription with Prompt Guidance

public String transcribeWithGuidance(String audioFilePath, String contextPrompt) {
    Resource audioFile = new FileSystemResource(audioFilePath);

    OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
        .prompt(contextPrompt) // Guide transcription with context
        .build();

    AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
    AudioTranscriptionResponse response = transcriptionModel.call(prompt);
    return response.getResult().getOutput();
}

// Example usage
public void demonstrateGuidance() {
    String audioPath = "meeting.mp3";
    String guidance = "This is a technical discussion about Spring Boot and microservices.";
    String transcript = transcribeWithGuidance(audioPath, guidance);
}

JSON Output Format

import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptResponseFormat;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;

public JsonNode transcribeAsJson(String audioFilePath) throws Exception {
    Resource audioFile = new FileSystemResource(audioFilePath);

    OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
        .responseFormat(TranscriptResponseFormat.JSON)
        .build();

    AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
    AudioTranscriptionResponse response = transcriptionModel.call(prompt);

    String jsonOutput = response.getResult().getOutput();
    ObjectMapper mapper = new ObjectMapper();
    return mapper.readTree(jsonOutput);
}

Verbose JSON with Timestamps

public JsonNode transcribeVerbose(String audioFilePath) throws Exception {
    Resource audioFile = new FileSystemResource(audioFilePath);

    OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
        .responseFormat(TranscriptResponseFormat.VERBOSE_JSON)
        .build();

    AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
    AudioTranscriptionResponse response = transcriptionModel.call(prompt);

    String jsonOutput = response.getResult().getOutput();
    ObjectMapper mapper = new ObjectMapper();
    JsonNode json = mapper.readTree(jsonOutput);

    // Verbose JSON includes: text, language, duration, segments with timestamps
    return json;
}

SRT Subtitle Format

import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptResponseFormat;
import java.nio.file.Files;
import java.nio.file.Path;

public String transcribeToSRT(String audioFilePath, String outputSrtPath) throws Exception {
    Resource audioFile = new FileSystemResource(audioFilePath);

    OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
        .responseFormat(TranscriptResponseFormat.SRT)
        .build();

    AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
    AudioTranscriptionResponse response = transcriptionModel.call(prompt);

    String srtContent = response.getResult().getOutput();
    Files.writeString(Path.of(outputSrtPath), srtContent);
    return srtContent;
}

VTT Subtitle Format

public String transcribeToVTT(String audioFilePath, String outputVttPath) throws Exception {
    Resource audioFile = new FileSystemResource(audioFilePath);

    OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
        .responseFormat(TranscriptResponseFormat.VTT)
        .build();

    AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
    AudioTranscriptionResponse response = transcriptionModel.call(prompt);

    String vttContent = response.getResult().getOutput();
    Files.writeString(Path.of(outputVttPath), vttContent);
    return vttContent;
}

Temperature Control

public String transcribeDeterministic(String audioFilePath) {
    Resource audioFile = new FileSystemResource(audioFilePath);

    OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
        .temperature(0.0f) // More deterministic
        .build();

    AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
    AudioTranscriptionResponse response = transcriptionModel.call(prompt);
    return response.getResult().getOutput();
}

public String transcribeCreative(String audioFilePath) {
    Resource audioFile = new FileSystemResource(audioFilePath);

    OpenAiAudioTranscriptionOptions options = OpenAiAudioTranscriptionOptions.builder()
        .temperature(1.0f) // More varied output
        .build();

    AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile, options);
    AudioTranscriptionResponse response = transcriptionModel.call(prompt);
    return response.getResult().getOutput();
}

From Byte Array

import org.springframework.core.io.ByteArrayResource;

public String transcribeFromBytes(byte[] audioData, String filename) {
    Resource audioFile = new ByteArrayResource(audioData) {
        @Override
        public String getFilename() {
            return filename; // Must provide filename with extension
        }
    };

    TranscriptionPrompt prompt = new TranscriptionPrompt(audioFile);
    TranscriptionResponse response = transcriptionModel.call(prompt);
    return response.getResult().getOutput();
}

Configuration

Configure via application.properties:

# API Connection
spring.ai.openai.audio.transcription.api-key=sk-...
spring.ai.openai.audio.transcription.base-url=https://api.openai.com
spring.ai.openai.audio.transcription.project-id=proj_...
spring.ai.openai.audio.transcription.organization-id=org-...

# Model Configuration
spring.ai.openai.audio.transcription.options.model=whisper-1
spring.ai.openai.audio.transcription.options.language=en
spring.ai.openai.audio.transcription.options.response-format=text
spring.ai.openai.audio.transcription.options.temperature=0.7
spring.ai.openai.audio.transcription.options.timestamp-granularities=segment

Model Options

Available Models

  • whisper-1 - OpenAI's Whisper model (default, only option)

Language Options

Specify input language using ISO-639-1 codes:

  • en - English
  • es - Spanish
  • fr - French
  • de - German
  • it - Italian
  • pt - Portuguese
  • nl - Dutch
  • ja - Japanese
  • ko - Korean
  • zh - Chinese
  • And many more...

Language hint improves accuracy and speed.

Response Formats

  • TEXT - Plain text transcript (default)
  • JSON - JSON with transcript text
  • VERBOSE_JSON - Detailed JSON with segments, timestamps, language, duration
  • SRT - SubRip subtitle format (with timestamps)
  • VTT - WebVTT subtitle format (with timestamps)

Temperature Range

  • Minimum: 0.0 (more deterministic)
  • Default: 0.7 (balanced)
  • Maximum: 1.0 (more varied)

Lower temperature = more conservative transcription Higher temperature = more creative interpretation

Prompt Guidance

Optional text to guide transcription style and vocabulary:

  • Provide context about topic
  • Specify terminology or proper nouns
  • Maintain consistent style across segments
  • Maximum 224 tokens

Supported Audio Formats

The following audio formats are supported:

  • MP3
  • MP4
  • MPEG
  • MPGA
  • M4A
  • WAV
  • WEBM

File size limit: 25 MB maximum

Error Handling

import org.springframework.ai.openai.api.OpenAiApiException;

public String safeTranscribe(String audioFilePath) {
    try {
        Resource audioFile = new FileSystemResource(audioFilePath);
        AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioFile);
        AudioTranscriptionResponse response = transcriptionModel.call(prompt);
        return response.getResult().getOutput();
    } catch (OpenAiApiException e) {
        // Handle API errors (rate limits, invalid audio format, etc.)
        throw new RuntimeException("Transcription failed: " + e.getMessage(), e);
    }
}

Types

Builder

package org.springframework.ai.openai;

public static class Builder {
    public Builder model(String model);
    public Builder language(String language);
    public Builder prompt(String prompt);
    public Builder responseFormat(OpenAiAudioApi.TranscriptResponseFormat responseFormat);
    public Builder temperature(Float temperature);
    public Builder granularityType(OpenAiAudioApi.TranscriptResponseFormat granularityType);
    public OpenAiAudioTranscriptionOptions build();
}

TranscriptResponseFormat

package org.springframework.ai.openai.api.OpenAiAudioApi;

public enum TranscriptResponseFormat {
    TEXT("text"),
    JSON("json"),
    VERBOSE_JSON("verbose_json"),
    SRT("srt"),
    VTT("vtt");
}

AudioTranscriptionPrompt

package org.springframework.ai.model;

public class AudioTranscriptionPrompt {
    public AudioTranscriptionPrompt(Resource audioResource);
    public AudioTranscriptionPrompt(Resource audioResource, AudioTranscriptionOptions options);

    public Resource getInstructions();
    public AudioTranscriptionOptions getOptions();
}

AudioTranscriptionResponse

package org.springframework.ai.model;

public class AudioTranscriptionResponse {
    public AudioTranscription getResult();
    public List<AudioTranscription> getResults();
    public AudioTranscriptionResponseMetadata getMetadata();
}

AudioTranscription

package org.springframework.ai.model;

public class AudioTranscription {
    public String getOutput();
    public AudioTranscriptionMetadata getMetadata();
}

Best Practices

  1. Provide language hint: Improves accuracy and reduces processing time
  2. Use prompt for context: Helps with technical terms, proper nouns, acronyms
  3. Choose appropriate format: TEXT for simple needs, VERBOSE_JSON for detailed analysis
  4. Handle large files: Split files longer than 10 minutes for better results
  5. Use SRT/VTT for subtitles: Includes precise timestamps for video synchronization
  6. Set temperature low: Use 0.0-0.3 for technical/medical content requiring accuracy
  7. Preprocess audio: Clean audio with noise reduction for better results
tessl i tessl/maven-org-springframework-ai--spring-ai-starter-model-openai@1.1.1

docs

index.md

tile.json