CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-org-springframework-ai--spring-ai-model

Core model interfaces and abstractions for Spring AI framework providing portable API for chat, embeddings, images, audio, and tool calling across multiple AI providers

Overview
Eval results
Files

audio-transcription.mddocs/reference/

Audio Transcription

Speech-to-text capabilities for converting audio files to text, supporting multiple audio formats (WAV, MP3, FLAC, OPUS), language detection, and transcription options like temperature and prompting.

Capabilities

TranscriptionModel Interface

Main interface for speech-to-text transcription.

public interface TranscriptionModel extends Model<AudioTranscriptionPrompt, AudioTranscriptionResponse> {
    /**
     * Transcribe audio to text based on the given prompt.
     *
     * @param prompt the audio transcription prompt
     * @return the transcription response with text
     */
    AudioTranscriptionResponse call(AudioTranscriptionPrompt prompt);

    /**
     * Transcribe an audio resource with default options.
     *
     * @param audioResource the audio file resource
     * @return the transcribed text
     */
    String transcribe(Resource audioResource);

    /**
     * Transcribe an audio resource with custom options.
     *
     * @param audioResource the audio file resource
     * @param options the transcription options
     * @return the transcribed text
     */
    String transcribe(Resource audioResource, AudioTranscriptionOptions options);
}

AudioTranscriptionPrompt

Request for transcribing audio to text.

public class AudioTranscriptionPrompt implements ModelRequest<Resource> {
    /**
     * Construct an AudioTranscriptionPrompt with an audio resource.
     *
     * @param audioResource the audio file to transcribe
     */
    public AudioTranscriptionPrompt(Resource audioResource);

    /**
     * Construct an AudioTranscriptionPrompt with audio and options.
     *
     * @param audioResource the audio file to transcribe
     * @param options the transcription options
     */
    public AudioTranscriptionPrompt(Resource audioResource, AudioTranscriptionOptions options);

    /**
     * Get the audio resource to transcribe.
     *
     * @return the audio resource
     */
    Resource getInstructions();

    /**
     * Get the transcription options.
     *
     * @return the audio transcription options
     */
    AudioTranscriptionOptions getOptions();
}

AudioTranscriptionResponse

Response containing transcribed text and metadata.

public class AudioTranscriptionResponse implements ModelResponse<AudioTranscription> {
    /**
     * Construct an AudioTranscriptionResponse with transcriptions.
     *
     * @param transcriptions the list of transcriptions
     */
    public AudioTranscriptionResponse(List<AudioTranscription> transcriptions);

    /**
     * Construct an AudioTranscriptionResponse with transcriptions and metadata.
     *
     * @param transcriptions the list of transcriptions
     * @param metadata the response metadata
     */
    public AudioTranscriptionResponse(
        List<AudioTranscription> transcriptions,
        AudioTranscriptionResponseMetadata metadata
    );

    /**
     * Get the first transcription.
     *
     * @return the first audio transcription
     */
    AudioTranscription getResult();

    /**
     * Get all transcriptions.
     *
     * @return list of audio transcriptions
     */
    List<AudioTranscription> getResults();

    /**
     * Get response metadata.
     *
     * @return the audio transcription response metadata
     */
    AudioTranscriptionResponseMetadata getMetadata();
}

AudioTranscription

Single transcription result containing text.

public class AudioTranscription implements ModelResult<String> {
    /**
     * Construct an AudioTranscription with text.
     *
     * @param text the transcribed text
     */
    public AudioTranscription(String text);

    /**
     * Get the transcribed text.
     *
     * @return the text output
     */
    String getOutput();

    /**
     * Get transcription metadata.
     *
     * @return the audio transcription metadata
     */
    AudioTranscriptionMetadata getMetadata();

    /**
     * Set transcription metadata.
     *
     * @param transcriptionMetadata the transcription metadata
     * @return this AudioTranscription instance
     */
    AudioTranscription withTranscriptionMetadata(AudioTranscriptionMetadata transcriptionMetadata);
}

AudioTranscriptionOptions Interface

Options for configuring audio transcription.

public interface AudioTranscriptionOptions extends ModelOptions {
    /**
     * Get the model name to use.
     *
     * @return the model name
     */
    String getModel();
}

DefaultAudioTranscriptionOptions

Default implementation of AudioTranscriptionOptions.

public class DefaultAudioTranscriptionOptions implements AudioTranscriptionOptions {
    // Default implementation with standard transcription options
}

AudioTranscriptionMetadata Interface

Metadata for individual transcription results.

public interface AudioTranscriptionMetadata extends ResultMetadata {
    // Transcription-specific metadata
}

AudioTranscriptionResponseMetadata

Metadata for transcription responses.

public class AudioTranscriptionResponseMetadata extends MutableResponseMetadata {
    // Response-level metadata for transcription
}

Usage Examples

Simple Audio Transcription

import org.springframework.ai.audio.transcription.TranscriptionModel;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.beans.factory.annotation.Autowired;

@Service
public class TranscriptionService {
    @Autowired
    private TranscriptionModel transcriptionModel;

    public String transcribeAudio(String audioPath) {
        // Load audio resource
        Resource audioResource = new ClassPathResource(audioPath);

        // Transcribe with defaults
        return transcriptionModel.transcribe(audioResource);
    }
}

Transcription with Options

import org.springframework.ai.audio.transcription.*;

// Configure transcription options
AudioTranscriptionOptions options = AudioTranscriptionOptions.builder()
    .language("en")
    .temperature(0.0f)
    .responseFormat("text")
    .build();

// Load audio file
Resource audioResource = new ClassPathResource("audio/speech.mp3");

// Transcribe
String transcription = transcriptionModel.transcribe(audioResource, options);
System.out.println("Transcription: " + transcription);

Using AudioTranscriptionPrompt

import org.springframework.ai.audio.transcription.AudioTranscriptionPrompt;
import org.springframework.ai.audio.transcription.AudioTranscriptionResponse;

// Create options
AudioTranscriptionOptions options = AudioTranscriptionOptions.builder()
    .language("es")
    .model("whisper-1")
    .build();

// Create prompt
Resource audio = new ClassPathResource("audio/spanish.wav");
AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audio, options);

// Get full response
AudioTranscriptionResponse response = transcriptionModel.call(prompt);

// Access transcription
String text = response.getResult().getOutput();
System.out.println("Spanish transcription: " + text);

Language Auto-Detection

// Omit language option for auto-detection
AudioTranscriptionOptions options = AudioTranscriptionOptions.builder()
    .model("whisper-1")
    .build();

Resource audioResource = new ClassPathResource("audio/unknown-language.mp3");
String transcription = transcriptionModel.transcribe(audioResource, options);

Guiding Transcription with Prompts

// Use prompt to guide transcription style and vocabulary
AudioTranscriptionOptions options = AudioTranscriptionOptions.builder()
    .prompt("This is a technical discussion about machine learning, " +
            "neural networks, and artificial intelligence.")
    .language("en")
    .build();

Resource audio = new ClassPathResource("audio/tech-talk.wav");
String transcription = transcriptionModel.transcribe(audio, options);

Different Response Formats

// Get plain text
AudioTranscriptionOptions textOptions = AudioTranscriptionOptions.builder()
    .responseFormat("text")
    .build();

// Get JSON with timestamps
AudioTranscriptionOptions jsonOptions = AudioTranscriptionOptions.builder()
    .responseFormat("json")
    .build();

// Get SRT subtitle format
AudioTranscriptionOptions srtOptions = AudioTranscriptionOptions.builder()
    .responseFormat("srt")
    .build();

// Get VTT subtitle format
AudioTranscriptionOptions vttOptions = AudioTranscriptionOptions.builder()
    .responseFormat("vtt")
    .build();

Resource audio = new ClassPathResource("audio/speech.mp3");
String textTranscription = transcriptionModel.transcribe(audio, textOptions);
String jsonTranscription = transcriptionModel.transcribe(audio, jsonOptions);

Handling Different Audio Formats

// WAV format
Resource wavAudio = new ClassPathResource("audio/speech.wav");
String wavTranscription = transcriptionModel.transcribe(wavAudio);

// MP3 format
Resource mp3Audio = new ClassPathResource("audio/speech.mp3");
String mp3Transcription = transcriptionModel.transcribe(mp3Audio);

// FLAC format
Resource flacAudio = new ClassPathResource("audio/speech.flac");
String flacTranscription = transcriptionModel.transcribe(flacAudio);

// OPUS format
Resource opusAudio = new ClassPathResource("audio/speech.opus");
String opusTranscription = transcriptionModel.transcribe(opusAudio);

Temperature Control

// Lower temperature for more consistent output
AudioTranscriptionOptions lowTemp = AudioTranscriptionOptions.builder()
    .temperature(0.0f)  // More deterministic
    .build();

// Higher temperature for more varied output
AudioTranscriptionOptions highTemp = AudioTranscriptionOptions.builder()
    .temperature(0.8f)  // More creative/varied
    .build();

Resource audio = new ClassPathResource("audio/speech.mp3");
String consistent = transcriptionModel.transcribe(audio, lowTemp);
String varied = transcriptionModel.transcribe(audio, highTemp);

Loading Audio from File System

import org.springframework.core.io.FileSystemResource;
import java.io.File;

// Load from file system
File audioFile = new File("/path/to/audio/recording.mp3");
Resource audioResource = new FileSystemResource(audioFile);

String transcription = transcriptionModel.transcribe(audioResource);

Loading Audio from URL

import org.springframework.core.io.UrlResource;
import java.net.URL;

// Load from URL
URL audioUrl = new URL("https://example.com/audio/speech.mp3");
Resource audioResource = new UrlResource(audioUrl);

String transcription = transcriptionModel.transcribe(audioResource);

Batch Transcription

@Service
public class BatchTranscriptionService {
    private final TranscriptionModel transcriptionModel;

    public List<String> transcribeBatch(List<String> audioPaths) {
        return audioPaths.stream()
            .map(ClassPathResource::new)
            .map(transcriptionModel::transcribe)
            .toList();
    }
}

Error Handling

public String safeTranscribe(String audioPath) {
    try {
        Resource audioResource = new ClassPathResource(audioPath);

        AudioTranscriptionOptions options = AudioTranscriptionOptions.builder()
            .language("en")
            .temperature(0.0f)
            .build();

        return transcriptionModel.transcribe(audioResource, options);
    } catch (Exception e) {
        System.err.println("Transcription failed: " + e.getMessage());
        return null;
    }
}

Accessing Metadata

AudioTranscriptionPrompt prompt = new AudioTranscriptionPrompt(audioResource);
AudioTranscriptionResponse response = transcriptionModel.call(prompt);

// Access transcription
AudioTranscription transcription = response.getResult();
String text = transcription.getOutput();

// Access metadata
AudioTranscriptionMetadata metadata = transcription.getMetadata();
AudioTranscriptionResponseMetadata responseMetadata = response.getMetadata();

REST API Example

@RestController
@RequestMapping("/api/transcription")
public class TranscriptionController {
    private final TranscriptionModel transcriptionModel;

    public TranscriptionController(TranscriptionModel transcriptionModel) {
        this.transcriptionModel = transcriptionModel;
    }

    @PostMapping("/transcribe")
    public TranscriptionResult transcribe(
        @RequestParam("file") MultipartFile file,
        @RequestParam(required = false) String language
    ) throws Exception {
        // Save uploaded file temporarily
        File tempFile = File.createTempFile("audio", ".tmp");
        file.transferTo(tempFile);

        try {
            // Create resource
            Resource audioResource = new FileSystemResource(tempFile);

            // Build options
            AudioTranscriptionOptions.Builder optionsBuilder =
                AudioTranscriptionOptions.builder();

            if (language != null) {
                optionsBuilder.language(language);
            }

            // Transcribe
            String transcription = transcriptionModel.transcribe(
                audioResource,
                optionsBuilder.build()
            );

            return new TranscriptionResult(transcription);
        } finally {
            tempFile.delete();
        }
    }

    record TranscriptionResult(String text) {}
}

Spring Configuration

import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

@Configuration
public class TranscriptionConfig {

    @Bean
    public AudioTranscriptionOptions defaultTranscriptionOptions() {
        return AudioTranscriptionOptions.builder()
            .model("whisper-1")
            .temperature(0.0f)
            .responseFormat("text")
            .build();
    }
}

Install with Tessl CLI

npx tessl i tessl/maven-org-springframework-ai--spring-ai-model

docs

index.md

tile.json