CtrlK
CommunityDocumentationLog inGet started
Tessl Logo

tessl/maven-com-embabel-agent--embabel-agent-starter

Base starter module for the Embabel Agent Framework providing core dependencies for building agentic flows on the JVM with Spring Boot integration and GOAP-based intelligent path finding.

Overview
Eval results
Files

guides-multimodal.mddocs/

Multimodal Content

Step-by-step guide for using multimodal content (text, images, audio, video) in agent actions.

1. Basic Text Content

Java

import com.embabel.agent.api.MultimodalContent;
import com.embabel.agent.api.annotation.Action;

@Agent(description = "Text processing agent")
public class TextAgent {

    @Action(description = "Process text input")
    public Result processText(String input, @Provided Ai ai) {
        // Create text content
        MultimodalContent textContent = MultimodalContent.text(input);

        // Process with LLM
        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(List.of(textContent));
    }
}

Kotlin

import com.embabel.agent.api.MultimodalContent
import com.embabel.agent.api.annotation.Action

@Agent(description = "Text processor")
class TextAgent {

    @Action(description = "Process text")
    fun processText(input: String, @Provided ai: Ai): Result {
        val textContent = MultimodalContent.text(input)

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(listOf(textContent))
    }
}

2. Image Content

Java

import com.embabel.agent.api.MultimodalContent;

@Agent(description = "Image analysis agent")
public class ImageAnalysisAgent {

    @Action(description = "Analyze image")
    public Analysis analyzeImage(String imageUrl, @Provided Ai ai) {
        // Create image content
        MultimodalContent image = MultimodalContent.image(
            imageUrl,
            "image/jpeg"
        );

        MultimodalContent prompt = MultimodalContent.text(
            "Analyze this image and describe what you see"
        );

        // Process with multimodal LLM
        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(List.of(prompt, image));
    }

    @Action(description = "Compare two images")
    public Comparison compareImages(
        String imageUrl1,
        String imageUrl2,
        @Provided Ai ai
    ) {
        List<MultimodalContent> contents = List.of(
            MultimodalContent.text("Compare these two images:"),
            MultimodalContent.image(imageUrl1, "image/jpeg"),
            MultimodalContent.text("and"),
            MultimodalContent.image(imageUrl2, "image/jpeg")
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents);
    }
}

Kotlin

import com.embabel.agent.api.MultimodalContent

@Agent(description = "Vision analyzer")
class VisionAgent {

    @Action(description = "Analyze image")
    fun analyzeImage(imageUrl: String, @Provided ai: Ai): Analysis {
        val contents = listOf(
            MultimodalContent.text("Analyze this image:"),
            MultimodalContent.image(imageUrl, "image/jpeg")
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }

    @Action(description = "Extract text from image")
    fun extractText(imageUrl: String, @Provided ai: Ai): String {
        val contents = listOf(
            MultimodalContent.text("Extract all text from this image:"),
            MultimodalContent.image(imageUrl, "image/png")
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }
}

3. Audio Content

Java

import com.embabel.agent.api.MultimodalContent;

@Agent(description = "Audio processing agent")
public class AudioAgent {

    @Action(description = "Transcribe audio")
    public Transcript transcribeAudio(String audioUrl, @Provided Ai ai) {
        // Create audio content
        MultimodalContent audio = MultimodalContent.audio(
            audioUrl,
            "audio/mpeg"
        );

        MultimodalContent prompt = MultimodalContent.text(
            "Transcribe this audio"
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(List.of(prompt, audio));
    }

    @Action(description = "Analyze audio sentiment")
    public SentimentAnalysis analyzeSentiment(
        String audioUrl,
        @Provided Ai ai
    ) {
        List<MultimodalContent> contents = List.of(
            MultimodalContent.text("Analyze the sentiment of this audio:"),
            MultimodalContent.audio(audioUrl, "audio/wav")
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents);
    }
}

Kotlin

import com.embabel.agent.api.MultimodalContent

@Agent(description = "Audio analyzer")
class AudioAnalyzerAgent {

    @Action(description = "Transcribe audio")
    fun transcribe(audioUrl: String, @Provided ai: Ai): Transcript {
        val contents = listOf(
            MultimodalContent.text("Transcribe this audio:"),
            MultimodalContent.audio(audioUrl, "audio/mpeg")
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }

    @Action(description = "Summarize audio content")
    fun summarizeAudio(audioUrl: String, @Provided ai: Ai): Summary {
        val contents = listOf(
            MultimodalContent.text("Summarize the key points from this audio:"),
            MultimodalContent.audio(audioUrl, "audio/wav")
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }
}

4. Video Content

Java

import com.embabel.agent.api.MultimodalContent;

@Agent(description = "Video processing agent")
public class VideoAgent {

    @Action(description = "Analyze video")
    public VideoAnalysis analyzeVideo(String videoUrl, @Provided Ai ai) {
        // Create video content
        MultimodalContent video = MultimodalContent.video(
            videoUrl,
            "video/mp4"
        );

        MultimodalContent prompt = MultimodalContent.text(
            "Analyze this video and describe the key events"
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(List.of(prompt, video));
    }

    @Action(description = "Extract frames from video")
    public List<Frame> extractFrames(String videoUrl, @Provided Ai ai) {
        List<MultimodalContent> contents = List.of(
            MultimodalContent.text("Extract and describe key frames from this video:"),
            MultimodalContent.video(videoUrl, "video/mp4")
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents);
    }
}

Kotlin

import com.embabel.agent.api.MultimodalContent

@Agent(description = "Video analyzer")
class VideoAnalyzerAgent {

    @Action(description = "Analyze video content")
    fun analyzeVideo(videoUrl: String, @Provided ai: Ai): VideoAnalysis {
        val contents = listOf(
            MultimodalContent.text("Analyze this video:"),
            MultimodalContent.video(videoUrl, "video/mp4")
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }

    @Action(description = "Generate video summary")
    fun summarizeVideo(videoUrl: String, @Provided ai: Ai): Summary {
        val contents = listOf(
            MultimodalContent.text("Provide a detailed summary of this video:"),
            MultimodalContent.video(videoUrl, "video/webm")
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }
}

5. File Content

Java

import com.embabel.agent.api.MultimodalContent;

@Agent(description = "File processor")
public class FileProcessorAgent {

    @Action(description = "Process PDF document")
    public DocumentAnalysis processPdf(String filePath, @Provided Ai ai) {
        // Create file content
        MultimodalContent pdfFile = MultimodalContent.file(
            filePath,
            "application/pdf"
        );

        MultimodalContent prompt = MultimodalContent.text(
            "Analyze this PDF document and extract key information"
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(List.of(prompt, pdfFile));
    }

    @Action(description = "Process spreadsheet")
    public DataAnalysis processSpreadsheet(String filePath, @Provided Ai ai) {
        MultimodalContent spreadsheet = MultimodalContent.file(
            filePath,
            "application/vnd.ms-excel"
        );

        MultimodalContent prompt = MultimodalContent.text(
            "Analyze the data in this spreadsheet"
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(List.of(prompt, spreadsheet));
    }
}

Kotlin

import com.embabel.agent.api.MultimodalContent

@Agent(description = "Document processor")
class DocumentProcessorAgent {

    @Action(description = "Process PDF")
    fun processPdf(filePath: String, @Provided ai: Ai): DocumentAnalysis {
        val contents = listOf(
            MultimodalContent.text("Extract structured data from this PDF:"),
            MultimodalContent.file(filePath, "application/pdf")
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }

    @Action(description = "Process Word document")
    fun processWord(filePath: String, @Provided ai: Ai): Analysis {
        val contents = listOf(
            MultimodalContent.text("Summarize this document:"),
            MultimodalContent.file(filePath, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }
}

6. Mixed Multimodal Content

Java

import com.embabel.agent.api.MultimodalContent;

@Agent(description = "Multimodal content processor")
public class MultimodalAgent {

    @Action(description = "Process mixed content")
    public Report processMultimodal(
        String text,
        String imageUrl,
        String audioUrl,
        @Provided Ai ai
    ) {
        // Combine multiple content types
        List<MultimodalContent> contents = List.of(
            MultimodalContent.text("Analyze the following content:"),
            MultimodalContent.text("Text: " + text),
            MultimodalContent.image(imageUrl, "image/jpeg"),
            MultimodalContent.text("Audio description:"),
            MultimodalContent.audio(audioUrl, "audio/mpeg"),
            MultimodalContent.text("Provide a comprehensive analysis.")
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents);
    }

    @Action(description = "Create multimedia report")
    public MultimediaReport createReport(
        List<String> imagePaths,
        String narrationUrl,
        @Provided Ai ai
    ) {
        List<MultimodalContent> contents = new ArrayList<>();

        contents.add(MultimodalContent.text(
            "Create a multimedia report from these materials:"
        ));

        // Add all images
        for (String imagePath : imagePaths) {
            contents.add(MultimodalContent.image(imagePath, "image/png"));
        }

        // Add narration
        contents.add(MultimodalContent.text("With narration:"));
        contents.add(MultimodalContent.audio(narrationUrl, "audio/mpeg"));

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents);
    }
}

Kotlin

import com.embabel.agent.api.MultimodalContent

@Agent(description = "Rich content processor")
class RichContentAgent {

    @Action(description = "Process mixed media")
    fun processMixedMedia(
        text: String,
        images: List<String>,
        audioUrl: String,
        @Provided ai: Ai
    ): Analysis {
        val contents = buildList {
            add(MultimodalContent.text("Analyze this mixed media content:"))
            add(MultimodalContent.text("Context: $text"))

            images.forEach { imageUrl ->
                add(MultimodalContent.image(imageUrl, "image/jpeg"))
            }

            add(MultimodalContent.text("Audio commentary:"))
            add(MultimodalContent.audio(audioUrl, "audio/mpeg"))
        }

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }

    @Action(description = "Create presentation from materials")
    fun createPresentation(
        slides: List<SlideData>,
        @Provided ai: Ai
    ): Presentation {
        val contents = mutableListOf<MultimodalContent>()

        contents.add(MultimodalContent.text(
            "Create a presentation from these slides:"
        ))

        slides.forEach { slide ->
            contents.add(MultimodalContent.text("Slide ${slide.number}:"))
            contents.add(MultimodalContent.image(slide.imageUrl, "image/png"))
            contents.add(MultimodalContent.text("Notes: ${slide.notes}"))
        }

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }
}

data class SlideData(
    val number: Int,
    val imageUrl: String,
    val notes: String
)

7. Image MIME Types

Java

@Agent(description = "Image processor with various formats")
public class ImageFormatAgent {

    @Action(description = "Process JPEG image")
    public Result processJpeg(String url, @Provided Ai ai) {
        MultimodalContent image = MultimodalContent.image(url, "image/jpeg");
        return processImage(image, ai);
    }

    @Action(description = "Process PNG image")
    public Result processPng(String url, @Provided Ai ai) {
        MultimodalContent image = MultimodalContent.image(url, "image/png");
        return processImage(image, ai);
    }

    @Action(description = "Process WebP image")
    public Result processWebP(String url, @Provided Ai ai) {
        MultimodalContent image = MultimodalContent.image(url, "image/webp");
        return processImage(image, ai);
    }

    @Action(description = "Process GIF image")
    public Result processGif(String url, @Provided Ai ai) {
        MultimodalContent image = MultimodalContent.image(url, "image/gif");
        return processImage(image, ai);
    }

    private Result processImage(MultimodalContent image, Ai ai) {
        List<MultimodalContent> contents = List.of(
            MultimodalContent.text("Analyze this image:"),
            image
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents);
    }
}

Kotlin

@Agent(description = "Format-aware image processor")
class ImageFormatAgent {

    @Action(description = "Process image with auto-detect format")
    fun processImage(url: String, format: ImageFormat, @Provided ai: Ai): Result {
        val mimeType = when (format) {
            ImageFormat.JPEG -> "image/jpeg"
            ImageFormat.PNG -> "image/png"
            ImageFormat.WEBP -> "image/webp"
            ImageFormat.GIF -> "image/gif"
            ImageFormat.BMP -> "image/bmp"
            ImageFormat.TIFF -> "image/tiff"
        }

        val contents = listOf(
            MultimodalContent.text("Analyze this $format image:"),
            MultimodalContent.image(url, mimeType)
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }
}

enum class ImageFormat {
    JPEG, PNG, WEBP, GIF, BMP, TIFF
}

8. Audio MIME Types

Java

@Agent(description = "Audio processor with various formats")
public class AudioFormatAgent {

    @Action(description = "Process MP3 audio")
    public Transcript processMp3(String url, @Provided Ai ai) {
        MultimodalContent audio = MultimodalContent.audio(url, "audio/mpeg");
        return transcribeAudio(audio, ai);
    }

    @Action(description = "Process WAV audio")
    public Transcript processWav(String url, @Provided Ai ai) {
        MultimodalContent audio = MultimodalContent.audio(url, "audio/wav");
        return transcribeAudio(audio, ai);
    }

    @Action(description = "Process OGG audio")
    public Transcript processOgg(String url, @Provided Ai ai) {
        MultimodalContent audio = MultimodalContent.audio(url, "audio/ogg");
        return transcribeAudio(audio, ai);
    }

    @Action(description = "Process FLAC audio")
    public Transcript processFlac(String url, @Provided Ai ai) {
        MultimodalContent audio = MultimodalContent.audio(url, "audio/flac");
        return transcribeAudio(audio, ai);
    }

    private Transcript transcribeAudio(MultimodalContent audio, Ai ai) {
        List<MultimodalContent> contents = List.of(
            MultimodalContent.text("Transcribe this audio:"),
            audio
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents);
    }
}

Kotlin

@Agent(description = "Format-aware audio processor")
class AudioFormatAgent {

    @Action(description = "Process audio with format")
    fun processAudio(url: String, format: AudioFormat, @Provided ai: Ai): Transcript {
        val mimeType = when (format) {
            AudioFormat.MP3 -> "audio/mpeg"
            AudioFormat.WAV -> "audio/wav"
            AudioFormat.OGG -> "audio/ogg"
            AudioFormat.FLAC -> "audio/flac"
            AudioFormat.AAC -> "audio/aac"
            AudioFormat.M4A -> "audio/mp4"
        }

        val contents = listOf(
            MultimodalContent.text("Transcribe this $format audio:"),
            MultimodalContent.audio(url, mimeType)
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }
}

enum class AudioFormat {
    MP3, WAV, OGG, FLAC, AAC, M4A
}

9. Video MIME Types

Java

@Agent(description = "Video processor with various formats")
public class VideoFormatAgent {

    @Action(description = "Process MP4 video")
    public VideoAnalysis processMp4(String url, @Provided Ai ai) {
        MultimodalContent video = MultimodalContent.video(url, "video/mp4");
        return analyzeVideo(video, ai);
    }

    @Action(description = "Process WebM video")
    public VideoAnalysis processWebM(String url, @Provided Ai ai) {
        MultimodalContent video = MultimodalContent.video(url, "video/webm");
        return analyzeVideo(video, ai);
    }

    @Action(description = "Process MOV video")
    public VideoAnalysis processMov(String url, @Provided Ai ai) {
        MultimodalContent video = MultimodalContent.video(url, "video/quicktime");
        return analyzeVideo(video, ai);
    }

    @Action(description = "Process AVI video")
    public VideoAnalysis processAvi(String url, @Provided Ai ai) {
        MultimodalContent video = MultimodalContent.video(url, "video/x-msvideo");
        return analyzeVideo(video, ai);
    }

    private VideoAnalysis analyzeVideo(MultimodalContent video, Ai ai) {
        List<MultimodalContent> contents = List.of(
            MultimodalContent.text("Analyze this video:"),
            video
        );

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents);
    }
}

Kotlin

@Agent(description = "Format-aware video processor")
class VideoFormatAgent {

    @Action(description = "Process video with format")
    fun processVideo(url: String, format: VideoFormat, @Provided ai: Ai): VideoAnalysis {
        val mimeType = when (format) {
            VideoFormat.MP4 -> "video/mp4"
            VideoFormat.WEBM -> "video/webm"
            VideoFormat.MOV -> "video/quicktime"
            VideoFormat.AVI -> "video/x-msvideo"
            VideoFormat.MKV -> "video/x-matroska"
            VideoFormat.FLV -> "video/x-flv"
        }

        val contents = listOf(
            MultimodalContent.text("Analyze this $format video:"),
            MultimodalContent.video(url, mimeType)
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }
}

enum class VideoFormat {
    MP4, WEBM, MOV, AVI, MKV, FLV
}

10. Complete Working Example

Java - Multimedia Content Analysis

import com.embabel.agent.api.*;
import com.embabel.agent.api.annotation.*;
import java.util.*;

@Agent(
    description = "Comprehensive multimedia content analyzer",
    planner = PlannerType.GOAP
)
public class MultimediaAnalyzerAgent {

    private final StorageService storage;
    private final TranscriptionService transcription;

    public MultimediaAnalyzerAgent(
        StorageService storage,
        TranscriptionService transcription
    ) {
        this.storage = storage;
        this.transcription = transcription;
    }

    @Action(
        description = "Upload and prepare media files",
        post = {"mediaUploaded"},
        outputBinding = "mediaUrls"
    )
    public MediaUrls uploadMedia(
        List<File> files,
        @Provided ActionContext context
    ) {
        context.updateProgress("Uploading " + files.size() + " files");

        List<String> imageUrls = new ArrayList<>();
        List<String> audioUrls = new ArrayList<>();
        List<String> videoUrls = new ArrayList<>();

        for (File file : files) {
            String url = storage.upload(file);
            String mimeType = Files.probeContentType(file.toPath());

            if (mimeType.startsWith("image/")) {
                imageUrls.add(url);
            } else if (mimeType.startsWith("audio/")) {
                audioUrls.add(url);
            } else if (mimeType.startsWith("video/")) {
                videoUrls.add(url);
            }
        }

        return new MediaUrls(imageUrls, audioUrls, videoUrls);
    }

    @Action(
        description = "Analyze images",
        pre = {"mediaUploaded"},
        post = {"imagesAnalyzed"},
        outputBinding = "imageAnalyses"
    )
    public List<ImageAnalysis> analyzeImages(
        MediaUrls mediaUrls,
        @Provided Ai ai,
        @Provided ActionContext context
    ) {
        context.updateProgress("Analyzing " + mediaUrls.getImageUrls().size() + " images");

        List<ImageAnalysis> analyses = new ArrayList<>();

        for (String imageUrl : mediaUrls.getImageUrls()) {
            List<MultimodalContent> contents = List.of(
                MultimodalContent.text("Analyze this image in detail:"),
                MultimodalContent.image(imageUrl, "image/jpeg")
            );

            ImageAnalysis analysis = ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                                       .createObject(contents);
            analyses.add(analysis);
        }

        return analyses;
    }

    @Action(
        description = "Transcribe audio files",
        pre = {"mediaUploaded"},
        post = {"audioTranscribed"},
        outputBinding = "transcripts"
    )
    public List<Transcript> transcribeAudio(
        MediaUrls mediaUrls,
        @Provided Ai ai,
        @Provided ActionContext context
    ) {
        context.updateProgress("Transcribing " + mediaUrls.getAudioUrls().size() + " audio files");

        List<Transcript> transcripts = new ArrayList<>();

        for (String audioUrl : mediaUrls.getAudioUrls()) {
            List<MultimodalContent> contents = List.of(
                MultimodalContent.text("Transcribe this audio:"),
                MultimodalContent.audio(audioUrl, "audio/mpeg")
            );

            Transcript transcript = ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                                       .createObject(contents);
            transcripts.add(transcript);
        }

        return transcripts;
    }

    @Action(
        description = "Analyze video content",
        pre = {"mediaUploaded"},
        post = {"videosAnalyzed"},
        outputBinding = "videoAnalyses"
    )
    public List<VideoAnalysis> analyzeVideos(
        MediaUrls mediaUrls,
        @Provided Ai ai,
        @Provided ActionContext context
    ) {
        context.updateProgress("Analyzing " + mediaUrls.getVideoUrls().size() + " videos");

        List<VideoAnalysis> analyses = new ArrayList<>();

        for (String videoUrl : mediaUrls.getVideoUrls()) {
            List<MultimodalContent> contents = List.of(
                MultimodalContent.text("Analyze this video:"),
                MultimodalContent.video(videoUrl, "video/mp4")
            );

            VideoAnalysis analysis = ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                                       .createObject(contents);
            analyses.add(analysis);
        }

        return analyses;
    }

    @AchievesGoal(
        description = "Generate comprehensive multimedia report",
        tags = {"multimedia", "analysis", "reporting"},
        export = @Export(remote = true, local = true),
        value = 100.0
    )
    @Action(
        description = "Generate final report",
        pre = {"imagesAnalyzed", "audioTranscribed", "videosAnalyzed"}
    )
    public MultimediaReport generateReport(
        MediaUrls mediaUrls,
        List<ImageAnalysis> imageAnalyses,
        List<Transcript> transcripts,
        List<VideoAnalysis> videoAnalyses,
        @Provided Ai ai,
        @Provided ActionContext context
    ) {
        context.updateProgress("Generating comprehensive report");

        // Build multimodal content for final synthesis
        List<MultimodalContent> contents = new ArrayList<>();

        contents.add(MultimodalContent.text(
            "Create a comprehensive report from the following analyzed content:"
        ));

        // Add image analyses
        contents.add(MultimodalContent.text("Images analyzed:"));
        for (int i = 0; i < imageAnalyses.size(); i++) {
            contents.add(MultimodalContent.text(
                "Image " + (i + 1) + ": " + imageAnalyses.get(i).getSummary()
            ));
            contents.add(MultimodalContent.image(
                mediaUrls.getImageUrls().get(i),
                "image/jpeg"
            ));
        }

        // Add transcripts
        contents.add(MultimodalContent.text("Audio transcripts:"));
        for (int i = 0; i < transcripts.size(); i++) {
            contents.add(MultimodalContent.text(
                "Audio " + (i + 1) + ": " + transcripts.get(i).getText()
            ));
        }

        // Add video analyses
        contents.add(MultimodalContent.text("Video analyses:"));
        for (int i = 0; i < videoAnalyses.size(); i++) {
            contents.add(MultimodalContent.text(
                "Video " + (i + 1) + ": " + videoAnalyses.get(i).getSummary()
            ));
        }

        // Generate final report
        MultimediaReport report = ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                                     .createObject(contents);

        context.sendMessage(Message.info("Report generated successfully"));

        return report;
    }
}

// Data classes
class MediaUrls {
    private final List<String> imageUrls;
    private final List<String> audioUrls;
    private final List<String> videoUrls;

    public MediaUrls(
        List<String> imageUrls,
        List<String> audioUrls,
        List<String> videoUrls
    ) {
        this.imageUrls = imageUrls;
        this.audioUrls = audioUrls;
        this.videoUrls = videoUrls;
    }

    public List<String> getImageUrls() { return imageUrls; }
    public List<String> getAudioUrls() { return audioUrls; }
    public List<String> getVideoUrls() { return videoUrls; }
}

Kotlin - Social Media Content Moderator

import com.embabel.agent.api.*
import com.embabel.agent.api.annotation.*

@Agent(
    description = "Social media content moderator with multimodal analysis",
    planner = PlannerType.GOAP
)
class ContentModeratorAgent(
    private val contentStore: ContentStore,
    private val moderationRules: ModerationRules
) {

    @Action(
        description = "Fetch content for moderation",
        post = ["contentFetched"],
        outputBinding = "content"
    )
    fun fetchContent(contentId: String): SocialContent {
        return contentStore.fetch(contentId)
    }

    @Action(
        description = "Analyze text content",
        pre = ["contentFetched"],
        post = ["textAnalyzed"],
        outputBinding = "textAnalysis"
    )
    fun analyzeText(
        content: SocialContent,
        @Provided ai: Ai
    ): TextModerationResult {
        val contents = listOf(
            MultimodalContent.text(
                "Moderate this social media post for: " +
                "hate speech, violence, explicit content, misinformation"
            ),
            MultimodalContent.text("Post text: ${content.text}")
        )

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }

    @Action(
        description = "Analyze image content",
        pre = ["contentFetched"],
        post = ["imagesAnalyzed"],
        outputBinding = "imageAnalysis"
    )
    fun analyzeImages(
        content: SocialContent,
        @Provided ai: Ai,
        @Provided context: ActionContext
    ): ImageModerationResult {
        if (content.imageUrls.isEmpty()) {
            return ImageModerationResult.empty()
        }

        context.updateProgress("Analyzing ${content.imageUrls.size} images")

        val contents = buildList {
            add(MultimodalContent.text(
                "Moderate these images for: " +
                "explicit content, violence, illegal activities, graphic content"
            ))

            content.imageUrls.forEach { url ->
                add(MultimodalContent.image(url, "image/jpeg"))
            }
        }

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }

    @Action(
        description = "Analyze video content",
        pre = ["contentFetched"],
        post = ["videosAnalyzed"],
        outputBinding = "videoAnalysis"
    )
    fun analyzeVideos(
        content: SocialContent,
        @Provided ai: Ai,
        @Provided context: ActionContext
    ): VideoModerationResult {
        if (content.videoUrls.isEmpty()) {
            return VideoModerationResult.empty()
        }

        context.updateProgress("Analyzing ${content.videoUrls.size} videos")

        val contents = buildList {
            add(MultimodalContent.text(
                "Moderate these videos for policy violations"
            ))

            content.videoUrls.forEach { url ->
                add(MultimodalContent.video(url, "video/mp4"))
            }
        }

        return ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                 .createObject(contents)
    }

    @AchievesGoal(
        description = "Complete content moderation decision",
        tags = ["moderation", "safety", "compliance"],
        export = Export(remote = true, local = true),
        value = 100.0
    )
    @Action(
        description = "Generate moderation decision",
        pre = ["textAnalyzed", "imagesAnalyzed", "videosAnalyzed"]
    )
    fun generateDecision(
        content: SocialContent,
        textAnalysis: TextModerationResult,
        imageAnalysis: ImageModerationResult,
        videoAnalysis: VideoModerationResult,
        @Provided ai: Ai,
        @Provided context: ActionContext
    ): ModerationDecision {
        context.updateProgress("Generating final moderation decision")

        // Combine all analyses
        val contents = listOf(
            MultimodalContent.text(
                "Based on the following moderation analyses, " +
                "generate a final decision (APPROVE, FLAG, REMOVE):"
            ),
            MultimodalContent.text("Text analysis: ${textAnalysis.summary}"),
            MultimodalContent.text("Image analysis: ${imageAnalysis.summary}"),
            MultimodalContent.text("Video analysis: ${videoAnalysis.summary}"),
            MultimodalContent.text("Content ID: ${content.id}")
        )

        val decision = ai.withLlm(GeminiModels.GEMINI_2_5_PRO)
                         .createObject<ModerationDecision>(contents)

        context.sendMessage(Message.info(
            "Moderation decision: ${decision.action} - ${decision.reason}"
        ))

        return decision
    }
}

// Data classes
data class SocialContent(
    val id: String,
    val text: String,
    val imageUrls: List<String> = emptyList(),
    val videoUrls: List<String> = emptyList(),
    val audioUrls: List<String> = emptyList()
)

data class TextModerationResult(
    val summary: String,
    val violations: List<String>,
    val severity: Severity
)

data class ImageModerationResult(
    val summary: String,
    val violations: List<String>,
    val severity: Severity
) {
    companion object {
        fun empty() = ImageModerationResult("No images", emptyList(), Severity.NONE)
    }
}

data class VideoModerationResult(
    val summary: String,
    val violations: List<String>,
    val severity: Severity
) {
    companion object {
        fun empty() = VideoModerationResult("No videos", emptyList(), Severity.NONE)
    }
}

data class ModerationDecision(
    val action: ModerationAction,
    val reason: String,
    val confidence: Double
)

enum class ModerationAction {
    APPROVE, FLAG, REMOVE
}

enum class Severity {
    NONE, LOW, MEDIUM, HIGH, CRITICAL
}

Key Concepts

MultimodalContent Types

  • Text - Text content (String)
  • Image - Image content (URL + MIME type)
  • Audio - Audio content (URL + MIME type)
  • Video - Video content (URL + MIME type)
  • File - File content (path + MIME type)

Factory Methods

  • MultimodalContent.text(content) - Create text content
  • MultimodalContent.image(url, mimeType) - Create image content
  • MultimodalContent.audio(url, mimeType) - Create audio content
  • MultimodalContent.video(url, mimeType) - Create video content
  • MultimodalContent.file(path, mimeType) - Create file content

Common MIME Types

Images: image/jpeg, image/png, image/gif, image/webp, image/bmp

Audio: audio/mpeg (MP3), audio/wav, audio/ogg, audio/flac, audio/aac

Video: video/mp4, video/webm, video/quicktime (MOV), video/x-msvideo (AVI)

Documents: application/pdf, application/vnd.ms-excel, application/vnd.openxmlformats-officedocument.wordprocessingml.document

Best Practices

  1. Use Appropriate Models - Use multimodal-capable models (Gemini 2.5 Pro, GPT-4 Vision)
  2. Specify MIME Types - Always specify correct MIME types for content
  3. Combine Content Types - Mix text with media for better context
  4. Sequence Matters - Order content logically (context, then media, then questions)
  5. Handle Large Files - Be mindful of file size limits
  6. Validate URLs - Ensure media URLs are accessible
  7. Track Progress - Update progress for long multimodal operations
  8. Error Handling - Handle media loading errors gracefully
  9. Security - Validate and sanitize media URLs
  10. Performance - Cache analyzed media when possible

See Also

  • Defining Actions - Create actions with multimodal content
  • Creating Tools - Create tools that handle multimodal content
  • Goal Achievement - Define goals for multimodal processing
  • Human-in-the-Loop - Combine HITL with multimodal content
tessl i tessl/maven-com-embabel-agent--embabel-agent-starter@0.3.1

docs

api-annotations.md

api-domain-model.md

api-invocation.md

api-tools.md

concepts-actions.md

concepts-agents.md

concepts-goals.md

concepts-invocation.md

concepts-tools.md

guides-creating-agents.md

guides-creating-tools.md

guides-defining-actions.md

guides-goal-achievement.md

guides-human-in-loop.md

guides-multimodal.md

index.md

integration-mcp.md

integration-model-providers.md

integration-spring-boot.md

LlmTool.md

quickstart.md

reference-component-scanning.md

reference-configuration-properties.md

reference-installation.md

reference-logging.md

reference-resilience.md

reference-streaming.md

tile.json