CtrlK
CommunityDocumentationLog inGet started
Tessl Logo

tessl/maven-org-springframework-ai--spring-ai-ollama

Spring Boot-compatible Ollama integration providing ChatModel and EmbeddingModel implementations for running large language models locally with support for streaming, tool calling, model management, and observability.

Overview
Eval results
Files

chat-model.mddocs/reference/

Chat Model

The OllamaChatModel class provides the main implementation of Spring AI's ChatModel interface for Ollama, enabling chat completions with support for streaming, tool calling, multimodal inputs, and thinking models.

Core API

package org.springframework.ai.ollama;

public class OllamaChatModel implements ChatModel {
    // Constructors
    public OllamaChatModel(OllamaApi ollamaApi, OllamaChatOptions defaultOptions,
        ToolCallingManager toolCallingManager, ObservationRegistry observationRegistry,
        ModelManagementOptions modelManagementOptions);

    public OllamaChatModel(OllamaApi ollamaApi, OllamaChatOptions defaultOptions,
        ToolCallingManager toolCallingManager, ObservationRegistry observationRegistry,
        ModelManagementOptions modelManagementOptions,
        ToolExecutionEligibilityPredicate toolExecutionEligibilityPredicate,
        RetryTemplate retryTemplate);

    // Factory method
    public static Builder builder();

    // Chat operations
    public ChatResponse call(Prompt prompt);
    public Flux<ChatResponse> stream(Prompt prompt);

    // Configuration
    public ChatOptions getDefaultOptions();
    public void setObservationConvention(ChatModelObservationConvention observationConvention);

    // Metadata conversion (internal utility)
    public static ChatResponseMetadata from(OllamaApi.ChatResponse response, ChatResponse previousChatResponse);
}

Builder API

public static final class Builder {
    public Builder ollamaApi(OllamaApi ollamaApi);
    public Builder defaultOptions(OllamaChatOptions defaultOptions);
    public Builder toolCallingManager(ToolCallingManager toolCallingManager);
    public Builder toolExecutionEligibilityPredicate(ToolExecutionEligibilityPredicate predicate);
    public Builder observationRegistry(ObservationRegistry observationRegistry);
    public Builder modelManagementOptions(ModelManagementOptions modelManagementOptions);
    public Builder retryTemplate(RetryTemplate retryTemplate);
    public OllamaChatModel build();
}

Builder Defaults:

  • defaultOptions: OllamaChatOptions with model set to OllamaModel.MISTRAL.id() ("mistral") if not provided
  • toolCallingManager: Internal DEFAULT_TOOL_CALLING_MANAGER implementation if not provided
  • toolExecutionEligibilityPredicate: DefaultToolExecutionEligibilityPredicate if not provided
  • observationRegistry: ObservationRegistry.NOOP if not provided
  • modelManagementOptions: ModelManagementOptions.defaults() if not provided
  • retryTemplate: RetryUtils.DEFAULT_RETRY_TEMPLATE if not provided

Note: The default model is MISTRAL, not "no default" or any other model.

Usage

Basic Chat

import org.springframework.ai.ollama.OllamaChatModel;
import org.springframework.ai.ollama.api.OllamaApi;
import org.springframework.ai.ollama.api.OllamaChatOptions;
import org.springframework.ai.ollama.api.OllamaModel;
import org.springframework.ai.chat.prompt.Prompt;
import org.springframework.ai.chat.model.ChatResponse;

// Create API client
OllamaApi api = OllamaApi.builder()
    .baseUrl("http://localhost:11434")
    .build();

// Build chat model with default options
OllamaChatModel chatModel = OllamaChatModel.builder()
    .ollamaApi(api)
    .defaultOptions(OllamaChatOptions.builder()
        .model(OllamaModel.LLAMA3.id())
        .temperature(0.7)
        .topP(0.9)
        .build())
    .build();

// Simple synchronous call
ChatResponse response = chatModel.call(new Prompt("What is Spring AI?"));
String content = response.getResult().getOutput().getContent();
System.out.println(content);

Streaming Chat

import reactor.core.publisher.Flux;

// Streaming call
Flux<ChatResponse> stream = chatModel.stream(new Prompt("Tell me a long story"));

// Subscribe to stream
stream.subscribe(chunk -> {
    String content = chunk.getResult().getOutput().getContent();
    System.out.print(content);
});

// Or block and collect
String fullResponse = stream
    .map(chunk -> chunk.getResult().getOutput().getContent())
    .collect(StringBuilder::new, StringBuilder::append)
    .block()
    .toString();

Runtime Options Override

import org.springframework.ai.chat.prompt.Prompt;

// Override default options for a specific request
OllamaChatOptions requestOptions = OllamaChatOptions.builder()
    .temperature(0.9)
    .maxTokens(500)
    .build();

Prompt prompt = new Prompt("Be creative!", requestOptions);
ChatResponse response = chatModel.call(prompt);

Conversation History

import org.springframework.ai.chat.messages.Message;
import org.springframework.ai.chat.messages.SystemMessage;
import org.springframework.ai.chat.messages.UserMessage;
import org.springframework.ai.chat.messages.AssistantMessage;
import java.util.List;

List<Message> messages = List.of(
    new SystemMessage("You are a helpful assistant."),
    new UserMessage("What is the capital of France?"),
    new AssistantMessage("The capital of France is Paris."),
    new UserMessage("What about Germany?")
);

Prompt prompt = new Prompt(messages);
ChatResponse response = chatModel.call(prompt);

Model Management Integration

import org.springframework.ai.ollama.management.ModelManagementOptions;
import org.springframework.ai.ollama.management.PullModelStrategy;
import java.time.Duration;

// Configure automatic model management
ModelManagementOptions modelMgmt = ModelManagementOptions.builder()
    .pullModelStrategy(PullModelStrategy.WHEN_MISSING)
    .additionalModels(List.of("llama3", "mistral"))
    .timeout(Duration.ofMinutes(10))
    .maxRetries(3)
    .build();

OllamaChatModel chatModel = OllamaChatModel.builder()
    .ollamaApi(api)
    .defaultOptions(OllamaChatOptions.builder()
        .model("llama3")
        .build())
    .modelManagementOptions(modelMgmt)
    .build();

// Model will be automatically pulled if not available

Observability

import io.micrometer.observation.ObservationRegistry;
import org.springframework.ai.chat.observation.ChatModelObservationConvention;

// Set up observability
ObservationRegistry registry = ObservationRegistry.create();

OllamaChatModel chatModel = OllamaChatModel.builder()
    .ollamaApi(api)
    .defaultOptions(options)
    .observationRegistry(registry)
    .build();

// Optional: custom observation convention
chatModel.setObservationConvention(new CustomChatModelObservationConvention());

// Metrics will be automatically tracked:
// - Token usage (prompt, completion, total)
// - Duration (total, load, prompt eval, generation)
// - Model information

Retry Configuration

import org.springframework.retry.support.RetryTemplate;
import org.springframework.retry.policy.SimpleRetryPolicy;
import org.springframework.retry.backoff.ExponentialBackOffPolicy;

// Configure custom retry behavior
ExponentialBackOffPolicy backOffPolicy = new ExponentialBackOffPolicy();
backOffPolicy.setInitialInterval(1000);
backOffPolicy.setMultiplier(2.0);
backOffPolicy.setMaxInterval(10000);

SimpleRetryPolicy retryPolicy = new SimpleRetryPolicy();
retryPolicy.setMaxAttempts(3);

RetryTemplate retryTemplate = new RetryTemplate();
retryTemplate.setRetryPolicy(retryPolicy);
retryTemplate.setBackOffPolicy(backOffPolicy);

OllamaChatModel chatModel = OllamaChatModel.builder()
    .ollamaApi(api)
    .defaultOptions(options)
    .retryTemplate(retryTemplate)
    .build();

Return Types

ChatResponse

// From Spring AI Core
public class ChatResponse implements ModelResponse<Generation> {
    public List<Generation> getResults();
    public ChatResponseMetadata getMetadata();
}

public class Generation {
    public AssistantMessage getOutput();
    public ChatGenerationMetadata getGenerationMetadata();
}

// Metadata includes:
// - Token usage (prompt, completion, total)
// - Model name
// - Finish reason
// - Custom metadata (created-at, durations, eval-count, etc.)

Features

Synchronous and Streaming

  • Synchronous: call(Prompt) returns complete ChatResponse
  • Streaming: stream(Prompt) returns Flux<ChatResponse> for token-by-token streaming

Token-by-Token Streaming

Streaming responses emit multiple ChatResponse chunks as tokens are generated:

Flux<ChatResponse> stream = chatModel.stream(prompt);
stream.subscribe(chunk -> {
    // Each chunk contains one or more tokens
    String content = chunk.getResult().getOutput().getContent();
    // Print immediately for real-time output
    System.out.print(content);
});

Tool Calling Support

The chat model supports automatic tool/function calling. Configure tools via OllamaChatOptions:

// Tools are configured in options (see tool-calling.md for details)
OllamaChatOptions options = OllamaChatOptions.builder()
    .model("llama3")
    .toolCallbacks(List.of(weatherTool, calculatorTool))
    .build();

Multimodal Support

Vision models can process images alongside text:

import org.springframework.ai.chat.messages.UserMessage;
import org.springframework.ai.chat.messages.Media;
import org.springframework.util.MimeTypeUtils;

// Create message with image
byte[] imageBytes = Files.readAllBytes(Path.of("image.jpg"));
UserMessage message = new UserMessage(
    "What's in this image?",
    List.of(new Media(MimeTypeUtils.IMAGE_JPEG, imageBytes))
);

Prompt prompt = new Prompt(List.of(message));
ChatResponse response = chatModel.call(prompt);

Thinking Models

Reasoning models can provide thinking traces:

import org.springframework.ai.ollama.api.ThinkOption;

OllamaChatOptions options = OllamaChatOptions.builder()
    .model(OllamaModel.QWEN3_4B_THINKING.id())
    .thinkOption(ThinkOption.ThinkBoolean.ENABLED)
    .build();

ChatResponse response = chatModel.call(new Prompt("Solve this logic puzzle...", options));

// Thinking trace is in generation metadata
String thinking = (String) response.getResult()
    .getGenerationMetadata()
    .getMetadata()
    .get("thinking");

JSON Output

Request structured JSON output:

OllamaChatOptions options = OllamaChatOptions.builder()
    .model("llama3")
    .format("json")
    .build();

Prompt prompt = new Prompt("List three colors as JSON array", options);
ChatResponse response = chatModel.call(prompt);

Stop Sequences

Control when generation stops:

OllamaChatOptions options = OllamaChatOptions.builder()
    .model("llama3")
    .stop(List.of("\\n\\n", "END"))
    .build();

Configuration Options

The chat model behavior is controlled by OllamaChatOptions. See Chat Options Documentation for complete details on all available parameters including:

  • Model selection and format
  • Generation parameters (temperature, top-p, etc.)
  • GPU and memory management
  • Tool calling configuration
  • Thinking/reasoning options

Metadata and Usage Information

The ChatResponse includes rich metadata:

ChatResponse response = chatModel.call(prompt);

// Usage information
ChatResponseMetadata metadata = response.getMetadata();
Integer promptTokens = metadata.getUsage().getPromptTokens();
Integer completionTokens = metadata.getUsage().getCompletionTokens();
Integer totalTokens = metadata.getUsage().getTotalTokens();

// Model information
String modelName = metadata.getModel();

// Generation metadata
String finishReason = response.getResult().getGenerationMetadata().getFinishReason();

// Custom metadata (Ollama-specific)
Instant createdAt = (Instant) metadata.get("created-at");
Duration totalDuration = (Duration) metadata.get("total-duration");
Duration loadDuration = (Duration) metadata.get("load-duration");
Duration promptEvalDuration = (Duration) metadata.get("prompt-eval-duration");
Duration evalDuration = (Duration) metadata.get("eval-duration");
Integer evalCount = (Integer) metadata.get("eval-count");
Integer promptEvalCount = (Integer) metadata.get("prompt-eval-count");

Metadata Keys:

  • "created-at" - Timestamp when the response was created (Instant)
  • "total-duration" - Total request duration (Duration)
  • "load-duration" - Model load duration (Duration)
  • "prompt-eval-duration" - Prompt evaluation duration (Duration)
  • "eval-duration" - Generation duration (Duration)
  • "eval-count" - Number of tokens generated (Integer)
  • "prompt-eval-count" - Number of tokens in prompt (Integer)

Error Handling

The chat model uses Spring Retry for automatic retry on transient failures. Errors are propagated as exceptions:

try {
    ChatResponse response = chatModel.call(prompt);
} catch (RestClientException e) {
    // Handle connection errors
} catch (Exception e) {
    // Handle other errors
}

Thread Safety

OllamaChatModel instances are thread-safe and can be reused across multiple requests. It's recommended to create a single instance and share it.

Related Documentation

tessl i tessl/maven-org-springframework-ai--spring-ai-ollama@1.1.1

docs

index.md

tile.json