Spring Boot-compatible Ollama integration providing ChatModel and EmbeddingModel implementations for running large language models locally with support for streaming, tool calling, model management, and observability.
This guide will help you get started with Spring AI Ollama in minutes.
Ollama Server: Install and run Ollama locally
# Download from https://ollama.ai
# Or use brew on macOS
brew install ollama
# Start Ollama server
ollama serveJava 17+: Spring AI Ollama requires Java 17 or later
Maven/Gradle: Add the dependency to your project
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-ollama</artifactId>
<version>1.1.2</version>
</dependency>implementation 'org.springframework.ai:spring-ai-ollama:1.1.2'Before using Ollama, pull a model:
# Pull Llama 3 (recommended for general use)
ollama pull llama3
# Or pull a smaller model for testing
ollama pull qwen3:0.6b
# For embeddings
ollama pull nomic-embed-textimport org.springframework.ai.ollama.OllamaChatModel;
import org.springframework.ai.ollama.api.OllamaApi;
import org.springframework.ai.ollama.api.OllamaChatOptions;
import org.springframework.ai.ollama.api.OllamaModel;
import org.springframework.ai.chat.prompt.Prompt;
import org.springframework.ai.chat.model.ChatResponse;
public class SimpleChatExample {
public static void main(String[] args) {
// 1. Create API client
OllamaApi api = OllamaApi.builder()
.baseUrl("http://localhost:11434")
.build();
// 2. Build chat model
OllamaChatModel chatModel = OllamaChatModel.builder()
.ollamaApi(api)
.defaultOptions(OllamaChatOptions.builder()
.model(OllamaModel.LLAMA3.id())
.temperature(0.7)
.build())
.build();
// 3. Send a message
ChatResponse response = chatModel.call(new Prompt("What is Spring AI?"));
// 4. Get the response
String answer = response.getResult().getOutput().getContent();
System.out.println(answer);
}
}import reactor.core.publisher.Flux;
public class StreamingChatExample {
public static void main(String[] args) {
// Setup (same as above)
OllamaApi api = OllamaApi.builder().build();
OllamaChatModel chatModel = OllamaChatModel.builder()
.ollamaApi(api)
.defaultOptions(OllamaChatOptions.builder()
.model(OllamaModel.LLAMA3.id())
.build())
.build();
// Stream the response
Flux<ChatResponse> stream = chatModel.stream(new Prompt("Tell me a short story"));
stream.subscribe(
chunk -> System.out.print(chunk.getResult().getOutput().getContent()),
error -> System.err.println("Error: " + error),
() -> System.out.println("\n[Complete]")
);
// Keep main thread alive for streaming
try {
Thread.sleep(10000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}import org.springframework.ai.ollama.OllamaEmbeddingModel;
import org.springframework.ai.ollama.api.OllamaEmbeddingOptions;
import org.springframework.ai.embedding.EmbeddingRequest;
import org.springframework.ai.embedding.EmbeddingResponse;
import org.springframework.ai.embedding.EmbeddingOptions;
import java.util.List;
public class EmbeddingExample {
public static void main(String[] args) {
// 1. Create API client
OllamaApi api = OllamaApi.builder().build();
// 2. Build embedding model
OllamaEmbeddingModel embeddingModel = OllamaEmbeddingModel.builder()
.ollamaApi(api)
.defaultOptions(OllamaEmbeddingOptions.builder()
.model(OllamaModel.NOMIC_EMBED_TEXT.id())
.build())
.build();
// 3. Generate single embedding
float[] embedding = embeddingModel.embed("Hello, world!");
System.out.println("Embedding dimension: " + embedding.length);
// 4. Generate batch embeddings
List<String> texts = List.of(
"First document",
"Second document",
"Third document"
);
EmbeddingRequest request = new EmbeddingRequest(texts, EmbeddingOptions.EMPTY);
EmbeddingResponse response = embeddingModel.call(request);
System.out.println("Generated " + response.getResults().size() + " embeddings");
}
}Configure automatic model pulling:
import org.springframework.ai.ollama.management.ModelManagementOptions;
import org.springframework.ai.ollama.management.PullModelStrategy;
import java.time.Duration;
public class AutoPullExample {
public static void main(String[] args) {
OllamaApi api = OllamaApi.builder().build();
// Configure auto-pull
ModelManagementOptions modelMgmt = ModelManagementOptions.builder()
.pullModelStrategy(PullModelStrategy.WHEN_MISSING)
.timeout(Duration.ofMinutes(10))
.build();
// Model will be pulled automatically if not available
OllamaChatModel chatModel = OllamaChatModel.builder()
.ollamaApi(api)
.defaultOptions(OllamaChatOptions.builder()
.model("llama3")
.build())
.modelManagementOptions(modelMgmt)
.build();
// Use the model (will auto-pull if missing)
ChatResponse response = chatModel.call(new Prompt("Hello!"));
System.out.println(response.getResult().getOutput().getContent());
}
}import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
@Configuration
public class OllamaConfig {
@Bean
public OllamaApi ollamaApi() {
return OllamaApi.builder()
.baseUrl("http://localhost:11434")
.build();
}
@Bean
public OllamaChatModel chatModel(OllamaApi ollamaApi) {
return OllamaChatModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaChatOptions.builder()
.model(OllamaModel.LLAMA3.id())
.temperature(0.7)
.build())
.build();
}
@Bean
public OllamaEmbeddingModel embeddingModel(OllamaApi ollamaApi) {
return OllamaEmbeddingModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaEmbeddingOptions.builder()
.model(OllamaModel.NOMIC_EMBED_TEXT.id())
.build())
.build();
}
}import org.springframework.stereotype.Service;
@Service
public class ChatService {
private final OllamaChatModel chatModel;
public ChatService(OllamaChatModel chatModel) {
this.chatModel = chatModel;
}
public String chat(String userMessage) {
ChatResponse response = chatModel.call(new Prompt(userMessage));
return response.getResult().getOutput().getContent();
}
public Flux<String> chatStream(String userMessage) {
return chatModel.stream(new Prompt(userMessage))
.map(chunk -> chunk.getResult().getOutput().getContent());
}
}import org.springframework.web.bind.annotation.*;
import reactor.core.publisher.Flux;
@RestController
@RequestMapping("/api/chat")
public class ChatController {
private final ChatService chatService;
public ChatController(ChatService chatService) {
this.chatService = chatService;
}
@PostMapping
public String chat(@RequestBody String message) {
return chatService.chat(message);
}
@PostMapping("/stream")
public Flux<String> chatStream(@RequestBody String message) {
return chatService.chatStream(message);
}
}Problem: Cannot connect to Ollama server
Solution:
# Check if Ollama is running
curl http://localhost:11434/api/version
# Start Ollama if not running
ollama serveProblem: 404 error when calling the model
Solution:
# Pull the model first
ollama pull llama3
# Or use auto-pull in code
ModelManagementOptions.builder()
.pullModelStrategy(PullModelStrategy.WHEN_MISSING)
.build()Problem: Model too large for available RAM
Solution:
// Use a smaller model
.model(OllamaModel.QWEN_3_06B.id()) // 0.6B parameters
// Or use CPU-only mode
OllamaChatOptions.builder()
.model("llama3")
.numGPU(0) // CPU only
.build()tessl i tessl/maven-org-springframework-ai--spring-ai-ollama@1.1.1