Spring Boot-compatible Ollama integration providing ChatModel and EmbeddingModel implementations for running large language models locally with support for streaming, tool calling, model management, and observability.
This document covers edge cases, boundary conditions, and advanced usage scenarios for Spring AI Ollama.
// Empty string prompt
try {
ChatResponse response = chatModel.call(new Prompt(""));
// May return empty or error depending on model
} catch (Exception e) {
// Handle error
}
// Null prompt (throws exception)
try {
ChatResponse response = chatModel.call(new Prompt(null));
} catch (IllegalArgumentException e) {
// Expected: null prompt not allowed
}// Empty message list
try {
Prompt prompt = new Prompt(List.of());
ChatResponse response = chatModel.call(prompt);
} catch (IllegalArgumentException e) {
// Expected: at least one message required
}// Null model (uses default)
OllamaChatOptions options = OllamaChatOptions.builder()
.model(null) // Will use default model
.temperature(0.7)
.build();
// Null temperature (uses default)
OllamaChatOptions options = OllamaChatOptions.builder()
.model("llama3")
.temperature(null) // Will use default temperature (0.8)
.build();// Generate very long prompt
StringBuilder longPrompt = new StringBuilder();
for (int i = 0; i < 10000; i++) {
longPrompt.append("This is a very long text. ");
}
// Option 1: Enable truncation (default)
OllamaChatOptions options = OllamaChatOptions.builder()
.model("llama3")
.truncate(true) // Auto-truncate to context length
.build();
ChatResponse response = chatModel.call(new Prompt(longPrompt.toString(), options));
// Option 2: Increase context window
OllamaChatOptions largeContextOptions = OllamaChatOptions.builder()
.model("llama3")
.numCtx(8192) // Increase from default 2048
.build();
// Option 3: Disable truncation (will error if too long)
OllamaChatOptions strictOptions = OllamaChatOptions.builder()
.model("llama3")
.truncate(false)
.build();
try {
response = chatModel.call(new Prompt(longPrompt.toString(), strictOptions));
} catch (Exception e) {
// Handle context length exceeded error
}public class ConversationManager {
private final int MAX_HISTORY_TOKENS = 3000;
private final List<Message> history = new ArrayList<>();
public void addMessage(Message message) {
history.add(message);
// Estimate tokens (rough: 1 token ≈ 4 characters)
int estimatedTokens = history.stream()
.mapToInt(m -> m.getContent().length() / 4)
.sum();
// Trim if exceeds limit
while (estimatedTokens > MAX_HISTORY_TOKENS && history.size() > 2) {
// Keep system message, remove oldest user/assistant messages
if (!(history.get(1) instanceof SystemMessage)) {
history.remove(1);
estimatedTokens = history.stream()
.mapToInt(m -> m.getContent().length() / 4)
.sum();
} else {
break;
}
}
}
public List<Message> getHistory() {
return new ArrayList<>(history);
}
}@Service
public class ConcurrentChatService {
// Single shared instance (thread-safe)
private final OllamaChatModel chatModel;
public ConcurrentChatService(OllamaApi ollamaApi) {
this.chatModel = OllamaChatModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaChatOptions.builder()
.model(OllamaModel.LLAMA3.id())
.build())
.build();
}
public String chat(String message) {
// Safe to call from multiple threads
ChatResponse response = chatModel.call(new Prompt(message));
return response.getResult().getOutput().getContent();
}
}
// Usage from multiple threads
ExecutorService executor = Executors.newFixedThreadPool(10);
ConcurrentChatService service = new ConcurrentChatService(ollamaApi);
List<Future<String>> futures = new ArrayList<>();
for (int i = 0; i < 100; i++) {
final int index = i;
Future<String> future = executor.submit(() ->
service.chat("Question " + index)
);
futures.add(future);
}
// Wait for all to complete
for (Future<String> future : futures) {
String response = future.get();
}public class ParallelBatchProcessor {
private final OllamaEmbeddingModel embeddingModel;
public List<float[]> processInParallel(List<String> texts, int parallelism) {
return texts.parallelStream()
.map(text -> {
try {
return embeddingModel.embed(text);
} catch (Exception e) {
logger.error("Failed to embed text", e);
return null;
}
})
.filter(Objects::nonNull)
.toList();
}
public Flux<float[]> processReactive(List<String> texts, int concurrency) {
return Flux.fromIterable(texts)
.flatMap(text ->
Mono.fromCallable(() -> embeddingModel.embed(text)),
concurrency
);
}
}public class RobustStreamingHandler {
public String handleStreamWithTimeout(OllamaChatModel chatModel, String prompt, Duration timeout) {
StringBuilder result = new StringBuilder();
CountDownLatch latch = new CountDownLatch(1);
AtomicReference<Throwable> error = new AtomicReference<>();
Flux<ChatResponse> stream = chatModel.stream(new Prompt(prompt));
stream.subscribe(
chunk -> result.append(chunk.getResult().getOutput().getContent()),
err -> {
error.set(err);
latch.countDown();
},
latch::countDown
);
try {
if (!latch.await(timeout.toMillis(), TimeUnit.MILLISECONDS)) {
return result.toString() + " [TIMEOUT]";
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return result.toString() + " [INTERRUPTED]";
}
if (error.get() != null) {
throw new RuntimeException("Stream error", error.get());
}
return result.toString();
}
}import org.springframework.ai.ollama.api.OllamaApiHelper;
public class StreamMerger {
public OllamaApi.ChatResponse accumulateStream(Flux<OllamaApi.ChatResponse> stream) {
return stream.reduce(
null,
(accumulated, current) -> {
if (accumulated == null) {
return current;
}
return OllamaApiHelper.merge(accumulated, current);
}
).block();
}
public void processStreamWithMerging(OllamaApi ollamaApi, ChatRequest request) {
Flux<OllamaApi.ChatResponse> stream = ollamaApi.streamingChat(request);
OllamaApi.ChatResponse accumulated = null;
for (OllamaApi.ChatResponse chunk : stream.toIterable()) {
if (accumulated == null) {
accumulated = chunk;
} else {
accumulated = OllamaApiHelper.merge(accumulated, chunk);
}
// Check if streaming is done
if (OllamaApiHelper.isStreamingDone(chunk)) {
// Final chunk includes complete metadata
System.out.println("Total tokens: " + chunk.evalCount());
break;
}
// Check for tool calls
if (OllamaApiHelper.isStreamingToolCall(chunk)) {
// Handle tool call
}
}
}
}public class RobustToolCalling {
public String chatWithToolErrorHandling(OllamaChatModel chatModel, String message) {
try {
ChatResponse response = chatModel.call(new Prompt(message));
return response.getResult().getOutput().getContent();
} catch (Exception e) {
if (e.getMessage().contains("tool execution failed")) {
// Tool failed - provide fallback response
return "I encountered an error accessing external tools. " +
"Please try again or rephrase your question.";
}
throw e;
}
}
}public class ValidatingToolExecutor {
public ChatResponse executeToolsManually(
OllamaChatModel chatModel,
String message,
Predicate<ToolCall> validator
) {
// Disable auto-execution
OllamaChatOptions options = OllamaChatOptions.builder()
.internalToolExecutionEnabled(false)
.build();
ChatResponse response = chatModel.call(new Prompt(message, options));
AssistantMessage assistantMessage = response.getResult().getOutput();
// Check for tool calls
List<ToolCall> toolCalls = assistantMessage.getToolCalls();
if (toolCalls == null || toolCalls.isEmpty()) {
return response;
}
// Validate and execute tools
List<Message> messages = new ArrayList<>();
messages.add(new UserMessage(message));
messages.add(assistantMessage);
for (ToolCall toolCall : toolCalls) {
if (!validator.test(toolCall)) {
// Tool not allowed
messages.add(new ToolResponseMessage(
"{\"error\": \"Tool not allowed\"}",
toolCall.name()
));
continue;
}
try {
// Execute tool
String result = executeToolSafely(toolCall);
messages.add(new ToolResponseMessage(result, toolCall.name()));
} catch (Exception e) {
messages.add(new ToolResponseMessage(
"{\"error\": \"" + e.getMessage() + "\"}",
toolCall.name()
));
}
}
// Continue conversation with tool results
return chatModel.call(new Prompt(messages));
}
private String executeToolSafely(ToolCall toolCall) {
// Execute with timeout and error handling
// ... implementation
return "{}";
}
}public class ModelPullTracker {
private final OllamaApi ollamaApi;
public void pullModelWithProgress(String modelName) {
PullModelRequest request = new PullModelRequest(modelName);
Flux<ProgressResponse> progress = ollamaApi.pullModel(request);
AtomicLong lastCompleted = new AtomicLong(0);
progress.subscribe(
p -> {
if (p.total() != null && p.completed() != null) {
double percent = (p.completed() * 100.0) / p.total();
// Only log on significant progress
if (p.completed() - lastCompleted.get() > p.total() / 20) {
System.out.printf("Progress: %.1f%% - %s%n", percent, p.status());
lastCompleted.set(p.completed());
}
} else {
System.out.println("Status: " + p.status());
}
},
error -> System.err.println("Pull failed: " + error.getMessage()),
() -> System.out.println("Pull complete!")
);
}
}public class TimeoutAwareModelManager {
public boolean pullModelWithTimeout(
OllamaModelManager manager,
String modelName,
Duration timeout
) {
ExecutorService executor = Executors.newSingleThreadExecutor();
Future<Void> future = executor.submit(() -> {
manager.pullModel(modelName, PullModelStrategy.ALWAYS);
return null;
});
try {
future.get(timeout.toMillis(), TimeUnit.MILLISECONDS);
return true;
} catch (TimeoutException e) {
future.cancel(true);
logger.error("Model pull timed out after {}", timeout);
return false;
} catch (Exception e) {
logger.error("Model pull failed", e);
return false;
} finally {
executor.shutdown();
}
}
}public class ModelVersionManager {
public void ensureModelVersion(OllamaApi ollamaApi, String modelName) {
// Check if model exists
ListModelResponse response = ollamaApi.listModels();
Optional<Model> existing = response.models().stream()
.filter(m -> m.name().startsWith(modelName))
.findFirst();
if (existing.isPresent()) {
Model model = existing.get();
Instant modifiedAt = model.modifiedAt();
// Check if model is old (e.g., > 30 days)
if (modifiedAt.isBefore(Instant.now().minus(Duration.ofDays(30)))) {
logger.info("Model {} is outdated, pulling latest", modelName);
// Pull latest version
PullModelRequest request = new PullModelRequest(modelName);
ollamaApi.pullModel(request).blockLast();
}
} else {
// Model doesn't exist, pull it
PullModelRequest request = new PullModelRequest(modelName);
ollamaApi.pullModel(request).blockLast();
}
}
}@Service
public class ResilientChatService {
private final OllamaChatModel primaryModel;
private final OllamaChatModel fallbackModel;
public ResilientChatService(OllamaApi ollamaApi) {
// Primary: Large model
this.primaryModel = OllamaChatModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaChatOptions.builder()
.model(OllamaModel.LLAMA3.id())
.build())
.build();
// Fallback: Smaller, faster model
this.fallbackModel = OllamaChatModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaChatOptions.builder()
.model(OllamaModel.QWEN_3_06B.id())
.build())
.build();
}
public String chat(String message) {
try {
return chatWithPrimary(message);
} catch (HttpClientErrorException.NotFound e) {
logger.warn("Primary model not found, using fallback");
return chatWithFallback(message);
} catch (HttpServerErrorException e) {
logger.warn("Primary model error, using fallback");
return chatWithFallback(message);
} catch (ResourceAccessException e) {
logger.warn("Network error with primary, using fallback");
return chatWithFallback(message);
}
}
private String chatWithPrimary(String message) {
ChatResponse response = primaryModel.call(new Prompt(message));
return response.getResult().getOutput().getContent();
}
private String chatWithFallback(String message) {
ChatResponse response = fallbackModel.call(new Prompt(message));
return response.getResult().getOutput().getContent();
}
}public class CircuitBreakerChatService {
private final OllamaChatModel chatModel;
private final AtomicInteger failureCount = new AtomicInteger(0);
private final AtomicBoolean circuitOpen = new AtomicBoolean(false);
private final int FAILURE_THRESHOLD = 5;
private final Duration RESET_TIMEOUT = Duration.ofMinutes(1);
private Instant lastFailureTime;
public Optional<String> chat(String message) {
// Check circuit breaker
if (circuitOpen.get()) {
if (Duration.between(lastFailureTime, Instant.now()).compareTo(RESET_TIMEOUT) > 0) {
// Try to reset circuit
circuitOpen.set(false);
failureCount.set(0);
} else {
return Optional.empty(); // Circuit open
}
}
try {
ChatResponse response = chatModel.call(new Prompt(message));
// Success - reset failure count
failureCount.set(0);
return Optional.of(response.getResult().getOutput().getContent());
} catch (Exception e) {
// Failure - increment counter
int failures = failureCount.incrementAndGet();
lastFailureTime = Instant.now();
if (failures >= FAILURE_THRESHOLD) {
circuitOpen.set(true);
logger.error("Circuit breaker opened after {} failures", failures);
}
return Optional.empty();
}
}
public boolean isCircuitOpen() {
return circuitOpen.get();
}
}public class GPUMemoryManager {
public OllamaChatModel createMemoryEfficientModel(OllamaApi ollamaApi) {
return OllamaChatModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaChatOptions.builder()
.model("llama3")
.numGPU(0) // CPU-only
.lowVRAM(true) // Enable low VRAM mode
.numCtx(2048) // Smaller context
.numBatch(256) // Smaller batch size
.keepAlive("1m") // Unload quickly
.build())
.build();
}
public OllamaChatModel createGPUOptimizedModel(OllamaApi ollamaApi) {
return OllamaChatModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaChatOptions.builder()
.model("llama3")
.numGPU(-1) // Use all GPU layers
.numCtx(8192) // Large context
.numBatch(1024) // Large batch
.useMLock(true) // Lock in RAM
.keepAlive("30m") // Keep loaded
.build())
.build();
}
}public class DiskSpaceManager {
public void cleanupOldModels(OllamaApi ollamaApi, long maxTotalSizeBytes) {
ListModelResponse response = ollamaApi.listModels();
// Sort by last modified (oldest first)
List<Model> sortedModels = response.models().stream()
.sorted(Comparator.comparing(Model::modifiedAt))
.toList();
long totalSize = sortedModels.stream()
.mapToLong(Model::size)
.sum();
// Delete oldest models until under limit
OllamaModelManager manager = new OllamaModelManager(ollamaApi);
for (Model model : sortedModels) {
if (totalSize <= maxTotalSizeBytes) {
break;
}
logger.info("Deleting old model: {} (size: {} MB)",
model.name(), model.size() / (1024 * 1024));
manager.deleteModel(model.name());
totalSize -= model.size();
}
}
}public class ExponentialBackoffRetry {
public <T> T executeWithRetry(
Supplier<T> operation,
int maxAttempts,
Duration initialDelay
) {
int attempt = 0;
Duration delay = initialDelay;
while (attempt < maxAttempts) {
try {
return operation.get();
} catch (ResourceAccessException e) {
attempt++;
if (attempt >= maxAttempts) {
throw new RuntimeException("Max retry attempts exceeded", e);
}
logger.warn("Attempt {} failed, retrying in {}", attempt, delay);
try {
Thread.sleep(delay.toMillis());
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new RuntimeException("Interrupted during retry", ie);
}
// Exponential backoff
delay = delay.multipliedBy(2);
}
}
throw new RuntimeException("Should not reach here");
}
// Usage
public String chatWithRetry(OllamaChatModel chatModel, String message) {
return executeWithRetry(
() -> {
ChatResponse response = chatModel.call(new Prompt(message));
return response.getResult().getOutput().getContent();
},
5, // Max 5 attempts
Duration.ofSeconds(1) // Start with 1 second
);
}
}public class ConnectionPoolManager {
public OllamaApi createPooledApi(int maxConnections) {
// Configure RestClient with connection pool
RestClient.Builder restClientBuilder = RestClient.builder()
.requestInterceptor((request, body, execution) -> {
// Add connection pool headers
request.getHeaders().add("Connection", "keep-alive");
return execution.execute(request, body);
});
// Configure WebClient with connection pool
WebClient.Builder webClientBuilder = WebClient.builder()
.codecs(configurer ->
configurer.defaultCodecs().maxInMemorySize(16 * 1024 * 1024)
);
return OllamaApi.builder()
.baseUrl("http://localhost:11434")
.restClientBuilder(restClientBuilder)
.webClientBuilder(webClientBuilder)
.build();
}
}public class TimeoutHandler {
public Optional<String> chatWithTimeout(
OllamaChatModel chatModel,
String message,
Duration timeout
) {
ExecutorService executor = Executors.newSingleThreadExecutor();
Future<String> future = executor.submit(() -> {
ChatResponse response = chatModel.call(new Prompt(message));
return response.getResult().getOutput().getContent();
});
try {
String result = future.get(timeout.toMillis(), TimeUnit.MILLISECONDS);
return Optional.of(result);
} catch (TimeoutException e) {
future.cancel(true);
logger.error("Request timed out after {}", timeout);
return Optional.empty();
} catch (Exception e) {
logger.error("Request failed", e);
return Optional.empty();
} finally {
executor.shutdown();
}
}
}public class StreamingTimeoutHandler {
public String streamWithTimeout(
OllamaChatModel chatModel,
String message,
Duration timeout
) {
StringBuilder result = new StringBuilder();
CountDownLatch latch = new CountDownLatch(1);
AtomicBoolean timedOut = new AtomicBoolean(false);
Flux<ChatResponse> stream = chatModel.stream(new Prompt(message))
.timeout(timeout)
.onErrorResume(TimeoutException.class, e -> {
timedOut.set(true);
return Flux.empty();
});
stream.subscribe(
chunk -> result.append(chunk.getResult().getOutput().getContent()),
error -> latch.countDown(),
latch::countDown
);
try {
latch.await();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
if (timedOut.get()) {
return result.toString() + " [TIMEOUT - Partial response]";
}
return result.toString();
}
}// Test maximum token generation
OllamaChatOptions options = OllamaChatOptions.builder()
.model("llama3")
.numPredict(-1) // Generate until context fills or stop sequence
.build();
// Or use -2 to fill entire context
OllamaChatOptions fillContextOptions = OllamaChatOptions.builder()
.model("llama3")
.numPredict(-2) // Fill context window
.numCtx(4096)
.build();// Minimum temperature (deterministic)
OllamaChatOptions deterministicOptions = OllamaChatOptions.builder()
.model("llama3")
.temperature(0.0)
.seed(42) // Fixed seed for reproducibility
.build();
// Maximum temperature (highly creative)
OllamaChatOptions creativeOptions = OllamaChatOptions.builder()
.model("llama3")
.temperature(2.0)
.build();
// Invalid temperature (will use default or error)
try {
OllamaChatOptions invalidOptions = OllamaChatOptions.builder()
.model("llama3")
.temperature(-1.0) // Invalid
.build();
} catch (Exception e) {
// Handle validation error
}// Minimum context
OllamaChatOptions minContextOptions = OllamaChatOptions.builder()
.model("llama3")
.numCtx(128) // Very small context
.build();
// Maximum context (model-dependent)
OllamaChatOptions maxContextOptions = OllamaChatOptions.builder()
.model(OllamaModel.MISTRAL_NEMO.id())
.numCtx(131072) // 128k tokens
.build();public class ModelComparator {
public record ComparisonResult(
String modelName,
String response,
Duration responseTime,
Integer tokenCount
) {}
public List<ComparisonResult> compareModels(
OllamaApi ollamaApi,
String prompt,
List<OllamaModel> models
) {
List<ComparisonResult> results = new ArrayList<>();
for (OllamaModel model : models) {
OllamaChatModel chatModel = OllamaChatModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaChatOptions.builder()
.model(model.id())
.temperature(0.7)
.build())
.build();
Instant start = Instant.now();
try {
ChatResponse response = chatModel.call(new Prompt(prompt));
Duration responseTime = Duration.between(start, Instant.now());
results.add(new ComparisonResult(
model.id(),
response.getResult().getOutput().getContent(),
responseTime,
response.getMetadata().getUsage().getTotalTokens()
));
} catch (Exception e) {
logger.error("Model {} failed: {}", model.id(), e.getMessage());
}
}
return results;
}
}public class DynamicModelSelector {
private final OllamaApi ollamaApi;
public String chatWithBestAvailableModel(String message) {
// Try models in order of preference
List<String> preferredModels = List.of(
"llama3:70b",
"llama3",
"mistral",
"qwen3:0.6b"
);
OllamaModelManager manager = new OllamaModelManager(ollamaApi);
for (String modelName : preferredModels) {
if (manager.isModelAvailable(modelName)) {
OllamaChatModel chatModel = OllamaChatModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaChatOptions.builder()
.model(modelName)
.build())
.build();
try {
ChatResponse response = chatModel.call(new Prompt(message));
return response.getResult().getOutput().getContent();
} catch (Exception e) {
logger.warn("Model {} failed, trying next", modelName);
continue;
}
}
}
throw new RuntimeException("No available models");
}
}For more examples, see:
tessl i tessl/maven-org-springframework-ai--spring-ai-ollama@1.1.1