Spring Boot-compatible Ollama integration providing ChatModel and EmbeddingModel implementations for running large language models locally with support for streaming, tool calling, model management, and observability.
Monitor and measure Ollama model performance with Micrometer integration.
Spring AI Ollama integrates with Micrometer Observation API to provide metrics, tracing, and monitoring for chat and embedding operations. Track token usage, latencies, errors, and model performance.
Include Micrometer in your project:
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-core</artifactId>
</dependency>
<!-- For Prometheus -->
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>import io.micrometer.observation.ObservationRegistry;
import io.micrometer.core.instrument.simple.SimpleMeterRegistry;
// Simple registry for testing
ObservationRegistry observationRegistry = ObservationRegistry.create();
// Or with metrics backend
SimpleMeterRegistry meterRegistry = new SimpleMeterRegistry();
ObservationRegistry observationRegistry = ObservationRegistry.create();
observationRegistry.observationConfig()
.observationHandler(new DefaultMeterObservationHandler(meterRegistry));OllamaChatModel chatModel = OllamaChatModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaChatOptions.builder()
.model(OllamaModel.LLAMA3)
.build())
.observationRegistry(observationRegistry) // Enable observability
.build();OllamaEmbeddingModel embeddingModel = OllamaEmbeddingModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaEmbeddingOptions.builder()
.model(OllamaModel.NOMIC_EMBED_TEXT)
.build())
.observationRegistry(observationRegistry) // Enable observability
.build();gen_ai.client.token.usage
- Tags: gen_ai.token.type=input|output
- Description: Number of tokens used
- Type: Counter
gen_ai.client.operation.duration
- Tags: gen_ai.operation.name=chat
- Description: Operation duration in seconds
- Type: TimerOllama provides detailed timing information:
Access via response metadata:
ChatResponse response = chatModel.call(new Prompt("Hello"));
ChatResponseMetadata metadata = response.getMetadata();
Duration totalDuration = metadata.get("total-duration");
Duration loadDuration = metadata.get("load-duration");
Duration promptEvalDuration = metadata.get("prompt-eval-duration");
Duration evalDuration = metadata.get("eval-duration");
Integer promptTokens = metadata.get("prompt-eval-count");
Integer completionTokens = metadata.get("eval-count");gen_ai.client.operation.duration
- Tags: gen_ai.operation.name=embed
- Description: Embedding operation duration
- Type: Timer
gen_ai.client.token.usage
- Tags: gen_ai.token.type=input
- Description: Input tokens processed
- Type: CounterImplement ChatModelObservationConvention to customize observations:
import org.springframework.ai.chat.observation.ChatModelObservationContext;
import org.springframework.ai.chat.observation.ChatModelObservationConvention;
import io.micrometer.common.KeyValues;
public class CustomChatModelObservationConvention
implements ChatModelObservationConvention {
@Override
public String getName() {
return "ai.ollama.chat";
}
@Override
public String getContextualName(ChatModelObservationContext context) {
return "chat " + context.getRequest().getInstructions().get(0).getText();
}
@Override
public KeyValues getLowCardinalityKeyValues(ChatModelObservationContext context) {
return KeyValues.of(
"ai.provider", "ollama",
"ai.model", context.getRequest().getOptions().getModel(),
"ai.operation", "chat"
);
}
@Override
public KeyValues getHighCardinalityKeyValues(ChatModelObservationContext context) {
return KeyValues.of(
"ai.prompt", context.getRequest().getInstructions().get(0).getText(),
"ai.response.id", context.getResponse().getResult().getOutput().getMetadata().get("id")
);
}
}CustomChatModelObservationConvention customConvention =
new CustomChatModelObservationConvention();
chatModel.setObservationConvention(customConvention);@Configuration
public class ObservabilityConfig {
@Bean
public MeterRegistry meterRegistry() {
return new PrometheusMeterRegistry(PrometheusConfig.DEFAULT);
}
@Bean
public ObservationRegistry observationRegistry(MeterRegistry meterRegistry) {
ObservationRegistry registry = ObservationRegistry.create();
// Connect to metrics
registry.observationConfig()
.observationHandler(new DefaultMeterObservationHandler(meterRegistry));
// Add tracing (if using distributed tracing)
// registry.observationConfig()
// .observationHandler(new TracingAwareMeterObservationHandler<>(...));
return registry;
}
@Bean
public OllamaChatModel chatModel(
OllamaApi ollamaApi,
ObservationRegistry observationRegistry) {
return OllamaChatModel.builder()
.ollamaApi(ollamaApi)
.defaultOptions(OllamaChatOptions.builder()
.model(OllamaModel.LLAMA3)
.temperature(0.7)
.build())
.observationRegistry(observationRegistry)
.build();
}
}
// Expose Prometheus endpoint
@RestController
public class MetricsController {
private final PrometheusMeterRegistry registry;
@GetMapping("/metrics")
public String metrics() {
return registry.scrape();
}
}import io.micrometer.observation.ObservationHandler;
public class LoggingObservationHandler implements ObservationHandler<Observation.Context> {
private static final Logger logger = LoggerFactory.getLogger(LoggingObservationHandler.class);
@Override
public boolean supportsContext(Observation.Context context) {
return true;
}
@Override
public void onStart(Observation.Context context) {
logger.info("Observation started: {}", context.getName());
}
@Override
public void onStop(Observation.Context context) {
logger.info("Observation stopped: {} (duration: {}ms)",
context.getName(),
Duration.between(context.getOpeningScope().getCurrentObservation().getContext().get("start"),
Instant.now()).toMillis());
}
@Override
public void onError(Observation.Context context) {
logger.error("Observation error: {}", context.getName(), context.getError());
}
}
// Register handler
observationRegistry.observationConfig()
.observationHandler(new LoggingObservationHandler());@Service
public class TokenUsageTracker {
private final OllamaChatModel chatModel;
private final AtomicLong totalPromptTokens = new AtomicLong(0);
private final AtomicLong totalCompletionTokens = new AtomicLong(0);
public String chat(String message) {
ChatResponse response = chatModel.call(new Prompt(message));
// Extract token usage
ChatResponseMetadata metadata = response.getMetadata();
Integer promptTokens = metadata.get("prompt-eval-count");
Integer completionTokens = metadata.get("eval-count");
if (promptTokens != null) {
totalPromptTokens.addAndGet(promptTokens);
}
if (completionTokens != null) {
totalCompletionTokens.addAndGet(completionTokens);
}
logger.info("Request - Prompt tokens: {}, Completion tokens: {}",
promptTokens, completionTokens);
logger.info("Total - Prompt tokens: {}, Completion tokens: {}",
totalPromptTokens.get(), totalCompletionTokens.get());
return response.getResult().getOutput().getText();
}
public record TokenStats(long promptTokens, long completionTokens, long totalTokens) {}
public TokenStats getStats() {
long prompt = totalPromptTokens.get();
long completion = totalCompletionTokens.get();
return new TokenStats(prompt, completion, prompt + completion);
}
}@Service
public class PerformanceMonitor {
private final OllamaChatModel chatModel;
private final List<Duration> responseTimes = new CopyOnWriteArrayList<>();
public String chat(String message) {
Instant start = Instant.now();
ChatResponse response = chatModel.call(new Prompt(message));
Duration responseTime = Duration.between(start, Instant.now());
responseTimes.add(responseTime);
// Get detailed timing from metadata
ChatResponseMetadata metadata = response.getMetadata();
Duration totalDuration = metadata.get("total-duration");
Duration loadDuration = metadata.get("load-duration");
Duration promptEvalDuration = metadata.get("prompt-eval-duration");
Duration evalDuration = metadata.get("eval-duration");
logger.info("Performance breakdown:");
logger.info(" Total: {}", totalDuration);
logger.info(" Load: {}", loadDuration);
logger.info(" Prompt Eval: {}", promptEvalDuration);
logger.info(" Eval: {}", evalDuration);
return response.getResult().getOutput().getText();
}
public record PerformanceStats(
Duration avgResponseTime,
Duration minResponseTime,
Duration maxResponseTime,
long requestCount
) {}
public PerformanceStats getStats() {
if (responseTimes.isEmpty()) {
return new PerformanceStats(Duration.ZERO, Duration.ZERO, Duration.ZERO, 0);
}
Duration avg = Duration.ofNanos(
responseTimes.stream()
.mapToLong(Duration::toNanos)
.sum() / responseTimes.size()
);
Duration min = responseTimes.stream()
.min(Duration::compareTo)
.orElse(Duration.ZERO);
Duration max = responseTimes.stream()
.max(Duration::compareTo)
.orElse(Duration.ZERO);
return new PerformanceStats(avg, min, max, responseTimes.size());
}
}@Service
public class ErrorRateMonitor {
private final OllamaChatModel chatModel;
private final AtomicLong successCount = new AtomicLong(0);
private final AtomicLong errorCount = new AtomicLong(0);
public String chat(String message) {
try {
ChatResponse response = chatModel.call(new Prompt(message));
successCount.incrementAndGet();
return response.getResult().getOutput().getText();
} catch (Exception e) {
errorCount.incrementAndGet();
logger.error("Chat error", e);
throw e;
}
}
public record ErrorStats(
long successCount,
long errorCount,
double errorRate
) {}
public ErrorStats getStats() {
long success = successCount.get();
long errors = errorCount.get();
long total = success + errors;
double errorRate = total > 0 ? (errors * 100.0) / total : 0.0;
return new ErrorStats(success, errors, errorRate);
}
}Use consistent tags for filtering and aggregation:
KeyValues.of(
"ai.provider", "ollama",
"ai.model", modelName,
"ai.operation", operationType,
"environment", environment,
"application", applicationName
);@Override
public KeyValues getLowCardinalityKeyValues(ChatModelObservationContext context) {
return KeyValues.of(
"ai.provider", "ollama",
"ai.model", extractModelName(context),
"ai.operation", "chat"
);
}
@Override
public KeyValues getHighCardinalityKeyValues(ChatModelObservationContext context) {
return KeyValues.of(
"ai.prompt.length", String.valueOf(getPromptLength(context)),
"ai.user.id", extractUserId(context)
);
}For high-traffic applications, consider sampling:
ObservationRegistry registry = ObservationRegistry.create();
// Sample 10% of observations
registry.observationConfig()
.observationPredicate((name, context) -> Math.random() < 0.1);# HELP gen_ai_client_token_usage_total
# TYPE gen_ai_client_token_usage_total counter
gen_ai_client_token_usage_total{ai_model="llama3",ai_operation="chat",ai_provider="ollama",gen_ai_token_type="input"} 1247.0
gen_ai_client_token_usage_total{ai_model="llama3",ai_operation="chat",ai_provider="ollama",gen_ai_token_type="output"} 892.0
# HELP gen_ai_client_operation_duration_seconds
# TYPE gen_ai_client_operation_duration_seconds histogram
gen_ai_client_operation_duration_seconds_bucket{ai_model="llama3",ai_operation="chat",le="0.5"} 12.0
gen_ai_client_operation_duration_seconds_bucket{ai_model="llama3",ai_operation="chat",le="1.0"} 45.0
gen_ai_client_operation_duration_seconds_bucket{ai_model="llama3",ai_operation="chat",le="2.0"} 89.0
gen_ai_client_operation_duration_seconds_sum{ai_model="llama3",ai_operation="chat"} 125.4
gen_ai_client_operation_duration_seconds_count{ai_model="llama3",ai_operation="chat"} 100.0Average response time:
rate(gen_ai_client_operation_duration_seconds_sum{ai_operation="chat"}[5m])
/
rate(gen_ai_client_operation_duration_seconds_count{ai_operation="chat"}[5m])Token usage rate:
rate(gen_ai_client_token_usage_total{gen_ai_token_type="output"}[5m])P95 response time:
histogram_quantile(0.95,
rate(gen_ai_client_operation_duration_seconds_bucket{ai_operation="chat"}[5m])
)<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-tracing-bridge-brave</artifactId>
</dependency>
<dependency>
<groupId>io.zipkin.reporter2</groupId>
<artifactId>zipkin-reporter-brave</artifactId>
</dependency>@Configuration
public class TracingConfig {
@Bean
public ObservationRegistry observationRegistry(Tracer tracer) {
ObservationRegistry registry = ObservationRegistry.create();
// Add tracing handler
registry.observationConfig()
.observationHandler(
new TracingAwareMeterObservationHandler<>(
new DefaultTracingObservationHandler(tracer)
)
);
return registry;
}
}OllamaChatModel defines the following metadata key constants (from lines 97-112 of OllamaChatModel.java):
// Metadata key constants used by OllamaChatModel
private static final String METADATA_PROMPT_EVAL_COUNT = "prompt-eval-count";
private static final String METADATA_EVAL_COUNT = "eval-count";
private static final String METADATA_CREATED_AT = "created-at";
private static final String METADATA_TOTAL_DURATION = "total-duration";
private static final String METADATA_LOAD_DURATION = "load-duration";
private static final String METADATA_PROMPT_EVAL_DURATION = "prompt-eval-duration";
private static final String METADATA_EVAL_DURATION = "eval-duration";These constants are used internally to populate the ChatResponseMetadata. When accessing metadata, use the string keys directly.
ChatResponseMetadata metadata = response.getMetadata();
// Timing metadata (as Duration objects)
Duration totalDuration = metadata.get("total-duration");
Duration loadDuration = metadata.get("load-duration");
Duration promptEvalDuration = metadata.get("prompt-eval-duration");
Duration evalDuration = metadata.get("eval-duration");
// Token counts
Integer promptEvalCount = metadata.get("prompt-eval-count");
Integer evalCount = metadata.get("eval-count");
// Other metadata
Instant createdAt = metadata.get("created-at");
Boolean done = metadata.get("done");EmbeddingResponseMetadata metadata = response.getMetadata();
// Timing
Duration totalDuration = metadata.get("total-duration");
Duration loadDuration = metadata.get("load-duration");
// Token count
Integer promptEvalCount = metadata.get("prompt-eval-count");ObservationRegistry in productionObservationRegistry.NOOP disables observationstessl i tessl/maven-org-springframework-ai--spring-ai-ollama@1.1.1