Spring Boot-compatible Ollama integration providing ChatModel and EmbeddingModel implementations for running large language models locally with support for streaming, tool calling, model management, and observability.
The OllamaEmbeddingModel class provides the implementation of Spring AI's EmbeddingModel interface for Ollama, enabling generation of vector embeddings for text documents.
package org.springframework.ai.ollama;
public class OllamaEmbeddingModel extends AbstractEmbeddingModel {
// Constructor
public OllamaEmbeddingModel(OllamaApi ollamaApi, OllamaEmbeddingOptions defaultOptions,
ObservationRegistry observationRegistry, ModelManagementOptions modelManagementOptions);
// Factory method
public static Builder builder();
// Embedding operations
public float[] embed(Document document);
public EmbeddingResponse call(EmbeddingRequest request);
// Configuration
public void setObservationConvention(EmbeddingModelObservationConvention observationConvention);
}public static final class Builder {
public Builder ollamaApi(OllamaApi ollamaApi);
public Builder defaultOptions(OllamaEmbeddingOptions defaultOptions);
public Builder observationRegistry(ObservationRegistry observationRegistry);
public Builder modelManagementOptions(ModelManagementOptions modelManagementOptions);
public OllamaEmbeddingModel build();
}Builder Defaults:
defaultOptions: OllamaEmbeddingOptions with model set to OllamaModel.MXBAI_EMBED_LARGE.id() ("mxbai-embed-large") if not providedobservationRegistry: ObservationRegistry.NOOP if not providedmodelManagementOptions: ModelManagementOptions.defaults() if not providedNote: The default embedding model is MXBAI_EMBED_LARGE (mxbai-embed-large).
import org.springframework.ai.ollama.OllamaEmbeddingModel;
import org.springframework.ai.ollama.api.OllamaApi;
import org.springframework.ai.ollama.api.OllamaEmbeddingOptions;
import org.springframework.ai.ollama.api.OllamaModel;
// Create API client
OllamaApi api = OllamaApi.builder()
.baseUrl("http://localhost:11434")
.build();
// Build embedding model
OllamaEmbeddingModel embeddingModel = OllamaEmbeddingModel.builder()
.ollamaApi(api)
.defaultOptions(OllamaEmbeddingOptions.builder()
.model(OllamaModel.NOMIC_EMBED_TEXT.id())
.build())
.build();
// Generate single embedding
float[] embedding = embeddingModel.embed("Hello world");
System.out.println("Embedding dimension: " + embedding.length);import org.springframework.ai.document.Document;
// Create document
Document document = new Document("This is my document content");
// Generate embedding
float[] embedding = embeddingModel.embed(document);import org.springframework.ai.embedding.EmbeddingRequest;
import org.springframework.ai.embedding.EmbeddingResponse;
import org.springframework.ai.embedding.Embedding;
import java.util.List;
// Multiple texts
List<String> texts = List.of(
"First document",
"Second document",
"Third document"
);
// Create request
EmbeddingRequest request = new EmbeddingRequest(texts, EmbeddingOptions.EMPTY);
// Generate embeddings
EmbeddingResponse response = embeddingModel.call(request);
// Access individual embeddings
for (Embedding emb : response.getResults()) {
float[] vector = emb.getOutput();
int index = emb.getIndex();
System.out.println("Embedding " + index + " dimension: " + vector.length);
}// Override options for specific request
OllamaEmbeddingOptions requestOptions = OllamaEmbeddingOptions.builder()
.truncate(true)
.keepAlive("10m")
.build();
EmbeddingRequest request = new EmbeddingRequest(texts, requestOptions);
EmbeddingResponse response = embeddingModel.call(request);import org.springframework.ai.ollama.management.ModelManagementOptions;
import org.springframework.ai.ollama.management.PullModelStrategy;
import java.time.Duration;
// Configure automatic model management
ModelManagementOptions modelMgmt = ModelManagementOptions.builder()
.pullModelStrategy(PullModelStrategy.WHEN_MISSING)
.timeout(Duration.ofMinutes(10))
.build();
OllamaEmbeddingModel embeddingModel = OllamaEmbeddingModel.builder()
.ollamaApi(api)
.defaultOptions(OllamaEmbeddingOptions.builder()
.model("nomic-embed-text")
.build())
.modelManagementOptions(modelMgmt)
.build();
// Model will be automatically pulled if not availableimport io.micrometer.observation.ObservationRegistry;
// Set up observability
ObservationRegistry registry = ObservationRegistry.create();
OllamaEmbeddingModel embeddingModel = OllamaEmbeddingModel.builder()
.ollamaApi(api)
.defaultOptions(options)
.observationRegistry(registry)
.build();
// Optional: custom observation convention
embeddingModel.setObservationConvention(new CustomEmbeddingModelObservationConvention());
// Metrics will be automatically tracked:
// - Token usage
// - Duration (total, load)
// - Model information// From Spring AI Core
public class EmbeddingResponse implements ModelResponse<Embedding> {
public List<Embedding> getResults();
public EmbeddingResponseMetadata getMetadata();
}
public class Embedding {
public float[] getOutput();
public int getIndex();
}
// Metadata includes:
// - Model name
// - Token usage
// - Duration informationOllama supports several embedding models:
// Using different embedding models
OllamaEmbeddingOptions nomicOptions = OllamaEmbeddingOptions.builder()
.model(OllamaModel.NOMIC_EMBED_TEXT.id())
.build();
OllamaEmbeddingOptions mxbaiOptions = OllamaEmbeddingOptions.builder()
.model(OllamaModel.MXBAI_EMBED_LARGE.id())
.build();Control how inputs that exceed the model's context length are handled:
OllamaEmbeddingOptions options = OllamaEmbeddingOptions.builder()
.model("nomic-embed-text")
.truncate(true) // Truncate long inputs (default: true)
.build();
// If truncate=false, an error will be thrown for inputs exceeding context lengthControl how long the model stays in memory after use:
OllamaEmbeddingOptions options = OllamaEmbeddingOptions.builder()
.model("nomic-embed-text")
.keepAlive("5m") // Keep model loaded for 5 minutes (default)
// Other values: "10m", "1h", "-1" (indefinite), "0" (unload immediately)
.build();Configure GPU usage and memory settings:
OllamaEmbeddingOptions options = OllamaEmbeddingOptions.builder()
.model("nomic-embed-text")
.numGPU(1) // Number of GPU layers
.mainGPU(0) // Primary GPU index
.lowVRAM(false) // Low VRAM mode
.useMMap(true) // Use memory mapping
.useMLock(false) // Lock model in memory
.numThread(8) // Number of threads
.build();See Embedding Options Documentation for complete configuration reference.
The EmbeddingResponse includes metadata:
EmbeddingResponse response = embeddingModel.call(request);
// Usage information
EmbeddingResponseMetadata metadata = response.getMetadata();
Integer promptTokens = metadata.getUsage().getPromptTokens();
String modelName = metadata.getModel();
// Each embedding has an index
for (Embedding emb : response.getResults()) {
int index = emb.getIndex(); // Position in input list
float[] vector = emb.getOutput();
}Once you have embeddings, compute similarity:
// Generate embeddings
float[] embedding1 = embeddingModel.embed("cat");
float[] embedding2 = embeddingModel.embed("kitten");
float[] embedding3 = embeddingModel.embed("car");
// Cosine similarity
double sim12 = cosineSimilarity(embedding1, embedding2); // High similarity
double sim13 = cosineSimilarity(embedding1, embedding3); // Low similarity
// Helper method for cosine similarity
private double cosineSimilarity(float[] a, float[] b) {
double dotProduct = 0.0;
double normA = 0.0;
double normB = 0.0;
for (int i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}Embeddings are commonly used with vector stores:
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.ai.document.Document;
import java.util.List;
// Assume you have a VectorStore configured with OllamaEmbeddingModel
VectorStore vectorStore = createVectorStore(embeddingModel);
// Add documents
List<Document> documents = List.of(
new Document("Content 1"),
new Document("Content 2")
);
vectorStore.add(documents);
// Search similar documents
List<Document> similar = vectorStore.similaritySearch("query text");try {
EmbeddingResponse response = embeddingModel.call(request);
} catch (RestClientException e) {
// Handle connection errors
} catch (IllegalArgumentException e) {
// Handle invalid requests (e.g., empty input)
} catch (Exception e) {
// Handle other errors
}OllamaEmbeddingModel instances are thread-safe and can be reused across multiple requests. It's recommended to create a single instance and share it.
Note that OllamaEmbeddingModel does not support the getDimensions() method from EmbeddingOptions interface (it returns null). Embedding dimensions are determined by the model itself:
OllamaEmbeddingOptions options = embeddingModel.getDefaultOptions();
Integer dims = options.getDimensions(); // Returns nulltessl i tessl/maven-org-springframework-ai--spring-ai-ollama@1.1.1