LangChain4j integration for Chroma embedding store enabling storage, retrieval, and similarity search of vector embeddings with metadata filtering support for both API V1 and V2.
Adding embeddings to the Chroma vector store.
import dev.langchain4j.data.embedding.Embedding;
Embedding embedding = Embedding.from(new float[]{0.1f, 0.2f, 0.3f});
String id = store.add(embedding);
// Returns: auto-generated UUID stringEmbedding embedding = Embedding.from(new float[]{0.1f, 0.2f, 0.3f});
store.add("custom-id-123", embedding);
// Returns: voidimport dev.langchain4j.data.segment.TextSegment;
Embedding embedding = Embedding.from(new float[]{0.1f, 0.2f, 0.3f});
TextSegment segment = TextSegment.from("This is the document text");
String id = store.add(embedding, segment);
// Returns: auto-generated UUID stringimport dev.langchain4j.data.document.Metadata;
Embedding embedding = Embedding.from(new float[]{0.1f, 0.2f, 0.3f});
Metadata metadata = new Metadata()
.put("author", "John Doe")
.put("year", 2024)
.put("category", "technology");
TextSegment segment = TextSegment.from("Document text", metadata);
String id = store.add(embedding, segment);List<Embedding> embeddings = Arrays.asList(
Embedding.from(new float[]{0.1f, 0.2f, 0.3f}),
Embedding.from(new float[]{0.4f, 0.5f, 0.6f}),
Embedding.from(new float[]{0.7f, 0.8f, 0.9f})
);
List<String> ids = store.addAll(embeddings);
// Returns: list of auto-generated IDsList<String> ids = Arrays.asList("id1", "id2", "id3");
List<Embedding> embeddings = Arrays.asList(
Embedding.from(new float[]{0.1f, 0.2f, 0.3f}),
Embedding.from(new float[]{0.4f, 0.5f, 0.6f}),
Embedding.from(new float[]{0.7f, 0.8f, 0.9f})
);
store.addAll(ids, embeddings, null);
// Returns: voidList<String> ids = Arrays.asList("doc1", "doc2", "doc3");
List<Embedding> embeddings = Arrays.asList(emb1, emb2, emb3);
List<TextSegment> segments = Arrays.asList(
TextSegment.from("First document", new Metadata().put("index", 1)),
TextSegment.from("Second document", new Metadata().put("index", 2)),
TextSegment.from("Third document", new Metadata().put("index", 3))
);
store.addAll(ids, embeddings, segments);
// Returns: voidList<Embedding> embeddings = Arrays.asList(emb1, emb2, emb3);
List<TextSegment> segments = Arrays.asList(
TextSegment.from("First document"),
TextSegment.from("Second document"),
TextSegment.from("Third document")
);
List<String> ids = store.addAll(embeddings, segments);
// Returns: list of auto-generated IDsEmbedding embedding = Embedding.from(new float[]{0.1f, 0.2f, 0.3f});List<Float> vector = Arrays.asList(0.1f, 0.2f, 0.3f);
Embedding embedding = Embedding.from(vector);import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.model.embedding.AllMiniLmL6V2EmbeddingModel;
EmbeddingModel model = new AllMiniLmL6V2EmbeddingModel();
Embedding embedding = model.embed("text to embed").content();// INEFFICIENT: Multiple HTTP requests
for (Embedding embedding : embeddings) {
store.add(embedding); // N requests
}
// EFFICIENT: Single HTTP request
List<String> ids = store.addAll(embeddings); // 1 requestBatch operations are significantly faster for multiple embeddings because they use a single HTTP request instead of N requests.
Use add(embedding) when:
Use addAll(embeddings) when:
Metadata metadata = new Metadata()
.put("string_field", "value") // String
.put("int_field", 42) // Integer
.put("long_field", 123456789L) // Long
.put("float_field", 3.14f) // Float
.put("double_field", 3.14159) // Double
.put("uuid_field", UUID.randomUUID()); // UUID
// NOT SUPPORTED: Boolean type is not supported by Chroma
// metadata.put("bool_field", true); // Will failimport dev.langchain4j.model.embedding.EmbeddingModel;
List<String> documents = loadDocuments();
EmbeddingModel model = createEmbeddingModel();
List<Embedding> embeddings = new ArrayList<>();
List<TextSegment> segments = new ArrayList<>();
for (String doc : documents) {
embeddings.add(model.embed(doc).content());
segments.add(TextSegment.from(doc));
}
List<String> ids = store.addAll(embeddings, segments);record Document(String text, String author, int year, String category) {}
List<Document> documents = loadDocuments();
List<String> ids = new ArrayList<>();
List<Embedding> embeddings = new ArrayList<>();
List<TextSegment> segments = new ArrayList<>();
for (Document doc : documents) {
ids.add(doc.id());
embeddings.add(model.embed(doc.text()).content());
Metadata metadata = new Metadata()
.put("author", doc.author())
.put("year", doc.year())
.put("category", doc.category());
segments.add(TextSegment.from(doc.text(), metadata));
}
store.addAll(ids, embeddings, segments);For very large datasets, process in chunks to avoid memory issues:
int batchSize = 100;
List<Document> allDocuments = loadLargeDataset();
for (int i = 0; i < allDocuments.size(); i += batchSize) {
int end = Math.min(i + batchSize, allDocuments.size());
List<Document> batch = allDocuments.subList(i, end);
List<Embedding> embeddings = new ArrayList<>();
List<TextSegment> segments = new ArrayList<>();
for (Document doc : batch) {
embeddings.add(model.embed(doc.text()).content());
segments.add(TextSegment.from(doc.text(), doc.metadata()));
}
store.addAll(embeddings, segments);
System.out.println("Indexed batch: " + (i/batchSize + 1));
}All embeddings in a collection must have the same dimensions:
// First embedding: 3 dimensions
store.add(Embedding.from(new float[]{0.1f, 0.2f, 0.3f}));
// ERROR: Different dimensions (4)
store.add(Embedding.from(new float[]{0.1f, 0.2f, 0.3f, 0.4f}));
// Throws: ChromaException about dimension mismatchMetadata metadata = new Metadata();
// VALID
metadata.put("name", "value");
metadata.put("count", 42);
metadata.put("score", 3.14);
// INVALID - Boolean not supported
// metadata.put("active", true); // Will cause issues in Chromatry {
String id = store.add(embedding);
} catch (java.net.http.HttpConnectTimeoutException e) {
System.err.println("Cannot connect to Chroma: " + e.getMessage());
} catch (java.net.http.HttpTimeoutException e) {
System.err.println("Add operation timed out: " + e.getMessage());
}See: ChromaEmbeddingStore API for complete method signatures.
Related:
Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-chroma@1.11.0