LangChain4j integration for Chroma embedding store enabling storage, retrieval, and similarity search of vector embeddings with metadata filtering support for both API V1 and V2.
Efficient batch processing patterns for ChromaEmbeddingStore.
Single operations:
// N HTTP requests, high overhead
for (Embedding emb : embeddings) {
store.add(emb);
}Batch operations:
// 1 HTTP request, efficient
List<String> ids = store.addAll(embeddings);Performance difference: 10-100x faster for large datasets.
List<Embedding> embeddings = generateEmbeddings(documents);
List<String> ids = store.addAll(embeddings);List<Embedding> embeddings = new ArrayList<>();
List<TextSegment> segments = new ArrayList<>();
for (String doc : documents) {
embeddings.add(embeddingModel.embed(doc).content());
segments.add(TextSegment.from(doc));
}
List<String> ids = store.addAll(embeddings, segments);List<String> ids = new ArrayList<>();
List<Embedding> embeddings = new ArrayList<>();
List<TextSegment> segments = new ArrayList<>();
for (Document doc : documents) {
ids.add(doc.id());
Embedding emb = embeddingModel.embed(doc.text()).content();
embeddings.add(emb);
Metadata meta = new Metadata()
.put("author", doc.author())
.put("year", doc.year());
segments.add(TextSegment.from(doc.text(), meta));
}
store.addAll(ids, embeddings, segments);// Small batches: Real-time, low latency
int batchSize = 50;
// Medium batches: Balanced
int batchSize = 200;
// Large batches: Maximum throughput
int batchSize = 500;public void processInChunks(
List<String> documents,
int batchSize
) {
for (int i = 0; i < documents.size(); i += batchSize) {
int end = Math.min(i + batchSize, documents.size());
List<String> batch = documents.subList(i, end);
List<Embedding> embeddings = batch.stream()
.map(doc -> embeddingModel.embed(doc).content())
.collect(Collectors.toList());
List<TextSegment> segments = batch.stream()
.map(TextSegment::from)
.collect(Collectors.toList());
store.addAll(embeddings, segments);
System.out.println("Processed " + end + " / " + documents.size());
}
}public void processWithProgress(
List<String> documents,
int batchSize,
Consumer<Progress> progressCallback
) {
int total = documents.size();
for (int i = 0; i < total; i += batchSize) {
int end = Math.min(i + batchSize, total);
List<String> batch = documents.subList(i, end);
processBatch(batch);
Progress progress = new Progress(end, total);
progressCallback.accept(progress);
}
}
record Progress(int current, int total) {
public double percentage() {
return (current * 100.0) / total;
}
}public void processWithRetry(
List<String> documents,
int batchSize,
int maxRetries
) {
for (int i = 0; i < documents.size(); i += batchSize) {
int end = Math.min(i + batchSize, documents.size());
List<String> batch = documents.subList(i, end);
boolean success = false;
int attempt = 0;
while (!success && attempt < maxRetries) {
try {
processBatch(batch);
success = true;
} catch (Exception e) {
attempt++;
if (attempt >= maxRetries) {
System.err.println("Batch failed after " + maxRetries +
" attempts: " + e.getMessage());
// Log failed batch for manual processing
logFailedBatch(batch, e);
} else {
Thread.sleep(1000 * attempt); // Backoff
}
}
}
}
}public List<String> processWithFallback(
List<Embedding> embeddings,
List<TextSegment> segments
) {
try {
// Try batch first
return store.addAll(embeddings, segments);
} catch (Exception e) {
System.err.println("Batch failed, processing individually");
// Fallback to individual adds
List<String> ids = new ArrayList<>();
for (int i = 0; i < embeddings.size(); i++) {
try {
String id = store.add(embeddings.get(i), segments.get(i));
ids.add(id);
} catch (Exception itemError) {
System.err.println("Item " + i + " failed: " +
itemError.getMessage());
ids.add(null); // Mark failure
}
}
return ids;
}
}public void processParallel(
List<String> documents,
int batchSize
) {
int numBatches = (documents.size() + batchSize - 1) / batchSize;
IntStream.range(0, numBatches)
.parallel()
.forEach(batchIndex -> {
int start = batchIndex * batchSize;
int end = Math.min(start + batchSize, documents.size());
List<String> batch = documents.subList(start, end);
processBatch(batch);
});
}Warning: Ensure thread-safe access to store if processing in parallel.
public void processLargeFile(
Path filePath,
int batchSize
) throws IOException {
try (Stream<String> lines = Files.lines(filePath)) {
List<String> batch = new ArrayList<>();
lines.forEach(line -> {
batch.add(line);
if (batch.size() >= batchSize) {
processBatch(new ArrayList<>(batch));
batch.clear();
}
});
// Process remaining
if (!batch.isEmpty()) {
processBatch(batch);
}
}
}List<String> idsToRemove = Arrays.asList("id1", "id2", "id3");
store.removeAll(idsToRemove);Filter filter = metadataKey("status").isEqualTo("outdated");
store.removeAll(filter); // Batch remove on server sidepublic void measureBatchPerformance(
List<String> documents,
int[] batchSizes
) {
for (int batchSize : batchSizes) {
long start = System.currentTimeMillis();
processInChunks(documents, batchSize);
long duration = System.currentTimeMillis() - start;
double throughput = documents.size() / (duration / 1000.0);
System.out.println("Batch size " + batchSize +
": " + duration + "ms" +
", throughput: " + throughput + " docs/sec");
// Clear for next test
store.removeAll();
}
}See: Add Operations Guide for complete API details.
Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-chroma@1.11.0