LangChain4j integration for Chroma embedding store enabling storage, retrieval, and similarity search of vector embeddings with metadata filtering support for both API V1 and V2.
—
Efficient batch processing patterns for ChromaEmbeddingStore.
Single operations:
// N HTTP requests, high overhead
for (Embedding emb : embeddings) {
store.add(emb);
}Batch operations:
// 1 HTTP request, efficient
List<String> ids = store.addAll(embeddings);Performance difference: 10-100x faster for large datasets.
List<Embedding> embeddings = generateEmbeddings(documents);
List<String> ids = store.addAll(embeddings);List<Embedding> embeddings = new ArrayList<>();
List<TextSegment> segments = new ArrayList<>();
for (String doc : documents) {
embeddings.add(embeddingModel.embed(doc).content());
segments.add(TextSegment.from(doc));
}
List<String> ids = store.addAll(embeddings, segments);List<String> ids = new ArrayList<>();
List<Embedding> embeddings = new ArrayList<>();
List<TextSegment> segments = new ArrayList<>();
for (Document doc : documents) {
ids.add(doc.id());
Embedding emb = embeddingModel.embed(doc.text()).content();
embeddings.add(emb);
Metadata meta = new Metadata()
.put("author", doc.author())
.put("year", doc.year());
segments.add(TextSegment.from(doc.text(), meta));
}
store.addAll(ids, embeddings, segments);// Small batches: Real-time, low latency
int batchSize = 50;
// Medium batches: Balanced
int batchSize = 200;
// Large batches: Maximum throughput
int batchSize = 500;public void processInChunks(
List<String> documents,
int batchSize
) {
for (int i = 0; i < documents.size(); i += batchSize) {
int end = Math.min(i + batchSize, documents.size());
List<String> batch = documents.subList(i, end);
List<Embedding> embeddings = batch.stream()
.map(doc -> embeddingModel.embed(doc).content())
.collect(Collectors.toList());
List<TextSegment> segments = batch.stream()
.map(TextSegment::from)
.collect(Collectors.toList());
store.addAll(embeddings, segments);
System.out.println("Processed " + end + " / " + documents.size());
}
}public void processWithProgress(
List<String> documents,
int batchSize,
Consumer<Progress> progressCallback
) {
int total = documents.size();
for (int i = 0; i < total; i += batchSize) {
int end = Math.min(i + batchSize, total);
List<String> batch = documents.subList(i, end);
processBatch(batch);
Progress progress = new Progress(end, total);
progressCallback.accept(progress);
}
}
record Progress(int current, int total) {
public double percentage() {
return (current * 100.0) / total;
}
}public void processWithRetry(
List<String> documents,
int batchSize,
int maxRetries
) {
for (int i = 0; i < documents.size(); i += batchSize) {
int end = Math.min(i + batchSize, documents.size());
List<String> batch = documents.subList(i, end);
boolean success = false;
int attempt = 0;
while (!success && attempt < maxRetries) {
try {
processBatch(batch);
success = true;
} catch (Exception e) {
attempt++;
if (attempt >= maxRetries) {
System.err.println("Batch failed after " + maxRetries +
" attempts: " + e.getMessage());
// Log failed batch for manual processing
logFailedBatch(batch, e);
} else {
Thread.sleep(1000 * attempt); // Backoff
}
}
}
}
}public List<String> processWithFallback(
List<Embedding> embeddings,
List<TextSegment> segments
) {
try {
// Try batch first
return store.addAll(embeddings, segments);
} catch (Exception e) {
System.err.println("Batch failed, processing individually");
// Fallback to individual adds
List<String> ids = new ArrayList<>();
for (int i = 0; i < embeddings.size(); i++) {
try {
String id = store.add(embeddings.get(i), segments.get(i));
ids.add(id);
} catch (Exception itemError) {
System.err.println("Item " + i + " failed: " +
itemError.getMessage());
ids.add(null); // Mark failure
}
}
return ids;
}
}public void processParallel(
List<String> documents,
int batchSize
) {
int numBatches = (documents.size() + batchSize - 1) / batchSize;
IntStream.range(0, numBatches)
.parallel()
.forEach(batchIndex -> {
int start = batchIndex * batchSize;
int end = Math.min(start + batchSize, documents.size());
List<String> batch = documents.subList(start, end);
processBatch(batch);
});
}Warning: Ensure thread-safe access to store if processing in parallel.
public void processLargeFile(
Path filePath,
int batchSize
) throws IOException {
try (Stream<String> lines = Files.lines(filePath)) {
List<String> batch = new ArrayList<>();
lines.forEach(line -> {
batch.add(line);
if (batch.size() >= batchSize) {
processBatch(new ArrayList<>(batch));
batch.clear();
}
});
// Process remaining
if (!batch.isEmpty()) {
processBatch(batch);
}
}
}List<String> idsToRemove = Arrays.asList("id1", "id2", "id3");
store.removeAll(idsToRemove);Filter filter = metadataKey("status").isEqualTo("outdated");
store.removeAll(filter); // Batch remove on server sidepublic void measureBatchPerformance(
List<String> documents,
int[] batchSizes
) {
for (int batchSize : batchSizes) {
long start = System.currentTimeMillis();
processInChunks(documents, batchSize);
long duration = System.currentTimeMillis() - start;
double throughput = documents.size() / (duration / 1000.0);
System.out.println("Batch size " + batchSize +
": " + duration + "ms" +
", throughput: " + throughput + " docs/sec");
// Clear for next test
store.removeAll();
}
}See: Add Operations Guide for complete API details.
Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-chroma