LangChain4j integration for Chroma embedding store enabling storage, retrieval, and similarity search of vector embeddings with metadata filtering support for both API V1 and V2.
Comprehensive guide to metadata filtering in ChromaEmbeddingStore.
Filters enable searching and removing embeddings based on metadata conditions. Filters work with both search and remove operations.
import dev.langchain4j.store.embedding.filter.Filter;
import static dev.langchain4j.store.embedding.filter.MetadataFilterBuilder.*;Filter filter = metadataKey("status").isEqualTo("published");EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.maxResults(10)
.filter(filter)
.build();
EmbeddingSearchResult<TextSegment> result = store.search(request);Filter filter = metadataKey("status").isEqualTo("archived");
store.removeAll(filter);// String equality
Filter f1 = metadataKey("author").isEqualTo("John Doe");
// Numeric equality
Filter f2 = metadataKey("year").isEqualTo(2024);
Filter f3 = metadataKey("rating").isEqualTo(4.5);
// UUID equality
UUID id = UUID.fromString("...");
Filter f4 = metadataKey("document_id").isEqualTo(id);Filter f1 = metadataKey("status").isNotEqualTo("draft");
Filter f2 = metadataKey("priority").isNotEqualTo(0);Important: Comparison operators only work with numeric metadata values.
Filter f1 = metadataKey("year").isGreaterThan(2020);
Filter f2 = metadataKey("rating").isGreaterThan(4.0);
Filter f3 = metadataKey("price").isGreaterThan(99.99);Filter f1 = metadataKey("year").isGreaterThanOrEqualTo(2020);
Filter f2 = metadataKey("score").isGreaterThanOrEqualTo(0.75);Filter f1 = metadataKey("age").isLessThan(30);
Filter f2 = metadataKey("price").isLessThan(100.0);Filter f1 = metadataKey("year").isLessThanOrEqualTo(2024);
Filter f2 = metadataKey("rating").isLessThanOrEqualTo(3.0);// Match any of multiple values
Filter categories = metadataKey("category")
.isIn(Arrays.asList("tech", "science", "engineering"));
Filter authors = metadataKey("author")
.isIn(Arrays.asList("Alice", "Bob", "Charlie"));
Filter years = metadataKey("year")
.isIn(Arrays.asList(2022, 2023, 2024));// Exclude multiple values
Filter excludeStatuses = metadataKey("status")
.isNotIn(Arrays.asList("draft", "archived", "deleted"));All conditions must match:
Filter combined = metadataKey("status").isEqualTo("published")
.and(metadataKey("year").isGreaterThanOrEqualTo(2020))
.and(metadataKey("author").isEqualTo("John Doe"));At least one condition must match:
Filter categories = metadataKey("category").isEqualTo("tech")
.or(metadataKey("category").isEqualTo("science"))
.or(metadataKey("category").isEqualTo("math"));Combine AND and OR with parentheses/grouping:
// (status = "published") AND ((priority >= 5) OR (urgent = true))
Filter complex = metadataKey("status").isEqualTo("published")
.and(
metadataKey("priority").isGreaterThanOrEqualTo(5)
.or(metadataKey("urgent").isEqualTo(true))
);// Documents from 2020-2024 (inclusive)
Filter yearRange = metadataKey("year").isGreaterThanOrEqualTo(2020)
.and(metadataKey("year").isLessThanOrEqualTo(2024));// Last 90 days
long ninetyDaysAgo = System.currentTimeMillis() - (90L * 24 * 60 * 60 * 1000);
Filter recent = metadataKey("timestamp").isGreaterThanOrEqualTo(ninetyDaysAgo);
// Specific date range
long startDate = parseDate("2024-01-01");
long endDate = parseDate("2024-12-31");
Filter dateRange = metadataKey("created_at").isGreaterThanOrEqualTo(startDate)
.and(metadataKey("created_at").isLessThan(endDate));// Single category
Filter tech = metadataKey("category").isEqualTo("tech");
// Multiple categories
Filter multiCategory = metadataKey("category")
.isIn(Arrays.asList("tech", "science", "engineering"));
// Exclude categories
Filter excludeCategories = metadataKey("category")
.isNotIn(Arrays.asList("spam", "test", "draft"));// Published only
Filter published = metadataKey("status").isEqualTo("published");
// Not draft
Filter notDraft = metadataKey("status").isNotEqualTo("draft");
// Multiple allowed statuses
Filter activeStatuses = metadataKey("status")
.isIn(Arrays.asList("published", "featured", "promoted"));// High priority (>= 5)
Filter highPriority = metadataKey("priority").isGreaterThanOrEqualTo(5);
// High rating (> 4.0)
Filter highRated = metadataKey("rating").isGreaterThan(4.0);
// Mid-range scores
Filter midScore = metadataKey("score").isGreaterThanOrEqualTo(50)
.and(metadataKey("score").isLessThan(80));// Specific author
Filter author = metadataKey("author").isEqualTo("John Doe");
// Multiple authors
Filter authors = metadataKey("author")
.isIn(Arrays.asList("Alice", "Bob", "Charlie"));
// Exclude specific user
Filter notTestUser = metadataKey("user_id").isNotEqualTo("test-user");// Published recent tech articles by specific authors
Filter complexFilter = metadataKey("status").isEqualTo("published")
.and(metadataKey("category").isEqualTo("tech"))
.and(metadataKey("year").isGreaterThanOrEqualTo(2023))
.and(metadataKey("author").isIn(Arrays.asList("Alice", "Bob")));
// High priority OR recent
Filter urgentOrRecent = metadataKey("priority").isGreaterThanOrEqualTo(8)
.or(metadataKey("created_at").isGreaterThanOrEqualTo(recentTimestamp));public Filter buildUserFilter(String status, Integer minYear, List<String> categories) {
Filter filter = metadataKey("status").isEqualTo(status);
if (minYear != null) {
filter = filter.and(metadataKey("year").isGreaterThanOrEqualTo(minYear));
}
if (categories != null && !categories.isEmpty()) {
filter = filter.and(metadataKey("category").isIn(categories));
}
return filter;
}
// Usage
Filter userFilter = buildUserFilter("published", 2020, Arrays.asList("tech", "science"));public EmbeddingSearchResult<TextSegment> searchWithOptionalFilters(
Embedding queryEmbedding,
String category,
Integer minYear,
Double minRating
) {
Filter filter = null;
if (category != null) {
filter = metadataKey("category").isEqualTo(category);
}
if (minYear != null) {
Filter yearFilter = metadataKey("year").isGreaterThanOrEqualTo(minYear);
filter = (filter == null) ? yearFilter : filter.and(yearFilter);
}
if (minRating != null) {
Filter ratingFilter = metadataKey("rating").isGreaterThanOrEqualTo(minRating);
filter = (filter == null) ? ratingFilter : filter.and(ratingFilter);
}
EmbeddingSearchRequest.Builder requestBuilder = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.maxResults(10);
if (filter != null) {
requestBuilder.filter(filter);
}
return store.search(requestBuilder.build());
}// Reusable filter components
Filter publishedFilter = metadataKey("status").isEqualTo("published");
Filter recentFilter = metadataKey("year").isGreaterThanOrEqualTo(2023);
Filter techFilter = metadataKey("category").isEqualTo("tech");
// Compose filters
Filter recentPublished = publishedFilter.and(recentFilter);
Filter recentPublishedTech = recentPublished.and(techFilter);
// Use in search
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.filter(recentPublishedTech)
.maxResults(10)
.build();// VALID - numeric values
Filter yearFilter = metadataKey("year").isGreaterThan(2020);
Filter ratingFilter = metadataKey("rating").isGreaterThanOrEqualTo(4.5);
// INVALID - will cause runtime error
// Filter nameFilter = metadataKey("name").isGreaterThan("M"); // ERRORSolution: Use equality or collection filters for strings:
// Use isEqualTo for strings
Filter nameFilter = metadataKey("name").isEqualTo("Smith");
// Or isIn for multiple string values
Filter namesFilter = metadataKey("name")
.isIn(Arrays.asList("Smith", "Jones", "Brown"));// NOT SUPPORTED - Boolean type
// Metadata meta = new Metadata().put("active", true); // Will fail
// WORKAROUND 1 - Use string
Metadata meta1 = new Metadata().put("active", "true");
Filter f1 = metadataKey("active").isEqualTo("true");
// WORKAROUND 2 - Use 0/1
Metadata meta2 = new Metadata().put("active", 1);
Filter f2 = metadataKey("active").isEqualTo(1);Chroma doesn't natively support NOT operations. Simple NOT filters are converted:
// This NOT filter
Filter f1 = Filter.not(metadataKey("status").isEqualTo("draft"));
// Is converted to
Filter f2 = metadataKey("status").isNotEqualTo("draft");Complex NOT operations may not work as expected. Prefer positive filtering.
Filter filter = metadataKey("status").isEqualTo("outdated");
// Preview with search first
EmbeddingSearchRequest previewRequest = EmbeddingSearchRequest.builder()
.queryEmbedding(anyEmbedding)
.maxResults(1000)
.filter(filter)
.build();
EmbeddingSearchResult<TextSegment> preview = store.search(previewRequest);
System.out.println("Filter matches: " + preview.matches().size() + " documents");
// Confirm before removing
if (shouldProceed()) {
store.removeAll(filter);
}// Test filter with known data
Metadata testMeta = new Metadata()
.put("status", "published")
.put("year", 2024)
.put("category", "tech");
TextSegment testSeg = TextSegment.from("test", testMeta);
Embedding testEmb = Embedding.from(new float[]{1.0f, 0.0f, 0.0f});
String testId = store.add(testEmb, testSeg);
// Test filter
Filter filter = metadataKey("status").isEqualTo("published")
.and(metadataKey("year").isEqualTo(2024));
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(testEmb)
.filter(filter)
.maxResults(10)
.build();
EmbeddingSearchResult<TextSegment> result = store.search(request);
boolean found = result.matches().stream()
.anyMatch(match -> match.embeddingId().equals(testId));
System.out.println("Filter test " + (found ? "passed" : "failed"));
// Cleanup
store.remove(testId);More selective filters perform better:
// High selectivity - fast
Filter specific = metadataKey("id").isEqualTo("exact-id");
// Medium selectivity
Filter moderate = metadataKey("category").isEqualTo("tech");
// Low selectivity - slower
Filter broad = metadataKey("year").isGreaterThan(2000);Apply filters in queries, not after retrieval:
// GOOD - filter during search
Filter filter = metadataKey("category").isEqualTo("tech");
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.filter(filter)
.maxResults(10)
.build();
// BAD - filter after retrieval (inefficient)
EmbeddingSearchResult<TextSegment> all = store.search(requestWithoutFilter);
List<EmbeddingMatch<TextSegment>> filtered = all.matches().stream()
.filter(match -> "tech".equals(
match.embedded().metadata().getString("category")
))
.collect(Collectors.toList());Deeply nested filters may impact performance:
// Simple - fast
Filter simple = metadataKey("status").isEqualTo("published");
// Complex - slower
Filter complex = metadataKey("status").isEqualTo("published")
.and(
metadataKey("category").isIn(manyCategories)
.or(metadataKey("priority").isGreaterThan(5))
)
.and(metadataKey("year").isGreaterThanOrEqualTo(2020));See:
Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-chroma@1.11.0