LangChain4j integration for Chroma embedding store enabling storage, retrieval, and similarity search of vector embeddings with metadata filtering support for both API V1 and V2.
—
Comprehensive guide to metadata filtering in ChromaEmbeddingStore.
Filters enable searching and removing embeddings based on metadata conditions. Filters work with both search and remove operations.
import dev.langchain4j.store.embedding.filter.Filter;
import static dev.langchain4j.store.embedding.filter.MetadataFilterBuilder.*;Filter filter = metadataKey("status").isEqualTo("published");EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.maxResults(10)
.filter(filter)
.build();
EmbeddingSearchResult<TextSegment> result = store.search(request);Filter filter = metadataKey("status").isEqualTo("archived");
store.removeAll(filter);// String equality
Filter f1 = metadataKey("author").isEqualTo("John Doe");
// Numeric equality
Filter f2 = metadataKey("year").isEqualTo(2024);
Filter f3 = metadataKey("rating").isEqualTo(4.5);
// UUID equality
UUID id = UUID.fromString("...");
Filter f4 = metadataKey("document_id").isEqualTo(id);Filter f1 = metadataKey("status").isNotEqualTo("draft");
Filter f2 = metadataKey("priority").isNotEqualTo(0);Important: Comparison operators only work with numeric metadata values.
Filter f1 = metadataKey("year").isGreaterThan(2020);
Filter f2 = metadataKey("rating").isGreaterThan(4.0);
Filter f3 = metadataKey("price").isGreaterThan(99.99);Filter f1 = metadataKey("year").isGreaterThanOrEqualTo(2020);
Filter f2 = metadataKey("score").isGreaterThanOrEqualTo(0.75);Filter f1 = metadataKey("age").isLessThan(30);
Filter f2 = metadataKey("price").isLessThan(100.0);Filter f1 = metadataKey("year").isLessThanOrEqualTo(2024);
Filter f2 = metadataKey("rating").isLessThanOrEqualTo(3.0);// Match any of multiple values
Filter categories = metadataKey("category")
.isIn(Arrays.asList("tech", "science", "engineering"));
Filter authors = metadataKey("author")
.isIn(Arrays.asList("Alice", "Bob", "Charlie"));
Filter years = metadataKey("year")
.isIn(Arrays.asList(2022, 2023, 2024));// Exclude multiple values
Filter excludeStatuses = metadataKey("status")
.isNotIn(Arrays.asList("draft", "archived", "deleted"));All conditions must match:
Filter combined = metadataKey("status").isEqualTo("published")
.and(metadataKey("year").isGreaterThanOrEqualTo(2020))
.and(metadataKey("author").isEqualTo("John Doe"));At least one condition must match:
Filter categories = metadataKey("category").isEqualTo("tech")
.or(metadataKey("category").isEqualTo("science"))
.or(metadataKey("category").isEqualTo("math"));Combine AND and OR with parentheses/grouping:
// (status = "published") AND ((priority >= 5) OR (urgent = true))
Filter complex = metadataKey("status").isEqualTo("published")
.and(
metadataKey("priority").isGreaterThanOrEqualTo(5)
.or(metadataKey("urgent").isEqualTo(true))
);// Documents from 2020-2024 (inclusive)
Filter yearRange = metadataKey("year").isGreaterThanOrEqualTo(2020)
.and(metadataKey("year").isLessThanOrEqualTo(2024));// Last 90 days
long ninetyDaysAgo = System.currentTimeMillis() - (90L * 24 * 60 * 60 * 1000);
Filter recent = metadataKey("timestamp").isGreaterThanOrEqualTo(ninetyDaysAgo);
// Specific date range
long startDate = parseDate("2024-01-01");
long endDate = parseDate("2024-12-31");
Filter dateRange = metadataKey("created_at").isGreaterThanOrEqualTo(startDate)
.and(metadataKey("created_at").isLessThan(endDate));// Single category
Filter tech = metadataKey("category").isEqualTo("tech");
// Multiple categories
Filter multiCategory = metadataKey("category")
.isIn(Arrays.asList("tech", "science", "engineering"));
// Exclude categories
Filter excludeCategories = metadataKey("category")
.isNotIn(Arrays.asList("spam", "test", "draft"));// Published only
Filter published = metadataKey("status").isEqualTo("published");
// Not draft
Filter notDraft = metadataKey("status").isNotEqualTo("draft");
// Multiple allowed statuses
Filter activeStatuses = metadataKey("status")
.isIn(Arrays.asList("published", "featured", "promoted"));// High priority (>= 5)
Filter highPriority = metadataKey("priority").isGreaterThanOrEqualTo(5);
// High rating (> 4.0)
Filter highRated = metadataKey("rating").isGreaterThan(4.0);
// Mid-range scores
Filter midScore = metadataKey("score").isGreaterThanOrEqualTo(50)
.and(metadataKey("score").isLessThan(80));// Specific author
Filter author = metadataKey("author").isEqualTo("John Doe");
// Multiple authors
Filter authors = metadataKey("author")
.isIn(Arrays.asList("Alice", "Bob", "Charlie"));
// Exclude specific user
Filter notTestUser = metadataKey("user_id").isNotEqualTo("test-user");// Published recent tech articles by specific authors
Filter complexFilter = metadataKey("status").isEqualTo("published")
.and(metadataKey("category").isEqualTo("tech"))
.and(metadataKey("year").isGreaterThanOrEqualTo(2023))
.and(metadataKey("author").isIn(Arrays.asList("Alice", "Bob")));
// High priority OR recent
Filter urgentOrRecent = metadataKey("priority").isGreaterThanOrEqualTo(8)
.or(metadataKey("created_at").isGreaterThanOrEqualTo(recentTimestamp));public Filter buildUserFilter(String status, Integer minYear, List<String> categories) {
Filter filter = metadataKey("status").isEqualTo(status);
if (minYear != null) {
filter = filter.and(metadataKey("year").isGreaterThanOrEqualTo(minYear));
}
if (categories != null && !categories.isEmpty()) {
filter = filter.and(metadataKey("category").isIn(categories));
}
return filter;
}
// Usage
Filter userFilter = buildUserFilter("published", 2020, Arrays.asList("tech", "science"));public EmbeddingSearchResult<TextSegment> searchWithOptionalFilters(
Embedding queryEmbedding,
String category,
Integer minYear,
Double minRating
) {
Filter filter = null;
if (category != null) {
filter = metadataKey("category").isEqualTo(category);
}
if (minYear != null) {
Filter yearFilter = metadataKey("year").isGreaterThanOrEqualTo(minYear);
filter = (filter == null) ? yearFilter : filter.and(yearFilter);
}
if (minRating != null) {
Filter ratingFilter = metadataKey("rating").isGreaterThanOrEqualTo(minRating);
filter = (filter == null) ? ratingFilter : filter.and(ratingFilter);
}
EmbeddingSearchRequest.Builder requestBuilder = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.maxResults(10);
if (filter != null) {
requestBuilder.filter(filter);
}
return store.search(requestBuilder.build());
}// Reusable filter components
Filter publishedFilter = metadataKey("status").isEqualTo("published");
Filter recentFilter = metadataKey("year").isGreaterThanOrEqualTo(2023);
Filter techFilter = metadataKey("category").isEqualTo("tech");
// Compose filters
Filter recentPublished = publishedFilter.and(recentFilter);
Filter recentPublishedTech = recentPublished.and(techFilter);
// Use in search
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.filter(recentPublishedTech)
.maxResults(10)
.build();// VALID - numeric values
Filter yearFilter = metadataKey("year").isGreaterThan(2020);
Filter ratingFilter = metadataKey("rating").isGreaterThanOrEqualTo(4.5);
// INVALID - will cause runtime error
// Filter nameFilter = metadataKey("name").isGreaterThan("M"); // ERRORSolution: Use equality or collection filters for strings:
// Use isEqualTo for strings
Filter nameFilter = metadataKey("name").isEqualTo("Smith");
// Or isIn for multiple string values
Filter namesFilter = metadataKey("name")
.isIn(Arrays.asList("Smith", "Jones", "Brown"));// NOT SUPPORTED - Boolean type
// Metadata meta = new Metadata().put("active", true); // Will fail
// WORKAROUND 1 - Use string
Metadata meta1 = new Metadata().put("active", "true");
Filter f1 = metadataKey("active").isEqualTo("true");
// WORKAROUND 2 - Use 0/1
Metadata meta2 = new Metadata().put("active", 1);
Filter f2 = metadataKey("active").isEqualTo(1);Chroma doesn't natively support NOT operations. Simple NOT filters are converted:
// This NOT filter
Filter f1 = Filter.not(metadataKey("status").isEqualTo("draft"));
// Is converted to
Filter f2 = metadataKey("status").isNotEqualTo("draft");Complex NOT operations may not work as expected. Prefer positive filtering.
Filter filter = metadataKey("status").isEqualTo("outdated");
// Preview with search first
EmbeddingSearchRequest previewRequest = EmbeddingSearchRequest.builder()
.queryEmbedding(anyEmbedding)
.maxResults(1000)
.filter(filter)
.build();
EmbeddingSearchResult<TextSegment> preview = store.search(previewRequest);
System.out.println("Filter matches: " + preview.matches().size() + " documents");
// Confirm before removing
if (shouldProceed()) {
store.removeAll(filter);
}// Test filter with known data
Metadata testMeta = new Metadata()
.put("status", "published")
.put("year", 2024)
.put("category", "tech");
TextSegment testSeg = TextSegment.from("test", testMeta);
Embedding testEmb = Embedding.from(new float[]{1.0f, 0.0f, 0.0f});
String testId = store.add(testEmb, testSeg);
// Test filter
Filter filter = metadataKey("status").isEqualTo("published")
.and(metadataKey("year").isEqualTo(2024));
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(testEmb)
.filter(filter)
.maxResults(10)
.build();
EmbeddingSearchResult<TextSegment> result = store.search(request);
boolean found = result.matches().stream()
.anyMatch(match -> match.embeddingId().equals(testId));
System.out.println("Filter test " + (found ? "passed" : "failed"));
// Cleanup
store.remove(testId);More selective filters perform better:
// High selectivity - fast
Filter specific = metadataKey("id").isEqualTo("exact-id");
// Medium selectivity
Filter moderate = metadataKey("category").isEqualTo("tech");
// Low selectivity - slower
Filter broad = metadataKey("year").isGreaterThan(2000);Apply filters in queries, not after retrieval:
// GOOD - filter during search
Filter filter = metadataKey("category").isEqualTo("tech");
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.filter(filter)
.maxResults(10)
.build();
// BAD - filter after retrieval (inefficient)
EmbeddingSearchResult<TextSegment> all = store.search(requestWithoutFilter);
List<EmbeddingMatch<TextSegment>> filtered = all.matches().stream()
.filter(match -> "tech".equals(
match.embedded().metadata().getString("category")
))
.collect(Collectors.toList());Deeply nested filters may impact performance:
// Simple - fast
Filter simple = metadataKey("status").isEqualTo("published");
// Complex - slower
Filter complex = metadataKey("status").isEqualTo("published")
.and(
metadataKey("category").isIn(manyCategories)
.or(metadataKey("priority").isGreaterThan(5))
)
.and(metadataKey("year").isGreaterThanOrEqualTo(2020));See:
Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-chroma