LangChain4j integration for Chroma embedding store enabling storage, retrieval, and similarity search of vector embeddings with metadata filtering support for both API V1 and V2.
Implementing semantic search using ChromaEmbeddingStore.
Semantic search finds content by meaning, not just keywords. It uses vector embeddings to capture semantic similarity.
Traditional keyword search:
Semantic search:
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.model.embedding.AllMiniLmL6V2EmbeddingModel;
EmbeddingModel embeddingModel = new AllMiniLmL6V2EmbeddingModel();
ChromaEmbeddingStore store = ChromaEmbeddingStore.builder()
.baseUrl("http://localhost:8000")
.collectionName("documents")
.build();List<String> documents = Arrays.asList(
"Artificial intelligence is transforming technology",
"Machine learning models learn from data patterns",
"Neural networks mimic human brain structure",
"Python is a popular programming language",
"Java is used for enterprise applications"
);
List<Embedding> embeddings = documents.stream()
.map(doc -> embeddingModel.embed(doc).content())
.collect(Collectors.toList());
List<TextSegment> segments = documents.stream()
.map(TextSegment::from)
.collect(Collectors.toList());
store.addAll(embeddings, segments);String query = "What is deep learning?";
Embedding queryEmbedding = embeddingModel.embed(query).content();
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.maxResults(3)
.minScore(0.5)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);
for (EmbeddingMatch<TextSegment> match : results.matches()) {
System.out.println("Score: " + match.score());
System.out.println("Text: " + match.embedded().text());
System.out.println();
}record Document(String text, String category, String author, int year) {}
List<Document> documents = loadDocuments();
List<Embedding> embeddings = new ArrayList<>();
List<TextSegment> segments = new ArrayList<>();
for (Document doc : documents) {
// Create embedding
Embedding emb = embeddingModel.embed(doc.text()).content();
embeddings.add(emb);
// Create segment with metadata
Metadata meta = new Metadata()
.put("category", doc.category())
.put("author", doc.author())
.put("year", doc.year());
segments.add(TextSegment.from(doc.text(), meta));
}
store.addAll(embeddings, segments);String query = "artificial intelligence applications";
Embedding queryEmb = embeddingModel.embed(query).content();
// Filter: category = "tech" AND year >= 2020
Filter filter = metadataKey("category").isEqualTo("tech")
.and(metadataKey("year").isGreaterThanOrEqualTo(2020));
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.minScore(0.7)
.filter(filter)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);public List<EmbeddingMatch<TextSegment>> multiQuerySearch(
String originalQuery
) {
// Generate query variations
List<String> queries = Arrays.asList(
originalQuery,
"What is " + originalQuery + "?",
"Explain " + originalQuery,
originalQuery + " overview"
);
Set<String> seenIds = new HashSet<>();
List<EmbeddingMatch<TextSegment>> allResults = new ArrayList<>();
for (String query : queries) {
Embedding queryEmb = embeddingModel.embed(query).content();
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(5)
.minScore(0.7)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);
// Deduplicate
for (EmbeddingMatch<TextSegment> match : results.matches()) {
if (!seenIds.contains(match.embeddingId())) {
seenIds.add(match.embeddingId());
allResults.add(match);
}
}
}
// Sort by score
allResults.sort((a, b) -> Double.compare(b.score(), a.score()));
return allResults;
}public List<TextSegment> hybridSearch(
String query,
String keywordFilter
) {
// Semantic search
Embedding queryEmb = embeddingModel.embed(query).content();
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(20)
.minScore(0.6)
.build();
EmbeddingSearchResult<TextSegment> semanticResults = store.search(request);
// Post-filter by keywords (simple contains check)
return semanticResults.matches().stream()
.map(EmbeddingMatch::embedded)
.filter(seg -> seg.text().toLowerCase()
.contains(keywordFilter.toLowerCase()))
.collect(Collectors.toList());
}public List<EmbeddingMatch<TextSegment>> searchWithReRanking(
String query
) {
Embedding queryEmb = embeddingModel.embed(query).content();
// Get more candidates
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(50)
.minScore(0.5)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);
// Re-rank using additional criteria
return results.matches().stream()
.sorted((a, b) -> {
double scoreA = computeRerankedScore(a, query);
double scoreB = computeRerankedScore(b, query);
return Double.compare(scoreB, scoreA);
})
.limit(10)
.collect(Collectors.toList());
}
private double computeRerankedScore(
EmbeddingMatch<TextSegment> match,
String query
) {
double embeddingScore = match.score();
// Factor in recency
Metadata meta = match.embedded().metadata();
double recencyScore = calculateRecency(meta.getLong("created_at"));
// Factor in quality indicators
double qualityScore = meta.getDouble("rating");
// Weighted combination
return (embeddingScore * 0.6) +
(recencyScore * 0.2) +
(qualityScore * 0.2);
}public Map<String, List<EmbeddingMatch<TextSegment>>> searchByCategory(
String query,
List<String> categories
) {
Map<String, List<EmbeddingMatch<TextSegment>>> resultsByCategory =
new HashMap<>();
Embedding queryEmb = embeddingModel.embed(query).content();
for (String category : categories) {
Filter categoryFilter = metadataKey("category").isEqualTo(category);
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(5)
.minScore(0.7)
.filter(categoryFilter)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);
resultsByCategory.put(category, results.matches());
}
return resultsByCategory;
}public List<EmbeddingMatch<TextSegment>> searchRecent(
String query,
int daysBack
) {
Embedding queryEmb = embeddingModel.embed(query).content();
long cutoffTime = System.currentTimeMillis() -
(daysBack * 24L * 60 * 60 * 1000);
Filter timeFilter = metadataKey("created_at")
.isGreaterThanOrEqualTo(cutoffTime);
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.minScore(0.7)
.filter(timeFilter)
.build();
return store.search(request).matches();
}// Interpret scores
for (EmbeddingMatch<TextSegment> match : results.matches()) {
double score = match.score();
String relevance = switch ((int)(score * 10)) {
case 10, 9 -> "Highly relevant";
case 8, 7 -> "Relevant";
case 6, 5 -> "Moderately relevant";
default -> "Marginally relevant";
};
System.out.println(relevance + " (" + score + "): " +
match.embedded().text());
}public List<EmbeddingMatch<TextSegment>> adaptiveSearch(String query) {
Embedding queryEmb = embeddingModel.embed(query).content();
// Try with high threshold first
EmbeddingSearchRequest strictRequest = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.minScore(0.8)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(strictRequest);
// If too few results, lower threshold
if (results.matches().size() < 3) {
EmbeddingSearchRequest relaxedRequest = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.minScore(0.6)
.build();
results = store.search(relaxedRequest);
}
return results.matches();
}public String preprocessQuery(String query) {
// Remove noise
query = query.trim().toLowerCase();
// Remove stop words (basic example)
List<String> stopWords = Arrays.asList("the", "a", "an", "and", "or");
String[] words = query.split("\\s+");
String processed = Arrays.stream(words)
.filter(word -> !stopWords.contains(word))
.collect(Collectors.joining(" "));
return processed;
}public List<EmbeddingMatch<TextSegment>> expandedSearch(String query) {
// Add related terms
String expandedQuery = query + " " +
getRelatedTerms(query);
Embedding queryEmb = embeddingModel.embed(expandedQuery).content();
return performSearch(queryEmb);
}
private String getRelatedTerms(String query) {
// Simple example - in practice, use thesaurus or LLM
Map<String, String> synonyms = Map.of(
"machine learning", "ML artificial intelligence AI",
"programming", "coding software development",
"database", "data storage persistence"
);
return synonyms.getOrDefault(query, "");
}public class CachedSemanticSearch {
private final Map<String, Embedding> queryCache = new ConcurrentHashMap<>();
private final Map<String, List<EmbeddingMatch<TextSegment>>> resultCache =
new ConcurrentHashMap<>();
public List<EmbeddingMatch<TextSegment>> search(String query) {
// Check result cache
if (resultCache.containsKey(query)) {
return resultCache.get(query);
}
// Get or compute embedding
Embedding queryEmb = queryCache.computeIfAbsent(
query,
q -> embeddingModel.embed(q).content()
);
// Perform search
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.build();
List<EmbeddingMatch<TextSegment>> results =
store.search(request).matches();
// Cache results
resultCache.put(query, results);
return results;
}
}public SearchResult searchWithSuggestions(String query) {
Embedding queryEmb = embeddingModel.embed(query).content();
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.minScore(0.7)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);
SearchResult result = new SearchResult();
result.matches = results.matches();
// If low quality results, suggest alternative queries
if (results.matches().isEmpty() ||
results.matches().get(0).score() < 0.6) {
result.suggestions = generateSuggestions(query);
}
return result;
}
class SearchResult {
List<EmbeddingMatch<TextSegment>> matches;
List<String> suggestions;
}public List<TextSegment> findRelated(String documentId) {
// Get the document
EmbeddingSearchRequest getDoc = EmbeddingSearchRequest.builder()
.queryEmbedding(anyEmbedding)
.filter(metadataKey("id").isEqualTo(documentId))
.maxResults(1)
.build();
EmbeddingSearchResult<TextSegment> docResult = store.search(getDoc);
if (docResult.matches().isEmpty()) {
return Collections.emptyList();
}
// Find similar
Embedding docEmb = docResult.matches().get(0).embedding();
EmbeddingSearchRequest relatedReq = EmbeddingSearchRequest.builder()
.queryEmbedding(docEmb)
.maxResults(6) // Get 6 to exclude the document itself
.minScore(0.7)
.build();
return store.search(relatedReq).matches().stream()
.filter(match -> !match.embeddingId().equals(documentId))
.limit(5)
.map(EmbeddingMatch::embedded)
.collect(Collectors.toList());
}Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-chroma@1.11.0