LangChain4j integration for Chroma embedding store enabling storage, retrieval, and similarity search of vector embeddings with metadata filtering support for both API V1 and V2.
—
Implementing semantic search using ChromaEmbeddingStore.
Semantic search finds content by meaning, not just keywords. It uses vector embeddings to capture semantic similarity.
Traditional keyword search:
Semantic search:
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.model.embedding.AllMiniLmL6V2EmbeddingModel;
EmbeddingModel embeddingModel = new AllMiniLmL6V2EmbeddingModel();
ChromaEmbeddingStore store = ChromaEmbeddingStore.builder()
.baseUrl("http://localhost:8000")
.collectionName("documents")
.build();List<String> documents = Arrays.asList(
"Artificial intelligence is transforming technology",
"Machine learning models learn from data patterns",
"Neural networks mimic human brain structure",
"Python is a popular programming language",
"Java is used for enterprise applications"
);
List<Embedding> embeddings = documents.stream()
.map(doc -> embeddingModel.embed(doc).content())
.collect(Collectors.toList());
List<TextSegment> segments = documents.stream()
.map(TextSegment::from)
.collect(Collectors.toList());
store.addAll(embeddings, segments);String query = "What is deep learning?";
Embedding queryEmbedding = embeddingModel.embed(query).content();
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.maxResults(3)
.minScore(0.5)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);
for (EmbeddingMatch<TextSegment> match : results.matches()) {
System.out.println("Score: " + match.score());
System.out.println("Text: " + match.embedded().text());
System.out.println();
}record Document(String text, String category, String author, int year) {}
List<Document> documents = loadDocuments();
List<Embedding> embeddings = new ArrayList<>();
List<TextSegment> segments = new ArrayList<>();
for (Document doc : documents) {
// Create embedding
Embedding emb = embeddingModel.embed(doc.text()).content();
embeddings.add(emb);
// Create segment with metadata
Metadata meta = new Metadata()
.put("category", doc.category())
.put("author", doc.author())
.put("year", doc.year());
segments.add(TextSegment.from(doc.text(), meta));
}
store.addAll(embeddings, segments);String query = "artificial intelligence applications";
Embedding queryEmb = embeddingModel.embed(query).content();
// Filter: category = "tech" AND year >= 2020
Filter filter = metadataKey("category").isEqualTo("tech")
.and(metadataKey("year").isGreaterThanOrEqualTo(2020));
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.minScore(0.7)
.filter(filter)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);public List<EmbeddingMatch<TextSegment>> multiQuerySearch(
String originalQuery
) {
// Generate query variations
List<String> queries = Arrays.asList(
originalQuery,
"What is " + originalQuery + "?",
"Explain " + originalQuery,
originalQuery + " overview"
);
Set<String> seenIds = new HashSet<>();
List<EmbeddingMatch<TextSegment>> allResults = new ArrayList<>();
for (String query : queries) {
Embedding queryEmb = embeddingModel.embed(query).content();
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(5)
.minScore(0.7)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);
// Deduplicate
for (EmbeddingMatch<TextSegment> match : results.matches()) {
if (!seenIds.contains(match.embeddingId())) {
seenIds.add(match.embeddingId());
allResults.add(match);
}
}
}
// Sort by score
allResults.sort((a, b) -> Double.compare(b.score(), a.score()));
return allResults;
}public List<TextSegment> hybridSearch(
String query,
String keywordFilter
) {
// Semantic search
Embedding queryEmb = embeddingModel.embed(query).content();
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(20)
.minScore(0.6)
.build();
EmbeddingSearchResult<TextSegment> semanticResults = store.search(request);
// Post-filter by keywords (simple contains check)
return semanticResults.matches().stream()
.map(EmbeddingMatch::embedded)
.filter(seg -> seg.text().toLowerCase()
.contains(keywordFilter.toLowerCase()))
.collect(Collectors.toList());
}public List<EmbeddingMatch<TextSegment>> searchWithReRanking(
String query
) {
Embedding queryEmb = embeddingModel.embed(query).content();
// Get more candidates
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(50)
.minScore(0.5)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);
// Re-rank using additional criteria
return results.matches().stream()
.sorted((a, b) -> {
double scoreA = computeRerankedScore(a, query);
double scoreB = computeRerankedScore(b, query);
return Double.compare(scoreB, scoreA);
})
.limit(10)
.collect(Collectors.toList());
}
private double computeRerankedScore(
EmbeddingMatch<TextSegment> match,
String query
) {
double embeddingScore = match.score();
// Factor in recency
Metadata meta = match.embedded().metadata();
double recencyScore = calculateRecency(meta.getLong("created_at"));
// Factor in quality indicators
double qualityScore = meta.getDouble("rating");
// Weighted combination
return (embeddingScore * 0.6) +
(recencyScore * 0.2) +
(qualityScore * 0.2);
}public Map<String, List<EmbeddingMatch<TextSegment>>> searchByCategory(
String query,
List<String> categories
) {
Map<String, List<EmbeddingMatch<TextSegment>>> resultsByCategory =
new HashMap<>();
Embedding queryEmb = embeddingModel.embed(query).content();
for (String category : categories) {
Filter categoryFilter = metadataKey("category").isEqualTo(category);
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(5)
.minScore(0.7)
.filter(categoryFilter)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);
resultsByCategory.put(category, results.matches());
}
return resultsByCategory;
}public List<EmbeddingMatch<TextSegment>> searchRecent(
String query,
int daysBack
) {
Embedding queryEmb = embeddingModel.embed(query).content();
long cutoffTime = System.currentTimeMillis() -
(daysBack * 24L * 60 * 60 * 1000);
Filter timeFilter = metadataKey("created_at")
.isGreaterThanOrEqualTo(cutoffTime);
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.minScore(0.7)
.filter(timeFilter)
.build();
return store.search(request).matches();
}// Interpret scores
for (EmbeddingMatch<TextSegment> match : results.matches()) {
double score = match.score();
String relevance = switch ((int)(score * 10)) {
case 10, 9 -> "Highly relevant";
case 8, 7 -> "Relevant";
case 6, 5 -> "Moderately relevant";
default -> "Marginally relevant";
};
System.out.println(relevance + " (" + score + "): " +
match.embedded().text());
}public List<EmbeddingMatch<TextSegment>> adaptiveSearch(String query) {
Embedding queryEmb = embeddingModel.embed(query).content();
// Try with high threshold first
EmbeddingSearchRequest strictRequest = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.minScore(0.8)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(strictRequest);
// If too few results, lower threshold
if (results.matches().size() < 3) {
EmbeddingSearchRequest relaxedRequest = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.minScore(0.6)
.build();
results = store.search(relaxedRequest);
}
return results.matches();
}public String preprocessQuery(String query) {
// Remove noise
query = query.trim().toLowerCase();
// Remove stop words (basic example)
List<String> stopWords = Arrays.asList("the", "a", "an", "and", "or");
String[] words = query.split("\\s+");
String processed = Arrays.stream(words)
.filter(word -> !stopWords.contains(word))
.collect(Collectors.joining(" "));
return processed;
}public List<EmbeddingMatch<TextSegment>> expandedSearch(String query) {
// Add related terms
String expandedQuery = query + " " +
getRelatedTerms(query);
Embedding queryEmb = embeddingModel.embed(expandedQuery).content();
return performSearch(queryEmb);
}
private String getRelatedTerms(String query) {
// Simple example - in practice, use thesaurus or LLM
Map<String, String> synonyms = Map.of(
"machine learning", "ML artificial intelligence AI",
"programming", "coding software development",
"database", "data storage persistence"
);
return synonyms.getOrDefault(query, "");
}public class CachedSemanticSearch {
private final Map<String, Embedding> queryCache = new ConcurrentHashMap<>();
private final Map<String, List<EmbeddingMatch<TextSegment>>> resultCache =
new ConcurrentHashMap<>();
public List<EmbeddingMatch<TextSegment>> search(String query) {
// Check result cache
if (resultCache.containsKey(query)) {
return resultCache.get(query);
}
// Get or compute embedding
Embedding queryEmb = queryCache.computeIfAbsent(
query,
q -> embeddingModel.embed(q).content()
);
// Perform search
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.build();
List<EmbeddingMatch<TextSegment>> results =
store.search(request).matches();
// Cache results
resultCache.put(query, results);
return results;
}
}public SearchResult searchWithSuggestions(String query) {
Embedding queryEmb = embeddingModel.embed(query).content();
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmb)
.maxResults(10)
.minScore(0.7)
.build();
EmbeddingSearchResult<TextSegment> results = store.search(request);
SearchResult result = new SearchResult();
result.matches = results.matches();
// If low quality results, suggest alternative queries
if (results.matches().isEmpty() ||
results.matches().get(0).score() < 0.6) {
result.suggestions = generateSuggestions(query);
}
return result;
}
class SearchResult {
List<EmbeddingMatch<TextSegment>> matches;
List<String> suggestions;
}public List<TextSegment> findRelated(String documentId) {
// Get the document
EmbeddingSearchRequest getDoc = EmbeddingSearchRequest.builder()
.queryEmbedding(anyEmbedding)
.filter(metadataKey("id").isEqualTo(documentId))
.maxResults(1)
.build();
EmbeddingSearchResult<TextSegment> docResult = store.search(getDoc);
if (docResult.matches().isEmpty()) {
return Collections.emptyList();
}
// Find similar
Embedding docEmb = docResult.matches().get(0).embedding();
EmbeddingSearchRequest relatedReq = EmbeddingSearchRequest.builder()
.queryEmbedding(docEmb)
.maxResults(6) // Get 6 to exclude the document itself
.minScore(0.7)
.build();
return store.search(relatedReq).matches().stream()
.filter(match -> !match.embeddingId().equals(documentId))
.limit(5)
.map(EmbeddingMatch::embedded)
.collect(Collectors.toList());
}Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-chroma