LangChain4j integration for Chroma embedding store enabling storage, retrieval, and similarity search of vector embeddings with metadata filtering support for both API V1 and V2.
—
Comprehensive guide to using metadata with ChromaEmbeddingStore.
Metadata allows attaching key-value pairs to embeddings for:
Metadata metadata = new Metadata()
.put("string_field", "value") // String
.put("int_field", 42) // Integer
.put("long_field", 123456789L) // Long
.put("float_field", 3.14f) // Float
.put("double_field", 3.14159) // Double
.put("uuid_field", UUID.randomUUID()); // UUID// NOT SUPPORTED - will cause errors
// metadata.put("bool_field", true); // Boolean - NOT supported
// metadata.put("array_field", array); // Arrays - NOT supported
// metadata.put("object_field", object); // Objects - NOT supported// Option 1: Use string
Metadata meta1 = new Metadata().put("active", "true");
Filter f1 = metadataKey("active").isEqualTo("true");
// Option 2: Use 0/1
Metadata meta2 = new Metadata().put("active", 1);
Filter f2 = metadataKey("active").isEqualTo(1);Metadata docMetadata = new Metadata()
.put("title", "Document Title")
.put("author", "John Doe")
.put("created_at", System.currentTimeMillis())
.put("modified_at", System.currentTimeMillis())
.put("version", 1)
.put("document_id", UUID.randomUUID());Metadata categoryMetadata = new Metadata()
.put("category", "technology")
.put("subcategory", "machine-learning")
.put("tags", "ai,ml,embeddings") // Comma-separated
.put("difficulty", "intermediate");Metadata sourceMetadata = new Metadata()
.put("source", "/docs/guide.pdf")
.put("page", 42)
.put("chapter", 3)
.put("url", "https://example.com/docs/guide.pdf");long now = System.currentTimeMillis();
long thirtyDaysAgo = now - (30L * 24 * 60 * 60 * 1000);
Metadata timeMetadata = new Metadata()
.put("indexed_at", now)
.put("published_at", thirtyDaysAgo)
.put("year", 2024)
.put("month", 3);Metadata statusMetadata = new Metadata()
.put("status", "published")
.put("visibility", "public")
.put("archived", 0) // Boolean as int
.put("priority", 5);Metadata qualityMetadata = new Metadata()
.put("rating", 4.5)
.put("views", 1250)
.put("likes", 87)
.put("confidence", 0.95);Metadata meta = new Metadata();Metadata meta = new Metadata()
.put("key1", "value1")
.put("key2", 42)
.put("key3", 3.14);Map<String, Object> map = new HashMap<>();
map.put("author", "John Doe");
map.put("year", 2024);
map.put("rating", 4.5);
Metadata meta = new Metadata(map);public class MetadataBuilder {
public static Metadata forDocument(
String author,
int year,
String category
) {
return new Metadata()
.put("author", author)
.put("year", year)
.put("category", category)
.put("indexed_at", System.currentTimeMillis());
}
}
// Usage
Metadata meta = MetadataBuilder.forDocument("John Doe", 2024, "tech");Metadata meta = segment.metadata();
String author = meta.getString("author");
Integer year = meta.getInteger("year");
Double rating = meta.getDouble("rating");
UUID id = meta.getUUID("document_id");
Long timestamp = meta.getLong("created_at");
Float score = meta.getFloat("score");// Check existence first
if (meta.containsKey("year")) {
int year = meta.getInteger("year");
}
// Or handle null
Integer year = meta.getInteger("year");
if (year != null) {
// Use year
} else {
// Use default
year = 2024;
}Map<String, Object> map = meta.toMap();
// Iterate all values
for (Map.Entry<String, Object> entry : map.entrySet()) {
System.out.println(entry.getKey() + ": " + entry.getValue());
}Filter authorFilter = metadataKey("author").isEqualTo("John Doe");
Filter categoryFilter = metadataKey("category").isEqualTo("tech");// Only work with numeric metadata
Filter yearFilter = metadataKey("year").isGreaterThanOrEqualTo(2020);
Filter ratingFilter = metadataKey("rating").isGreaterThan(4.0);
Filter priorityFilter = metadataKey("priority").isLessThanOrEqualTo(5);Filter categoriesFilter = metadataKey("category")
.isIn(Arrays.asList("tech", "science", "engineering"));
Filter excludeStatuses = metadataKey("status")
.isNotIn(Arrays.asList("draft", "archived"));See: Filtering Guide for complete filter documentation.
Consider these questions:
public class DocumentMetadataSchema {
public static Metadata create(
String documentId,
String source,
String category,
String author,
int year,
double rating
) {
return new Metadata()
// Identity
.put("document_id", UUID.fromString(documentId))
.put("source", source)
// Classification
.put("category", category)
.put("author", author)
// Temporal
.put("year", year)
.put("indexed_at", System.currentTimeMillis())
// Quality
.put("rating", rating)
// Status
.put("status", "published")
.put("version", 1);
}
}// GOOD: Consistent naming conventions
.put("created_at", timestamp)
.put("updated_at", timestamp)
.put("indexed_at", timestamp)
// BAD: Inconsistent naming
.put("createdAt", timestamp) // camelCase
.put("updated_time", timestamp) // Different suffix
.put("indexTimestamp", timestamp) // Different formatpublic interface StandardMetadataFields {
String ID = "id";
String SOURCE = "source";
String CATEGORY = "category";
String AUTHOR = "author";
String CREATED_AT = "created_at";
String UPDATED_AT = "updated_at";
String STATUS = "status";
String VERSION = "version";
}
// Usage
Metadata meta = new Metadata()
.put(StandardMetadataFields.ID, id)
.put(StandardMetadataFields.SOURCE, source)
.put(StandardMetadataFields.CATEGORY, category);// Use multiple fields for hierarchy
Metadata hierarchical = new Metadata()
.put("category_l1", "technology")
.put("category_l2", "artificial-intelligence")
.put("category_l3", "machine-learning");
// Filter at any level
Filter l1Filter = metadataKey("category_l1").isEqualTo("technology");
Filter l2Filter = metadataKey("category_l2").isEqualTo("artificial-intelligence");Metadata versionedMetadata = new Metadata()
.put("document_id", documentId)
.put("version", 2)
.put("created_at", originalTimestamp)
.put("updated_at", System.currentTimeMillis())
.put("changelog", "Fixed typos, updated examples");
// Query latest version
Filter latestVersion = metadataKey("document_id").isEqualTo(documentId);
// Then sort by version in application codeSince arrays aren't supported, use comma-separated strings:
// Store
Metadata meta = new Metadata()
.put("tags", "ai,ml,embeddings,vector-search");
// Parse when needed
String tags = meta.getString("tags");
List<String> tagList = Arrays.asList(tags.split(","));
// Filter (limited - can only match exact string)
Filter tagFilter = metadataKey("tags").isEqualTo("ai,ml,embeddings,vector-search");
// Better: Use separate fields for important multi-values
Metadata betterMeta = new Metadata()
.put("primary_tag", "ai")
.put("secondary_tag", "ml")
.put("tertiary_tag", "embeddings");public Metadata computeMetadata(String text, String source) {
Metadata meta = new Metadata()
.put("source", source)
.put("indexed_at", System.currentTimeMillis());
// Compute values
int wordCount = text.split("\\s+").length;
meta.put("word_count", wordCount);
// Categorize by length
String lengthCategory = wordCount < 100 ? "short" :
wordCount < 500 ? "medium" : "long";
meta.put("length_category", lengthCategory);
// Extract year from text if possible
Pattern yearPattern = Pattern.compile("\\b(19|20)\\d{2}\\b");
Matcher matcher = yearPattern.matcher(text);
if (matcher.find()) {
meta.put("extracted_year", Integer.parseInt(matcher.group()));
}
return meta;
}// GOOD: Compact metadata
Metadata compact = new Metadata()
.put("id", id)
.put("category", category)
.put("year", year);
// BAD: Storing large text in metadata
Metadata bloated = new Metadata()
.put("id", id)
.put("full_text", longDocument); // Store in TextSegment insteadChoose metadata fields that you'll filter on:
// Fields you'll filter on
Metadata filterable = new Metadata()
.put("category", "tech") // Will filter
.put("year", 2024) // Will filter
.put("status", "published"); // Will filter
// Metadata that doesn't need filtering can still be stored
filterable.put("notes", "Internal notes"); // Won't filter, just metadatapublic class MetadataValidator {
public void validate(Metadata meta) {
// Required fields
if (!meta.containsKey("category")) {
throw new IllegalArgumentException("category is required");
}
// Type validation
Integer year = meta.getInteger("year");
if (year != null && (year < 1900 || year > 2100)) {
throw new IllegalArgumentException("invalid year: " + year);
}
// Value validation
String status = meta.getString("status");
if (status != null &&
!Arrays.asList("draft", "published", "archived").contains(status)) {
throw new IllegalArgumentException("invalid status: " + status);
}
}
}See:
Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-chroma