RAG (Retrieval-Augmented Generation) framework for the Embabel Agent platform providing content ingestion, chunking, hierarchical navigation, and semantic search capabilities
A comprehensive RAG (Retrieval-Augmented Generation) framework for the Embabel Agent platform on the JVM, providing content ingestion, chunking, hierarchical document navigation, and semantic retrieval capabilities for enhancing agent interactions with domain knowledge.
<dependency>
<groupId>com.embabel.agent</groupId>
<artifactId>embabel-agent-rag-core</artifactId>
<version>0.3.3</version>
</dependency>// Data models
import com.embabel.agent.rag.model.*
// Search operations
import com.embabel.agent.rag.service.*
// Content ingestion
import com.embabel.agent.rag.ingestion.*
// RAG tools for LLM integration
import com.embabel.agent.rag.tools.*
// Filtering
import com.embabel.agent.rag.filter.*
// Storage
import com.embabel.agent.rag.store.*Document Hierarchy: NavigableDocument → NavigableContainerSection → LeafSection → Chunk (indexed text segments)
Data Types: Retrievable (Chunks, NamedEntities, Facts), Source (input data), NamedEntity (structured with properties/relationships)
Search: VectorSearch (semantic similarity), TextSearch (Lucene syntax), RegexSearch (patterns), FilteringSearch (with metadata/entity filters)
import com.embabel.agent.rag.ingestion.*
import com.embabel.agent.rag.store.*
import com.embabel.agent.rag.tools.*
// Create content reader
val contentReader: HierarchicalContentReader = // implementation
// Set up chunking repository
val repository: ChunkingContentElementRepository = // implementation
// Parse and ingest document
val document = contentReader.parseUrl("https://example.com/docs")
val chunkIds = repository.writeAndChunkDocument(document)
// Create RAG tools for LLM
val ragTools = ToolishRag(
name = "documentation_search",
description = "Search documentation content",
searchOperations = repository
)
// Perform vector search
val results = repository.vectorSearch(
request = TextSimilaritySearchRequest(
query = "How to configure authentication?",
topK = 5
),
clazz = Chunk::class.java
)import com.embabel.agent.rag.service.*
import com.embabel.agent.rag.filter.*
val searchOps: FilteringVectorSearch = // implementation
// Search with metadata constraints
val results = searchOps.vectorSearchWithFilter(
request = TextSimilaritySearchRequest(
query = "authentication setup",
topK = 10,
similarityThreshold = 0.75
),
clazz = Chunk::class.java,
metadataFilter = PropertyFilter.eq("category", "security")
.and(PropertyFilter.gte("version", 2.0)),
entityFilter = null
)import com.embabel.agent.rag.model.*
import com.embabel.agent.rag.service.*
val entityRepo: NamedEntityDataRepository = // implementation
// Create and save entity
val person = SimpleNamedEntityData(
id = "person-123",
name = "Alice Smith",
description = "Software engineer",
properties = mapOf("role" to "engineer", "team" to "platform")
)
entityRepo.save(person)
// Search entities
val results = entityRepo.vectorSearch(
request = TextSimilaritySearchRequest("platform engineer", topK = 10),
metadataFilter = PropertyFilter.eq("team", "platform"),
entityFilter = null
)
// Navigate relationships
val related = entityRepo.findRelated(
source = RetrievableIdentifier.from(person),
relationshipName = "WORKS_WITH",
direction = RelationshipDirection.OUTGOING
)Request parameters for similarity-based searches.
data class TextSimilaritySearchRequest(
val query: String,
val topK: Int = 10,
val similarityThreshold: ZeroToOne = 0.0
)Result container for search operations.
data class SimilarityResult<T : Retrievable>(
val content: T,
val score: Double,
val embedding: Embedding? = null
)Text chunk for indexing and retrieval.
interface Chunk : Source, HierarchicalContentElement {
val text: String // Indexed text
val urtext: String // Raw text for citation
val parentId: String // Non-null parent reference
val pathFromRoot: List<String>?
val uri: String?
fun withText(transformed: String): Chunk
fun withAdditionalMetadata(metadata: Map<String, Any?>): Chunk
companion object {
fun create(
text: String,
parentId: String,
metadata: Map<String, Any?> = emptyMap(),
id: String = UUID.randomUUID().toString(),
urtext: String = text
): Chunk
}
}Semantic similarity search operations.
interface VectorSearch : TypeRetrievalOperations {
fun <T : Retrievable> vectorSearch(
request: TextSimilaritySearchRequest,
clazz: Class<T>
): List<SimilarityResult<T>>
}Vector search with metadata and entity filtering.
interface FilteringVectorSearch : VectorSearch {
fun <T : Retrievable> vectorSearchWithFilter(
request: TextSimilaritySearchRequest,
clazz: Class<T>,
metadataFilter: PropertyFilter?,
entityFilter: EntityFilter?
): List<SimilarityResult<T>>
}Full-text search operations.
interface TextSearch : TypeRetrievalOperations {
val luceneSyntaxNotes: String
fun <T : Retrievable> textSearch(
request: TextSimilaritySearchRequest,
clazz: Class<T>
): List<SimilarityResult<T>>
}Rich filtering for metadata and properties.
sealed interface PropertyFilter {
operator fun not(): PropertyFilter
infix fun and(other: PropertyFilter): PropertyFilter
infix fun or(other: PropertyFilter): PropertyFilter
companion object {
// Comparison
fun eq(key: String, value: Any): Eq
fun ne(key: String, value: Any): Ne
fun gt(key: String, value: Number): Gt
fun gte(key: String, value: Number): Gte
fun lt(key: String, value: Number): Lt
fun lte(key: String, value: Number): Lte
// Collections
fun `in`(key: String, vararg values: Any): In
fun nin(key: String, vararg values: Any): Nin
// Strings
fun contains(key: String, value: String): Contains
fun startsWith(key: String, value: String): StartsWith
fun endsWith(key: String, value: String): EndsWith
// Logical
fun and(vararg filters: PropertyFilter): And
fun or(vararg filters: PropertyFilter): Or
fun not(filter: PropertyFilter): Not
}
}Parse content from various sources into NavigableDocument structures.
interface HierarchicalContentReader {
fun parseUrl(url: String): NavigableDocument
fun parseResource(resourcePath: String): NavigableDocument
fun parseFile(file: File, url: String? = null): NavigableDocument
fun parseContent(inputStream: InputStream, uri: String): NavigableDocument
fun parseFromDirectory(
fileTools: FileReadTools,
config: DirectoryParsingConfig
): DirectoryParsingResult
}Repository with document chunking and lifecycle management.
interface ChunkingContentElementRepository : ContentElementRepository {
val enhancers: List<RetrievableEnhancer>
fun writeAndChunkDocument(root: NavigableDocument): List<String>
fun deleteRootAndDescendants(uri: String): DocumentDeletionResult?
fun findContentRootByUri(uri: String): ContentRoot?
fun existsRootWithUri(uri: String): Boolean
fun <T : Retrievable> enhance(retrievable: T): T
fun onNewRetrievables(retrievables: List<Retrievable>)
}Storage and retrieval of named entities with relationships.
interface NamedEntityDataRepository :
CoreSearchOperations,
FinderOperations,
FilteringVectorSearch,
FilteringTextSearch,
RelationshipNavigator {
fun save(entity: NamedEntityData): NamedEntityData
fun saveAll(entities: Collection<NamedEntityData>): List<NamedEntityData>
fun findById(id: String): NamedEntityData?
fun findByLabel(label: String): List<NamedEntityData>
fun delete(id: String): Boolean
fun createRelationship(
a: RetrievableIdentifier,
b: RetrievableIdentifier,
relationship: RelationshipData
)
fun findRelated(
source: RetrievableIdentifier,
relationshipName: String,
direction: RelationshipDirection
): List<NamedEntityData>
}Primary interface for exposing RAG capabilities to LLMs.
data class ToolishRag(
override val name: String,
override val description: String,
val searchOperations: SearchOperations,
val goal: String = DEFAULT_GOAL,
val formatter: RetrievableResultsFormatter = SimpleRetrievableResultsFormatter,
val vectorSearchFor: List<Class<out Retrievable>> = listOf(Chunk::class.java),
val textSearchFor: List<Class<out Retrievable>> = listOf(Chunk::class.java),
val hints: List<PromptContributor> = listOf(),
val listener: ResultsListener? = null,
val metadataFilter: PropertyFilter? = null,
val entityFilter: EntityFilter? = null
) : LlmReference {
fun withSearchFor(
vectorSearchFor: List<Class<out Retrievable>>,
textSearchFor: List<Class<out Retrievable>>
): ToolishRag
fun withHint(hint: PromptContributor): ToolishRag
fun withGoal(goal: String): ToolishRag
fun withListener(listener: ResultsListener): ToolishRag
fun withMetadataFilter(filter: PropertyFilter): ToolishRag
fun withEntityFilter(filter: EntityFilter): ToolishRag
override fun tools(): List<Tool>
override fun notes(): String
}Convert hierarchical documents into indexed chunks.
interface ContentChunker {
val chunkTransformer: ChunkTransformer
fun chunk(section: NavigableContainerSection): Iterable<Chunk>
data class Config(
val maxChunkSize: Int = 1500,
val overlapSize: Int = 200,
val embeddingBatchSize: Int = 100
)
companion object {
// Standard metadata keys: CHUNK_INDEX, TOTAL_CHUNKS, SEQUENCE_NUMBER,
// ROOT_DOCUMENT_ID, CONTAINER_SECTION_ID, CONTAINER_SECTION_TITLE,
// LEAF_SECTION_ID, LEAF_SECTION_TITLE
operator fun invoke(
config: Config,
chunkTransformer: ChunkTransformer
): InMemoryContentChunker
}
}See Quick Start Tasks above for complete examples of:
Separation of Concerns: Reading separates from storage, chunking from embedding, search from filtering.
Interface-Based: Dependencies through interfaces for testability, multiple implementations, clear contracts.
Composability: Chain transformers, combine filters, layer policies.
Progressive Disclosure: Start simple (basic search) → add complexity (filters, transformers, policies) as needed.