RAG (Retrieval-Augmented Generation) framework for the Embabel Agent platform providing content ingestion, chunking, hierarchical navigation, and semantic search capabilities
This guide walks you through setting up a complete RAG (Retrieval-Augmented Generation) pipeline from document ingestion to LLM-powered search.
A basic RAG pipeline consists of:
Add the dependency to your project:
<dependency>
<groupId>com.embabel.agent</groupId>
<artifactId>embabel-agent-rag-core</artifactId>
<version>0.3.3</version>
</dependency>Create a content reader to parse documents from various sources.
import com.embabel.agent.rag.ingestion.HierarchicalContentReader
import java.io.File
val reader: HierarchicalContentReader = // implementation
// Parse from URL
val doc1 = reader.parseUrl("https://example.com/docs/guide.html")
println("Parsed: ${doc1.title} with ${doc1.children.count()} sections")
// Parse from file
val file = File("/path/to/document.md")
val doc2 = reader.parseFile(file, url = "file:///path/to/document.md")
// Parse from classpath resource
val doc3 = reader.parseResource("docs/readme.md")import com.embabel.agent.rag.ingestion.*
val config = DirectoryParsingConfig(
includedExtensions = setOf("md", "txt", "html"),
excludedDirectories = setOf("node_modules", ".git", "build"),
relativePath = "docs",
maxFileSize = 5_242_880, // 5 MB
followSymlinks = false,
maxDepth = 10
)
val fileTools: FileReadTools = // implementation
val result = reader.parseFromDirectory(fileTools, config)
println("Files processed: ${result.filesProcessed}")
println("Documents created: ${result.contentRoots.size}")
println("Sections extracted: ${result.totalSectionsExtracted}")
println("Processing time: ${result.processingTime}")
if (!result.success) {
result.errors.forEach { error ->
println("Error: $error")
}
}Set up a chunker to split documents into searchable pieces.
import com.embabel.agent.rag.ingestion.ContentChunker
import com.embabel.agent.rag.ingestion.ChunkTransformer
// Configure chunking parameters
val config = ContentChunker.Config(
maxChunkSize = 1500, // Max characters per chunk
overlapSize = 200, // Overlap between chunks
embeddingBatchSize = 100 // Batch size for embeddings
)
// Create chunker with optional transformer
val chunker = ContentChunker(
config = config,
chunkTransformer = ChunkTransformer.NO_OP // No transformation
)import com.embabel.agent.rag.model.Chunk
// Chunk the parsed document
val chunks = chunker.chunk(document).toList()
println("Created ${chunks.size} chunks")
// Examine chunk metadata
chunks.forEach { chunk ->
val chunkIndex = chunk.metadata[ContentChunker.CHUNK_INDEX]
val totalChunks = chunk.metadata[ContentChunker.TOTAL_CHUNKS]
val sectionTitle = chunk.metadata[ContentChunker.CONTAINER_SECTION_TITLE]
println("Chunk $chunkIndex of $totalChunks from '$sectionTitle'")
println(" Text length: ${chunk.text.length}")
println(" Parent: ${chunk.parentId}")
}Set up a repository to persist chunks with embeddings.
import com.embabel.agent.rag.store.ChunkingContentElementRepository
val repository: ChunkingContentElementRepository = // implementation
// Provision the repository (create indexes, tables, etc.)
repository.provision()
// Write and chunk document in one operation
val chunkIds = repository.writeAndChunkDocument(document)
println("Stored ${chunkIds.size} chunks")// Find existing content by URI
val existingRoot = repository.findContentRootByUri(document.uri)
if (existingRoot != null) {
println("Document already exists: ${existingRoot.title}")
// Delete old content before re-ingesting
val deletionResult = repository.deleteRootAndDescendants(document.uri)
if (deletionResult != null) {
println("Deleted ${deletionResult.chunksDeleted} chunks")
}
}
// Now write the new version
val chunkIds = repository.writeAndChunkDocument(document)Perform vector and text searches to find relevant chunks.
import com.embabel.agent.rag.service.*
import com.embabel.agent.rag.model.*
val searchOps: CoreSearchOperations = // implementation
// Vector similarity search
val vectorResults = searchOps.vectorSearch(
request = TextSimilaritySearchRequest(
query = "How to configure authentication?",
topK = 10,
similarityThreshold = 0.7
),
clazz = Chunk::class.java
)
vectorResults.forEach { result ->
println("Score: ${result.score}")
println("Text: ${result.content.text}")
println()
}// Text search with Lucene syntax
val textResults = searchOps.textSearch(
request = TextSimilaritySearchRequest(
query = "authentication AND (oauth OR jwt)",
topK = 20
),
clazz = Chunk::class.java
)
textResults.forEach { result ->
println("Match: ${result.content.text.take(100)}...")
}import com.embabel.agent.rag.service.ResultExpander
val expander: ResultExpander = // implementation
// Get a search result
val firstResult = vectorResults.first()
// Expand to include surrounding chunks
val expandedChunks = expander.expandResult(
id = firstResult.content.id,
method = ResultExpander.Method.SEQUENCE,
elementsToAdd = 2 // Add 2 chunks before and after
)
// Or zoom out to parent section
val parentSection = expander.expandResult(
id = firstResult.content.id,
method = ResultExpander.Method.ZOOM_OUT,
elementsToAdd = 1
)Create a ToolishRag instance to expose search capabilities to language models.
import com.embabel.agent.rag.tools.ToolishRag
// Create RAG tool for LLM
val ragTool = ToolishRag(
name = "documentation_search",
description = "Search technical documentation and guides",
searchOperations = repository
)
// Get tools to pass to LLM
val tools = ragTool.tools()
println("Exposed ${tools.size} tools to LLM")
// Get usage notes for LLM context
val notes = ragTool.notes()import com.embabel.agent.rag.filter.PropertyFilter
import com.embabel.agent.rag.tools.TryHyDE
// Add filters and hints
val customizedRag = ragTool
// Filter by metadata
.withMetadataFilter(
PropertyFilter.eq("category", "documentation")
.and(PropertyFilter.gte("version", 2.0))
)
// Add HyDE hint for better search
.withHint(
TryHyDE.usingConversationContext()
.withMaxWords(60)
)
// Add custom goal
.withGoal(
"Use this tool to find relevant documentation. " +
"Try both vector and text search for best results."
)Here's a complete end-to-end example:
import com.embabel.agent.rag.ingestion.*
import com.embabel.agent.rag.store.*
import com.embabel.agent.rag.service.*
import com.embabel.agent.rag.tools.*
import com.embabel.agent.rag.model.*
fun setupRagPipeline(
reader: HierarchicalContentReader,
repository: ChunkingContentElementRepository
): ToolishRag {
// 1. Configure chunking
val chunker = ContentChunker(
config = ContentChunker.Config(
maxChunkSize = 1500,
overlapSize = 200
),
chunkTransformer = ChunkTransformer.NO_OP
)
// 2. Provision storage
repository.provision()
// 3. Parse and ingest documents
val documents = listOf(
"https://example.com/docs/guide.html",
"https://example.com/docs/api.html",
"https://example.com/docs/tutorial.html"
)
documents.forEach { url ->
println("Ingesting: $url")
// Check if already exists
val existing = repository.findContentRootByUri(url)
if (existing != null) {
println(" Already exists, deleting old version")
repository.deleteRootAndDescendants(url)
}
// Parse and store
val document = reader.parseUrl(url)
val chunkIds = repository.writeAndChunkDocument(document)
println(" Created ${chunkIds.size} chunks")
}
// 4. Create RAG tool
return ToolishRag(
name = "documentation_search",
description = "Search product documentation and guides",
searchOperations = repository
).withHint(TryHyDE.usingConversationContext())
}
// Use the pipeline
val ragTool = setupRagPipeline(reader, repository)
// Search
val results = repository.vectorSearch(
request = TextSimilaritySearchRequest(
query = "How do I authenticate API requests?",
topK = 5
),
clazz = Chunk::class.java
)
results.forEach { result ->
println("Score: ${"%.3f".format(result.score)}")
println("Content: ${result.content.text}")
println()
}Implement a refresh policy to update content intelligently.
import com.embabel.agent.rag.ingestion.ContentRefreshPolicy
val policy: ContentRefreshPolicy = // implementation
// Check if content needs refreshing
val shouldRefresh = policy.shouldReread(
repository = repository,
rootUri = "https://example.com/docs/guide.html"
)
if (shouldRefresh) {
println("Content is stale, refreshing...")
// Ingest if needed
val refreshedDoc = policy.ingestUriIfNeeded(
repository = repository,
hierarchicalContentReader = reader,
rootUri = "https://example.com/docs/guide.html"
)
if (refreshedDoc != null) {
println("Successfully refreshed: ${refreshedDoc.title}")
}
} else {
println("Content is up to date")
}import com.embabel.agent.rag.ingestion.*
// Create custom transformer
class UppercaseTransformer : ChunkTransformer {
override val name = "uppercase"
override fun transform(
chunk: Chunk,
context: ChunkTransformationContext
): Chunk {
return chunk.withText(chunk.text.uppercase())
}
}
val chunker = ContentChunker(
config = ContentChunker.Config(),
chunkTransformer = UppercaseTransformer()
)fun ingestBatch(
urls: List<String>,
reader: HierarchicalContentReader,
repository: ChunkingContentElementRepository
) {
var successCount = 0
var errorCount = 0
urls.forEachIndexed { index, url ->
println("Processing ${index + 1}/${urls.size}: $url")
try {
// Delete existing
repository.findContentRootByUri(url)?.let {
repository.deleteRootAndDescendants(url)
}
// Parse and store
val document = reader.parseUrl(url)
val chunkIds = repository.writeAndChunkDocument(document)
println(" ✓ Created ${chunkIds.size} chunks")
successCount++
} catch (e: Exception) {
println(" ✗ Error: ${e.message}")
errorCount++
}
}
println("\nBatch complete: $successCount succeeded, $errorCount failed")
}import com.embabel.agent.rag.model.Fact
// Search across multiple content types
val ragTool = ToolishRag(
name = "knowledge_base",
description = "Search documentation and facts",
searchOperations = repository
).withSearchFor(
vectorSearchFor = listOf(Chunk::class.java, Fact::class.java),
textSearchFor = listOf(Chunk::class.java, Fact::class.java)
)// Handle documents with no content
val document = reader.parseUrl(url)
if (document.children.none()) {
println("Warning: Document has no sections")
// Skip or handle appropriately
} else {
repository.writeAndChunkDocument(document)
}// Adjust chunk size for large documents
val largeDocConfig = ContentChunker.Config(
maxChunkSize = 2000, // Larger chunks
overlapSize = 300, // More overlap
embeddingBatchSize = 50 // Smaller batches for memory
)val results = repository.vectorSearch(
request = TextSimilaritySearchRequest(
query = "rare search term",
topK = 10
),
clazz = Chunk::class.java
)
if (results.isEmpty()) {
println("No results found")
// Try with lower threshold
val relaxedResults = repository.vectorSearch(
request = TextSimilaritySearchRequest(
query = "rare search term",
topK = 10,
similarityThreshold = 0.5 // Lower threshold
),
clazz = Chunk::class.java
)
}