Core classes and interfaces of LangChain4j providing foundational abstractions for LLM interaction, RAG, embeddings, agents, and observability
Package: dev.langchain4j.data.document, dev.langchain4j.data.segment
Thread-Safety: Document and TextSegment are immutable and thread-safe
Use Case: Text processing for RAG (Retrieval Augmented Generation) systems
Documents and text segments represent the core data structures for working with textual content in RAG systems, enabling document ingestion, chunking, and retrieval.
package dev.langchain4j.data.document;
/**
* Represents a document with text content and metadata
* Immutability: Immutable, thread-safe
*/
public class Document {
private final String text;
private final Metadata metadata;
public static Document from(String text) { /* ... */ }
public static Document from(String text, Metadata metadata) { /* ... */ }
public static Document document(String text) { /* ... */ }
public static Document document(String text, Metadata metadata) { /* ... */ }
public String text() { return text; }
public Metadata metadata() { return metadata; }
}package dev.langchain4j.data.document;
import java.util.Map;
/**
* Key-value metadata attached to documents and segments
* Used for filtering, routing, and context tracking
* Immutability: Immutable, thread-safe
*/
public class Metadata {
private final Map<String, Object> map;
public static Metadata from(Map<String, Object> map) { /* ... */ }
public static Metadata metadata(String key, Object value) { /* ... */ }
public Object get(String key) { /* ... */ }
public String getString(String key) { /* ... */ }
public Integer getInteger(String key) { /* ... */ }
public Long getLong(String key) { /* ... */ }
public Double getDouble(String key) { /* ... */ }
public Map<String, Object> toMap() { return new HashMap<>(map); }
public Metadata add(String key, Object value) { /* Returns new instance */ }
public Metadata remove(String key) { /* Returns new instance */ }
}package dev.langchain4j.data.segment;
import dev.langchain4j.data.document.Metadata;
/**
* A chunk of text, typically derived from a Document
* Used as input to embedding models
* Immutability: Immutable, thread-safe
*/
public class TextSegment {
private final String text;
private final Metadata metadata;
public static TextSegment from(String text) { /* ... */ }
public static TextSegment from(String text, Metadata metadata) { /* ... */ }
public static TextSegment textSegment(String text) { /* ... */ }
public String text() { return text; }
public Metadata metadata() { return metadata; }
}package dev.langchain4j.data.document;
import dev.langchain4j.data.segment.TextSegment;
/**
* Splits documents into smaller segments for embedding
* Thread-Safety: Implementations should be thread-safe
*/
public interface DocumentSplitter {
/**
* Split single document into segments
* @param document Document to split (non-null)
* @return List of text segments
*/
List<TextSegment> split(Document document);
/**
* Split multiple documents into segments
* @param documents Documents to split (non-null)
* @return List of all segments from all documents
*/
List<TextSegment> splitAll(List<Document> documents);
}import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.Metadata;
// Simple document
Document doc1 = Document.from("This is document text.");
// Document with metadata
Metadata metadata = Metadata.from(Map.of(
"source", "user-manual.pdf",
"page", 42,
"category", "documentation",
"author", "John Doe",
"timestamp", System.currentTimeMillis()
));
Document doc2 = Document.from("Document content here.", metadata);
// Access content
String text = doc2.text();
String source = doc2.metadata().getString("source");
Integer page = doc2.metadata().getInteger("page");// Create metadata
Metadata meta = Metadata.from(Map.of(
"category", "technical",
"language", "en"
));
// Add more metadata (returns new instance - immutable)
Metadata extended = meta
.add("version", "1.0")
.add("author", "Alice");
// Query metadata
if (extended.get("category").equals("technical")) {
processTechnicalDoc(doc);
}
// Convert to map
Map<String, Object> map = meta.toMap();import java.nio.file.Files;
import java.nio.file.Path;
// Load text file
Path path = Path.of("document.txt");
String content = Files.readString(path);
Metadata metadata = Metadata.from(Map.of(
"source", path.toString(),
"fileName", path.getFileName().toString(),
"fileSize", Files.size(path),
"lastModified", Files.getLastModifiedTime(path).toMillis()
));
Document document = Document.from(content, metadata);import dev.langchain4j.data.document.splitter.DocumentSplitters;
import dev.langchain4j.data.segment.TextSegment;
// Create splitter (from langchain4j module, not core)
// This is conceptual - actual implementation varies by provider
DocumentSplitter splitter = DocumentSplitters.recursive(
300, // Max segment size in characters
50 // Overlap between segments
);
// Split document into segments
Document document = Document.from(longText, metadata);
List<TextSegment> segments = splitter.split(document);
System.out.println("Split into " + segments.size() + " segments");
// Each segment inherits document metadata
for (TextSegment segment : segments) {
System.out.println("Segment text: " + segment.text());
System.out.println("Source: " + segment.metadata().getString("source"));
}import dev.langchain4j.data.segment.TextSegment;
// Simple segment
TextSegment segment1 = TextSegment.from("Segment text");
// Segment with metadata
Metadata segmentMeta = Metadata.from(Map.of(
"source", "doc.pdf",
"page", 1,
"chunk_id", "chunk_0"
));
TextSegment segment2 = TextSegment.from("Segment text", segmentMeta);import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.store.embedding.EmbeddingStore;
/**
* Complete RAG ingestion pipeline
*/
public class DocumentIngestionPipeline {
private final DocumentSplitter splitter;
private final EmbeddingModel embeddingModel;
private final EmbeddingStore<TextSegment> embeddingStore;
public void ingest(Document document) {
// 1. Split document into segments
List<TextSegment> segments = splitter.split(document);
System.out.println("Split into " + segments.size() + " segments");
// 2. Generate embeddings (batch for efficiency)
Response<List<Embedding>> response = embeddingModel.embedAll(segments);
List<Embedding> embeddings = response.content();
// 3. Store segments with embeddings
for (int i = 0; i < segments.size(); i++) {
TextSegment segment = segments.get(i);
Embedding embedding = embeddings.get(i);
// Store returns ID
String id = embeddingStore.add(embedding, segment);
System.out.println("Stored segment with ID: " + id);
}
}
public void ingestBatch(List<Document> documents) {
// Split all documents
List<TextSegment> allSegments = new ArrayList<>();
for (Document doc : documents) {
allSegments.addAll(splitter.split(doc));
}
// Batch embed (much faster)
Response<List<Embedding>> response = embeddingModel.embedAll(allSegments);
List<Embedding> embeddings = response.content();
// Batch store
for (int i = 0; i < allSegments.size(); i++) {
embeddingStore.add(embeddings.get(i), allSegments.get(i));
}
System.out.println("Ingested " + documents.size() + " documents");
System.out.println("Created " + allSegments.size() + " segments");
}
}import dev.langchain4j.store.embedding.EmbeddingSearchRequest;
import dev.langchain4j.store.embedding.EmbeddingSearchResult;
import dev.langchain4j.store.embedding.EmbeddingMatch;
import dev.langchain4j.store.embedding.filter.Filter;
/**
* Retrieve relevant segments for a query
*/
public List<TextSegment> retrieveRelevant(String query, String category) {
// 1. Embed query
Embedding queryEmbedding = embeddingModel.embed(query).content();
// 2. Search with metadata filter
Filter filter = Filter.metadataKey("category").isEqualTo(category);
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.maxResults(5)
.minScore(0.7)
.filter(filter)
.build();
EmbeddingSearchResult<TextSegment> result = embeddingStore.search(request);
// 3. Extract segments
List<TextSegment> segments = result.matches().stream()
.map(EmbeddingMatch::embedded)
.collect(Collectors.toList());
return segments;
}// ✅ GOOD: Rich metadata for filtering and context
Metadata metadata = Metadata.from(Map.of(
"source", "user-manual.pdf",
"title", "Installation Guide",
"page", 15,
"section", "Configuration",
"category", "documentation",
"version", "2.1",
"language", "en",
"lastUpdated", Instant.now().toEpochMilli()
));
// ❌ BAD: Minimal or missing metadata
Metadata bad = Metadata.from(Map.of("file", "doc.pdf"));// Balance chunk size with context
// Too small: Loses context, more chunks, slower retrieval
// Too large: Less precise retrieval, exceeds embedding limits
// Typical ranges:
// - 200-500 chars: Very granular, good for exact matches
// - 500-1000 chars: Balanced (RECOMMENDED for most cases)
// - 1000-2000 chars: More context, fewer chunks
// - 2000+ chars: Risk exceeding model limits
DocumentSplitter splitter = DocumentSplitters.recursive(
800, // Target chunk size (chars)
100 // Overlap (preserves context across boundaries)
);public Document preprocess(String rawText, Metadata metadata) {
// Clean text
String cleaned = rawText
.replaceAll("\\s+", " ") // Normalize whitespace
.replaceAll("[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F]", "") // Remove control chars
.trim();
// Remove very short documents
if (cleaned.length() < 50) {
return null; // Skip
}
// Add preprocessing metadata
Metadata extended = metadata
.add("processed_at", System.currentTimeMillis())
.add("original_length", rawText.length())
.add("processed_length", cleaned.length());
return Document.from(cleaned, extended);
}// Use metadata for efficient filtering
Filter techDocsOnly = Filter.metadataKey("category").isEqualTo("technical");
Filter recentDocs = Filter.metadataKey("timestamp")
.isGreaterThan(Instant.now().minus(30, ChronoUnit.DAYS).toEpochMilli());
Filter combined = Filter.and(techDocsOnly, recentDocs);
// Apply when searching
EmbeddingSearchRequest request = EmbeddingSearchRequest.builder()
.queryEmbedding(queryEmbedding)
.filter(combined)
.build();// Text files
public Document loadTextFile(Path path) throws IOException {
String content = Files.readString(path);
Metadata metadata = Metadata.from(Map.of(
"source", path.toString(),
"type", "text/plain"
));
return Document.from(content, metadata);
}
// PDF files (requires separate library)
// public Document loadPdf(Path path) { ... }
// Web pages (requires separate library)
// public Document loadUrl(String url) { ... }
// Database records
public Document loadFromDatabase(ResultSet rs) throws SQLException {
String content = rs.getString("content");
Metadata metadata = Metadata.from(Map.of(
"id", rs.getLong("id"),
"title", rs.getString("title"),
"created_at", rs.getTimestamp("created_at").getTime(),
"source", "database"
));
return Document.from(content, metadata);
}| Pitfall | Solution |
|---|---|
| Chunks too small | Increase chunk size to 500-1000 chars |
| Chunks too large | Decrease size, may exceed embedding limits |
| No overlap | Add 10-20% overlap to preserve context |
| Missing metadata | Always include source, timestamp, category |
| Not preprocessing | Clean text before splitting |
| Duplicate documents | Use content hashing to detect duplicates |
// ✅ GOOD: Batch operations
List<Document> documents = loadAllDocuments();
List<TextSegment> allSegments = splitter.splitAll(documents);
Response<List<Embedding>> embeddings = embeddingModel.embedAll(allSegments);
// ❌ BAD: Individual operations in loop
for (Document doc : documents) {
List<TextSegment> segments = splitter.split(doc);
for (TextSegment segment : segments) {
Embedding emb = embeddingModel.embed(segment).content(); // SLOW!
embeddingStore.add(emb, segment);
}
}// For large document sets, process in batches
int BATCH_SIZE = 100;
List<List<Document>> batches = partition(allDocuments, BATCH_SIZE);
for (List<Document> batch : batches) {
ingestBatch(batch);
// GC can clean up between batches
}Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-core