Build LLM-powered applications in Java with support for chatbots, agents, RAG, tools, and much more
Loaders, parsers, splitters, and sources for working with documents. Supports loading from file system, classpath, and URLs, with various splitting strategies for creating text segments.
Load documents from the file system.
package dev.langchain4j.data.document.loader;
/**
* DocumentLoader for loading documents from the file system
*/
public class FileSystemDocumentLoader {
/**
* Load a single document from path
* @param filePath Path to the document
* @return Loaded document
*/
public static Document loadDocument(Path filePath);
/**
* Load a single document from string path
* @param filePath String path to the document
* @return Loaded document
*/
public static Document loadDocument(String filePath);
/**
* Load a single document with custom parser
* @param filePath Path to the document
* @param documentParser Parser to use
* @return Loaded document
*/
public static Document loadDocument(Path filePath, DocumentParser documentParser);
/**
* Load a single document with custom parser from string path
* @param filePath String path to the document
* @param documentParser Parser to use
* @return Loaded document
*/
public static Document loadDocument(String filePath, DocumentParser documentParser);
/**
* Load all documents from directory (non-recursive)
* @param directoryPath Path to directory
* @return List of loaded documents
*/
public static List<Document> loadDocuments(Path directoryPath);
/**
* Load all documents from directory with custom parser (non-recursive)
* @param directoryPath Path to directory
* @param documentParser Parser to use for all documents
* @return List of loaded documents
*/
public static List<Document> loadDocuments(Path directoryPath, DocumentParser documentParser);
/**
* Load matching documents from directory (non-recursive)
* @param directoryPath Path to directory
* @param pathMatcher Matcher to filter files
* @return List of loaded documents
*/
public static List<Document> loadDocuments(Path directoryPath, PathMatcher pathMatcher);
/**
* Load documents recursively from directory
* @param directoryPath Path to directory
* @return List of loaded documents
*/
public static List<Document> loadDocumentsRecursively(Path directoryPath);
/**
* Load documents recursively with matcher and parser
* @param directoryPath Path to directory
* @param pathMatcher Matcher to filter files
* @param documentParser Parser to use
* @return List of loaded documents
*/
public static List<Document> loadDocumentsRecursively(
Path directoryPath,
PathMatcher pathMatcher,
DocumentParser documentParser
);
}Thread Safety: All methods are static and stateless. Safe for concurrent use across threads. However, when loading the same file concurrently, OS-level file locks may apply. The Document objects returned are immutable after construction.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
NoSuchFileException - File or directory does not existAccessDeniedException - Insufficient permissions to read fileIOException - Generic I/O errors (disk full, network mount issues)OutOfMemoryError - File too large to load into memoryMalformedInputException - Invalid character encoding in fileRelated APIs: ClassPathDocumentLoader, UrlDocumentLoader, FileSystemSource, TextDocumentParser
Load documents from classpath resources.
package dev.langchain4j.data.document.loader;
/**
* DocumentLoader implementation for loading documents using ClassPathSource
*/
public class ClassPathDocumentLoader {
/**
* Load document from classpath
* @param pathOnClasspath Path to resource on classpath
* @return Loaded document
*/
public static Document loadDocument(String pathOnClasspath);
/**
* Load document from classpath with custom classloader
* @param pathOnClasspath Path to resource on classpath
* @param classLoader ClassLoader to use
* @return Loaded document
*/
public static Document loadDocument(String pathOnClasspath, ClassLoader classLoader);
/**
* Load document from classpath with custom parser
* @param pathOnClasspath Path to resource on classpath
* @param documentParser Parser to use
* @return Loaded document
*/
public static Document loadDocument(String pathOnClasspath, DocumentParser documentParser);
/**
* Load document from classpath with parser and classloader
* @param pathOnClasspath Path to resource on classpath
* @param documentParser Parser to use
* @param classLoader ClassLoader to use
* @return Loaded document
*/
public static Document loadDocument(
String pathOnClasspath,
DocumentParser documentParser,
ClassLoader classLoader
);
/**
* Load all documents from directory on classpath (non-recursive)
* @param directoryOnClasspath Path to directory on classpath
* @return List of loaded documents
*/
public static List<Document> loadDocuments(String directoryOnClasspath);
/**
* Load all documents from directory with custom classloader (non-recursive)
* @param directoryOnClasspath Path to directory on classpath
* @param classLoader ClassLoader to use
* @return List of loaded documents
*/
public static List<Document> loadDocuments(String directoryOnClasspath, ClassLoader classLoader);
/**
* Load documents from directory with custom parser (non-recursive)
* @param directoryOnClasspath Path to directory on classpath
* @param documentParser Parser to use
* @return List of loaded documents
*/
public static List<Document> loadDocuments(String directoryOnClasspath, DocumentParser documentParser);
/**
* Load matching documents from directory (non-recursive)
* @param directoryOnClasspath Path to directory on classpath
* @param pathMatcher Matcher to filter files
* @return List of loaded documents
*/
public static List<Document> loadDocuments(String directoryOnClasspath, PathMatcher pathMatcher);
/**
* Load documents recursively from directory
* @param directoryOnClasspath Path to directory on classpath
* @return List of loaded documents
*/
public static List<Document> loadDocumentsRecursively(String directoryOnClasspath);
/**
* Load documents recursively with matcher and parser
* @param directoryOnClasspath Path to directory on classpath
* @param pathMatcher Matcher to filter files
* @param documentParser Parser to use
* @return List of loaded documents
*/
public static List<Document> loadDocumentsRecursively(
String directoryOnClasspath,
PathMatcher pathMatcher,
DocumentParser documentParser
);
}Thread Safety: All methods are static and thread-safe. ClassLoader instances are typically thread-safe. Safe for concurrent loading of different resources. Loading same resource concurrently is safe but inefficient.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
NullPointerException - Resource not found on classpathIOException - Error reading from JAR fileIllegalArgumentException - Invalid path formatOutOfMemoryError - Resource too large to loadRelated APIs: FileSystemDocumentLoader, ClassPathSource, UrlDocumentLoader
Load documents from URLs.
package dev.langchain4j.data.document.loader;
/**
* DocumentLoader for loading documents from URLs
*/
public class UrlDocumentLoader {
/**
* Load document from URL
* @param url URL to load from
* @param documentParser Parser to use
* @return Loaded document
*/
public static Document load(URL url, DocumentParser documentParser);
/**
* Load document from string URL
* @param url String URL to load from
* @param documentParser Parser to use
* @return Loaded document
*/
public static Document load(String url, DocumentParser documentParser);
}Thread Safety: Static methods are thread-safe. However, underlying HTTP client uses default configuration which may have connection pool limits. Concurrent loads share connection pool.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
MalformedURLException - Invalid URL formatIOException - Network errors, HTTP errors (404, 500)UnknownHostException - DNS resolution failureSocketTimeoutException - Connection or read timeoutSSLException - HTTPS certificate validation failureOutOfMemoryError - Response too large for memoryRelated APIs: UrlSource, FileSystemDocumentLoader, ClassPathDocumentLoader
Parse plain text documents.
package dev.langchain4j.data.document.parser;
/**
* DocumentParser implementation for parsing plain text documents
*/
public class TextDocumentParser implements DocumentParser {
/**
* Constructor with default UTF-8 charset
*/
public TextDocumentParser();
/**
* Constructor with custom charset
* @param charset Charset to use for reading text
*/
public TextDocumentParser(Charset charset);
/**
* Parse input stream into document
* @param inputStream Input stream to parse
* @return Parsed document
*/
public Document parse(InputStream inputStream);
}Thread Safety: Instances are stateless and thread-safe. Safe to share single instance across threads. Parse method is reentrant.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
IOException - Stream read errorsMalformedInputException - Invalid character encodingUnmappableCharacterException - Characters not supported in charsetOutOfMemoryError - File too large for available heapRelated APIs: DocumentParser interface, ApachePdfBoxParser, ApacheTikaParser, TextDocumentParser subclasses
Document source for file system files.
package dev.langchain4j.data.document.source;
/**
* DocumentSource for file system sources
*/
public class FileSystemSource implements DocumentSource {
/**
* Constructor
* @param path Path to file
*/
public FileSystemSource(Path path);
/**
* Create from path
* @param filePath Path to file
* @return FileSystemSource instance
*/
public static FileSystemSource from(Path filePath);
/**
* Create from string path
* @param filePath String path to file
* @return FileSystemSource instance
*/
public static FileSystemSource from(String filePath);
/**
* Create from URI
* @param fileUri URI to file
* @return FileSystemSource instance
*/
public static FileSystemSource from(URI fileUri);
/**
* Create from File
* @param file File object
* @return FileSystemSource instance
*/
public static FileSystemSource from(File file);
/**
* Get input stream
* @return InputStream for reading file
*/
public InputStream inputStream();
/**
* Get metadata
* @return Metadata for the source
*/
public Metadata metadata();
}Thread Safety: Immutable after construction. Safe to share across threads. Each inputStream() call creates new FileInputStream, allowing concurrent reads.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
NoSuchFileException - File does not existAccessDeniedException - Insufficient permissionsIOException - Generic I/O errorsFileSystemException - File system specific errorsRelated APIs: FileSystemDocumentLoader, UrlSource, ClassPathSource, DocumentSource interface
Document source for classpath resources.
package dev.langchain4j.data.document.source;
/**
* DocumentSource specialization that reads from classpath
*/
public class ClassPathSource implements DocumentSource {
/**
* Create from classpath resource
* @param classPathResource Path to resource on classpath
* @return ClassPathSource instance
*/
public static ClassPathSource from(String classPathResource);
/**
* Create with custom classloader
* @param classPathResource Path to resource on classpath
* @param classLoader ClassLoader to use
* @return ClassPathSource instance
*/
public static ClassPathSource from(String classPathResource, ClassLoader classLoader);
/**
* Get the URL
* @return URL of the resource
*/
public URL url();
/**
* Get the classloader
* @return ClassLoader used
*/
public ClassLoader classLoader();
/**
* Check if inside archive (JAR)
* @return true if resource is inside a JAR file
*/
public boolean isInsideArchive();
/**
* Get input stream
* @return InputStream for reading resource
*/
public InputStream inputStream();
/**
* Get metadata
* @return Metadata for the source
*/
public Metadata metadata();
}Thread Safety: Immutable after construction. Thread-safe for concurrent access. Each inputStream() call creates independent stream.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
NullPointerException - Resource not found on classpathIOException - Error reading from JARIllegalArgumentException - Invalid resource pathOutOfMemoryError - Resource too largeRelated APIs: ClassPathDocumentLoader, FileSystemSource, UrlSource
Document source for URLs.
package dev.langchain4j.data.document.source;
/**
* DocumentSource for URL sources
*/
public class UrlSource implements DocumentSource {
/**
* Constructor
* @param url URL to load from
*/
public UrlSource(URL url);
/**
* Create from string URL
* @param url String URL
* @return UrlSource instance
*/
public static UrlSource from(String url);
/**
* Create from URL
* @param url URL object
* @return UrlSource instance
*/
public static UrlSource from(URL url);
/**
* Create from URI
* @param uri URI object
* @return UrlSource instance
*/
public static UrlSource from(URI uri);
/**
* Get input stream
* @return InputStream for reading from URL
*/
public InputStream inputStream();
/**
* Get metadata
* @return Metadata for the source
*/
public Metadata metadata();
}Thread Safety: Immutable after construction. Thread-safe for concurrent access. Each inputStream() call makes new HTTP request.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
MalformedURLException - Invalid URL formatIOException - Network errors, HTTP errorsUnknownHostException - DNS failureSSLException - HTTPS certificate errorsSocketTimeoutException - Connection timeoutOutOfMemoryError - Response too largeRelated APIs: UrlDocumentLoader, FileSystemSource, ClassPathSource
Factory methods for recommended document splitters.
package dev.langchain4j.data.document.splitter;
/**
* Utility class providing factory methods for recommended document splitters
*/
public class DocumentSplitters {
/**
* Create recursive splitter with token limits (recommended for generic text)
* Splits by paragraphs, then lines, then sentences, then words, then characters
* @param maxSegmentSizeInTokens Maximum segment size in tokens
* @param maxOverlapSizeInTokens Maximum overlap size in tokens
* @param tokenCountEstimator Token count estimator
* @return Configured document splitter
*/
public static DocumentSplitter recursive(
int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
TokenCountEstimator tokenCountEstimator
);
/**
* Create recursive splitter with character limits
* Splits by paragraphs, then lines, then sentences, then words, then characters
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
* @return Configured document splitter
*/
public static DocumentSplitter recursive(
int maxSegmentSizeInChars,
int maxOverlapSizeInChars
);
}Thread Safety: Factory methods are static and thread-safe. Returned DocumentSplitter instances are stateless and thread-safe. Safe to share splitter instance across threads.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
IllegalArgumentException - Invalid parameters (negative sizes, overlap > segment size)NullPointerException - Null tokenizer for token-based splittingOutOfMemoryError - Document too large with very small segment sizeRelated APIs: DocumentByParagraphSplitter, DocumentBySentenceSplitter, HierarchicalDocumentSplitter
Base class for hierarchical document splitters.
package dev.langchain4j.data.document.splitter;
/**
* Base class for hierarchical document splitters
* Provides machinery for sub-splitting documents when a single segment is too long
*/
public abstract class HierarchicalDocumentSplitter implements DocumentSplitter {
/**
* Split document into segments
* @param document Document to split
* @return List of text segments
*/
public List<TextSegment> split(Document document);
/**
* Split text implementation (abstract)
* @param text Text to split
* @return Array of split parts
*/
protected abstract String[] split(String text);
/**
* Get join delimiter (abstract)
* @return Delimiter used to join parts
*/
protected abstract String joinDelimiter();
/**
* Get default sub-splitter (abstract)
* @return Default sub-splitter to use if segment is too large
*/
protected abstract DocumentSplitter defaultSubSplitter();
/**
* Get overlap region at end of segment
* @param segmentText Segment text
* @return Overlap text
*/
protected String overlapFrom(String segmentText);
/**
* Estimate size in tokens or characters
* @param text Text to estimate
* @return Estimated size
*/
protected int estimateSize(String text);
/**
* Create segment with metadata
* @param text Segment text
* @param document Source document
* @param index Segment index
* @return Text segment with metadata
*/
protected static TextSegment createSegment(String text, Document document, int index);
}Thread Safety: Implementations are stateless and thread-safe if TokenCountEstimator is thread-safe. Safe to share across threads for splitting different documents concurrently.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
IllegalArgumentException - Invalid configuration (overlap > segment size)StackOverflowError - Infinite sub-splitting recursionOutOfMemoryError - Too many segments generatedRelated APIs: DocumentSplitters, DocumentByParagraphSplitter, DocumentBySentenceSplitter
Split documents by paragraphs.
package dev.langchain4j.data.document.splitter;
/**
* Splits documents into paragraphs and fits as many as possible into a single TextSegment
* Paragraph boundaries detected by double newlines
* Default sub-splitter is DocumentBySentenceSplitter
*/
public class DocumentByParagraphSplitter extends HierarchicalDocumentSplitter {
/**
* Constructor with character limits
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
*/
public DocumentByParagraphSplitter(int maxSegmentSizeInChars, int maxOverlapSizeInChars);
/**
* Constructor with sub-splitter
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
* @param subSplitter Sub-splitter to use for large paragraphs
*/
public DocumentByParagraphSplitter(
int maxSegmentSizeInChars,
int maxOverlapSizeInChars,
DocumentSplitter subSplitter
);
/**
* Constructor with token limits
* @param maxSegmentSizeInTokens Maximum segment size in tokens
* @param maxOverlapSizeInTokens Maximum overlap size in tokens
* @param tokenCountEstimator Token count estimator
*/
public DocumentByParagraphSplitter(
int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
TokenCountEstimator tokenCountEstimator
);
/**
* Split text by paragraphs
* @param text Text to split
* @return Array of paragraphs
*/
protected String[] split(String text);
/**
* Get join delimiter
* @return "\n\n" (double newline)
*/
protected String joinDelimiter();
/**
* Get default sub-splitter
* @return DocumentBySentenceSplitter instance
*/
protected DocumentSplitter defaultSubSplitter();
}Thread Safety: Stateless and thread-safe. Safe to share instance across threads. Token counter must be thread-safe if used.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
IllegalArgumentException - Invalid size parametersNullPointerException - Null text or tokenizerOutOfMemoryError - Too many small paragraphs with large documentRelated APIs: DocumentBySentenceSplitter, DocumentByLineSplitter, HierarchicalDocumentSplitter
Split documents by lines.
package dev.langchain4j.data.document.splitter;
/**
* Splits documents into lines and fits as many as possible into a single TextSegment
* Line boundaries detected by newline characters
* Default sub-splitter is DocumentBySentenceSplitter
*/
public class DocumentByLineSplitter extends HierarchicalDocumentSplitter {
/**
* Constructor with character limits
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
*/
public DocumentByLineSplitter(int maxSegmentSizeInChars, int maxOverlapSizeInChars);
/**
* Constructor with sub-splitter
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
* @param subSplitter Sub-splitter to use for large lines
*/
public DocumentByLineSplitter(
int maxSegmentSizeInChars,
int maxOverlapSizeInChars,
DocumentSplitter subSplitter
);
/**
* Constructor with token limits
* @param maxSegmentSizeInTokens Maximum segment size in tokens
* @param maxOverlapSizeInTokens Maximum overlap size in tokens
* @param tokenCountEstimator Token count estimator
*/
public DocumentByLineSplitter(
int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
TokenCountEstimator tokenCountEstimator
);
/**
* Split text by lines
* @param text Text to split
* @return Array of lines
*/
protected String[] split(String text);
/**
* Get join delimiter
* @return "\n" (newline)
*/
protected String joinDelimiter();
/**
* Get default sub-splitter
* @return DocumentBySentenceSplitter instance
*/
protected DocumentSplitter defaultSubSplitter();
}Thread Safety: Stateless and thread-safe. Safe for concurrent use. TokenCountEstimator must be thread-safe.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
IllegalArgumentException - Invalid parametersNullPointerException - Null inputOutOfMemoryError - Too many linesRelated APIs: DocumentByParagraphSplitter, DocumentBySentenceSplitter, HierarchicalDocumentSplitter
Split documents by sentences.
package dev.langchain4j.data.document.splitter;
/**
* Splits documents into sentences and fits as many as possible into a single TextSegment
* Uses Apache OpenNLP for sentence detection
*/
public class DocumentBySentenceSplitter extends HierarchicalDocumentSplitter {
/**
* Constructor with character limits
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
*/
public DocumentBySentenceSplitter(int maxSegmentSizeInChars, int maxOverlapSizeInChars);
/**
* Constructor with sub-splitter
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
* @param subSplitter Sub-splitter to use for large sentences
*/
public DocumentBySentenceSplitter(
int maxSegmentSizeInChars,
int maxOverlapSizeInChars,
DocumentSplitter subSplitter
);
/**
* Constructor with token limits
* @param maxSegmentSizeInTokens Maximum segment size in tokens
* @param maxOverlapSizeInTokens Maximum overlap size in tokens
* @param tokenCountEstimator Token count estimator
*/
public DocumentBySentenceSplitter(
int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
TokenCountEstimator tokenCountEstimator
);
}Thread Safety: Sentence detector is NOT thread-safe (OpenNLP limitation). Do NOT share instance across threads. Create one instance per thread or synchronize access.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
ClassNotFoundException - OpenNLP dependency missingIOException - Sentence model file not foundIllegalArgumentException - Invalid parametersConcurrentModificationException - Concurrent access (not thread-safe)Related APIs: DocumentByWordSplitter, DocumentByParagraphSplitter, HierarchicalDocumentSplitter
Split documents by words.
package dev.langchain4j.data.document.splitter;
/**
* Splits documents into words and fits as many as possible into a single TextSegment
*/
public class DocumentByWordSplitter extends HierarchicalDocumentSplitter {
/**
* Constructor with character limits
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
*/
public DocumentByWordSplitter(int maxSegmentSizeInChars, int maxOverlapSizeInChars);
/**
* Constructor with sub-splitter
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
* @param subSplitter Sub-splitter to use for large words
*/
public DocumentByWordSplitter(
int maxSegmentSizeInChars,
int maxOverlapSizeInChars,
DocumentSplitter subSplitter
);
/**
* Constructor with token limits
* @param maxSegmentSizeInTokens Maximum segment size in tokens
* @param maxOverlapSizeInTokens Maximum overlap size in tokens
* @param tokenCountEstimator Token count estimator
*/
public DocumentByWordSplitter(
int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
TokenCountEstimator tokenCountEstimator
);
}Thread Safety: Stateless and thread-safe. Safe for concurrent use across threads.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
IllegalArgumentException - Invalid parametersNullPointerException - Null inputRelated APIs: DocumentByCharacterSplitter, DocumentBySentenceSplitter, HierarchicalDocumentSplitter
Split documents by characters.
package dev.langchain4j.data.document.splitter;
/**
* Splits documents into characters and fits as many as possible into a single TextSegment
* Supports character or token-based limits
*/
public class DocumentByCharacterSplitter extends HierarchicalDocumentSplitter {
/**
* Constructor with character limits
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
*/
public DocumentByCharacterSplitter(int maxSegmentSizeInChars, int maxOverlapSizeInChars);
/**
* Constructor with sub-splitter
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
* @param subSplitter Sub-splitter (typically null for character splitter)
*/
public DocumentByCharacterSplitter(
int maxSegmentSizeInChars,
int maxOverlapSizeInChars,
DocumentSplitter subSplitter
);
/**
* Constructor with token limits
* @param maxSegmentSizeInTokens Maximum segment size in tokens
* @param maxOverlapSizeInTokens Maximum overlap size in tokens
* @param tokenCountEstimator Token count estimator
*/
public DocumentByCharacterSplitter(
int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
TokenCountEstimator tokenCountEstimator
);
/**
* Full constructor with token limits and sub-splitter
* @param maxSegmentSizeInTokens Maximum segment size in tokens
* @param maxOverlapSizeInTokens Maximum overlap size in tokens
* @param tokenCountEstimator Token count estimator
* @param subSplitter Sub-splitter (typically null)
*/
public DocumentByCharacterSplitter(
int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
TokenCountEstimator tokenCountEstimator,
DocumentSplitter subSplitter
);
/**
* Split text implementation
* @param text Text to split
* @return Array of characters as strings
*/
protected String[] split(String text);
/**
* Get join delimiter
* @return "" (empty string)
*/
protected String joinDelimiter();
/**
* Get default sub-splitter
* @return null (no sub-splitter)
*/
protected DocumentSplitter defaultSubSplitter();
}Thread Safety: Stateless and thread-safe. Safe for concurrent use.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
IllegalArgumentException - Invalid parameters (overlap >= segment size)NullPointerException - Null inputRelated APIs: DocumentByWordSplitter, HierarchicalDocumentSplitter
Split documents using custom regex pattern.
package dev.langchain4j.data.document.splitter;
/**
* Splits documents using a custom regex pattern
*/
public class DocumentByRegexSplitter extends HierarchicalDocumentSplitter {
/**
* Constructor with character limits
* @param regex Regular expression pattern for splitting
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
*/
public DocumentByRegexSplitter(String regex, int maxSegmentSizeInChars, int maxOverlapSizeInChars);
/**
* Constructor with sub-splitter
* @param regex Regular expression pattern for splitting
* @param maxSegmentSizeInChars Maximum segment size in characters
* @param maxOverlapSizeInChars Maximum overlap size in characters
* @param subSplitter Sub-splitter to use for large segments
*/
public DocumentByRegexSplitter(
String regex,
int maxSegmentSizeInChars,
int maxOverlapSizeInChars,
DocumentSplitter subSplitter
);
/**
* Constructor with token limits
* @param regex Regular expression pattern for splitting
* @param maxSegmentSizeInTokens Maximum segment size in tokens
* @param maxOverlapSizeInTokens Maximum overlap size in tokens
* @param tokenCountEstimator Token count estimator
*/
public DocumentByRegexSplitter(
String regex,
int maxSegmentSizeInTokens,
int maxOverlapSizeInTokens,
TokenCountEstimator tokenCountEstimator
);
}Thread Safety: Pattern compiled at construction time. Thread-safe if Pattern.split() is used correctly (stateless). Safe for concurrent use.
Common Pitfalls:
Edge Cases:
Performance Notes:
Cost Considerations:
Exception Handling:
PatternSyntaxException - Invalid regex patternIllegalArgumentException - Invalid size parametersStackOverflowError - Catastrophic regex backtrackingRelated APIs: Pattern class, HierarchicalDocumentSplitter, DocumentByLineSplitter
Use Cases by Content Type:
| Content Type | Recommended Splitter | Reasoning |
|---|---|---|
| Documentation, articles | DocumentSplitters.recursive() | Preserves semantic structure (paragraphs > sentences > words) |
| Code files | DocumentByLineSplitter | Code structure aligned with lines |
| Log files | DocumentByRegexSplitter | Custom delimiters (timestamps, log levels) |
| CSV/TSV | DocumentByLineSplitter | Each line is semantic unit |
| Legal documents | DocumentByParagraphSplitter | Paragraph = logical unit |
| Chat transcripts | DocumentByRegexSplitter | Split by speaker or timestamp |
| Markdown | DocumentByParagraphSplitter | Respects document structure |
| JSON/XML | Custom parser + DocumentByLineSplitter | Parse first, then split logical blocks |
Token-based sizing (recommended):
Character-based sizing:
Overlap benefits:
Overlap sizing:
When to skip overlap:
Recursive splitting hierarchy:
Custom hierarchy example:
DocumentSplitter customSplitter = new DocumentByRegexSplitter(
"\\n---\\n", // Custom section delimiter
1000,
100,
new DocumentByParagraphSplitter(1000, 100)
);Parallel processing pattern:
List<Document> documents = loadDocuments();
List<TextSegment> allSegments = documents.parallelStream()
.flatMap(doc -> splitter.split(doc).stream())
.collect(Collectors.toList());Batching for embedding:
int batchSize = 100;
for (int i = 0; i < segments.size(); i += batchSize) {
List<TextSegment> batch = segments.subList(
i,
Math.min(i + batchSize, segments.size())
);
List<Embedding> embeddings = embeddingModel.embedAll(batch).content();
// Store embeddings
}Supported encodings:
Pattern:
// Auto-detect encoding (defaults to UTF-8)
Document doc = FileSystemDocumentLoader.loadDocument("file.txt");
// Explicit encoding
Document doc2 = FileSystemDocumentLoader.loadDocument(
"file.txt",
new TextDocumentParser(StandardCharsets.ISO_8859_1)
);Pattern:
// Load as text (preserves markdown syntax)
Document doc = FileSystemDocumentLoader.loadDocument("README.md");
// Split by headers (custom regex)
DocumentSplitter splitter = new DocumentByRegexSplitter(
"\\n##? ", // Split on ## or # headers
2000,
200
);Pattern:
// Load source code
Document code = FileSystemDocumentLoader.loadDocument("App.java");
// Split by lines (preserves structure)
DocumentSplitter splitter = new DocumentByLineSplitter(500, 50);
// Or split by functions (custom regex for Java)
DocumentSplitter functionSplitter = new DocumentByRegexSplitter(
"\\n\\s*(public|private|protected)\\s+",
1000,
100
);Pattern:
// Load CSV
Document csv = FileSystemDocumentLoader.loadDocument("data.csv");
// Split by lines (each row is a segment)
DocumentSplitter splitter = new DocumentByLineSplitter(
1000, // Max chars per segment
0 // No overlap for structured data
);
// Filter header if needed
List<TextSegment> segments = splitter.split(csv).stream()
.skip(1) // Skip header row
.collect(Collectors.toList());Pattern:
// Load log file
Document logs = FileSystemDocumentLoader.loadDocument("app.log");
// Split by timestamp pattern
DocumentSplitter splitter = new DocumentByRegexSplitter(
"\\n\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", // ISO timestamp
2000,
0 // No overlap for logs
);Pattern:
// Parse JSON first, then create documents per object
String jsonContent = Files.readString(Path.of("data.json"));
JsonArray array = JsonParser.parseString(jsonContent).getAsJsonArray();
List<Document> documents = new ArrayList<>();
for (JsonElement element : array) {
String text = element.toString();
documents.add(Document.from(text));
}
// Split each document
List<TextSegment> segments = documents.stream()
.flatMap(doc -> splitter.split(doc).stream())
.collect(Collectors.toList());Pattern (requires Apache PDFBox):
// Add dependency: dev.langchain4j:langchain4j-document-parser-apache-pdfbox
import dev.langchain4j.data.document.parser.apache.pdfbox.ApachePdfBoxDocumentParser;
Document pdf = FileSystemDocumentLoader.loadDocument(
"document.pdf",
new ApachePdfBoxDocumentParser()
);
// Split with token-based limits (PDFs often verbose)
DocumentSplitter splitter = DocumentSplitters.recursive(500, 50, tokenizer);Pattern:
PathMatcher textFilesOnly = FileSystems.getDefault().getPathMatcher(
"glob:*.{txt,md,java,py,js,json,xml,csv,log}"
);
List<Document> docs = FileSystemDocumentLoader.loadDocuments(
Path.of("/path/to/dir"),
textFilesOnly
);import org.junit.jupiter.api.Test;
import static org.assertj.core.api.Assertions.*;
class DocumentLoaderTest {
@Test
void testLoadSingleDocument() {
// Given
Path testFile = Path.of("src/test/resources/test.txt");
// When
Document doc = FileSystemDocumentLoader.loadDocument(testFile);
// Then
assertThat(doc.text()).isNotEmpty();
assertThat(doc.metadata().get("file_name")).isEqualTo("test.txt");
}
@Test
void testLoadNonExistentFile() {
// Given
Path nonExistent = Path.of("does-not-exist.txt");
// When/Then
assertThatThrownBy(() -> FileSystemDocumentLoader.loadDocument(nonExistent))
.isInstanceOf(NoSuchFileException.class);
}
@Test
void testLoadWithCustomCharset() {
// Given
Path latin1File = Path.of("src/test/resources/latin1.txt");
TextDocumentParser parser = new TextDocumentParser(StandardCharsets.ISO_8859_1);
// When
Document doc = FileSystemDocumentLoader.loadDocument(latin1File, parser);
// Then
assertThat(doc.text()).contains("café"); // Correctly decoded
}
}class DocumentSplitterTest {
private DocumentSplitter splitter;
@BeforeEach
void setUp() {
splitter = DocumentSplitters.recursive(100, 10);
}
@Test
void testSplitSmallDocument() {
// Given
Document doc = Document.from("Short text.");
// When
List<TextSegment> segments = splitter.split(doc);
// Then
assertThat(segments).hasSize(1);
assertThat(segments.get(0).text()).isEqualTo("Short text.");
}
@Test
void testSplitLargeDocument() {
// Given
String longText = "A ".repeat(100); // 200 characters
Document doc = Document.from(longText);
// When
List<TextSegment> segments = splitter.split(doc);
// Then
assertThat(segments).hasSizeGreaterThan(1);
assertThat(segments).allMatch(s -> s.text().length() <= 100);
}
@Test
void testOverlapBetweenSegments() {
// Given
String text = "Sentence one. Sentence two. Sentence three. Sentence four.";
Document doc = Document.from(text);
DocumentSplitter splitterWithOverlap = new DocumentBySentenceSplitter(30, 10);
// When
List<TextSegment> segments = splitterWithOverlap.split(doc);
// Then
assertThat(segments.size()).isGreaterThan(1);
// Verify overlap exists
for (int i = 0; i < segments.size() - 1; i++) {
String currentEnd = segments.get(i).text().substring(
Math.max(0, segments.get(i).text().length() - 10)
);
String nextStart = segments.get(i + 1).text().substring(0,
Math.min(10, segments.get(i + 1).text().length())
);
// Some overlap should exist
assertThat(nextStart).containsAnyOf(currentEnd.split(" "));
}
}
@Test
void testMetadataPreserved() {
// Given
Metadata metadata = new Metadata();
metadata.put("source", "test.txt");
Document doc = Document.from("Text content", metadata);
// When
List<TextSegment> segments = splitter.split(doc);
// Then
assertThat(segments).allMatch(s ->
s.metadata().get("source").equals("test.txt")
);
}
}class RAGPipelineTest {
private EmbeddingModel embeddingModel;
private EmbeddingStore<TextSegment> embeddingStore;
private DocumentSplitter splitter;
@BeforeEach
void setUp() {
embeddingModel = new AllMiniLmL6V2EmbeddingModel();
embeddingStore = new InMemoryEmbeddingStore<>();
splitter = DocumentSplitters.recursive(300, 30,
new OpenAiTokenizer("gpt-3.5-turbo"));
}
@Test
void testCompleteRAGPipeline() {
// Given: Load and split documents
List<Document> docs = FileSystemDocumentLoader.loadDocuments(
Path.of("src/test/resources/docs")
);
List<TextSegment> segments = docs.stream()
.flatMap(doc -> splitter.split(doc).stream())
.collect(Collectors.toList());
// Index segments
for (TextSegment segment : segments) {
Embedding embedding = embeddingModel.embed(segment).content();
embeddingStore.add(embedding, segment);
}
// When: Search
String query = "What is document processing?";
Embedding queryEmbedding = embeddingModel.embed(query).content();
List<EmbeddingMatch<TextSegment>> matches =
embeddingStore.findRelevant(queryEmbedding, 3);
// Then: Verify results
assertThat(matches).isNotEmpty();
assertThat(matches).hasSizeLessThanOrEqualTo(3);
assertThat(matches.get(0).score()).isGreaterThan(0.5);
assertThat(matches.get(0).embedded().text()).containsIgnoringCase("document");
}
}class ErrorHandlingTest {
@Test
void testLargeFileHandling() {
// Given: Simulate large file
Path largeFile = createLargeTestFile(1_000_000_000); // 1GB
// When/Then: Should handle gracefully or throw OOME
assertThatThrownBy(() ->
FileSystemDocumentLoader.loadDocument(largeFile)
).isInstanceOfAny(OutOfMemoryError.class, IOException.class);
// Cleanup
Files.deleteIfExists(largeFile);
}
@Test
void testInvalidEncodingHandling() {
// Given: File with invalid UTF-8
Path invalidFile = Path.of("src/test/resources/invalid-utf8.txt");
// When: Load with UTF-8 parser
Document doc = FileSystemDocumentLoader.loadDocument(invalidFile);
// Then: Should contain replacement characters
assertThat(doc.text()).contains("\uFFFD"); // Replacement character
}
@Test
void testEmptyFileHandling() {
// Given: Empty file
Path emptyFile = Files.createTempFile("empty", ".txt");
// When
Document doc = FileSystemDocumentLoader.loadDocument(emptyFile);
// Then
assertThat(doc.text()).isEmpty();
// Cleanup
Files.deleteIfExists(emptyFile);
}
}import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.loader.FileSystemDocumentLoader;
import dev.langchain4j.data.document.parser.TextDocumentParser;
import java.nio.file.Path;
import java.util.List;
// Load single document
Document doc = FileSystemDocumentLoader.loadDocument(Path.of("/path/to/file.txt"));
// Load with custom parser
Document doc2 = FileSystemDocumentLoader.loadDocument(
Path.of("/path/to/file.txt"),
new TextDocumentParser()
);
// Load all documents from directory
List<Document> docs = FileSystemDocumentLoader.loadDocuments(Path.of("/path/to/dir"));
// Load recursively
List<Document> allDocs = FileSystemDocumentLoader.loadDocumentsRecursively(
Path.of("/path/to/dir")
);import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.loader.ClassPathDocumentLoader;
// Load single file from classpath
Document doc = ClassPathDocumentLoader.loadDocument("documents/guide.txt");
// Load all documents from classpath directory
List<Document> docs = ClassPathDocumentLoader.loadDocuments("documents");
// Load recursively
List<Document> allDocs = ClassPathDocumentLoader.loadDocumentsRecursively("documents");import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.splitter.DocumentSplitter;
import dev.langchain4j.data.document.splitter.DocumentSplitters;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.openai.OpenAiTokenizer;
import java.util.List;
// Recommended: recursive splitter with token limits
DocumentSplitter splitter = DocumentSplitters.recursive(
500, // max tokens per segment
50, // overlap tokens
new OpenAiTokenizer()
);
List<TextSegment> segments = splitter.split(document);
// Simple: recursive splitter with character limits
DocumentSplitter charSplitter = DocumentSplitters.recursive(2000, 200);
List<TextSegment> charSegments = charSplitter.split(document);import dev.langchain4j.data.document.splitter.DocumentByParagraphSplitter;
import dev.langchain4j.data.document.splitter.DocumentBySentenceSplitter;
// Split by paragraphs with custom sub-splitter
DocumentSplitter splitter = new DocumentByParagraphSplitter(
1000, // max characters per segment
100, // overlap characters
new DocumentBySentenceSplitter(1000, 100) // sub-splitter for large paragraphs
);
List<TextSegment> segments = splitter.split(document);import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.loader.FileSystemDocumentLoader;
import dev.langchain4j.data.document.splitter.DocumentSplitters;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.store.embedding.EmbeddingStore;
import dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore;
import java.nio.file.Path;
import java.util.List;
// 1. Load documents
List<Document> documents = FileSystemDocumentLoader.loadDocumentsRecursively(
Path.of("/path/to/docs")
);
// 2. Split into segments
DocumentSplitter splitter = DocumentSplitters.recursive(300, 30, tokenizer);
List<TextSegment> segments = new ArrayList<>();
for (Document doc : documents) {
segments.addAll(splitter.split(doc));
}
// 3. Embed segments
EmbeddingStore<TextSegment> embeddingStore = new InMemoryEmbeddingStore<>();
for (TextSegment segment : segments) {
Embedding embedding = embeddingModel.embed(segment).content();
embeddingStore.add(embedding, segment);
}
// 4. Use with AI service for RAG
ContentRetriever contentRetriever = EmbeddingStoreContentRetriever.builder()
.embeddingStore(embeddingStore)
.embeddingModel(embeddingModel)
.maxResults(3)
.build();
Assistant assistant = AiServices.builder(Assistant.class)
.chatModel(chatModel)
.contentRetriever(contentRetriever)
.build();import java.nio.file.FileSystems;
import java.nio.file.PathMatcher;
// Create matcher for text files only
PathMatcher textFiles = FileSystems.getDefault().getPathMatcher(
"glob:*.{txt,md,java,py,js}"
);
// Load only matching files
List<Document> docs = FileSystemDocumentLoader.loadDocumentsRecursively(
Path.of("/path/to/code"),
textFiles,
new TextDocumentParser()
);import java.util.concurrent.ForkJoinPool;
import java.util.stream.Collectors;
// Load documents in parallel
List<Document> documents = FileSystemDocumentLoader.loadDocumentsRecursively(
Path.of("/path/to/docs")
);
// Split in parallel
ForkJoinPool customThreadPool = new ForkJoinPool(4);
List<TextSegment> allSegments = customThreadPool.submit(() ->
documents.parallelStream()
.flatMap(doc -> splitter.split(doc).stream())
.collect(Collectors.toList())
).join();
customThreadPool.shutdown();import java.nio.charset.StandardCharsets;
// Latin-1 encoded file
Document latin1Doc = FileSystemDocumentLoader.loadDocument(
Path.of("latin1-file.txt"),
new TextDocumentParser(StandardCharsets.ISO_8859_1)
);
// Windows-1252 encoded file
Document windowsDoc = FileSystemDocumentLoader.loadDocument(
Path.of("windows-file.txt"),
new TextDocumentParser(Charset.forName("Windows-1252"))
);// Split log file by timestamp entries
DocumentSplitter logSplitter = new DocumentByRegexSplitter(
"\\n\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", // ISO timestamp pattern
2000, // max chars per segment
0 // no overlap for logs
);
Document logs = FileSystemDocumentLoader.loadDocument("application.log");
List<TextSegment> logEntries = logSplitter.split(logs);// Batch segments for embedding API calls
int batchSize = 100;
List<TextSegment> allSegments = splitter.split(document);
for (int i = 0; i < allSegments.size(); i += batchSize) {
List<TextSegment> batch = allSegments.subList(
i,
Math.min(i + batchSize, allSegments.size())
);
// Embed entire batch in one API call
List<Embedding> embeddings = embeddingModel.embedAll(batch).content();
// Store embeddings
for (int j = 0; j < batch.size(); j++) {
embeddingStore.add(embeddings.get(j), batch.get(j));
}
}Document Loading:
Document Parsing:
Document Splitting:
Tokenization:
Embedding:
Data Types:
Install with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j@1.11.0