Common classes used across Spring AI providing document processing, text transformation, embedding utilities, observability support, and tokenization capabilities for AI application development
Text splitting breaks documents into smaller chunks optimized for AI operations like embeddings and context windows.
The text splitting layer consists of:
Text splitters are DocumentTransformers that take documents and produce multiple smaller documents (chunks) from each original document.
Base class for all text splitting implementations.
package org.springframework.ai.transformer.splitter;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentTransformer;
import java.util.List;
abstract class TextSplitter implements DocumentTransformer {
/**
* Apply splitting to list of documents.
* @param documents documents to split
* @return split documents (chunks)
*/
List<Document> apply(List<Document> documents);
/**
* Split list of documents (convenience method).
* Same as apply().
* @param documents documents to split
* @return split documents (chunks)
*/
List<Document> split(List<Document> documents);
/**
* Split single document into chunks.
* @param document document to split
* @return list of document chunks
*/
List<Document> split(Document document);
/**
* Check if content formatter is copied to chunks.
* @return true if formatter is copied
*/
boolean isCopyContentFormatter();
/**
* Set whether to copy content formatter to chunks.
* @param copyContentFormatter true to copy formatter
*/
void setCopyContentFormatter(boolean copyContentFormatter);
/**
* Split text into string chunks (implementation method).
* Subclasses must implement this method.
* @param text text to split
* @return list of text chunks
*/
protected abstract List<String> splitText(String text);
}import org.springframework.ai.document.Document;
import org.springframework.ai.transformer.splitter.TextSplitter;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import java.util.List;
// Create text splitter
TextSplitter splitter = new TokenTextSplitter();
// Split single document
Document longDoc = new Document("Very long document content that needs to be split...");
List<Document> chunks = splitter.split(longDoc);
System.out.println("Original: 1 document");
System.out.println("After splitting: " + chunks.size() + " chunks");
// Split multiple documents
List<Document> documents = List.of(
new Document("First long document..."),
new Document("Second long document..."),
new Document("Third long document...")
);
List<Document> allChunks = splitter.apply(documents);
// Process chunks
for (Document chunk : allChunks) {
System.out.println("Chunk ID: " + chunk.getId());
System.out.println("Content: " + chunk.getText());
System.out.println("Metadata: " + chunk.getMetadata());
}import org.springframework.ai.document.Document;
import org.springframework.ai.document.DefaultContentFormatter;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import java.util.List;
// Create document with custom formatter
DefaultContentFormatter customFormatter = DefaultContentFormatter.builder()
.withMetadataTemplate("[%s]=%s")
.build();
Document doc = Document.builder()
.text("Long content...")
.metadata("source", "manual")
.build();
doc.setContentFormatter(customFormatter);
// Create splitter
TokenTextSplitter splitter = new TokenTextSplitter();
// By default, copyContentFormatter is true
splitter.setCopyContentFormatter(true);
// Split - chunks inherit the formatter
List<Document> chunks = splitter.split(doc);
// All chunks have the custom formatter
for (Document chunk : chunks) {
// Uses inherited custom formatter
String formatted = chunk.getFormattedContent();
}Splits text into chunks based on token count using JTokkit tokenizer.
package org.springframework.ai.transformer.splitter;
import java.util.List;
class TokenTextSplitter extends TextSplitter {
/**
* Create with default settings.
* Default chunk size: 800 tokens
* Default encoding: CL100K_BASE (GPT-3.5/GPT-4)
*/
TokenTextSplitter();
/**
* Create with separator retention control.
* @param keepSeparator true to keep separators in chunks
*/
TokenTextSplitter(boolean keepSeparator);
/**
* Create with full configuration.
* @param chunkSize target chunk size in tokens
* @param minChunkSizeChars minimum chunk size in characters
* @param minChunkLengthToEmbed minimum chunk length to include
* @param maxNumChunks maximum number of chunks (0 = unlimited)
* @param keepSeparator true to keep separators in chunks
*/
TokenTextSplitter(int chunkSize, int minChunkSizeChars, int minChunkLengthToEmbed,
int maxNumChunks, boolean keepSeparator);
/**
* Create builder for configuration.
* @return builder instance
*/
static Builder builder();
/**
* Split text into string chunks.
* @param text text to split
* @return list of text chunks
*/
protected List<String> splitText(String text);
/**
* Split text with specific chunk size.
* @param text text to split
* @param chunkSize chunk size in tokens
* @return list of text chunks
*/
List<String> doSplit(String text, int chunkSize);
}class TokenTextSplitter.Builder {
/**
* Set chunk size in tokens.
* Default: 800
* @param chunkSize target chunk size
* @return this builder
*/
Builder withChunkSize(int chunkSize);
/**
* Set minimum chunk size in characters.
* Chunks smaller than this are discarded.
* Default: 350
* @param minChunkSizeChars minimum size
* @return this builder
*/
Builder withMinChunkSizeChars(int minChunkSizeChars);
/**
* Set minimum chunk length to embed.
* Chunks shorter than this are not embedded.
* Default: 5
* @param minChunkLengthToEmbed minimum length
* @return this builder
*/
Builder withMinChunkLengthToEmbed(int minChunkLengthToEmbed);
/**
* Set maximum number of chunks.
* Set to 0 for unlimited.
* Default: 0 (unlimited)
* Note: This is a suggestion rather than a hard limit. The actual number
* of chunks may exceed this value depending on the text structure and
* splitting algorithm behavior.
* @param maxNumChunks maximum chunks (suggestion)
* @return this builder
*/
Builder withMaxNumChunks(int maxNumChunks);
/**
* Set whether to keep separators in chunks.
* Default: true
* @param keepSeparator true to keep separators
* @return this builder
*/
Builder withKeepSeparator(boolean keepSeparator);
/**
* Build the TokenTextSplitter.
* @return configured splitter
*/
TokenTextSplitter build();
}import org.springframework.ai.document.Document;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import java.util.List;
import java.util.Map;
// Default configuration (800 tokens per chunk)
TokenTextSplitter defaultSplitter = new TokenTextSplitter();
Document doc = new Document("Long document content...");
List<Document> chunks = defaultSplitter.split(doc);
// Custom chunk size
TokenTextSplitter customSplitter = TokenTextSplitter.builder()
.withChunkSize(500) // 500 tokens per chunk
.build();
List<Document> smallerChunks = customSplitter.split(doc);
// Full configuration
TokenTextSplitter advancedSplitter = TokenTextSplitter.builder()
.withChunkSize(1000) // Target 1000 tokens per chunk
.withMinChunkSizeChars(200) // Discard chunks < 200 chars
.withMinChunkLengthToEmbed(10) // Don't embed chunks < 10 chars
.withMaxNumChunks(50) // Maximum 50 chunks per document
.withKeepSeparator(true) // Keep separators in chunks
.build();
List<Document> advancedChunks = advancedSplitter.split(doc);
// Split with metadata preservation
Document docWithMetadata = Document.builder()
.text("Long content...")
.metadata("source", "user-manual")
.metadata("chapter", "3")
.metadata("page", 42)
.build();
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(500)
.build();
List<Document> chunksWithMetadata = splitter.split(docWithMetadata);
// Each chunk inherits original metadata
for (int i = 0; i < chunksWithMetadata.size(); i++) {
Document chunk = chunksWithMetadata.get(i);
System.out.println("Chunk " + i);
System.out.println("Text: " + chunk.getText());
System.out.println("Source: " + chunk.getMetadata().get("source"));
System.out.println("Chapter: " + chunk.getMetadata().get("chapter"));
}TokenTextSplitter uses the CL100K_BASE encoding (used by GPT-3.5-turbo and GPT-4) by default through JTokkit tokenizer. This ensures accurate token counting for OpenAI models.
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.document.Document;
import java.util.List;
// CL100K_BASE encoding (default)
TokenTextSplitter splitter = new TokenTextSplitter();
// Example: This text is approximately 50 tokens
String text = "Artificial Intelligence has transformed many industries. " +
"Machine learning models can now understand natural language, " +
"generate images, and even write code. The future of AI is bright.";
Document doc = new Document(text);
List<Document> chunks = splitter.split(doc);
System.out.println("Number of chunks: " + chunks.size());
// With 800 token default, this stays as 1 chunk
// Split into smaller 25-token chunks
TokenTextSplitter smallChunkSplitter = TokenTextSplitter.builder()
.withChunkSize(25)
.build();
List<Document> smallChunks = smallChunkSplitter.split(doc);
System.out.println("Small chunks: " + smallChunks.size());
// Results in ~2 chunksimport org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.TextReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.core.io.ClassPathResource;
import java.util.List;
// Read large document
DocumentReader reader = new TextReader(new ClassPathResource("knowledge-base.txt"));
List<Document> documents = reader.get();
// Split for embedding (typical embedding models have 512-8192 token limits)
TokenTextSplitter embeddingSplitter = TokenTextSplitter.builder()
.withChunkSize(512) // Fit within embedding model limit
.withMinChunkSizeChars(100) // Filter out tiny chunks
.withMinChunkLengthToEmbed(20) // Must have substantial content
.build();
List<Document> chunks = embeddingSplitter.apply(documents);
System.out.println("Original documents: " + documents.size());
System.out.println("Chunks for embedding: " + chunks.size());
// Add chunk metadata
for (int i = 0; i < chunks.size(); i++) {
Document chunk = chunks.get(i);
chunk.getMetadata().put("chunk_index", i);
chunk.getMetadata().put("total_chunks", chunks.size());
}
// Now ready for embedding and vector store ingestionimport org.springframework.ai.document.Document;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import java.util.List;
// Create splitter matching model's context window
// Example: GPT-3.5-turbo has 4096 token context window
// Reserve tokens for prompt + response
int contextWindow = 4096;
int promptTokens = 500;
int responseTokens = 500;
int documentTokenBudget = contextWindow - promptTokens - responseTokens;
TokenTextSplitter contextSplitter = TokenTextSplitter.builder()
.withChunkSize(documentTokenBudget) // ~3000 tokens
.build();
Document largeDoc = new Document("Very large document...");
List<Document> contextChunks = contextSplitter.split(largeDoc);
// Use first chunk that fits in context
Document firstChunk = contextChunks.get(0);
// Send to LLM with prompt
// Or process all chunks sequentially
for (Document chunk : contextChunks) {
String prompt = "Analyze this content: " + chunk.getText();
// Send prompt to LLM
}import org.springframework.ai.document.Document;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import java.util.ArrayList;
import java.util.List;
/**
* Create overlapping chunks to maintain context across boundaries.
*/
class OverlappingChunker {
private final TokenTextSplitter splitter;
private final int overlapTokens;
public OverlappingChunker(int chunkSize, int overlapTokens) {
this.splitter = TokenTextSplitter.builder()
.withChunkSize(chunkSize)
.build();
this.overlapTokens = overlapTokens;
}
public List<Document> splitWithOverlap(Document document) {
// Get non-overlapping chunks first
List<Document> baseChunks = splitter.split(document);
if (baseChunks.size() <= 1) {
return baseChunks;
}
List<Document> overlappingChunks = new ArrayList<>();
for (int i = 0; i < baseChunks.size(); i++) {
String currentText = baseChunks.get(i).getText();
// Add overlap from previous chunk
if (i > 0) {
String prevText = baseChunks.get(i - 1).getText();
String[] prevWords = prevText.split("\\s+");
int overlapWords = Math.min(overlapTokens, prevWords.length);
String overlap = String.join(" ",
List.of(prevWords).subList(prevWords.length - overlapWords, prevWords.length));
currentText = overlap + " " + currentText;
}
Document chunk = Document.builder()
.text(currentText)
.metadata(baseChunks.get(i).getMetadata())
.metadata("chunk_index", i)
.metadata("has_overlap", i > 0)
.build();
overlappingChunks.add(chunk);
}
return overlappingChunks;
}
}
// Usage
Document doc = new Document("Long document with important context across sections...");
OverlappingChunker chunker = new OverlappingChunker(
500, // 500 token chunks
50 // 50 token overlap
);
List<Document> overlappingChunks = chunker.splitWithOverlap(doc);import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.JsonReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.core.io.ClassPathResource;
import java.util.List;
// Read multiple documents
DocumentReader reader = new JsonReader(
new ClassPathResource("documents.json"),
"title", "content", "author"
);
List<Document> documents = reader.get();
System.out.println("Loaded " + documents.size() + " documents");
// Split all documents
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(600)
.withMinChunkSizeChars(150)
.build();
List<Document> allChunks = splitter.apply(documents);
System.out.println("Created " + allChunks.size() + " chunks");
// Calculate statistics
int totalOriginalDocs = documents.size();
int totalChunks = allChunks.size();
double avgChunksPerDoc = (double) totalChunks / totalOriginalDocs;
System.out.println("Average chunks per document: " + avgChunksPerDoc);import org.springframework.ai.document.Document;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.tokenizer.JTokkitTokenCountEstimator;
import org.springframework.ai.tokenizer.TokenCountEstimator;
import java.util.List;
/**
* Adaptively choose chunk size based on document length.
*/
class AdaptiveChunker {
private final TokenCountEstimator estimator = new JTokkitTokenCountEstimator();
public List<Document> splitAdaptively(Document document) {
int tokenCount = estimator.estimate(document.getText());
TokenTextSplitter splitter;
if (tokenCount < 500) {
// Small document - no splitting
return List.of(document);
} else if (tokenCount < 2000) {
// Medium document - 400 token chunks
splitter = TokenTextSplitter.builder()
.withChunkSize(400)
.build();
} else if (tokenCount < 10000) {
// Large document - 800 token chunks
splitter = TokenTextSplitter.builder()
.withChunkSize(800)
.build();
} else {
// Very large document - 1200 token chunks
splitter = TokenTextSplitter.builder()
.withChunkSize(1200)
.build();
}
return splitter.split(document);
}
}
// Usage
AdaptiveChunker adaptiveChunker = new AdaptiveChunker();
Document shortDoc = new Document("Short content");
List<Document> shortChunks = adaptiveChunker.splitAdaptively(shortDoc);
// No splitting
Document longDoc = new Document("Very long content...".repeat(1000));
List<Document> longChunks = adaptiveChunker.splitAdaptively(longDoc);
// Split with appropriate chunk sizeimport org.springframework.ai.document.Document;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import java.util.ArrayList;
import java.util.List;
/**
* Split by sections first, then apply token-based splitting if needed.
*/
class SemanticSplitter {
private final TokenTextSplitter tokenSplitter;
public SemanticSplitter(int maxTokensPerChunk) {
this.tokenSplitter = TokenTextSplitter.builder()
.withChunkSize(maxTokensPerChunk)
.build();
}
public List<Document> splitBySections(Document document) {
String text = document.getText();
// Split by markdown headers or section markers
String[] sections = text.split("(?m)^#{1,3}\\s+");
List<Document> chunks = new ArrayList<>();
for (int i = 0; i < sections.length; i++) {
String section = sections[i].trim();
if (section.isEmpty()) continue;
Document sectionDoc = Document.builder()
.text(section)
.metadata(document.getMetadata())
.metadata("section_index", i)
.build();
// If section is too large, split it further
List<Document> sectionChunks = tokenSplitter.split(sectionDoc);
chunks.addAll(sectionChunks);
}
return chunks;
}
}
// Usage
String documentText = """
# Introduction
This is the introduction section with important context.
### Background
Detailed background information goes here.
### Methodology
Our approach involves several steps...
""";
Document doc = new Document(documentText);
SemanticSplitter semanticSplitter = new SemanticSplitter(500);
List<Document> semanticChunks = semanticSplitter.splitBySections(doc);
// Each major section is preserved, but split if too longimport org.springframework.ai.document.Document;
import org.springframework.ai.document.DefaultContentFormatter;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.transformer.ContentFormatTransformer;
import java.util.List;
// Split documents
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(500)
.build();
// Format chunks
DefaultContentFormatter formatter = DefaultContentFormatter.builder()
.withExcludedEmbedMetadataKeys("internal_id")
.build();
ContentFormatTransformer formatTransformer = new ContentFormatTransformer(formatter);
// Combined pipeline
List<Document> documents = List.of(new Document("Long content..."));
List<Document> processedChunks = formatTransformer.apply(
splitter.apply(documents)
);
// Or use function composition
var pipeline = splitter.andThen(formatTransformer);
List<Document> result = pipeline.apply(documents);Thread Safety:
TokenTextSplitter: Thread-safe, can be reused across threadsTextSplitter: Abstract class, thread-safety depends on implementationPerformance:
Memory Characteristics:
Common Exceptions:
IllegalArgumentException: If chunkSize <= 0, minChunkSizeChars < 0, or other invalid parametersNullPointerException: If document or text is nullRuntimeException: Token encoding errors (rare)Edge Cases:
// Empty document
Document empty = new Document("");
try {
List<Document> chunks = splitter.split(empty); // Throws IllegalArgumentException
} catch (IllegalArgumentException e) {
// Handle empty document
}
// Document smaller than chunk size
Document small = new Document("Short text");
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(1000)
.build();
List<Document> chunks = splitter.split(small); // Returns single chunk (original document)
// Very large chunk size
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(100000)
.build();
// All documents become single chunks
// Minimum chunk size filtering
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(100)
.withMinChunkSizeChars(50)
.build();
// Chunks with <50 characters are discarded
// MaxNumChunks is a suggestion, not a hard limit
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(100)
.withMaxNumChunks(5)
.build();
// May produce more than 5 chunks depending on text structureInstall with Tessl CLI
npx tessl i tessl/maven-org-springframework-ai--spring-ai-commons