Common classes used across Spring AI providing document processing, text transformation, embedding utilities, observability support, and tokenization capabilities for AI application development
This guide will help you get started with Spring AI Commons in minutes.
Add the dependency to your Maven pom.xml:
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-commons</artifactId>
<version>1.1.2</version>
</dependency>Requirements: Java 17 or higher
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.TextReader;
import org.springframework.core.io.ClassPathResource;
import java.util.List;
// Read from a text file
TextReader reader = new TextReader(new ClassPathResource("knowledge-base.txt"));
List<Document> documents = reader.get();
System.out.println("Loaded " + documents.size() + " documents");import org.springframework.ai.document.Document;
// Create a document manually
Document doc = Document.builder()
.text("Spring AI Commons provides foundational abstractions for AI development")
.metadata("source", "documentation")
.metadata("category", "overview")
.build();
System.out.println("Document ID: " + doc.getId());import org.springframework.ai.transformer.splitter.TokenTextSplitter;
// Create a splitter for embedding-sized chunks
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(800) // 800 tokens per chunk
.withMinChunkSizeChars(100)
.build();
// Split documents
List<Document> chunks = splitter.apply(documents);
System.out.println("Created " + chunks.size() + " chunks");import org.springframework.ai.tokenizer.JTokkitTokenCountEstimator;
import com.knuddels.jtokkit.api.EncodingType;
// Create token estimator
JTokkitTokenCountEstimator estimator = new JTokkitTokenCountEstimator(
EncodingType.CL100K_BASE // For GPT-3.5/GPT-4
);
// Count tokens in a document
int tokenCount = estimator.estimate(doc.getText());
System.out.println("Token count: " + tokenCount);import org.springframework.ai.embedding.TokenCountBatchingStrategy;
// Create batching strategy
TokenCountBatchingStrategy batchingStrategy = new TokenCountBatchingStrategy(
EncodingType.CL100K_BASE,
8191, // OpenAI embedding limit
0.1 // 10% reserve
);
// Batch chunks for efficient embedding
List<List<Document>> batches = batchingStrategy.batch(chunks);
System.out.println("Created " + batches.size() + " batches for embedding");import org.springframework.ai.document.MetadataMode;
// Format document for embedding (excludes certain metadata)
String embedContent = doc.getFormattedContent(MetadataMode.EMBED);
// Format document for LLM inference (excludes different metadata)
String inferenceContent = doc.getFormattedContent(MetadataMode.INFERENCE);
// Get just the text (no metadata)
String textOnly = doc.getFormattedContent(MetadataMode.NONE);import org.springframework.ai.document.Document;
import org.springframework.ai.document.MetadataMode;
import org.springframework.ai.reader.TextReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.embedding.TokenCountBatchingStrategy;
import org.springframework.ai.tokenizer.JTokkitTokenCountEstimator;
import org.springframework.core.io.ClassPathResource;
import com.knuddels.jtokkit.api.EncodingType;
import java.util.List;
public class RAGPipeline {
public void processDocuments() {
// 1. Read documents
TextReader reader = new TextReader(new ClassPathResource("knowledge-base.txt"));
List<Document> documents = reader.get();
// 2. Split into chunks
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(500)
.build();
List<Document> chunks = splitter.apply(documents);
// 3. Batch for embedding
TokenCountBatchingStrategy batchingStrategy = new TokenCountBatchingStrategy(
EncodingType.CL100K_BASE,
8191,
0.1
);
List<List<Document>> batches = batchingStrategy.batch(chunks);
// 4. Process each batch
for (List<Document> batch : batches) {
for (Document doc : batch) {
String content = doc.getFormattedContent(MetadataMode.EMBED);
// Send to embedding API
// Store in vector database
}
}
System.out.println("Processed " + chunks.size() + " chunks in " + batches.size() + " batches");
}
}import org.springframework.ai.reader.JsonReader;
JsonReader jsonReader = new JsonReader(
new ClassPathResource("data.json"),
"title", "content" // Keys to use for document text
);
List<Document> docs = jsonReader.get();Document doc = Document.builder()
.text("Content")
.metadata("author", "John Doe")
.metadata("timestamp", System.currentTimeMillis())
.metadata("version", "1.0")
.build();import org.springframework.ai.document.id.JdkSha256HexIdGenerator;
Document doc = Document.builder()
.idGenerator(new JdkSha256HexIdGenerator())
.text("Same content always gets same ID")
.build();// ❌ This throws IllegalArgumentException
Document doc = new Document("");
// ✅ Validate before creating
if (text != null && !text.isEmpty()) {
Document doc = new Document(text);
}try {
TextReader reader = new TextReader(new ClassPathResource("missing.txt"));
List<Document> docs = reader.get();
} catch (RuntimeException e) {
System.err.println("Failed to read file: " + e.getMessage());
}// Check token count before sending to API
JTokkitTokenCountEstimator estimator = new JTokkitTokenCountEstimator();
int tokens = estimator.estimate(text);
if (tokens > 8191) {
// Split the text first
TokenTextSplitter splitter = TokenTextSplitter.builder()
.withChunkSize(8000)
.build();
List<Document> chunks = splitter.split(new Document(text));
}Install with Tessl CLI
npx tessl i tessl/maven-org-springframework-ai--spring-ai-commons@1.1.0