Common classes used across Spring AI providing document processing, text transformation, embedding utilities, observability support, and tokenization capabilities for AI application development
Evaluation provides a framework for assessing the relevance and correctness of AI-generated responses.
The evaluation layer consists of:
This framework enables systematic evaluation of AI outputs for quality assurance, testing, and continuous improvement.
Functional interface for evaluating relevance and correctness of responses.
package org.springframework.ai.evaluation;
@FunctionalInterface
interface Evaluator {
/**
* Evaluate a response for relevance and correctness.
* @param evaluationRequest request containing context and response
* @return evaluation result
*/
EvaluationResponse evaluate(EvaluationRequest evaluationRequest);
/**
* Extract supporting data from evaluation request.
* Default implementation concatenates document content.
* @param evaluationRequest request to extract data from
* @return supporting data string
*/
default String doGetSupportingData(EvaluationRequest evaluationRequest) {
// Default implementation concatenates documents
return evaluationRequest.getDataList().stream()
.map(doc -> doc.getText())
.collect(java.util.stream.Collectors.joining("\n"));
}
}import org.springframework.ai.evaluation.Evaluator;
import org.springframework.ai.evaluation.EvaluationRequest;
import org.springframework.ai.evaluation.EvaluationResponse;
import org.springframework.ai.document.Document;
import java.util.List;
import java.util.Map;
// Implement custom evaluator
Evaluator simpleEvaluator = request -> {
String userText = request.getUserText();
String response = request.getResponseContent();
// Simple evaluation logic
boolean containsAnswer = response.toLowerCase().contains(
userText.toLowerCase().split(" ")[0]
);
return new EvaluationResponse(
containsAnswer,
containsAnswer ? 1.0f : 0.0f,
containsAnswer ? "Response appears relevant" : "Response may not address query",
Map.of("query_keyword_present", containsAnswer)
);
};
// Create evaluation request
EvaluationRequest request = new EvaluationRequest(
"What is Spring AI?",
"Spring AI is a framework for building AI-powered applications in Java."
);
// Evaluate
EvaluationResponse response = simpleEvaluator.evaluate(request);
System.out.println("Pass: " + response.isPass());
System.out.println("Score: " + response.getScore());
System.out.println("Feedback: " + response.getFeedback());Request object containing user text, context documents, and model response.
package org.springframework.ai.evaluation;
import org.springframework.ai.document.Document;
import java.util.List;
class EvaluationRequest {
/**
* Create request with user text and response.
* @param userText original user query or input
* @param responseContent AI-generated response
*/
EvaluationRequest(String userText, String responseContent);
/**
* Create request with context documents and response.
* @param dataList context documents used for generation
* @param responseContent AI-generated response
*/
EvaluationRequest(List<Document> dataList, String responseContent);
/**
* Create request with user text, context, and response.
* @param userText original user query or input
* @param dataList context documents used for generation
* @param responseContent AI-generated response
*/
EvaluationRequest(String userText, List<Document> dataList, String responseContent);
/**
* Get user text/query.
* @return user text
*/
String getUserText();
/**
* Get context documents.
* @return list of documents
*/
List<Document> getDataList();
/**
* Get AI-generated response.
* @return response content
*/
String getResponseContent();
}import org.springframework.ai.evaluation.EvaluationRequest;
import org.springframework.ai.document.Document;
import java.util.List;
// Simple evaluation request (query + response)
EvaluationRequest simpleRequest = new EvaluationRequest(
"What are the benefits of Spring AI?",
"Spring AI provides abstraction, integration flexibility, and Spring ecosystem compatibility."
);
// Request with context documents
List<Document> contextDocs = List.of(
new Document("Spring AI simplifies AI integration in Java applications."),
new Document("It supports multiple AI providers including OpenAI and Anthropic.")
);
EvaluationRequest contextRequest = new EvaluationRequest(
contextDocs,
"Spring AI is a framework that simplifies AI integration."
);
// Complete request (query + context + response)
EvaluationRequest fullRequest = new EvaluationRequest(
"How do I configure Spring AI?",
List.of(
new Document("Configuration is done through application.properties"),
new Document("Set spring.ai.openai.api-key property")
),
"To configure Spring AI, add your API key to application.properties using the spring.ai.openai.api-key property."
);
// Access request data
String query = fullRequest.getUserText();
List<Document> docs = fullRequest.getDataList();
String response = fullRequest.getResponseContent();
System.out.println("Query: " + query);
System.out.println("Context documents: " + docs.size());
System.out.println("Response: " + response);Response object containing evaluation results.
package org.springframework.ai.evaluation;
import java.util.Map;
class EvaluationResponse {
/**
* Create response with all fields.
* @param pass true if evaluation passes
* @param score numeric score (0.0 to 1.0 typical)
* @param feedback human-readable feedback
* @param metadata additional evaluation metadata
*/
EvaluationResponse(boolean pass, float score, String feedback, Map<String, Object> metadata);
/**
* Create response without explicit score.
* Score defaults to 1.0 if pass, 0.0 otherwise.
* @param pass true if evaluation passes
* @param feedback human-readable feedback
* @param metadata additional evaluation metadata
*/
EvaluationResponse(boolean pass, String feedback, Map<String, Object> metadata);
/**
* Check if evaluation passed.
* @return true if passed
*/
boolean isPass();
/**
* Get numeric score.
* @return score (typically 0.0 to 1.0)
*/
float getScore();
/**
* Get feedback message.
* @return feedback string
*/
String getFeedback();
/**
* Get evaluation metadata.
* @return metadata map
*/
Map<String, Object> getMetadata();
}import org.springframework.ai.evaluation.EvaluationResponse;
import java.util.Map;
// Create response with explicit score
EvaluationResponse scoredResponse = new EvaluationResponse(
true,
0.95f,
"Response is highly relevant and accurate",
Map.of(
"relevance_score", 0.96,
"accuracy_score", 0.94,
"completeness_score", 0.95
)
);
// Create response with pass/fail only
EvaluationResponse passFailResponse = new EvaluationResponse(
false,
"Response does not address the user's question",
Map.of(
"missing_elements", List.of("configuration", "setup steps")
)
);
// Access response data
if (scoredResponse.isPass()) {
System.out.println("Evaluation passed!");
System.out.println("Score: " + scoredResponse.getScore());
System.out.println("Feedback: " + scoredResponse.getFeedback());
Map<String, Object> metadata = scoredResponse.getMetadata();
System.out.println("Relevance: " + metadata.get("relevance_score"));
}import org.springframework.ai.evaluation.Evaluator;
import org.springframework.ai.evaluation.EvaluationRequest;
import org.springframework.ai.evaluation.EvaluationResponse;
import org.springframework.ai.document.Document;
import java.util.List;
import java.util.Map;
/**
* Evaluate RAG responses for faithfulness to source documents.
*/
class RAGResponseEvaluator implements Evaluator {
@Override
public EvaluationResponse evaluate(EvaluationRequest request) {
String response = request.getResponseContent();
List<Document> contextDocs = request.getDataList();
// Extract facts from context
String contextText = doGetSupportingData(request);
// Check if response is grounded in context
boolean isGrounded = checkGrounding(response, contextText);
// Check for hallucinations (statements not in context)
boolean hasHallucinations = detectHallucinations(response, contextText);
// Calculate relevance to user query
float relevanceScore = calculateRelevance(
request.getUserText(),
response
);
boolean pass = isGrounded && !hasHallucinations && relevanceScore > 0.7f;
return new EvaluationResponse(
pass,
relevanceScore,
pass ? "Response is faithful to context" : "Response may contain unsupported claims",
Map.of(
"grounded", isGrounded,
"has_hallucinations", hasHallucinations,
"relevance_score", relevanceScore
)
);
}
private boolean checkGrounding(String response, String context) {
// Simple implementation: check if key phrases exist in context
String[] responseSentences = response.split("\\. ");
int groundedSentences = 0;
for (String sentence : responseSentences) {
if (context.toLowerCase().contains(sentence.toLowerCase())) {
groundedSentences++;
}
}
return groundedSentences >= responseSentences.length * 0.7;
}
private boolean detectHallucinations(String response, String context) {
// Check for specific claims that don't appear in context
// This is a simplified implementation
return false;
}
private float calculateRelevance(String query, String response) {
// Simple keyword overlap
String[] queryWords = query.toLowerCase().split("\\s+");
String responseLower = response.toLowerCase();
long matchingWords = java.util.Arrays.stream(queryWords)
.filter(responseLower::contains)
.count();
return (float) matchingWords / queryWords.length;
}
}
// Usage
RAGResponseEvaluator evaluator = new RAGResponseEvaluator();
EvaluationRequest request = new EvaluationRequest(
"What is Spring AI?",
List.of(
new Document("Spring AI is a framework for AI integration in Java."),
new Document("It provides abstractions for multiple AI providers.")
),
"Spring AI is a framework for AI integration in Java, providing abstractions for multiple AI providers."
);
EvaluationResponse result = evaluator.evaluate(request);
System.out.println("Pass: " + result.isPass());
System.out.println("Feedback: " + result.getFeedback());import org.springframework.ai.evaluation.Evaluator;
import org.springframework.ai.evaluation.EvaluationRequest;
import org.springframework.ai.evaluation.EvaluationResponse;
import java.util.HashMap;
import java.util.Map;
/**
* Evaluate responses against multiple criteria.
*/
class MultiCriteriaEvaluator implements Evaluator {
private final Map<String, Float> criteriaWeights;
private final float passingThreshold;
public MultiCriteriaEvaluator(Map<String, Float> criteriaWeights, float passingThreshold) {
this.criteriaWeights = criteriaWeights;
this.passingThreshold = passingThreshold;
}
@Override
public EvaluationResponse evaluate(EvaluationRequest request) {
Map<String, Object> scores = new HashMap<>();
// Evaluate each criterion
float relevanceScore = evaluateRelevance(request);
float accuracyScore = evaluateAccuracy(request);
float completenessScore = evaluateCompleteness(request);
float clarityScore = evaluateClarity(request);
scores.put("relevance", relevanceScore);
scores.put("accuracy", accuracyScore);
scores.put("completeness", completenessScore);
scores.put("clarity", clarityScore);
// Calculate weighted score
float weightedScore =
relevanceScore * criteriaWeights.getOrDefault("relevance", 0.3f) +
accuracyScore * criteriaWeights.getOrDefault("accuracy", 0.3f) +
completenessScore * criteriaWeights.getOrDefault("completeness", 0.2f) +
clarityScore * criteriaWeights.getOrDefault("clarity", 0.2f);
boolean pass = weightedScore >= passingThreshold;
String feedback = generateFeedback(scores, weightedScore, pass);
return new EvaluationResponse(
pass,
weightedScore,
feedback,
scores
);
}
private float evaluateRelevance(EvaluationRequest request) {
// Evaluate how relevant response is to query
return 0.85f; // Placeholder
}
private float evaluateAccuracy(EvaluationRequest request) {
// Evaluate factual accuracy against context
return 0.90f; // Placeholder
}
private float evaluateCompleteness(EvaluationRequest request) {
// Evaluate if response fully addresses query
return 0.80f; // Placeholder
}
private float evaluateClarity(EvaluationRequest request) {
// Evaluate clarity and readability
return 0.88f; // Placeholder
}
private String generateFeedback(Map<String, Object> scores,
float weightedScore,
boolean pass) {
StringBuilder feedback = new StringBuilder();
feedback.append(String.format("Overall score: %.2f ", weightedScore));
if (pass) {
feedback.append("(PASS) - ");
} else {
feedback.append("(FAIL) - ");
}
// Identify areas for improvement
scores.entrySet().stream()
.filter(e -> (float) e.getValue() < 0.7f)
.forEach(e -> feedback.append(e.getKey()).append(" needs improvement; "));
return feedback.toString();
}
}
// Usage
MultiCriteriaEvaluator evaluator = new MultiCriteriaEvaluator(
Map.of(
"relevance", 0.35f,
"accuracy", 0.35f,
"completeness", 0.15f,
"clarity", 0.15f
),
0.75f // 75% passing threshold
);
EvaluationRequest request = new EvaluationRequest(
"How do I use Spring AI?",
"Spring AI can be configured through application.properties and used via dependency injection."
);
EvaluationResponse result = evaluator.evaluate(request);
System.out.println(result.getFeedback());
System.out.println("Score breakdown: " + result.getMetadata());import org.springframework.ai.evaluation.Evaluator;
import org.springframework.ai.evaluation.EvaluationRequest;
import org.springframework.ai.evaluation.EvaluationResponse;
import java.util.ArrayList;
import java.util.List;
/**
* Automated testing framework for AI responses.
*/
class AIResponseTester {
private final Evaluator evaluator;
private final List<TestCase> testCases;
public AIResponseTester(Evaluator evaluator) {
this.evaluator = evaluator;
this.testCases = new ArrayList<>();
}
public void addTestCase(String query, String expectedResponse, List<Document> context) {
testCases.add(new TestCase(query, expectedResponse, context));
}
public TestResults runTests() {
List<TestResult> results = new ArrayList<>();
for (int i = 0; i < testCases.size(); i++) {
TestCase testCase = testCases.get(i);
// Get actual response from AI system
String actualResponse = generateResponse(testCase.query(), testCase.context());
// Evaluate
EvaluationRequest request = new EvaluationRequest(
testCase.query(),
testCase.context(),
actualResponse
);
EvaluationResponse evaluation = evaluator.evaluate(request);
results.add(new TestResult(
i + 1,
testCase.query(),
actualResponse,
evaluation
));
}
return new TestResults(results);
}
private String generateResponse(String query, List<Document> context) {
// Call your AI system here
return "AI generated response"; // Placeholder
}
record TestCase(String query, String expectedResponse, List<Document> context) {}
record TestResult(int testNumber, String query, String actualResponse,
EvaluationResponse evaluation) {
public boolean passed() {
return evaluation.isPass();
}
}
record TestResults(List<TestResult> results) {
public int totalTests() {
return results.size();
}
public int passedTests() {
return (int) results.stream().filter(TestResult::passed).count();
}
public int failedTests() {
return totalTests() - passedTests();
}
public double passRate() {
return (double) passedTests() / totalTests() * 100;
}
public String summary() {
return String.format("""
Test Results:
- Total: %d
- Passed: %d
- Failed: %d
- Pass Rate: %.1f%%
""",
totalTests(), passedTests(), failedTests(), passRate()
);
}
public void printDetailedResults() {
System.out.println(summary());
System.out.println("\nDetailed Results:");
for (TestResult result : results) {
System.out.printf("Test #%d: %s%n", result.testNumber(),
result.passed() ? "PASS" : "FAIL");
System.out.println(" Query: " + result.query());
System.out.println(" Score: " + result.evaluation().getScore());
System.out.println(" Feedback: " + result.evaluation().getFeedback());
System.out.println();
}
}
}
}
// Usage
Evaluator evaluator = new RAGResponseEvaluator();
AIResponseTester tester = new AIResponseTester(evaluator);
// Add test cases
tester.addTestCase(
"What is Spring AI?",
"Spring AI is a framework...",
List.of(new Document("Spring AI documentation..."))
);
tester.addTestCase(
"How to configure OpenAI?",
"Set the API key...",
List.of(new Document("Configuration guide..."))
);
// Run tests
AIResponseTester.TestResults results = tester.runTests();
results.printDetailedResults();import org.springframework.ai.evaluation.Evaluator;
import org.springframework.ai.evaluation.EvaluationRequest;
import org.springframework.ai.evaluation.EvaluationResponse;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
/**
* Continuous evaluation pipeline for monitoring AI quality.
*/
class ContinuousEvaluationPipeline {
private final Evaluator evaluator;
private final List<EvaluationRecord> history;
private final double alertThreshold;
public ContinuousEvaluationPipeline(Evaluator evaluator, double alertThreshold) {
this.evaluator = evaluator;
this.history = new ArrayList<>();
this.alertThreshold = alertThreshold;
}
public void evaluateAndRecord(EvaluationRequest request) {
EvaluationResponse response = evaluator.evaluate(request);
EvaluationRecord record = new EvaluationRecord(
LocalDateTime.now(),
request,
response
);
history.add(record);
// Check for quality degradation
if (response.getScore() < alertThreshold) {
alertLowQuality(record);
}
}
public QualityMetrics calculateMetrics(int lastN) {
List<EvaluationRecord> recentRecords = history.subList(
Math.max(0, history.size() - lastN),
history.size()
);
double avgScore = recentRecords.stream()
.mapToDouble(r -> r.response().getScore())
.average()
.orElse(0.0);
long passCount = recentRecords.stream()
.filter(r -> r.response().isPass())
.count();
double passRate = (double) passCount / recentRecords.size();
return new QualityMetrics(
recentRecords.size(),
avgScore,
passRate,
recentRecords.stream()
.mapToDouble(r -> r.response().getScore())
.min()
.orElse(0.0),
recentRecords.stream()
.mapToDouble(r -> r.response().getScore())
.max()
.orElse(0.0)
);
}
private void alertLowQuality(EvaluationRecord record) {
System.err.printf("""
ALERT: Low quality response detected
Time: %s
Score: %.2f
Feedback: %s
""",
record.timestamp(),
record.response().getScore(),
record.response().getFeedback()
);
}
record EvaluationRecord(
LocalDateTime timestamp,
EvaluationRequest request,
EvaluationResponse response
) {}
record QualityMetrics(
int evaluationCount,
double averageScore,
double passRate,
double minScore,
double maxScore
) {
public String formatReport() {
return String.format("""
Quality Metrics (last %d evaluations):
- Average Score: %.2f
- Pass Rate: %.1f%%
- Score Range: %.2f - %.2f
""",
evaluationCount,
averageScore,
passRate * 100,
minScore,
maxScore
);
}
}
}
// Usage
Evaluator evaluator = new MultiCriteriaEvaluator(
Map.of("relevance", 0.5f, "accuracy", 0.5f),
0.7f
);
ContinuousEvaluationPipeline pipeline = new ContinuousEvaluationPipeline(
evaluator,
0.6 // Alert if score drops below 0.6
);
// Continuously evaluate responses
EvaluationRequest request1 = new EvaluationRequest(
"Query 1",
"Response 1"
);
pipeline.evaluateAndRecord(request1);
EvaluationRequest request2 = new EvaluationRequest(
"Query 2",
"Response 2"
);
pipeline.evaluateAndRecord(request2);
// Get metrics
ContinuousEvaluationPipeline.QualityMetrics metrics =
pipeline.calculateMetrics(100); // Last 100 evaluations
System.out.println(metrics.formatReport());import org.springframework.ai.evaluation.Evaluator;
import org.springframework.ai.evaluation.EvaluationRequest;
import org.springframework.ai.evaluation.EvaluationResponse;
import org.springframework.ai.document.Document;
import java.util.Map;
/**
* Use an LLM to evaluate responses (LLM-as-Judge pattern).
*/
class LLMAsJudgeEvaluator implements Evaluator {
private final String judgePromptTemplate = """
You are evaluating an AI response for quality and relevance.
User Query: %s
Context Documents:
%s
AI Response:
%s
Evaluate the response on:
1. Relevance to the query
2. Accuracy based on context
3. Completeness
Provide a score from 0.0 to 1.0 and brief feedback.
Format: SCORE: <score>\\nFEEDBACK: <feedback>
""";
@Override
public EvaluationResponse evaluate(EvaluationRequest request) {
String context = doGetSupportingData(request);
String judgePrompt = String.format(
judgePromptTemplate,
request.getUserText(),
context,
request.getResponseContent()
);
// Call LLM to judge (placeholder)
String judgeResponse = callJudgeLLM(judgePrompt);
// Parse judge response
float score = parseScore(judgeResponse);
String feedback = parseFeedback(judgeResponse);
return new EvaluationResponse(
score >= 0.7f,
score,
feedback,
Map.of("judge_response", judgeResponse)
);
}
private String callJudgeLLM(String prompt) {
// Call your LLM here
return "SCORE: 0.85\\nFEEDBACK: Response is relevant and accurate";
}
private float parseScore(String response) {
// Parse score from response
return 0.85f; // Placeholder
}
private String parseFeedback(String response) {
// Parse feedback from response
return "Response is relevant and accurate"; // Placeholder
}
}Thread Safety:
Evaluator implementations: Depends on implementation (stateless implementations are thread-safe)EvaluationRequest and EvaluationResponse: Immutable value objects, thread-safePerformance:
Common Exceptions:
NullPointerException: If required fields (userText, responseContent) are nullIllegalArgumentException: If evaluation criteria are invalidRuntimeException: LLM API errors, network failures (for LLM-based evaluators)Edge Cases:
// Empty response
EvaluationRequest request = new EvaluationRequest("query", "");
// Evaluator should handle empty responses gracefully
// Null data list
EvaluationRequest request = new EvaluationRequest("query", null, "response");
// dataList is null, evaluator should handle or throw
// Empty context documents
EvaluationRequest request = new EvaluationRequest(
"query",
List.of(), // Empty list
"response"
);
// Evaluator should handle lack of context
// Score edge values
new EvaluationResponse(true, 0.0f, "feedback", metadata); // Valid: minimum score
new EvaluationResponse(true, 1.0f, "feedback", metadata); // Valid: maximum score
new EvaluationResponse(true, -0.5f, "feedback", metadata); // Valid but unusual: negative scoreInstall with Tessl CLI
npx tessl i tessl/maven-org-springframework-ai--spring-ai-commons@1.1.0