Quarkus extension for integrating Anthropic Claude LLM models into Quarkus applications via LangChain4j
The quarkus-langchain4j-anthropic extension provides advanced capabilities including extended thinking mode, prompt caching, streaming responses, and tool/function calling.
Claude's extended thinking mode enables the model to perform internal reasoning before generating a response, improving performance on complex tasks.
Configuration interface for thinking mode:
package io.quarkiverse.langchain4j.anthropic.runtime.config;
@ConfigGroup
interface ChatModelConfig.ThinkingConfig {
/**
* The thinking type to enable Claude's reasoning process.
* Set to "enabled" to activate extended thinking.
*/
Optional<String> type();
/**
* The token budget for the model's thinking process.
* This allocates tokens specifically for internal reasoning
* before generating the final response.
* Typical values: 5000-15000 tokens
*/
Optional<Integer> budgetTokens();
/**
* Whether thinking results should be returned in the response.
* When true, the thinking content is included in the AiMessage.
* Default: false
*/
Optional<Boolean> returnThinking();
/**
* Whether previously stored thinking should be sent in follow-up requests.
* Maintains reasoning context across multiple interactions.
* Default: true
*/
Optional<Boolean> sendThinking();
/**
* Enable interleaved thinking for Claude 4 models.
* Allows reasoning between tool calls.
* Requires Claude 4 model (e.g., claude-opus-4-20250514)
* and thinking.type: enabled.
* Default: false
*/
Optional<Boolean> interleaved();
}# Enable thinking
quarkus.langchain4j.anthropic.chat-model.thinking.type=enabled
# Allocate 10000 tokens for thinking
quarkus.langchain4j.anthropic.chat-model.thinking.budget-tokens=10000
# Return thinking content in response
quarkus.langchain4j.anthropic.chat-model.thinking.return-thinking=true
# Send thinking in follow-up messages
quarkus.langchain4j.anthropic.chat-model.thinking.send-thinking=true
# Enable interleaved thinking for Claude 4
quarkus.langchain4j.anthropic.chat-model.thinking.interleaved=trueimport jakarta.inject.Inject;
import dev.langchain4j.model.chat.ChatModel;
import dev.langchain4j.model.chat.response.ChatResponse;
import dev.langchain4j.data.message.UserMessage;
@ApplicationScoped
public class ComplexReasoningService {
@Inject
ChatModel chatModel;
public String solveComplexProblem(String problem) {
// With thinking enabled, Claude will reason internally
// before generating the final answer
ChatResponse response = chatModel.chat(UserMessage.from(problem));
// Access thinking content if return-thinking=true
String thinking = response.aiMessage().thinking();
String answer = response.aiMessage().text();
System.out.println("Thinking: " + thinking);
return answer;
}
}budgetTokens based on problem complexityPrompt caching reduces costs and latency by caching portions of prompts that are reused across requests.
# Cache system messages
quarkus.langchain4j.anthropic.chat-model.cache-system-messages=true
# Cache tool definitions
quarkus.langchain4j.anthropic.chat-model.cache-tools=truepackage io.quarkiverse.langchain4j.anthropic.runtime.config;
@ConfigGroup
interface ChatModelConfig {
/**
* Cache system messages to reduce costs for repeated prompts.
* Requires minimum 1024 tokens (Claude Opus/Sonnet) or
* 2048-4096 tokens (Haiku).
* Supported models: Claude Opus 4.1, Sonnet 4.5, Haiku 4.5, and later.
* Default: false
*/
Boolean cacheSystemMessages();
/**
* Cache tool definitions to reduce costs.
* Requires minimum 1024 tokens (Claude Opus/Sonnet) or
* 2048-4096 tokens (Haiku).
* Supported models: Claude Opus 4.1, Sonnet 4.5, Haiku 4.5, and later.
* Default: false
*/
Boolean cacheTools();
}Streaming enables real-time response generation, providing partial updates as tokens are generated.
Interface for handling streaming responses:
package dev.langchain4j.model.chat.response;
/**
* Handler for streaming chat responses
*/
interface StreamingChatResponseHandler {
/**
* Called for each partial text response
*
* @param response Partial text response
* @param context Context including stream handle
*/
void onPartialResponse(
PartialResponse response,
PartialResponseContext context
);
/**
* Called for each partial thinking token (if thinking enabled and returned)
*
* @param thinking Partial thinking content
* @param context Context including stream handle
*/
void onPartialThinking(
PartialThinking thinking,
PartialThinkingContext context
);
/**
* Called for each partial tool call during streaming
*
* @param toolCall Partial tool call information
* @param context Context including stream handle
*/
void onPartialToolCall(
PartialToolCall toolCall,
PartialToolCallContext context
);
/**
* Called when a tool call is complete
*
* @param toolCall Complete tool call with all arguments
*/
void onCompleteToolCall(CompleteToolCall toolCall);
/**
* Called when streaming completes successfully
*
* @param response Complete chat response
*/
void onCompleteResponse(ChatResponse response);
/**
* Called when an error occurs during streaming
*
* @param error The error that occurred
*/
void onError(Throwable error);
}/**
* Partial text response
*/
class PartialResponse {
public String text();
}
/**
* Context for partial responses
*/
class PartialResponseContext {
public StreamingHandle streamingHandle();
}
/**
* Partial thinking content
*/
class PartialThinking {
public String thinking();
}
/**
* Context for partial thinking
*/
class PartialThinkingContext {
public StreamingHandle streamingHandle();
}
/**
* Context for partial tool calls
*/
class PartialToolCallContext {
public StreamingHandle streamingHandle();
}
/**
* Partial tool call
*/
class PartialToolCall {
public int index();
public String id();
public String name();
public String partialArguments();
}
/**
* Complete tool call
*/
class CompleteToolCall {
public int index();
public dev.langchain4j.agent.tool.ToolExecutionRequest toolExecutionRequest();
}
/**
* Handle to control streaming
*/
interface StreamingHandle {
void cancel();
boolean isCancelled();
}import jakarta.inject.Inject;
import dev.langchain4j.model.chat.StreamingChatModel;
import dev.langchain4j.model.chat.response.*;
@ApplicationScoped
public class StreamingService {
@Inject
StreamingChatModel streamingChatModel;
public void streamResponse(String prompt) {
streamingChatModel.chat(prompt, new StreamingChatResponseHandler() {
@Override
public void onPartialResponse(PartialResponse response,
PartialResponseContext context) {
// Print each token as it arrives
System.out.print(response.text());
}
@Override
public void onCompleteResponse(ChatResponse response) {
System.out.println("\n--- Complete ---");
System.out.println("Total tokens: " +
response.tokenUsage().totalTokenCount());
}
@Override
public void onError(Throwable error) {
System.err.println("Error: " + error.getMessage());
}
});
}
}streamingChatModel.chat(complexProblem, new StreamingChatResponseHandler() {
@Override
public void onPartialThinking(PartialThinking thinking,
PartialThinkingContext context) {
// Called when thinking tokens arrive (if return-thinking=true)
// The thinking parameter contains the partial thinking content
System.out.print("[Thinking received] ");
}
@Override
public void onPartialResponse(PartialResponse response,
PartialResponseContext context) {
// Stream response tokens
System.out.print(response.text());
}
@Override
public void onCompleteResponse(ChatResponse response) {
// Access complete thinking and response
String fullThinking = response.aiMessage().thinking();
String fullResponse = response.aiMessage().text();
System.out.println("\n--- Analysis Complete ---");
}
@Override
public void onError(Throwable error) {
error.printStackTrace();
}
});streamingChatModel.chat(prompt, new StreamingChatResponseHandler() {
private int tokenCount = 0;
private static final int MAX_TOKENS = 100;
private boolean cancelled = false;
@Override
public void onPartialResponse(PartialResponse response,
PartialResponseContext context) {
System.out.print(response.text());
tokenCount++;
// Cancel stream after 100 tokens
if (tokenCount >= MAX_TOKENS) {
context.streamingHandle().cancel();
cancelled = true;
System.out.println("\n[Stream cancelled]");
}
}
@Override
public void onCompleteResponse(ChatResponse response) {
if (!cancelled) {
System.out.println("\n[Stream completed]");
}
}
@Override
public void onError(Throwable error) {
error.printStackTrace();
}
});onError for robust applicationsThe extension supports declarative tool definitions for function calling.
import dev.langchain4j.service.SystemMessage;
import dev.langchain4j.service.UserMessage;
import dev.langchain4j.agent.tool.Tool;
import io.quarkiverse.langchain4j.RegisterAiService;
import jakarta.enterprise.context.ApplicationScoped;
// Define tools
@ApplicationScoped
public class WeatherTools {
@Tool("Get current weather for a location")
public String getWeather(String location) {
// Implementation
return "Sunny, 72°F in " + location;
}
@Tool("Get weather forecast for next N days")
public String getForecast(String location, int days) {
// Implementation
return days + "-day forecast for " + location;
}
}
// AI Service with tools
@RegisterAiService(tools = WeatherTools.class)
public interface WeatherAssistant {
@SystemMessage("You are a weather assistant. Use the provided tools to answer questions.")
String chat(String message);
}
// Usage
@ApplicationScoped
public class WeatherService {
@Inject
WeatherAssistant assistant;
public String getWeatherInfo(String query) {
// Claude will automatically call tools as needed
return assistant.chat(query);
}
}streamingChatModel.chat(request, new StreamingChatResponseHandler() {
@Override
public void onPartialToolCall(PartialToolCall toolCall,
PartialToolCallContext context) {
System.out.println("Calling tool: " + toolCall.name());
System.out.println("Arguments (partial): " + toolCall.partialArguments());
}
@Override
public void onCompleteToolCall(CompleteToolCall toolCall) {
var request = toolCall.toolExecutionRequest();
System.out.println("Tool: " + request.name());
System.out.println("Arguments: " + request.arguments());
System.out.println("ID: " + request.id());
}
@Override
public void onPartialResponse(PartialResponse response,
PartialResponseContext context) {
System.out.print(response.text());
}
@Override
public void onCompleteResponse(ChatResponse response) {
List<ToolExecutionRequest> toolRequests =
response.aiMessage().toolExecutionRequests();
if (!toolRequests.isEmpty()) {
System.out.println("Tools to execute:");
for (var tool : toolRequests) {
System.out.println("- " + tool.name());
}
}
}
@Override
public void onError(Throwable error) {
error.printStackTrace();
}
});Enable tool caching for applications with many tool definitions:
# Cache tool definitions across requests
quarkus.langchain4j.anthropic.chat-model.cache-tools=trueThis reduces costs when using the same tools across multiple requests.
Access additional response metadata through attributes:
import dev.langchain4j.model.chat.response.ChatResponse;
import dev.langchain4j.data.message.AiMessage;
import java.util.Map;
ChatResponse response = chatModel.chat(request);
AiMessage aiMessage = response.aiMessage();
// Get thinking signature (if thinking enabled)
Map<String, Object> attributes = aiMessage.attributes();
if (attributes != null) {
String thinkingSignature = (String) attributes.get("thinking_signature");
List<String> redactedThinkings = (List<String>) attributes.get("redacted_thinking");
if (thinkingSignature != null) {
System.out.println("Thinking signature: " + thinkingSignature);
}
}Monitor token usage for cost management:
import dev.langchain4j.model.chat.response.ChatResponse;
import dev.langchain4j.model.output.TokenUsage;
ChatResponse response = chatModel.chat(request);
TokenUsage usage = response.tokenUsage();
System.out.println("Input tokens: " + usage.inputTokenCount());
System.out.println("Output tokens: " + usage.outputTokenCount());
System.out.println("Total tokens: " + usage.totalTokenCount());
// Calculate approximate cost (example rates)
double inputCost = usage.inputTokenCount() * 0.00003; // $0.03 per 1K tokens
double outputCost = usage.outputTokenCount() * 0.00015; // $0.15 per 1K tokens
double totalCost = inputCost + outputCost;
System.out.printf("Estimated cost: $%.4f%n", totalCost);Define custom stop sequences to control response termination:
# Stop generation when these sequences are encountered
quarkus.langchain4j.anthropic.chat-model.stop-sequences=STOP,END,###Configuration API:
package io.quarkiverse.langchain4j.anthropic.runtime.config;
@ConfigGroup
interface ChatModelConfig {
/**
* Custom text sequences that will cause the model to stop generating.
* When the model generates any of these sequences, it will stop
* and return the response up to that point.
*/
Optional<List<String>> stopSequences();
}Usage:
// When configured with stop-sequences=STOP,END
// The model will stop generating when it outputs "STOP" or "END"
String response = chatModel.chat("List items, then output STOP");
// Response will end at "STOP"Maintain context across multiple turns:
import dev.langchain4j.data.message.*;
import dev.langchain4j.model.chat.response.ChatResponse;
import java.util.ArrayList;
import java.util.List;
@ApplicationScoped
public class ConversationService {
@Inject
ChatModel chatModel;
public String conversation() {
List<ChatMessage> messages = new ArrayList<>();
// Turn 1
messages.add(UserMessage.from("What is the capital of France?"));
ChatResponse response1 = chatModel.chat(messages.toArray(new ChatMessage[0]));
messages.add(response1.aiMessage());
System.out.println("AI: " + response1.aiMessage().text());
// Turn 2 (with context)
messages.add(UserMessage.from("What is its population?"));
ChatResponse response2 = chatModel.chat(messages.toArray(new ChatMessage[0]));
messages.add(response2.aiMessage());
System.out.println("AI: " + response2.aiMessage().text());
// Turn 3 (with full context including thinking if enabled)
messages.add(UserMessage.from("Compare it to London."));
ChatResponse response3 = chatModel.chat(messages.toArray(new ChatMessage[0]));
return response3.aiMessage().text();
}
}Claude 4 models support interleaved thinking, allowing reasoning between tool calls:
# Requires Claude 4 model
quarkus.langchain4j.anthropic.chat-model.model-name=claude-opus-4-20250514
# Enable thinking
quarkus.langchain4j.anthropic.chat-model.thinking.type=enabled
quarkus.langchain4j.anthropic.chat-model.thinking.budget-tokens=10000
# Enable interleaved thinking
quarkus.langchain4j.anthropic.chat-model.thinking.interleaved=trueThis allows Claude to:
import jakarta.inject.Inject;
import jakarta.enterprise.context.ApplicationScoped;
import dev.langchain4j.model.chat.ChatModel;
import dev.langchain4j.model.chat.StreamingChatModel;
import dev.langchain4j.model.chat.response.*;
import dev.langchain4j.data.message.*;
import io.quarkiverse.langchain4j.ModelName;
import java.util.ArrayList;
import java.util.List;
@ApplicationScoped
public class AdvancedService {
@Inject
@ModelName("advanced")
ChatModel chatModel;
@Inject
@ModelName("advanced")
StreamingChatModel streamingChatModel;
/**
* Complex reasoning with thinking, caching, and token tracking
*/
public String solveComplexProblem(String problem) {
List<ChatMessage> messages = new ArrayList<>();
// System message (will be cached if cache-system-messages=true)
String systemPrompt = """
You are an expert problem solver with advanced reasoning capabilities.
Break down complex problems step by step.
Use your thinking process to explore different approaches.
""";
messages.add(SystemMessage.from(systemPrompt));
// User message
messages.add(UserMessage.from(problem));
// Get response with thinking
ChatResponse response = chatModel.chat(messages.toArray(new ChatMessage[0]));
// Access thinking and response
String thinking = response.aiMessage().thinking();
String answer = response.aiMessage().text();
// Track token usage
var usage = response.tokenUsage();
System.out.println("Tokens: " + usage.totalTokenCount());
return answer;
}
/**
* Streaming with thinking and cancellation
*/
public void streamWithControl(String prompt) {
streamingChatModel.chat(prompt, new StreamingChatResponseHandler() {
private StringBuilder thinkingBuffer = new StringBuilder();
private StringBuilder responseBuffer = new StringBuilder();
@Override
public void onPartialThinking(PartialThinking partial,
PartialThinkingContext context) {
thinkingBuffer.append(partial.thinking());
System.out.print("[Thinking] " + partial.thinking());
// Cancel if thinking is too verbose
if (thinkingBuffer.length() > 5000) {
context.streamingHandle().cancel();
}
}
@Override
public void onPartialResponse(PartialResponse partial,
PartialResponseContext context) {
responseBuffer.append(partial.text());
System.out.print(partial.text());
}
@Override
public void onPartialToolCall(PartialToolCall toolCall,
PartialToolCallContext context) {
System.out.println("\n[Tool call] " + toolCall.name());
}
@Override
public void onCompleteToolCall(CompleteToolCall toolCall) {
System.out.println("[Tool ready] " + toolCall.toolExecutionRequest().name());
}
@Override
public void onCompleteResponse(ChatResponse response) {
System.out.println("\n--- Complete ---");
System.out.println("Thinking length: " + thinkingBuffer.length());
System.out.println("Response length: " + responseBuffer.length());
System.out.println("Tokens used: " +
response.tokenUsage().totalTokenCount());
}
@Override
public void onError(Throwable error) {
System.err.println("Error: " + error.getMessage());
}
});
}
}# Advanced model configuration
quarkus.langchain4j.anthropic.advanced.api-key=sk-ant-...
quarkus.langchain4j.anthropic.advanced.chat-model.model-name=claude-opus-4-20250514
quarkus.langchain4j.anthropic.advanced.chat-model.max-tokens=4096
quarkus.langchain4j.anthropic.advanced.chat-model.temperature=0.7
# Extended thinking with interleaving
quarkus.langchain4j.anthropic.advanced.chat-model.thinking.type=enabled
quarkus.langchain4j.anthropic.advanced.chat-model.thinking.budget-tokens=15000
quarkus.langchain4j.anthropic.advanced.chat-model.thinking.return-thinking=true
quarkus.langchain4j.anthropic.advanced.chat-model.thinking.interleaved=true
# Prompt caching for efficiency
quarkus.langchain4j.anthropic.advanced.chat-model.cache-system-messages=true
quarkus.langchain4j.anthropic.advanced.chat-model.cache-tools=true
# Debugging
quarkus.langchain4j.anthropic.advanced.log-requests=trueInstall with Tessl CLI
npx tessl i tessl/maven-io-quarkiverse-langchain4j--quarkus-langchain4j-anthropic@1.7.0