Quarkus extension for integrating IBM watsonx.ai foundation models with LangChain4j. Provides chat models, generation models, streaming models, embedding models, and scoring models for IBM watsonx.ai. Includes comprehensive configuration options, support for tool/function calling, text extraction from documents in Cloud Object Storage, and experimental built-in services for Google search, weather, and web crawling. Designed for enterprise Java applications using the Quarkus framework with built-in dependency injection and native compilation support.
Legacy text generation models using the Watsonx generation API without tool/function calling support. These models provide backward compatibility with older Watsonx generation endpoints and offer different parameter controls compared to chat models. Both synchronous and streaming variants are available.
Execute synchronous text generation with legacy generation parameters and decoding methods.
public class WatsonxGenerationModel implements dev.langchain4j.model.chat.ChatModel {
public static Builder builder();
public ChatResponse doChat(ChatRequest chatRequest);
public List<ChatModelListener> listeners();
public ChatRequestParameters defaultRequestParameters();
public Set<Capability> supportedCapabilities();
public WatsonxRestApi getClient();
public String getModelId();
public String getProjectId();
public String getSpaceId();
public String getVersion();
}Supported Capabilities:
Example Usage:
import io.quarkiverse.langchain4j.watsonx.WatsonxGenerationModel;
import dev.langchain4j.data.message.UserMessage;
import dev.langchain4j.model.chat.response.ChatResponse;
import java.net.URL;
WatsonxGenerationModel model = WatsonxGenerationModel.builder()
.modelId("meta-llama/llama-4-maverick-17b-128e-instruct-fp8")
.url(new URL("https://us-south.ml.cloud.ibm.com"))
.projectId("your-project-id")
.tokenGenerator(tokenGenerator)
.decodingMethod("sample")
.temperature(0.7)
.maxNewTokens(500)
.topK(50)
.topP(0.9)
.repetitionPenalty(1.1)
.build();
ChatResponse response = model.chat(UserMessage.from("Write a short poem about nature"));
String generatedText = response.aiMessage().text();Multi-prompt Generation:
import dev.langchain4j.data.message.ChatMessage;
import java.util.List;
// Multiple prompts are joined using the promptJoiner
WatsonxGenerationModel model = WatsonxGenerationModel.builder()
.promptJoiner("\n---\n")
.build();
List<ChatMessage> messages = List.of(
UserMessage.from("First prompt"),
UserMessage.from("Second prompt")
);
ChatResponse response = model.chat(messages);
// Prompts are joined: "First prompt\n---\nSecond prompt"Stream text generation responses in real-time for improved user experience.
public class WatsonxGenerationStreamingModel implements dev.langchain4j.model.chat.StreamingChatModel {
public static Builder builder();
public void doChat(ChatRequest chatRequest, StreamingChatResponseHandler handler);
public List<ChatModelListener> listeners();
public ChatRequestParameters defaultRequestParameters();
public Set<Capability> supportedCapabilities();
public WatsonxRestApi getClient();
public String getModelId();
public String getProjectId();
public String getSpaceId();
public String getVersion();
}Example Usage:
import io.quarkiverse.langchain4j.watsonx.WatsonxGenerationStreamingModel;
import dev.langchain4j.model.chat.request.ChatRequest;
import dev.langchain4j.model.chat.response.StreamingChatResponseHandler;
WatsonxGenerationStreamingModel streamingModel = WatsonxGenerationStreamingModel.builder()
.modelId("meta-llama/llama-4-maverick-17b-128e-instruct-fp8")
.url(new URL("https://us-south.ml.cloud.ibm.com"))
.projectId("your-project-id")
.tokenGenerator(tokenGenerator)
.temperature(0.8)
.maxNewTokens(1000)
.build();
streamingModel.chat(
ChatRequest.builder()
.messages(List.of(UserMessage.from("Tell me a story")))
.build(),
new StreamingChatResponseHandler() {
@Override
public void onPartialResponse(String partialResponse) {
System.out.print(partialResponse);
}
@Override
public void onComplete(ChatResponse response) {
System.out.println("\nGeneration complete!");
System.out.println("Tokens: " + response.tokenUsage().totalTokens());
System.out.println("Stop reason: " + response.finishReason());
}
@Override
public void onError(Throwable error) {
System.err.println("Error: " + error.getMessage());
}
}
);Configure generation models with legacy Watsonx generation parameters.
public static class Builder extends Watsonx.Builder<WatsonxGenerationModel, Builder> {
// Inherited base parameters
public Builder modelId(String modelId);
public Builder version(String version);
public Builder spaceId(String spaceId);
public Builder projectId(String projectId);
public Builder url(URL url);
public Builder timeout(Duration timeout);
public Builder tokenGenerator(TokenGenerator tokenGenerator);
public Builder logRequests(boolean logRequests);
public Builder logResponses(boolean logResponses);
public Builder logCurl(boolean logCurl);
public Builder listeners(List<ChatModelListener> listeners);
// Generation-specific parameters
public Builder decodingMethod(String decodingMethod);
public Builder decayFactor(Double decayFactor);
public Builder startIndex(Integer startIndex);
public Builder minNewTokens(Integer minNewTokens);
public Builder maxNewTokens(Integer maxNewTokens);
public Builder temperature(Double temperature);
public Builder topK(Integer topK);
public Builder topP(Double topP);
public Builder randomSeed(Integer randomSeed);
public Builder repetitionPenalty(Double repetitionPenalty);
public Builder stopSequences(List<String> stopSequences);
public Builder truncateInputTokens(Integer truncateInputTokens);
public Builder includeStopSequence(Boolean includeStopSequence);
public Builder promptJoiner(String promptJoiner);
public WatsonxGenerationModel build();
}Parameter Details:
modelId (String, required): Watsonx model identifier
version (String): API version
spaceId (String): Deployment space ID (mutually exclusive with projectId)
projectId (String): Project ID (mutually exclusive with spaceId)
url (URL, required): Watsonx API base URL
timeout (Duration): Request timeout
tokenGenerator (TokenGenerator, required): Handles IBM Cloud IAM authentication
logRequests (boolean): Enable request body logging
logResponses (boolean): Enable response body logging
logCurl (boolean): Log requests as cURL commands
listeners (List<ChatModelListener>): Request/response event listeners
decodingMethod (String): Decoding strategy for token selection
decayFactor (Double): Length penalty decay factor
startIndex (Integer): Length penalty start index
minNewTokens (Integer): Minimum number of tokens to generate
maxNewTokens (Integer): Maximum number of tokens to generate
temperature (Double): Sampling temperature
topK (Integer): Top-K sampling parameter
topP (Double): Nucleus sampling parameter
randomSeed (Integer): Random seed for reproducibility
repetitionPenalty (Double): Penalty for repeated tokens
stopSequences (List<String>): Stop sequences (max 6)
truncateInputTokens (Integer): Truncate input if exceeds limit
includeStopSequence (Boolean): Include stop sequence in output
promptJoiner (String): String to join multiple prompts
Complete Builder Example:
WatsonxGenerationModel model = WatsonxGenerationModel.builder()
.modelId("meta-llama/llama-4-maverick-17b-128e-instruct-fp8")
.url(new URL("https://us-south.ml.cloud.ibm.com"))
.projectId("abc123")
.tokenGenerator(tokenGenerator)
.decodingMethod("sample")
.temperature(0.7)
.topK(50)
.topP(0.9)
.minNewTokens(50)
.maxNewTokens(500)
.randomSeed(42)
.repetitionPenalty(1.2)
.stopSequences(List.of("\n\n", "END", "---"))
.includeStopSequence(false)
.truncateInputTokens(2048)
.decayFactor(1.5)
.startIndex(10)
.promptJoiner("\n---\n")
.logRequests(true)
.build();Control generation length with decay-based penalties.
Length Penalty Configuration:
// Using builder methods
WatsonxGenerationModel model = WatsonxGenerationModel.builder()
.decayFactor(1.5) // Penalty strength
.startIndex(10) // Start applying penalty at token 10
.build();
// Length penalty is applied during generation:
// penalty = decayFactor ^ (current_position - startIndex)
// For positions >= startIndex, token scores are divided by penaltyLength Penalty Example:
// Encourage longer responses
WatsonxGenerationModel longModel = WatsonxGenerationModel.builder()
.decayFactor(1.2) // Gentle penalty
.startIndex(100) // Start penalty after 100 tokens
.maxNewTokens(500)
.build();
// Encourage shorter responses
WatsonxGenerationModel shortModel = WatsonxGenerationModel.builder()
.decayFactor(2.0) // Strong penalty
.startIndex(20) // Start penalty early
.maxNewTokens(100)
.build();Control how tokens are selected during generation.
Greedy Decoding:
// Always select most probable token
// Deterministic and focused
WatsonxGenerationModel greedyModel = WatsonxGenerationModel.builder()
.decodingMethod("greedy")
.maxNewTokens(200)
.build();
// Temperature, topK, topP are ignored in greedy modeSample Decoding:
// Sample from probability distribution
// More creative and varied
WatsonxGenerationModel sampleModel = WatsonxGenerationModel.builder()
.decodingMethod("sample")
.temperature(0.7) // Controls randomness
.topK(50) // Limit to top 50 tokens
.topP(0.9) // Nucleus sampling
.randomSeed(42) // For reproducibility
.build();Use Quarkus CDI for automatic model creation and injection.
Configuration:
quarkus.langchain4j.watsonx.base-url=https://us-south.ml.cloud.ibm.com
quarkus.langchain4j.watsonx.api-key=your-api-key
quarkus.langchain4j.watsonx.project-id=your-project-id
# Generation model configuration
quarkus.langchain4j.watsonx.generation-model.model-name=meta-llama/llama-4-maverick-17b-128e-instruct-fp8
quarkus.langchain4j.watsonx.generation-model.decoding-method=sample
quarkus.langchain4j.watsonx.generation-model.temperature=0.7
quarkus.langchain4j.watsonx.generation-model.max-new-tokens=500
quarkus.langchain4j.watsonx.generation-model.min-new-tokens=50
quarkus.langchain4j.watsonx.generation-model.top-k=50
quarkus.langchain4j.watsonx.generation-model.top-p=0.9
quarkus.langchain4j.watsonx.generation-model.repetition-penalty=1.2
quarkus.langchain4j.watsonx.generation-model.random-seed=42
quarkus.langchain4j.watsonx.generation-model.stop-sequences=\n\n,END
quarkus.langchain4j.watsonx.generation-model.truncate-input-tokens=2048
quarkus.langchain4j.watsonx.generation-model.include-stop-sequence=false
quarkus.langchain4j.watsonx.generation-model.prompt-joiner=\n---\n
quarkus.langchain4j.watsonx.generation-model.length-penalty.decay-factor=1.5
quarkus.langchain4j.watsonx.generation-model.length-penalty.start-index=10Injection:
import jakarta.inject.Inject;
import dev.langchain4j.model.chat.ChatModel;
import dev.langchain4j.model.chat.StreamingChatModel;
import dev.langchain4j.data.message.UserMessage;
@ApplicationScoped
public class GenerationService {
@Inject
ChatModel generationModel; // Injects WatsonxGenerationModel
@Inject
StreamingChatModel streamingGenerationModel; // Injects WatsonxGenerationStreamingModel
public String generate(String prompt) {
ChatResponse response = generationModel.chat(UserMessage.from(prompt));
return response.aiMessage().text();
}
}Configure multiple generation models with different settings.
Configuration:
# Default generation model
quarkus.langchain4j.watsonx.generation-model.temperature=0.7
quarkus.langchain4j.watsonx.generation-model.max-new-tokens=500
# Creative generation model
quarkus.langchain4j.watsonx.creative.generation-model.model-name=meta-llama/llama-3-70b-instruct
quarkus.langchain4j.watsonx.creative.generation-model.temperature=1.5
quarkus.langchain4j.watsonx.creative.generation-model.max-new-tokens=1000
quarkus.langchain4j.watsonx.creative.generation-model.top-k=100
# Factual generation model
quarkus.langchain4j.watsonx.factual.generation-model.temperature=0.1
quarkus.langchain4j.watsonx.factual.generation-model.decoding-method=greedy
quarkus.langchain4j.watsonx.factual.generation-model.max-new-tokens=200Injection:
import jakarta.enterprise.inject.Named;
@ApplicationScoped
public class MultiModelService {
@Inject
@Named("creative")
ChatModel creativeModel;
@Inject
@Named("factual")
ChatModel factualModel;
public String generateCreative(String prompt) {
return creativeModel.chat(UserMessage.from(prompt)).aiMessage().text();
}
public String generateFactual(String query) {
return factualModel.chat(UserMessage.from(query)).aiMessage().text();
}
}public class WatsonxGenerationRequestParameters extends DefaultChatRequestParameters {
public static Builder builder();
// Generation-specific methods
public String decodingMethod();
public LengthPenalty lengthPenalty();
public Integer minNewTokens();
public Integer randomSeed();
public Duration timeLimit();
public Double repetitionPenalty();
public Integer truncateInputTokens();
public Boolean includeStopSequence();
// Override parameters
public ChatRequestParameters overrideWith(ChatRequestParameters that);
}See Request Parameters for complete details.
public record LengthPenalty(Double decayFactor, Integer startIndex) {
// Decay factor: > 1.0
// Start index: >= 0
}Internal response beans:
public record TextGenerationResponse(
String modelId,
List<Result> results
) {
public record Result(
String generatedText,
Integer generatedTokenCount,
Integer inputTokenCount,
String stopReason
) {}
}Stop Reasons:
Use generation models when:
Use chat models when:
| Parameter | Generation Model | Chat Model |
|---|---|---|
| Decoding method | greedy, sample | Not exposed |
| Temperature | Yes (sample only) | Yes |
| Top-K sampling | Yes (sample only) | No |
| Top-P sampling | Yes | Yes |
| Frequency penalty | No | Yes |
| Presence penalty | No | Yes |
| Repetition penalty | Yes | No |
| Length penalty | Yes (decay-based) | No |
| Min tokens | Yes | No |
| Max tokens | Yes (maxNewTokens) | Yes (maxTokens) |
| Stop sequences | Yes (max 6) | Yes (max 4) |
| Tool calling | No | Yes |
| JSON schema | No | Yes |
| Random seed | Yes (randomSeed) | Yes (seed) |
// Generation model (legacy)
WatsonxGenerationModel genModel = WatsonxGenerationModel.builder()
.decodingMethod("sample")
.temperature(0.7)
.topK(50)
.topP(0.9)
.maxNewTokens(500)
.repetitionPenalty(1.2)
.build();
// Equivalent chat model (modern)
WatsonxChatModel chatModel = WatsonxChatModel.builder()
.temperature(0.7)
.topP(0.9)
.maxTokens(500)
.frequencyPenalty(0.5) // Approximate repetitionPenalty
.build();
// Note: topK not available in chat models// Balanced sampling for general text generation
WatsonxGenerationModel balanced = WatsonxGenerationModel.builder()
.decodingMethod("sample")
.temperature(0.7)
.topK(50)
.topP(0.9)
.build();
// Creative/diverse generation
WatsonxGenerationModel creative = WatsonxGenerationModel.builder()
.decodingMethod("sample")
.temperature(1.2)
.topK(100)
.topP(0.95)
.build();
// Focused/deterministic generation
WatsonxGenerationModel focused = WatsonxGenerationModel.builder()
.decodingMethod("greedy")
.build();// Reduce repetition with repetition penalty
WatsonxGenerationModel model = WatsonxGenerationModel.builder()
.repetitionPenalty(1.3) // 1.0 = no penalty, 2.0 = maximum penalty
.build();
// Combine with stop sequences for clean outputs
WatsonxGenerationModel model2 = WatsonxGenerationModel.builder()
.repetitionPenalty(1.2)
.stopSequences(List.of("\n\n", "END"))
.includeStopSequence(false)
.build();// Ensure minimum response length
WatsonxGenerationModel minLength = WatsonxGenerationModel.builder()
.minNewTokens(100)
.maxNewTokens(500)
.build();
// Handle long inputs with truncation
WatsonxGenerationModel truncated = WatsonxGenerationModel.builder()
.truncateInputTokens(2048) // Truncate from left if input > 2048 tokens
.maxNewTokens(500)
.build();// Same seed + parameters = same output
WatsonxGenerationModel reproducible = WatsonxGenerationModel.builder()
.decodingMethod("sample")
.randomSeed(42)
.temperature(0.7)
.build();
ChatResponse response1 = reproducible.chat(UserMessage.from("Hello"));
ChatResponse response2 = reproducible.chat(UserMessage.from("Hello"));
// response1 and response2 will be identicalInstall with Tessl CLI
npx tessl i tessl/maven-io-quarkiverse-langchain4j--quarkus-langchain4j-watsonx@1.7.0