LangChain4j OpenAI Integration providing Java access to OpenAI APIs including chat models, embeddings, image generation, audio transcription, and moderation.
Token management provides tools for estimating token usage and costs before making API calls, and tracking detailed token consumption after responses. This enables budget control, cost optimization, and usage monitoring for OpenAI models.
The integration uses the jtokkit library for accurate token counting that matches OpenAI's tokenization, ensuring estimates align with actual costs. Advanced token usage tracking includes cached tokens and reasoning tokens for specialized models.
Estimates token counts for text, messages, and conversations before sending requests to the API. Useful for validating input lengths and estimating costs.
public class OpenAiTokenCountEstimator implements TokenCountEstimator {
// Constructors for different model types
public OpenAiTokenCountEstimator(String modelName);
public OpenAiTokenCountEstimator(OpenAiChatModelName modelName);
public OpenAiTokenCountEstimator(OpenAiEmbeddingModelName modelName);
public OpenAiTokenCountEstimator(OpenAiLanguageModelName modelName);
// Token estimation methods
public int estimateTokenCountInText(String text);
public int estimateTokenCountInMessage(ChatMessage message);
public int estimateTokenCountInMessages(Iterable<ChatMessage> messages);
public int estimateTokenCountInToolSpecifications(Iterable<ToolSpecification> toolSpecifications);
public int estimateTokenCountInToolExecutionRequests(Iterable<ToolExecutionRequest> toolExecutionRequests);
public int estimateTokenCountInForcedToolExecutionRequest(String toolName);
// Token encoding/decoding
public List<Integer> encode(String text);
public List<Integer> encode(String text, int maxTokensToEncode);
public String decode(List<Integer> tokens);
}import dev.langchain4j.model.openai.OpenAiTokenCountEstimator;
import dev.langchain4j.model.openai.OpenAiChatModelName;
// Create token estimator for GPT-4o
OpenAiTokenCountEstimator estimator = new OpenAiTokenCountEstimator(
OpenAiChatModelName.GPT_4_O
);
// Estimate tokens in text
String text = "Hello, how are you today? I need help with my code.";
int tokenCount = estimator.estimateTokenCountInText(text);
System.out.println("Text uses approximately " + tokenCount + " tokens");
// Estimate cost
double costPer1kTokens = 0.005; // GPT-4o input cost
double estimatedCost = (tokenCount / 1000.0) * costPer1kTokens;
System.out.println("Estimated cost: $" + estimatedCost);import dev.langchain4j.data.message.UserMessage;
import dev.langchain4j.data.message.SystemMessage;
import dev.langchain4j.data.message.AiMessage;
import java.util.List;
OpenAiTokenCountEstimator estimator = new OpenAiTokenCountEstimator(
OpenAiChatModelName.GPT_4_O
);
// Estimate single message
UserMessage userMsg = UserMessage.from("What is the capital of France?");
int msgTokens = estimator.estimateTokenCountInMessage(userMsg);
System.out.println("Message tokens: " + msgTokens);
// Estimate conversation
List<ChatMessage> conversation = List.of(
SystemMessage.from("You are a helpful AI assistant."),
UserMessage.from("What is machine learning?"),
AiMessage.from("Machine learning is a subset of artificial intelligence..."),
UserMessage.from("Can you give me an example?")
);
int conversationTokens = estimator.estimateTokenCountInMessages(conversation);
System.out.println("Conversation tokens: " + conversationTokens);public class TokenValidator {
private final OpenAiTokenCountEstimator estimator;
private final int maxTokens;
public TokenValidator(OpenAiChatModelName modelName, int maxTokens) {
this.estimator = new OpenAiTokenCountEstimator(modelName);
this.maxTokens = maxTokens;
}
public ValidationResult validateMessages(List<ChatMessage> messages) {
int tokenCount = estimator.estimateTokenCountInMessages(messages);
if (tokenCount > maxTokens) {
return ValidationResult.invalid(
"Messages exceed maximum tokens: " + tokenCount + " > " + maxTokens
);
}
return ValidationResult.valid(tokenCount);
}
public List<ChatMessage> truncateMessages(
List<ChatMessage> messages,
int targetTokens
) {
int currentTokens = estimator.estimateTokenCountInMessages(messages);
if (currentTokens <= targetTokens) {
return messages;
}
// Remove oldest messages until under limit
List<ChatMessage> truncated = new ArrayList<>(messages);
while (currentTokens > targetTokens && truncated.size() > 1) {
truncated.remove(1); // Keep system message, remove oldest user message
currentTokens = estimator.estimateTokenCountInMessages(truncated);
}
return truncated;
}
}
// Usage
TokenValidator validator = new TokenValidator(
OpenAiChatModelName.GPT_4_O,
4000
);
ValidationResult result = validator.validateMessages(messages);
if (!result.isValid()) {
System.err.println(result.getError());
messages = validator.truncateMessages(messages, 3500);
}OpenAiTokenCountEstimator estimator = new OpenAiTokenCountEstimator(
OpenAiChatModelName.GPT_4_O
);
String text = "Artificial intelligence and machine learning";
// Encode text to tokens
List<Integer> tokens = estimator.encode(text);
System.out.println("Token IDs: " + tokens);
System.out.println("Token count: " + tokens.size());
// Decode tokens back to text
String decoded = estimator.decode(tokens);
System.out.println("Decoded: " + decoded);
// Encode with limit
List<Integer> limitedTokens = estimator.encode(text, 5);
System.out.println("First 5 tokens: " + limitedTokens);
String partialDecoded = estimator.decode(limitedTokens);
System.out.println("Partial text: " + partialDecoded);Detailed token usage information returned in API responses, including input tokens, output tokens, and special token types like cached and reasoning tokens.
public class OpenAiTokenUsage extends TokenUsage {
public static Builder builder();
// Standard token counts
public Integer inputTokenCount();
public Integer outputTokenCount();
public Integer totalTokenCount();
// Detailed breakdowns
public InputTokensDetails inputTokensDetails();
public OutputTokensDetails outputTokensDetails();
// Combine usage from multiple requests
public OpenAiTokenUsage add(TokenUsage other);
}public static class InputTokensDetails {
public static Builder builder();
public Integer cachedTokens();
}public static class OutputTokensDetails {
public static Builder builder();
public Integer reasoningTokens();
}import dev.langchain4j.model.openai.OpenAiChatModel;
import dev.langchain4j.model.openai.OpenAiTokenUsage;
import dev.langchain4j.model.output.Response;
OpenAiChatModel model = OpenAiChatModel.builder()
.apiKey(apiKey)
.modelName(OpenAiChatModelName.GPT_4_O)
.build();
Response<AiMessage> response = model.generate("Explain quantum computing");
// Get token usage from response
OpenAiTokenUsage usage = (OpenAiTokenUsage) response.tokenUsage();
System.out.println("Input tokens: " + usage.inputTokenCount());
System.out.println("Output tokens: " + usage.outputTokenCount());
System.out.println("Total tokens: " + usage.totalTokenCount());
// Check for cached tokens (prompt caching)
if (usage.inputTokensDetails() != null) {
Integer cachedTokens = usage.inputTokensDetails().cachedTokens();
if (cachedTokens != null && cachedTokens > 0) {
System.out.println("Cached tokens: " + cachedTokens);
System.out.println("Tokens processed: " + (usage.inputTokenCount() - cachedTokens));
}
}
// Check for reasoning tokens (o1/o3 models)
if (usage.outputTokensDetails() != null) {
Integer reasoningTokens = usage.outputTokensDetails().reasoningTokens();
if (reasoningTokens != null && reasoningTokens > 0) {
System.out.println("Reasoning tokens: " + reasoningTokens);
System.out.println("Output tokens (excluding reasoning): " +
(usage.outputTokenCount() - reasoningTokens));
}
}public class CostCalculator {
private final Map<String, ModelPricing> pricing;
public CostCalculator() {
this.pricing = Map.of(
"gpt-4o", new ModelPricing(0.005, 0.015),
"gpt-4o-mini", new ModelPricing(0.00015, 0.0006),
"gpt-4", new ModelPricing(0.03, 0.06),
"o3-mini", new ModelPricing(0.01, 0.02)
);
}
public double calculateCost(String modelName, OpenAiTokenUsage usage) {
ModelPricing prices = pricing.get(modelName);
if (prices == null) {
throw new IllegalArgumentException("Unknown model: " + modelName);
}
double inputCost = (usage.inputTokenCount() / 1000.0) * prices.inputPrice;
double outputCost = (usage.outputTokenCount() / 1000.0) * prices.outputPrice;
// Apply discount for cached tokens
if (usage.inputTokensDetails() != null &&
usage.inputTokensDetails().cachedTokens() != null) {
int cachedTokens = usage.inputTokensDetails().cachedTokens();
double cachedDiscount = (cachedTokens / 1000.0) * prices.inputPrice * 0.5; // 50% discount
inputCost -= cachedDiscount;
}
return inputCost + outputCost;
}
private static class ModelPricing {
final double inputPrice; // per 1k tokens
final double outputPrice; // per 1k tokens
ModelPricing(double inputPrice, double outputPrice) {
this.inputPrice = inputPrice;
this.outputPrice = outputPrice;
}
}
}
// Usage
CostCalculator calculator = new CostCalculator();
double cost = calculator.calculateCost("gpt-4o", usage);
System.out.printf("Request cost: $%.6f%n", cost);public class UsageAccumulator {
private OpenAiTokenUsage totalUsage;
public UsageAccumulator() {
this.totalUsage = OpenAiTokenUsage.builder()
.inputTokenCount(0)
.outputTokenCount(0)
.totalTokenCount(0)
.build();
}
public void addUsage(TokenUsage usage) {
if (usage instanceof OpenAiTokenUsage) {
totalUsage = totalUsage.add(usage);
}
}
public OpenAiTokenUsage getTotalUsage() {
return totalUsage;
}
public void reset() {
totalUsage = OpenAiTokenUsage.builder()
.inputTokenCount(0)
.outputTokenCount(0)
.totalTokenCount(0)
.build();
}
public void printSummary() {
System.out.println("=== Token Usage Summary ===");
System.out.println("Total input tokens: " + totalUsage.inputTokenCount());
System.out.println("Total output tokens: " + totalUsage.outputTokenCount());
System.out.println("Total tokens: " + totalUsage.totalTokenCount());
}
}
// Usage in conversation
UsageAccumulator accumulator = new UsageAccumulator();
for (String userInput : userInputs) {
Response<AiMessage> response = model.generate(userInput);
accumulator.addUsage(response.tokenUsage());
}
accumulator.printSummary();public interface TokenCountEstimator {
int estimateTokenCountInText(String text);
int estimateTokenCountInMessage(ChatMessage message);
int estimateTokenCountInMessages(Iterable<ChatMessage> messages);
}public class TokenUsage {
public Integer inputTokenCount();
public Integer outputTokenCount();
public Integer totalTokenCount();
}// Always validate before expensive operations
public Response<AiMessage> safeGenerate(
List<ChatMessage> messages,
int maxInputTokens
) {
OpenAiTokenCountEstimator estimator = new OpenAiTokenCountEstimator(
OpenAiChatModelName.GPT_4_O
);
int estimatedTokens = estimator.estimateTokenCountInMessages(messages);
if (estimatedTokens > maxInputTokens) {
throw new IllegalArgumentException(
"Input too long: " + estimatedTokens + " tokens"
);
}
return model.generate(messages);
}public class BudgetManager {
private final CostCalculator calculator;
private double remainingBudget;
private final Object lock = new Object();
public BudgetManager(double initialBudget) {
this.calculator = new CostCalculator();
this.remainingBudget = initialBudget;
}
public boolean canAfford(String modelName, OpenAiTokenUsage estimatedUsage) {
double estimatedCost = calculator.calculateCost(modelName, estimatedUsage);
synchronized (lock) {
return remainingBudget >= estimatedCost;
}
}
public void recordUsage(String modelName, OpenAiTokenUsage usage) {
double cost = calculator.calculateCost(modelName, usage);
synchronized (lock) {
remainingBudget -= cost;
}
if (remainingBudget < 0) {
System.err.println("WARNING: Budget exceeded!");
}
}
public double getRemainingBudget() {
synchronized (lock) {
return remainingBudget;
}
}
}public class ContextWindowManager {
private final OpenAiTokenCountEstimator estimator;
private final int maxContextTokens;
public ContextWindowManager(OpenAiChatModelName modelName, int maxContextTokens) {
this.estimator = new OpenAiTokenCountEstimator(modelName);
this.maxContextTokens = maxContextTokens;
}
public List<ChatMessage> fitToContext(
List<ChatMessage> messages,
int reserveForResponse
) {
int availableTokens = maxContextTokens - reserveForResponse;
int currentTokens = estimator.estimateTokenCountInMessages(messages);
if (currentTokens <= availableTokens) {
return messages;
}
// Keep system message and most recent messages
List<ChatMessage> fitted = new ArrayList<>();
ChatMessage systemMsg = messages.get(0);
fitted.add(systemMsg);
int systemTokens = estimator.estimateTokenCountInMessage(systemMsg);
int remainingTokens = availableTokens - systemTokens;
// Add messages from most recent
for (int i = messages.size() - 1; i > 0; i--) {
ChatMessage msg = messages.get(i);
int msgTokens = estimator.estimateTokenCountInMessage(msg);
if (msgTokens <= remainingTokens) {
fitted.add(1, msg); // Insert after system message
remainingTokens -= msgTokens;
} else {
break;
}
}
return fitted;
}
}public class ConversationSummarizer {
private final OpenAiChatModel model;
private final OpenAiTokenCountEstimator estimator;
private final int summaryThreshold;
public ConversationSummarizer(
OpenAiChatModel model,
int summaryThreshold
) {
this.model = model;
this.estimator = new OpenAiTokenCountEstimator(OpenAiChatModelName.GPT_4_O);
this.summaryThreshold = summaryThreshold;
}
public List<ChatMessage> manageConversation(List<ChatMessage> messages) {
int tokenCount = estimator.estimateTokenCountInMessages(messages);
if (tokenCount < summaryThreshold) {
return messages;
}
// Summarize older messages
List<ChatMessage> toSummarize = messages.subList(0, messages.size() / 2);
String summary = summarizeMessages(toSummarize);
// Keep system message, add summary, keep recent messages
List<ChatMessage> compressed = new ArrayList<>();
compressed.add(messages.get(0)); // System message
compressed.add(SystemMessage.from("Previous conversation summary: " + summary));
compressed.addAll(messages.subList(messages.size() / 2, messages.size()));
return compressed;
}
private String summarizeMessages(List<ChatMessage> messages) {
String prompt = "Summarize the following conversation concisely:\n\n" +
messagesToText(messages);
Response<AiMessage> response = model.generate(prompt);
return response.content().text();
}
}public class TokenUsageMonitor {
private final Map<String, UsageStats> stats = new ConcurrentHashMap<>();
public void recordRequest(String userId, OpenAiTokenUsage usage) {
stats.computeIfAbsent(userId, k -> new UsageStats())
.addUsage(usage);
// Check for anomalies
UsageStats userStats = stats.get(userId);
if (userStats.getTotalTokens() > 1_000_000) {
alert("User " + userId + " exceeded 1M tokens");
}
if (userStats.getRequestCount() > 10000) {
alert("User " + userId + " made 10k+ requests");
}
}
public UsageReport generateReport() {
return new UsageReport(stats);
}
private void alert(String message) {
System.err.println("ALERT: " + message);
// Send to monitoring system
}
private static class UsageStats {
private int requestCount = 0;
private long totalInputTokens = 0;
private long totalOutputTokens = 0;
public synchronized void addUsage(OpenAiTokenUsage usage) {
requestCount++;
totalInputTokens += usage.inputTokenCount();
totalOutputTokens += usage.outputTokenCount();
}
public int getRequestCount() { return requestCount; }
public long getTotalTokens() { return totalInputTokens + totalOutputTokens; }
}
}Calculate expected costs before making API calls.
Ensure prompts fit within model context windows.
Track and limit spending on API calls.
Monitor token consumption patterns and trends.
Summarize or truncate conversations that exceed token limits.
Implement token-based rate limiting for users.
Forecast infrastructure needs based on usage patterns.
For models with prompt caching:
cachedTokens in response metadataInstall with Tessl CLI
npx tessl i tessl/maven-dev-langchain4j--langchain4j-open-ai