CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-dev-langchain4j--langchain4j-azure-open-ai

LangChain4j integration for Azure OpenAI providing chat, streaming, embeddings, image generation, audio transcription, and token counting capabilities

Overview
Eval results
Files

token-counting.mddocs/

Token Counting

The token count estimator provides token count estimates for cost calculation and request validation before making API calls. Supports chat, embedding, and language models.

Imports

import dev.langchain4j.model.azure.AzureOpenAiTokenCountEstimator;
import dev.langchain4j.model.azure.AzureOpenAiChatModelName;
import dev.langchain4j.model.azure.AzureOpenAiEmbeddingModelName;
import dev.langchain4j.model.azure.AzureOpenAiLanguageModelName;
import dev.langchain4j.data.message.ChatMessage;
import dev.langchain4j.data.message.UserMessage;
import dev.langchain4j.data.message.AiMessage;
import dev.langchain4j.data.message.SystemMessage;

Basic Usage

// Create estimator for specific model
AzureOpenAiTokenCountEstimator estimator = new AzureOpenAiTokenCountEstimator(
    AzureOpenAiChatModelName.GPT_4
);

// Estimate tokens in text
String text = "Hello, how are you today?";
int tokenCount = estimator.estimateTokenCountInText(text);
System.out.println("Token count: " + tokenCount);  // e.g., 7

// Estimate tokens in a message (includes formatting overhead)
ChatMessage message = UserMessage.from("What is the capital of France?");
int messageTokens = estimator.estimateTokenCountInMessage(message);  // e.g., 11

// Estimate tokens in message list (includes conversation structure overhead)
List<ChatMessage> messages = List.of(
    SystemMessage.from("You are a helpful assistant."),
    UserMessage.from("What is AI?"),
    AiMessage.from("AI stands for Artificial Intelligence...")
);
int totalTokens = estimator.estimateTokenCountInMessages(messages);

API

package dev.langchain4j.model.azure;

/**
 * Token count estimator for Azure OpenAI models.
 * Uses same tokenization algorithm as target model (cl100k_base for most models).
 * Thread-safe: Yes - instances are immutable and thread-safe.
 * Accuracy: Typically ±1-2 tokens for text, ±2-5 for messages, ±5-10 for message lists.
 * Performance: Fast, local tokenization (no API calls).
 */
class AzureOpenAiTokenCountEstimator implements dev.langchain4j.model.Tokenizer {
    /**
     * Creates estimator for chat model.
     * @param modelName Chat model enum
     * @throws IllegalArgumentException if modelName is null
     */
    AzureOpenAiTokenCountEstimator(AzureOpenAiChatModelName modelName);

    /**
     * Creates estimator for embedding model.
     * @param modelName Embedding model enum
     * @throws IllegalArgumentException if modelName is null
     */
    AzureOpenAiTokenCountEstimator(AzureOpenAiEmbeddingModelName modelName);

    /**
     * Creates estimator for language model.
     * @param modelName Language model enum
     * @throws IllegalArgumentException if modelName is null
     */
    AzureOpenAiTokenCountEstimator(AzureOpenAiLanguageModelName modelName);

    /**
     * Creates estimator for custom model name.
     * Falls back to cl100k_base encoding if model unknown.
     * @param modelName Model name string
     * @throws IllegalArgumentException if modelName is null or empty
     */
    AzureOpenAiTokenCountEstimator(String modelName);

    /**
     * Estimates tokens in plain text.
     * @param text Text to estimate (can be empty, not null)
     * @return Token count (0 for empty string)
     * @throws NullPointerException if text is null
     */
    int estimateTokenCountInText(String text);

    /**
     * Estimates tokens in single message including formatting overhead.
     * Includes tokens for role, name, and message structure.
     * @param message ChatMessage (UserMessage, AiMessage, SystemMessage, etc.)
     * @return Token count including ~3-5 tokens overhead
     * @throws NullPointerException if message is null
     */
    int estimateTokenCountInMessage(dev.langchain4j.data.message.ChatMessage message);

    /**
     * Estimates tokens in message list including conversation structure.
     * Includes tokens for message formatting + conversation framing.
     * @param messages Iterable of messages (can be empty, not null)
     * @return Total token count including all overhead (~3 tokens base + per-message overhead)
     * @throws NullPointerException if messages is null or contains null elements
     */
    int estimateTokenCountInMessages(Iterable<dev.langchain4j.data.message.ChatMessage> messages);
}

Usage Examples

Pre-flight Token Check

AzureOpenAiTokenCountEstimator estimator = new AzureOpenAiTokenCountEstimator(
    AzureOpenAiChatModelName.GPT_3_5_TURBO
);

List<ChatMessage> messages = buildConversation();
int estimatedTokens = estimator.estimateTokenCountInMessages(messages);

// GPT-3.5-turbo: 4096 token limit, reserve 1000 for response
int maxInputTokens = 4096 - 1000;
if (estimatedTokens > maxInputTokens) {
    // Trim conversation or reject request
    messages = trimConversation(messages, maxInputTokens);
}

Response<AiMessage> response = chatModel.generate(messages);

Cost Estimation

AzureOpenAiTokenCountEstimator estimator = new AzureOpenAiTokenCountEstimator(
    AzureOpenAiChatModelName.GPT_4
);

List<ChatMessage> messages = getConversation();
int inputTokens = estimator.estimateTokenCountInMessages(messages);

// Estimate output tokens (conservative: 1.5x input for conversations)
int estimatedOutputTokens = (int) (inputTokens * 1.5);
int totalTokens = inputTokens + estimatedOutputTokens;

// Calculate cost (example GPT-4 rates)
double inputCost = inputTokens * 0.00003;  // $0.03 per 1K input tokens
double outputCost = estimatedOutputTokens * 0.00006;  // $0.06 per 1K output tokens
double totalCost = inputCost + outputCost;

System.out.printf("Estimated cost: $%.4f%n", totalCost);

// Implement budget check
if (totalCost > userBudget) {
    throw new IllegalStateException("Request exceeds user budget");
}

Conversation History Management

class ConversationManager {
    private final AzureOpenAiTokenCountEstimator estimator;
    private final List<ChatMessage> history = new ArrayList<>();
    private final int maxTokens = 8000;  // Leave room for response

    ConversationManager(String modelName) {
        this.estimator = new AzureOpenAiTokenCountEstimator(modelName);
    }

    void addMessage(ChatMessage message) {
        history.add(message);
        trimIfNeeded();
    }

    private void trimIfNeeded() {
        int tokens = estimator.estimateTokenCountInMessages(history);
        
        // Trim oldest messages (keep system message)
        while (tokens > maxTokens && history.size() > 2) {
            // Remove oldest user/assistant pair (index 1 and 2)
            history.remove(1);
            if (history.size() > 1) history.remove(1);
            tokens = estimator.estimateTokenCountInMessages(history);
        }
    }

    List<ChatMessage> getHistory() {
        return new ArrayList<>(history);
    }
}

Embedding Token Estimation

AzureOpenAiTokenCountEstimator estimator = new AzureOpenAiTokenCountEstimator(
    AzureOpenAiEmbeddingModelName.TEXT_EMBEDDING_ADA_002
);

List<String> documents = loadDocuments();
int totalTokens = 0;
List<String> validDocs = new ArrayList<>();

for (String doc : documents) {
    int tokens = estimator.estimateTokenCountInText(doc);
    totalTokens += tokens;

    if (tokens > 8191) {
        // Document exceeds embedding model limit
        System.err.printf("Document too large: %d tokens (max 8191)%n", tokens);
        // Split or skip
    } else {
        validDocs.add(doc);
    }
}

System.out.printf("Total tokens across %d documents: %d%n",
    validDocs.size(), totalTokens);

Batch Request Planning

AzureOpenAiTokenCountEstimator estimator = new AzureOpenAiTokenCountEstimator(
    AzureOpenAiEmbeddingModelName.TEXT_EMBEDDING_3_SMALL
);

List<String> texts = getAllTexts();
List<List<String>> batches = new ArrayList<>();
List<String> currentBatch = new ArrayList<>();
int currentBatchTokens = 0;
int maxBatchTokens = 8000;  // Safety margin under 8191 limit
int maxBatchSize = 16;  // API limit

for (String text : texts) {
    int textTokens = estimator.estimateTokenCountInText(text);

    // Start new batch if limits reached
    if (currentBatchTokens + textTokens > maxBatchTokens ||
        currentBatch.size() >= maxBatchSize) {
        batches.add(currentBatch);
        currentBatch = new ArrayList<>();
        currentBatchTokens = 0;
    }

    currentBatch.add(text);
    currentBatchTokens += textTokens;
}

// Add final batch
if (!currentBatch.isEmpty()) {
    batches.add(currentBatch);
}

// Process batches
for (List<String> batch : batches) {
    embedBatch(batch);
}

Token Counting Accuracy

Estimation accuracy:

  • Text tokenization: Very accurate (±1-2 tokens, ~0.1% error)
  • Single messages: Accurate (±2-5 tokens, ~1-2% error)
  • Message lists: Good approximation (±5-10 tokens, ~2-5% error)

Why estimates may differ from actual:

  • Message formatting varies slightly by model
  • Conversation structure overhead can vary
  • Special tokens (BOS, EOS) may be added
  • Tool/function call formatting adds tokens

Best practice: Always leave safety margin

int estimated = estimator.estimateTokenCountInMessages(messages);
int safeLimit = maxTokens - 100;  // 100-token safety margin
if (estimated > safeLimit) {
    trimMessages();
}

Model Token Limits

Chat Models

ModelContext WindowNotes
GPT-3.5-turbo4,096Input + output total
GPT-3.5-turbo-16k16,384Input + output total
GPT-48,192Input + output total
GPT-4-32k32,768Input + output total
GPT-4-turbo128,000Input + output total
GPT-4o128,000Input + output total

Embedding Models

ModelMax Tokens Per Input
text-embedding-ada-0028,191
text-embedding-3-small8,191
text-embedding-3-large8,191

Language Models

ModelContext Window
gpt-3.5-turbo-instruct4,096

Best Practices

Always Leave Margin

// Reserve tokens for response
int maxContextTokens = 4096;
int maxInputTokens = maxContextTokens - 1000;  // Reserve 1000 for response

if (estimator.estimateTokenCountInMessages(messages) > maxInputTokens) {
    messages = trimMessages(messages);
}

Cache Estimators

// GOOD: Create once, reuse
private static final AzureOpenAiTokenCountEstimator ESTIMATOR =
    new AzureOpenAiTokenCountEstimator(AzureOpenAiChatModelName.GPT_4);

// BAD: Creating repeatedly
for (String text : texts) {
    AzureOpenAiTokenCountEstimator estimator =
        new AzureOpenAiTokenCountEstimator("gpt-4");  // Wasteful!
    int tokens = estimator.estimateTokenCountInText(text);
}

Validate Before Requests

int tokens = estimator.estimateTokenCountInMessages(messages);
if (tokens > modelLimit) {
    throw new IllegalArgumentException(
        String.format("Message too long: %d tokens (limit: %d)",
            tokens, modelLimit)
    );
}

Implement Token Budgets

class TokenBudget {
    private int remaining;

    TokenBudget(int maxTokens) {
        this.remaining = maxTokens;
    }

    boolean canAfford(int tokens) {
        return remaining >= tokens;
    }

    void spend(int tokens) {
        if (!canAfford(tokens)) {
            throw new IllegalStateException("Insufficient token budget");
        }
        remaining -= tokens;
    }

    int getRemaining() {
        return remaining;
    }

    void reset(int maxTokens) {
        this.remaining = maxTokens;
    }
}

Error Handling

try {
    int tokens = estimator.estimateTokenCountInText(text);
} catch (NullPointerException e) {
    // Text or message was null
    System.err.println("Input cannot be null");
} catch (IllegalArgumentException e) {
    // Invalid model name in constructor
    System.err.println("Invalid model: " + e.getMessage());
}

Install with Tessl CLI

npx tessl i tessl/maven-dev-langchain4j--langchain4j-azure-open-ai@1.11.0

docs

audio-transcription.md

chat-models.md

configuration.md

embedding-model.md

image-model.md

index.md

language-models.md

token-counting.md

tile.json