CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-dev-langchain4j--langchain4j-bedrock

AWS Bedrock integration for LangChain4j enabling Java applications to interact with various LLM providers through a unified interface

Overview
Eval results
Files

embedding-use-cases.mddocs/guides/

Embedding Use Cases

Embeddings are dense vector representations of text that capture semantic meaning. This guide covers common use cases beyond basic semantic search.

Text Classification

Use embeddings to classify text into categories by comparing query embeddings against labeled examples.

import dev.langchain4j.model.bedrock.BedrockCohereEmbeddingModel;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.store.embedding.EmbeddingStore;
import dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore;
import software.amazon.awssdk.regions.Region;

// Create embedding model
BedrockCohereEmbeddingModel model = BedrockCohereEmbeddingModel.builder()
    .region(Region.US_EAST_1)
    .model(BedrockCohereEmbeddingModel.Model.COHERE_EMBED_ENGLISH_V3)
    .build();

// Create labeled training examples
class LabeledExample {
    TextSegment text;
    String label;

    LabeledExample(String text, String label) {
        this.text = TextSegment.from(text);
        this.label = label;
    }
}

List<LabeledExample> trainingData = List.of(
    new LabeledExample("Your order has shipped", "shipping"),
    new LabeledExample("Package delivered successfully", "shipping"),
    new LabeledExample("I need a refund", "billing"),
    new LabeledExample("Charge on my card", "billing"),
    new LabeledExample("How do I reset my password?", "support"),
    new LabeledExample("Cannot log into my account", "support")
);

// Build embedding store with labels
EmbeddingStore<LabeledExample> store = new InMemoryEmbeddingStore<>();
for (LabeledExample example : trainingData) {
    Embedding embedding = model.embed(example.text).content();
    store.add(embedding, example);
}

// Classify new text
String newText = "Where is my package?";
Embedding queryEmbedding = model.embed(newText).content();
List<EmbeddingMatch<LabeledExample>> matches = store.findRelevant(queryEmbedding, 3);

// Get predicted label from nearest neighbors
Map<String, Integer> labelCounts = new HashMap<>();
for (EmbeddingMatch<LabeledExample> match : matches) {
    String label = match.embedded().label;
    labelCounts.merge(label, 1, Integer::sum);
}

String predictedLabel = labelCounts.entrySet().stream()
    .max(Map.Entry.comparingByValue())
    .map(Map.Entry::getKey)
    .orElse("unknown");

System.out.println("Predicted category: " + predictedLabel);

Document Clustering

Group similar documents together using embedding vectors and clustering algorithms.

import dev.langchain4j.model.bedrock.BedrockTitanEmbeddingModel;

// Create embedding model
BedrockTitanEmbeddingModel model = BedrockTitanEmbeddingModel.builder()
    .region(Region.US_EAST_1)
    .model(BedrockTitanEmbeddingModel.Types.TitanEmbedTextV2.getValue())
    .dimensions(512)
    .normalize(true)
    .build();

// Embed documents
List<TextSegment> documents = List.of(
    TextSegment.from("Machine learning fundamentals"),
    TextSegment.from("Deep learning neural networks"),
    TextSegment.from("Italian pasta recipes"),
    TextSegment.from("French cooking techniques"),
    TextSegment.from("AI model training"),
    TextSegment.from("Spanish cuisine guide")
);

List<float[]> vectors = new ArrayList<>();
for (TextSegment doc : documents) {
    Embedding embedding = model.embed(doc).content();
    vectors.add(embedding.vector());
}

// Simple k-means clustering (example)
int k = 2; // number of clusters
List<List<Integer>> clusters = performKMeansClustering(vectors, k);

// Print clusters
for (int i = 0; i < clusters.size(); i++) {
    System.out.println("Cluster " + (i + 1) + ":");
    for (int docIndex : clusters.get(i)) {
        System.out.println("  - " + documents.get(docIndex).text());
    }
}

Similarity Scoring

Calculate similarity scores between texts for duplicate detection, paraphrase identification, or content recommendation.

import dev.langchain4j.model.bedrock.BedrockCohereEmbeddingModel;

// Create embedding model
BedrockCohereEmbeddingModel model = BedrockCohereEmbeddingModel.builder()
    .region(Region.US_EAST_1)
    .model(BedrockCohereEmbeddingModel.Model.COHERE_EMBED_ENGLISH_V3)
    .build();

// Calculate cosine similarity between two texts
String text1 = "The quick brown fox jumps over the lazy dog";
String text2 = "A fast brown fox leaps over a sleepy dog";

Embedding embedding1 = model.embed(text1).content();
Embedding embedding2 = model.embed(text2).content();

double similarity = cosineSimilarity(embedding1.vector(), embedding2.vector());
System.out.println("Similarity score: " + similarity);

// Helper method for cosine similarity
private static double cosineSimilarity(float[] vectorA, float[] vectorB) {
    double dotProduct = 0.0;
    double normA = 0.0;
    double normB = 0.0;

    for (int i = 0; i < vectorA.length; i++) {
        dotProduct += vectorA[i] * vectorB[i];
        normA += vectorA[i] * vectorA[i];
        normB += vectorB[i] * vectorB[i];
    }

    return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}

Recommendation System

Build content recommendations by finding items similar to user preferences.

// Create user profile from interaction history
List<String> userHistory = List.of(
    "Introduction to Python programming",
    "Advanced Python techniques",
    "Python data structures"
);

// Combine user history into profile embedding
List<float[]> historyEmbeddings = new ArrayList<>();
for (String item : userHistory) {
    Embedding emb = model.embed(item).content();
    historyEmbeddings.add(emb.vector());
}

// Average embeddings to create user profile
float[] userProfile = averageEmbeddings(historyEmbeddings);

// Find similar content
List<String> candidateItems = List.of(
    "Java programming basics",
    "Python machine learning",
    "JavaScript web development",
    "Python Django framework"
);

List<ScoredItem> recommendations = new ArrayList<>();
for (String item : candidateItems) {
    Embedding itemEmb = model.embed(item).content();
    double score = cosineSimilarity(userProfile, itemEmb.vector());
    recommendations.add(new ScoredItem(item, score));
}

// Sort by similarity score
recommendations.sort((a, b) -> Double.compare(b.score, a.score));

System.out.println("Recommended items:");
for (ScoredItem rec : recommendations.subList(0, Math.min(3, recommendations.size()))) {
    System.out.println(rec.item + " (score: " + rec.score + ")");
}

Duplicate Detection

Identify near-duplicate content using embedding similarity thresholds.

// Define similarity threshold
double DUPLICATE_THRESHOLD = 0.95;

List<String> documents = List.of(
    "AWS Bedrock provides foundation models",
    "Amazon Bedrock offers foundational AI models",
    "Cloud computing with AWS services",
    "Bedrock gives access to foundation models"
);

// Find duplicates
List<float[]> embeddings = documents.stream()
    .map(doc -> model.embed(doc).content().vector())
    .toList();

System.out.println("Potential duplicates:");
for (int i = 0; i < documents.size(); i++) {
    for (int j = i + 1; j < documents.size(); j++) {
        double similarity = cosineSimilarity(embeddings.get(i), embeddings.get(j));
        if (similarity > DUPLICATE_THRESHOLD) {
            System.out.println("Documents " + i + " and " + j + ":");
            System.out.println("  - " + documents.get(i));
            System.out.println("  - " + documents.get(j));
            System.out.println("  Similarity: " + similarity);
        }
    }
}

Best Practices

Model Selection

  • Cohere Embed English v3: Best for English text, supports search optimization with input types
  • Titan Embed Text v2: Configurable dimensions, good for multilingual content
  • Use inputType.SEARCH_DOCUMENT for indexing and inputType.SEARCH_QUERY for queries (Cohere)

Performance Optimization

  • Batch embeddings with embedAll() for better throughput
  • Cache embeddings for frequently accessed content
  • Use normalized vectors for faster cosine similarity calculations
  • Consider dimension reduction (e.g., 512 dimensions) for large-scale applications

Quality Considerations

  • Clean and preprocess text before embedding (remove HTML, normalize whitespace)
  • Use domain-specific fine-tuning when available for specialized applications
  • Validate similarity thresholds with ground truth data
  • Monitor embedding quality with periodic evaluation on test sets

Related Documentation:

  • Semantic Search Guide for retrieval patterns
  • Embedding Models API for configuration options
  • Quick Start: Embeddings for basic examples

Install with Tessl CLI

npx tessl i tessl/maven-dev-langchain4j--langchain4j-bedrock@1.11.0

docs

index.md

README.md

tile.json