Universal Sentence Encoder for generating text embeddings using TensorFlow.js
npx @tessl/cli install tessl/npm-tensorflow-models--universal-sentence-encoder@1.3.0The Universal Sentence Encoder provides TensorFlow.js implementations for converting text into high-dimensional embeddings. It includes both the standard USE model that generates 512-dimensional embeddings for general text similarity and clustering tasks, and the USE QnA model that creates 100-dimensional embeddings specifically optimized for question-answering applications.
npm install @tensorflow/tfjs @tensorflow-models/universal-sentence-encoderimport * as use from '@tensorflow-models/universal-sentence-encoder';For CommonJS:
const use = require('@tensorflow-models/universal-sentence-encoder');import * as use from '@tensorflow-models/universal-sentence-encoder';
// Load the model
const model = await use.load();
// Embed sentences
const sentences = [
'Hello.',
'How are you?'
];
const embeddings = await model.embed(sentences);
// embeddings is a 2D tensor with shape [2, 512]
embeddings.print();Universal Sentence Encoder is built around several key components:
Core Universal Sentence Encoder functionality for generating 512-dimensional embeddings from text. Ideal for semantic similarity, clustering, and general NLP tasks.
function load(config?: LoadConfig): Promise<UniversalSentenceEncoder>;
interface LoadConfig {
modelUrl?: string;
vocabUrl?: string;
}
class UniversalSentenceEncoder {
embed(inputs: string[] | string): Promise<tf.Tensor2D>;
}Specialized Universal Sentence Encoder for question-answering applications, generating 100-dimensional embeddings optimized for matching questions with answers.
function loadQnA(): Promise<UniversalSentenceEncoderQnA>;
class UniversalSentenceEncoderQnA {
embed(input: ModelInput): ModelOutput;
}
interface ModelInput {
queries: string[];
responses: string[];
contexts?: string[];
}
interface ModelOutput {
queryEmbedding: tf.Tensor;
responseEmbedding: tf.Tensor;
}Independent tokenizer functionality using SentencePiece algorithm for converting text into token sequences. Can be used separately from the embedding models.
function loadTokenizer(pathToVocabulary?: string): Promise<Tokenizer>;
function loadVocabulary(pathToVocabulary: string): Promise<Vocabulary>;
function stringToChars(input: string): string[];
class Tokenizer {
constructor(vocabulary: Vocabulary, reservedSymbolsCount?: number);
encode(input: string): number[];
}
class Trie {
constructor();
insert(word: string, score: number, index: number): void;
commonPrefixSearch(symbols: string[]): Array<[string[], number, number]>;
}// TensorFlow.js tensors
import * as tf from '@tensorflow/tfjs-core';
// Core types
type Vocabulary = Array<[string, number]>;
// Version information
const version: string;