CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/npm-xenova--transformers

State-of-the-art Machine Learning for the web that runs Transformers directly in browsers with no server needed.

Overview
Eval results
Files

utilities.mddocs/

Utilities

Comprehensive utility functions and classes for tensor operations, image processing, audio processing, and mathematical computations that support the core ML functionality in Transformers.js.

Capabilities

Tensor Operations

The Tensor class and related functions provide N-dimensional array operations optimized for machine learning tasks.

Tensor Class

Core tensor class providing multidimensional array functionality with ML-optimized operations.

/**
 * N-dimensional tensor class for machine learning operations
 */
class Tensor {
  /** Tensor dimensions */
  dims: number[];
  /** Data type of tensor elements */
  type: string;
  /** Raw tensor data */
  data: TypedArray | any[];
  /** Total number of elements */
  size: number;
  
  /**
   * Create a new tensor
   * @param type - Data type ('float32', 'int64', etc.)
   * @param data - Tensor data as typed array
   * @param dims - Tensor dimensions
   */
  constructor(type: string, data: TypedArray | any[], dims: number[]);
  
  /**
   * Get tensor item by index
   * @param index - Linear index into tensor
   * @returns Tensor value or sub-tensor
   */
  _getitem(index: number): number | Tensor;
  
  /**
   * Compute mean along specified dimensions
   * @param dim - Dimension(s) to reduce (null for all)
   * @param keepdim - Whether to keep reduced dimensions
   * @returns Tensor with mean values
   */
  mean(dim?: number | number[] | null, keepdim?: boolean): Tensor;
  
  /**
   * Permute tensor dimensions
   * @param dims - New dimension order
   * @returns Tensor with permuted dimensions
   */
  permute(dims: number[]): Tensor;
  
  /**
   * Remove dimensions of size 1
   * @param dim - Specific dimension to squeeze (optional)
   * @returns Tensor with squeezed dimensions
   */
  squeeze(dim?: number): Tensor;
  
  /**
   * Add dimension of size 1
   * @param dim - Position to insert new dimension
   * @returns Tensor with added dimension
   */
  unsqueeze(dim: number): Tensor;
  
  /**
   * Convert tensor to different data type
   * @param type - Target data type
   * @returns Tensor with converted type
   */
  to(type: string): Tensor;
}

Tensor Manipulation Functions

/**
 * Rearrange tensor dimensions
 * @param tensor - Input tensor
 * @param axes - New axis order
 * @returns Tensor with rearranged dimensions
 */
function permute(tensor: Tensor, axes: number[]): Tensor;

/**
 * Resize tensor using interpolation
 * @param input - Input tensor
 * @param size - Target size [height, width]
 * @param mode - Interpolation mode ('bilinear', 'nearest')
 * @param align_corners - Whether to align corners
 * @returns Resized tensor
 */
function interpolate(
  input: Tensor,
  size: [number, number],
  mode?: string,
  align_corners?: boolean
): Tensor;

/**
 * Apply mean pooling to embeddings using attention mask
 * @param last_hidden_state - Model hidden states
 * @param attention_mask - Attention mask tensor
 * @returns Mean-pooled embeddings
 */
function mean_pooling(
  last_hidden_state: Tensor,
  attention_mask: Tensor
): Tensor;

/**
 * Apply layer normalization
 * @param input - Input tensor
 * @param normalized_shape - Shape for normalization
 * @param options - Normalization parameters (weight, bias, eps)
 * @returns Normalized tensor
 */
function layer_norm(
  input: Tensor,
  normalized_shape: number[],
  options?: {
    weight?: Tensor;
    bias?: Tensor;
    eps?: number;
  }
): Tensor;

/**
 * Concatenate tensors along specified dimension
 * @param tensors - Array of tensors to concatenate
 * @param dim - Dimension to concatenate along (default: 0)
 * @returns Concatenated tensor
 */
function cat(tensors: Tensor[], dim?: number): Tensor;

/**
 * Stack tensors along new dimension
 * @param tensors - Array of tensors to stack
 * @param dim - Dimension to insert for stacking (default: 0)
 * @returns Stacked tensor
 */
function stack(tensors: Tensor[], dim?: number): Tensor;

/**
 * Compute standard deviation and mean
 * @param input - Input tensor
 * @param dim - Dimension to reduce over
 * @param correction - Bessel's correction (default: 1)
 * @param keepdim - Keep reduced dimensions
 * @returns Object with std and mean tensors
 */
function std_mean(
  input: Tensor,
  dim?: number | null,
  correction?: number,
  keepdim?: boolean
): { std: Tensor; mean: Tensor };

/**
 * Compute mean along dimensions
 * @param input - Input tensor
 * @param dim - Dimension to reduce over
 * @param keepdim - Keep reduced dimensions
 * @returns Mean tensor
 */
function mean(
  input: Tensor,
  dim?: number | null,
  keepdim?: boolean
): Tensor;

/**
 * Create tensor filled with ones
 * @param size - Tensor dimensions
 * @returns Tensor filled with ones
 */
function ones(size: number[]): Tensor;

/**
 * Create tensor of ones with same shape as input
 * @param tensor - Reference tensor for shape
 * @returns Tensor of ones with matching shape
 */
function ones_like(tensor: Tensor): Tensor;

/**
 * Quantize embedding tensor for reduced memory usage
 * @param tensor - Input embedding tensor
 * @param precision - Quantization precision ('binary', 'ubinary')
 * @returns Quantized tensor
 */
function quantize_embeddings(tensor: Tensor, precision: string): Tensor;

/**
 * Dynamic time warping algorithm for sequence alignment
 * @param matrix - Distance matrix
 * @returns DTW distance and alignment path
 */
function dynamicTimeWarping(matrix: number[][]): {
  distance: number;
  matrix: number[][];
};

Audio Processing

Audio processing utilities for speech and audio analysis tasks.

/**
 * Read audio file from URL or file path
 * @param url - Audio file URL or path
 * @param sampling_rate - Target sampling rate (default: 16000)
 * @returns Promise resolving to Float32Array audio data
 */
async function read_audio(
  url: string | URL,
  sampling_rate?: number
): Promise<Float32Array>;

/**
 * Generate Hanning window function
 * @param M - Window length
 * @returns Hanning window coefficients
 */
function hanning(M: number): Float64Array;

/**
 * Create mel-scale filter bank for spectrogram analysis
 * @param num_frequency_bins - Number of frequency bins
 * @param num_mel_filters - Number of mel filters
 * @param min_frequency - Minimum frequency
 * @param max_frequency - Maximum frequency
 * @param sampling_rate - Audio sampling rate
 * @param norm - Normalization method (optional)
 * @param mel_scale - Mel scale type (optional)
 * @param triangularize_in_mel_space - Whether to triangularize in mel space
 * @returns Mel filter bank matrix
 */
function mel_filter_bank(
  num_frequency_bins: number,
  num_mel_filters: number,
  min_frequency: number,
  max_frequency: number,
  sampling_rate: number,
  norm?: string | null,
  mel_scale?: string,
  triangularize_in_mel_space?: boolean
): number[][];

/**
 * Compute spectrogram using Short-Time Fourier Transform
 * @param waveform - Input audio waveform
 * @param window - Window function
 * @param frame_length - Length of each frame
 * @param hop_length - Number of samples between frames
 * @param options - Additional STFT options
 * @returns Complex spectrogram tensor
 */
function spectrogram(
  waveform: Float32Array | Float64Array,
  window: Float64Array,
  frame_length: number,
  hop_length: number,
  options?: {
    fft_length?: number;
    power?: number;
    center?: boolean;
    pad_mode?: string;
    normalized?: boolean;
  }
): { data: Float32Array; dims: number[] };

/**
 * Generate window function for audio processing
 * @param window_length - Length of the window
 * @param name - Window type ('hann', 'hamming', 'blackman', etc.)
 * @param options - Additional window options
 * @returns Window function coefficients
 */
function window_function(
  window_length: number,
  name: string,
  options?: {
    symmetric?: boolean;
    dtype?: string;
  }
): Float64Array;

Mathematical Operations

Core mathematical functions and classes for ML computations.

FFT Class

Fast Fourier Transform implementation for frequency domain analysis.

/**
 * Fast Fourier Transform implementation
 */
class FFT {
  /** FFT length */
  readonly fft_length: number;
  
  /**
   * Create FFT instance
   * @param fft_length - Transform length (must be power of 2)
   */
  constructor(fft_length: number);
  
  /**
   * Compute real-valued FFT
   * @param out - Output buffer for complex results
   * @param input - Real input signal
   */
  realTransform(out: Float32Array, input: Float32Array): void;
  
  /**
   * Compute complex FFT
   * @param out - Output buffer for complex results
   * @param input - Complex input signal
   */
  transform(out: Float32Array, input: Float32Array): void;
}

Mathematical Utility Functions

/**
 * Apply softmax activation function
 * @param arr - Input array
 * @returns Softmax probabilities
 */
function softmax(arr: number[]): number[];

/**
 * Apply log softmax activation function
 * @param arr - Input array
 * @returns Log softmax values
 */
function log_softmax(arr: number[]): number[];

/**
 * Compute dot product of two arrays
 * @param arr1 - First array
 * @param arr2 - Second array
 * @returns Dot product result
 */
function dot(arr1: number[], arr2: number[]): number;

/**
 * Compute cosine similarity between two vectors
 * @param arr1 - First vector
 * @param arr2 - Second vector
 * @returns Cosine similarity (-1 to 1)
 */
function cos_sim(arr1: number[], arr2: number[]): number;

/**
 * Compute magnitude (L2 norm) of a vector
 * @param arr - Input vector
 * @returns Vector magnitude
 */
function magnitude(arr: number[]): number;

/**
 * Find minimum value and index
 * @param arr - Input array
 * @returns Object with min value and index
 */
function min(arr: number[]): { min_val: number; min_idx: number };

/**
 * Find maximum value and index
 * @param arr - Input array
 * @returns Object with max value and index
 */
function max(arr: number[]): { max_val: number; max_idx: number };

/**
 * Get top k items from array
 * @param items - Array of { score, index } objects
 * @param top_k - Number of top items to return (default: 1)
 * @returns Top k items sorted by score
 */
function getTopItems(
  items: Array<{ score: number; index: number }>,
  top_k?: number
): Array<{ score: number; index: number }>;

/**
 * Apply median filter to data
 * @param data - Input data array
 * @param windowSize - Filter window size
 * @returns Filtered data
 */
function medianFilter(data: number[], windowSize: number): number[];

/**
 * Round number to specified decimal places
 * @param num - Number to round
 * @param decimals - Number of decimal places
 * @returns Rounded number
 */
function round(num: number, decimals: number): number;

/**
 * Apply banker's rounding (round half to even)
 * @param x - Number to round
 * @returns Rounded number
 */
function bankers_round(x: number): number;

Image Processing

The RawImage class provides comprehensive image manipulation capabilities optimized for ML preprocessing.

RawImage Class

/**
 * Image processing class for ML preprocessing
 */
class RawImage {
  /** Image pixel data */
  data: Uint8ClampedArray;
  /** Image width in pixels */
  width: number;
  /** Image height in pixels */
  height: number;
  /** Number of color channels (1-4) */
  channels: number;
  
  /**
   * Create new RawImage instance
   * @param data - Pixel data array
   * @param width - Image width
   * @param height - Image height
   * @param channels - Number of channels (1=grayscale, 3=RGB, 4=RGBA)
   */
  constructor(
    data: Uint8ClampedArray,
    width: number,
    height: number,
    channels: number
  );
  
  /**
   * Get image dimensions
   * @returns [width, height] tuple
   */
  get size(): [number, number];
  
  /**
   * Load image from URL, file path, or buffer
   * @param input - Image source (URL, path, or buffer)
   * @returns Promise resolving to RawImage instance
   */
  static async read(input: string | URL | Buffer): Promise<RawImage>;
  
  /**
   * Load image from URL
   * @param url - Image URL
   * @returns Promise resolving to RawImage instance
   */
  static async fromURL(url: string | URL): Promise<RawImage>;
  
  /**
   * Create blank image filled with color
   * @param width - Image width
   * @param height - Image height
   * @param channels - Number of channels
   * @param color - Fill color (default: black)
   * @returns New RawImage instance
   */
  static zeros(
    width: number,
    height: number,
    channels: number,
    color?: number
  ): RawImage;
  
  /**
   * Resize image to new dimensions
   * @param width - Target width
   * @param height - Target height
   * @param options - Resize options (resample method)
   * @returns Resized RawImage
   */
  resize(
    width: number,
    height: number,
    options?: { resample?: number }
  ): RawImage;
  
  /**
   * Crop rectangular region from image
   * @param left - Left coordinate
   * @param top - Top coordinate
   * @param width - Crop width
   * @param height - Crop height
   * @returns Cropped RawImage
   */
  crop(left: number, top: number, width: number, height: number): RawImage;
  
  /**
   * Convert between color spaces/channel counts
   * @param channels - Target number of channels
   * @returns Converted RawImage
   */
  convert(channels: number): RawImage;
  
  /**
   * Flip image horizontally
   * @returns Horizontally flipped RawImage
   */
  flip(): RawImage;
  
  /**
   * Apply center crop to make image square
   * @param crop_size - Size of square crop
   * @returns Center-cropped RawImage
   */
  center_crop(crop_size: number): RawImage;
  
  /**
   * Convert image to tensor format for ML models
   * @param channel_format - Channel ordering ('CHW' or 'HWC')
   * @returns Image tensor
   */
  toTensor(channel_format?: string): Tensor;
  
  /**
   * Save image to file (Node.js only)
   * @param path - Output file path
   */
  save(path: string): Promise<void>;
  
  /**
   * Clone the image
   * @returns New RawImage instance with same data
   */
  clone(): RawImage;
}

Audio Processing

Audio utility functions for loading and preprocessing audio data for speech recognition and audio classification tasks.

/**
 * Load and preprocess audio file
 * @param url - Audio file URL or path
 * @param sampling_rate - Target sampling rate (default: 16000)
 * @returns Promise resolving to audio tensor
 */
async function read_audio(
  url: string,
  sampling_rate?: number
): Promise<{
  audio: Float32Array;
  sampling_rate: number;
}>;

/**
 * Generate Hanning window for audio processing
 * @param M - Window length
 * @returns Hanning window coefficients
 */
function hanning(M: number): Float64Array;

/**
 * Create mel-scale filter bank for audio feature extraction
 * @param num_frequency_bins - Number of frequency bins
 * @param num_mel_filters - Number of mel filters
 * @param min_frequency - Minimum frequency
 * @param max_frequency - Maximum frequency
 * @param sampling_rate - Audio sampling rate
 * @param norm - Normalization method
 * @param mel_scale - Mel scale type
 * @returns Mel filter bank matrix
 */
function mel_filter_bank(
  num_frequency_bins: number,
  num_mel_filters: number,
  min_frequency: number,
  max_frequency: number,
  sampling_rate: number,
  norm?: string,
  mel_scale?: string
): number[][];

/**
 * Compute spectrogram from audio signal
 * @param waveform - Audio waveform data
 * @param window - Window function coefficients
 * @param frame_length - Frame length for STFT
 * @param hop_length - Hop length between frames
 * @param options - Additional spectrogram options
 * @returns Spectrogram tensor
 */
function spectrogram(
  waveform: Float32Array | Float64Array,
  window: Float32Array | Float64Array,
  frame_length: number,
  hop_length: number,
  options?: {
    fft_length?: number;
    power?: number;
    center?: boolean;
    pad_mode?: string;
    onesided?: boolean;
  }
): Tensor;

/**
 * Generate window function for audio processing
 * @param window_length - Length of window
 * @param name - Window type ('hann', 'hamming', etc.)
 * @param options - Window parameters
 * @returns Window function coefficients
 */
function window_function(
  window_length: number,
  name: string,
  options?: {
    periodic?: boolean;
    beta?: number;
    dtype?: string;
  }
): Float64Array;

Mathematical Functions

Core mathematical operations for machine learning computations.

/**
 * Apply softmax function to array
 * @param arr - Input array
 * @returns Softmax-normalized array
 */
function softmax(arr: number[]): Float32Array;

/**
 * Apply log softmax function to array
 * @param arr - Input array
 * @returns Log softmax values
 */
function log_softmax(arr: number[]): Float32Array;

/**
 * Compute dot product of two arrays
 * @param arr1 - First array
 * @param arr2 - Second array
 * @returns Dot product result
 */
function dot(arr1: number[], arr2: number[]): number;

/**
 * Get top-k items from array
 * @param items - Input array with scores
 * @param top_k - Number of top items (0 for all)
 * @returns Sorted top-k items
 */
function getTopItems(
  items: Array<{ score: number; [key: string]: any }>,
  top_k?: number
): Array<{ score: number; [key: string]: any }>;

/**
 * Compute cosine similarity between two vectors
 * @param arr1 - First vector
 * @param arr2 - Second vector
 * @returns Cosine similarity score
 */
function cos_sim(arr1: number[], arr2: number[]): number;

/**
 * Compute vector magnitude (L2 norm)
 * @param arr - Input vector
 * @returns Vector magnitude
 */
function magnitude(arr: number[]): number;

/**
 * Find minimum value in array
 * @param arr - Input array
 * @returns Minimum value
 */
function min(arr: number[]): number;

/**
 * Find maximum value in array
 * @param arr - Input array
 * @returns Maximum value
 */
function max(arr: number[]): number;

/**
 * Apply median filter to data
 * @param data - Input data array
 * @param windowSize - Size of median filter window
 * @returns Filtered data
 */
function medianFilter(data: number[], windowSize: number): number[];

/**
 * Round number to specified decimal places
 * @param num - Number to round
 * @param decimals - Number of decimal places
 * @returns Rounded number
 */
function round(num: number, decimals: number): number;

/**
 * Interpolate array data to new dimensions
 * @param input - Input data array
 * @param input_shape - Input dimensions [channels, height, width]
 * @param output_shape - Output dimensions [height, width]
 * @param mode - Interpolation mode ('bilinear', 'nearest')
 * @param align_corners - Whether to align corners
 * @returns Interpolated data array
 */
function interpolate_data(
  input: number[],
  input_shape: [number, number, number],
  output_shape: [number, number],
  mode?: string,
  align_corners?: boolean
): number[];

/**
 * Permute array data dimensions
 * @param array - Input data array
 * @param dims - Original dimensions
 * @param axes - New axis order
 * @returns Permuted data array
 */
function permute_data(
  array: number[],
  dims: number[],
  axes: number[]
): number[];

FFT Class

Fast Fourier Transform implementation for frequency domain analysis.

/**
 * Fast Fourier Transform implementation
 */
class FFT {
  /**
   * Create FFT instance
   * @param fft_length - Length of FFT
   */
  constructor(fft_length: number);
  
  /**
   * Compute forward FFT
   * @param signal - Input signal (real or complex)
   * @returns FFT coefficients
   */
  forward(signal: number[] | Complex[]): Complex[];
  
  /**
   * Compute inverse FFT
   * @param spectrum - Frequency domain coefficients
   * @returns Time domain signal
   */
  inverse(spectrum: Complex[]): Complex[];
}

interface Complex {
  real: number;
  imag: number;
}

Usage Examples

Basic Tensor Operations

import { Tensor, cat, stack, mean_pooling } from "@xenova/transformers";

// Create tensors
const tensor1 = new Tensor("float32", new Float32Array([1, 2, 3, 4]), [2, 2]);
const tensor2 = new Tensor("float32", new Float32Array([5, 6, 7, 8]), [2, 2]);

// Concatenate tensors
const concatenated = cat([tensor1, tensor2], 0); // Shape: [4, 2]

// Stack tensors
const stacked = stack([tensor1, tensor2], 0); // Shape: [2, 2, 2]

// Compute mean
const mean_tensor = tensor1.mean(); // Scalar mean
const row_means = tensor1.mean(1); // Row-wise means

Image Processing

import { RawImage } from "@xenova/transformers";

// Load image from URL
const image = await RawImage.fromURL("https://example.com/image.jpg");

// Resize and crop
const resized = image.resize(224, 224);
const cropped = resized.center_crop(224);

// Convert to tensor for model input
const tensor = cropped.toTensor("CHW"); // Channel-Height-Width format

// Create blank image
const blank = RawImage.zeros(100, 100, 3); // 100x100 RGB image

Audio Processing

import { read_audio, spectrogram, hanning } from "@xenova/transformers";

// Load audio file
const { audio, sampling_rate } = await read_audio("audio.wav", 16000);

// Create window function
const window = hanning(512);

// Compute spectrogram
const spec = spectrogram(audio, window, 512, 256, {
  fft_length: 512,
  power: 2.0,
});

Mathematical Operations

import { softmax, cos_sim, getTopItems } from "@xenova/transformers";

// Apply softmax
const logits = [2.0, 1.0, 0.1];
const probabilities = softmax(logits);

// Compute cosine similarity
const vec1 = [1, 2, 3];
const vec2 = [4, 5, 6];
const similarity = cos_sim(vec1, vec2);

// Get top-k results
const scores = [
  { label: "cat", score: 0.9 },
  { label: "dog", score: 0.8 },
  { label: "bird", score: 0.3 },
];
const top2 = getTopItems(scores, 2); // Top 2 results

Types

type TypedArray = 
  | Float32Array 
  | Float64Array 
  | Int8Array 
  | Uint8Array 
  | Int16Array 
  | Uint16Array 
  | Int32Array 
  | Uint32Array 
  | Int64Array 
  | BigInt64Array 
  | BigUint64Array;

interface Complex {
  real: number;
  imag: number;
}

Install with Tessl CLI

npx tessl i tessl/npm-xenova--transformers

docs

index.md

models-tokenizers.md

pipelines.md

processors.md

utilities.md

tile.json