tessl/npm-xenova--transformers

State-of-the-art Machine Learning for the web that runs Transformers directly in browsers with no server needed.

Overview

Eval results

Files

Processors

Name: tessl/npm-xenova--transformers
Author: tessl

Processors are used to prepare non-textual inputs (image or audio) for machine learning models. They handle the preprocessing steps required to convert raw data into the format expected by specific model architectures.

Capabilities

Auto Processor

Automatically selects and loads the appropriate processor implementation based on the model configuration.

/**
 * Automatically instantiate a processor based on the model type
 * @param pretrained_model_name_or_path - Model identifier or path
 * @param options - Configuration options for processor loading
 * @returns Promise resolving to the appropriate processor instance
 */
class AutoProcessor {
  static async from_pretrained(
    pretrained_model_name_or_path: string,
    options?: ProcessorOptions
  ): Promise<Processor>;
}

interface ProcessorOptions {
  /** Use quantized version of the model (default: true) */
  quantized?: boolean;
  /** Callback to track model download progress */
  progress_callback?: (progress: any) => void;
  /** Custom model configuration */
  config?: any;
  /** Directory to cache downloaded models */
  cache_dir?: string;
  /** Only use local files, don't download from remote */
  local_files_only?: boolean;
  /** Model revision/branch to use (default: 'main') */
  revision?: string;
}

Usage Example:

import { AutoProcessor, read_audio } from "@xenova/transformers";

// Load a Whisper audio processor
const processor = await AutoProcessor.from_pretrained("openai/whisper-tiny.en");

// Process audio for speech recognition
const audio = await read_audio("audio.wav", 16000);
const { input_features } = await processor(audio);

Base Processor Class

Base class for all processor implementations providing common functionality.

/**
 * Base processor class for preprocessing inputs
 */
class Processor {
  /** Processor configuration */
  config: any;
  
  /**
   * Process inputs based on processor type
   * @param input - Input data (image, audio, etc.)
   * @param options - Processing options
   * @returns Processed data ready for model input
   */
  (input: any, options?: any): Promise<any>;
  
  /**
   * Clean up processor resources
   */
  dispose(): Promise<void>;
}

Image Processors

Processors specialized for handling image inputs, including resizing, normalization, and format conversion.

/**
 * Image processor for vision models
 */
class ImageProcessor extends Processor {
  /**
   * Process image inputs for vision models
   * @param images - Input images (RawImage, URL, Buffer, etc.)
   * @param options - Image processing options
   * @returns Processed image features
   */
  (images: ImageInput | ImageInput[], options?: ImageProcessorOptions): Promise<{
    pixel_values: Tensor;
  }>;
}

interface ImageProcessorOptions {
  /** Target image size for resizing */
  size?: { height: number; width: number };
  /** Whether to normalize pixel values */
  do_normalize?: boolean;
  /** Mean values for normalization */
  image_mean?: number[];
  /** Standard deviation values for normalization */
  image_std?: number[];
  /** Whether to resize images */
  do_resize?: boolean;
  /** Resizing algorithm */
  resample?: number;
  /** Whether to center crop images */
  do_center_crop?: boolean;
  /** Crop size for center cropping */
  crop_size?: { height: number; width: number };
}

type ImageInput = string | URL | RawImage | Buffer;

Audio Processors

Processors for handling audio inputs, including feature extraction and spectrogram generation.

/**
 * Audio processor for speech and audio models
 */
class AudioProcessor extends Processor {
  /**
   * Process audio inputs for speech/audio models
   * @param audio - Input audio data
   * @param options - Audio processing options
   * @returns Processed audio features
   */
  (audio: AudioInput, options?: AudioProcessorOptions): Promise<{
    input_features?: Tensor;
    input_values?: Tensor;
  }>;
}

interface AudioProcessorOptions {
  /** Target sampling rate */
  sampling_rate?: number;
  /** Whether to normalize audio */
  do_normalize?: boolean;
  /** Whether to pad audio to fixed length */
  do_pad?: boolean;
  /** Maximum audio length */
  max_length?: number;
}

type AudioInput = Float32Array | number[] | string | URL;

Whisper Processor

Specialized processor for Whisper speech recognition models.

/**
 * Whisper processor for automatic speech recognition
 */
class WhisperProcessor extends AudioProcessor {
  /**
   * Process audio for Whisper models
   * @param audio - Input audio (16kHz sampling rate expected)
   * @param options - Whisper-specific processing options
   * @returns Log mel-spectrogram features
   */
  (audio: AudioInput, options?: WhisperProcessorOptions): Promise<{
    input_features: Tensor;
  }>;
}

interface WhisperProcessorOptions extends AudioProcessorOptions {
  /** Number of mel filter banks (default: 80) */
  n_fft?: number;
  /** Hop length for STFT (default: 160) */
  hop_length?: number;
  /** Number of samples per chunk (default: 480000) */
  chunk_length?: number;
}

CLIP Processor

Multimodal processor for CLIP vision-language models that handles both images and text.

/**
 * CLIP processor for vision-language models
 */
class CLIPProcessor extends Processor {
  /** Image processor component */
  image_processor: ImageProcessor;
  /** Text tokenizer component */
  tokenizer: PreTrainedTokenizer;
  
  /**
   * Process text and/or images for CLIP models
   * @param options - Input data and processing options
   * @returns Processed features for both modalities
   */
  (options: CLIPProcessorInput): Promise<CLIPProcessorOutput>;
}

interface CLIPProcessorInput {
  /** Input text (optional) */
  text?: string | string[];
  /** Input images (optional) */
  images?: ImageInput | ImageInput[];
  /** Text processing options */
  text_options?: any;
  /** Image processing options */
  image_options?: ImageProcessorOptions;
}

interface CLIPProcessorOutput {
  /** Text input IDs (if text provided) */
  input_ids?: Tensor;
  /** Text attention mask (if text provided) */
  attention_mask?: Tensor;
  /** Image pixel values (if images provided) */
  pixel_values?: Tensor;
}

Utility Functions

Helper functions for data preprocessing and conversion.

/**
 * Convert bounding boxes from center format to corners format
 * @param coords - Center coordinates [center_x, center_y, width, height]
 * @returns Corner coordinates [top_left_x, top_left_y, bottom_right_x, bottom_right_y]
 */
function center_to_corners_format(coords: [number, number, number, number]): [number, number, number, number];

/**
 * Post-process object detection outputs
 * @param outputs - Raw model outputs
 * @param threshold - Score threshold for filtering (default: 0.5)
 * @param target_sizes - Original image sizes for coordinate scaling
 * @param is_zero_shot - Whether this is zero-shot detection
 * @returns Processed detection results
 */
function post_process_object_detection(
  outputs: { logits: Tensor; pred_boxes: Tensor },
  threshold?: number,
  target_sizes?: number[][],
  is_zero_shot?: boolean
): Array<{
  boxes: number[][];
  classes: number[];
  scores: number[];
}>;

Types

interface Processor {
  config: any;
  (input: any, options?: any): Promise<any>;
  dispose(): Promise<void>;
}

interface ProcessorComponents {
  image_processor?: ImageProcessor;
  tokenizer?: PreTrainedTokenizer;
  feature_extractor?: AudioProcessor;
}

Install with Tessl CLI

npx tessl i tessl/npm-xenova--transformers

docs