0
# Utilities
1
2
Comprehensive utility functions and classes for tensor operations, image processing, audio processing, and mathematical computations that support the core ML functionality in Transformers.js.
3
4
## Capabilities
5
6
### Tensor Operations
7
8
The Tensor class and related functions provide N-dimensional array operations optimized for machine learning tasks.
9
10
#### Tensor Class
11
12
Core tensor class providing multidimensional array functionality with ML-optimized operations.
13
14
```javascript { .api }
15
/**
16
* N-dimensional tensor class for machine learning operations
17
*/
18
class Tensor {
19
/** Tensor dimensions */
20
dims: number[];
21
/** Data type of tensor elements */
22
type: string;
23
/** Raw tensor data */
24
data: TypedArray | any[];
25
/** Total number of elements */
26
size: number;
27
28
/**
29
* Create a new tensor
30
* @param type - Data type ('float32', 'int64', etc.)
31
* @param data - Tensor data as typed array
32
* @param dims - Tensor dimensions
33
*/
34
constructor(type: string, data: TypedArray | any[], dims: number[]);
35
36
/**
37
* Get tensor item by index
38
* @param index - Linear index into tensor
39
* @returns Tensor value or sub-tensor
40
*/
41
_getitem(index: number): number | Tensor;
42
43
/**
44
* Compute mean along specified dimensions
45
* @param dim - Dimension(s) to reduce (null for all)
46
* @param keepdim - Whether to keep reduced dimensions
47
* @returns Tensor with mean values
48
*/
49
mean(dim?: number | number[] | null, keepdim?: boolean): Tensor;
50
51
/**
52
* Permute tensor dimensions
53
* @param dims - New dimension order
54
* @returns Tensor with permuted dimensions
55
*/
56
permute(dims: number[]): Tensor;
57
58
/**
59
* Remove dimensions of size 1
60
* @param dim - Specific dimension to squeeze (optional)
61
* @returns Tensor with squeezed dimensions
62
*/
63
squeeze(dim?: number): Tensor;
64
65
/**
66
* Add dimension of size 1
67
* @param dim - Position to insert new dimension
68
* @returns Tensor with added dimension
69
*/
70
unsqueeze(dim: number): Tensor;
71
72
/**
73
* Convert tensor to different data type
74
* @param type - Target data type
75
* @returns Tensor with converted type
76
*/
77
to(type: string): Tensor;
78
}
79
```
80
81
#### Tensor Manipulation Functions
82
83
```javascript { .api }
84
/**
85
* Rearrange tensor dimensions
86
* @param tensor - Input tensor
87
* @param axes - New axis order
88
* @returns Tensor with rearranged dimensions
89
*/
90
function permute(tensor: Tensor, axes: number[]): Tensor;
91
92
/**
93
* Resize tensor using interpolation
94
* @param input - Input tensor
95
* @param size - Target size [height, width]
96
* @param mode - Interpolation mode ('bilinear', 'nearest')
97
* @param align_corners - Whether to align corners
98
* @returns Resized tensor
99
*/
100
function interpolate(
101
input: Tensor,
102
size: [number, number],
103
mode?: string,
104
align_corners?: boolean
105
): Tensor;
106
107
/**
108
* Apply mean pooling to embeddings using attention mask
109
* @param last_hidden_state - Model hidden states
110
* @param attention_mask - Attention mask tensor
111
* @returns Mean-pooled embeddings
112
*/
113
function mean_pooling(
114
last_hidden_state: Tensor,
115
attention_mask: Tensor
116
): Tensor;
117
118
/**
119
* Apply layer normalization
120
* @param input - Input tensor
121
* @param normalized_shape - Shape for normalization
122
* @param options - Normalization parameters (weight, bias, eps)
123
* @returns Normalized tensor
124
*/
125
function layer_norm(
126
input: Tensor,
127
normalized_shape: number[],
128
options?: {
129
weight?: Tensor;
130
bias?: Tensor;
131
eps?: number;
132
}
133
): Tensor;
134
135
/**
136
* Concatenate tensors along specified dimension
137
* @param tensors - Array of tensors to concatenate
138
* @param dim - Dimension to concatenate along (default: 0)
139
* @returns Concatenated tensor
140
*/
141
function cat(tensors: Tensor[], dim?: number): Tensor;
142
143
/**
144
* Stack tensors along new dimension
145
* @param tensors - Array of tensors to stack
146
* @param dim - Dimension to insert for stacking (default: 0)
147
* @returns Stacked tensor
148
*/
149
function stack(tensors: Tensor[], dim?: number): Tensor;
150
151
/**
152
* Compute standard deviation and mean
153
* @param input - Input tensor
154
* @param dim - Dimension to reduce over
155
* @param correction - Bessel's correction (default: 1)
156
* @param keepdim - Keep reduced dimensions
157
* @returns Object with std and mean tensors
158
*/
159
function std_mean(
160
input: Tensor,
161
dim?: number | null,
162
correction?: number,
163
keepdim?: boolean
164
): { std: Tensor; mean: Tensor };
165
166
/**
167
* Compute mean along dimensions
168
* @param input - Input tensor
169
* @param dim - Dimension to reduce over
170
* @param keepdim - Keep reduced dimensions
171
* @returns Mean tensor
172
*/
173
function mean(
174
input: Tensor,
175
dim?: number | null,
176
keepdim?: boolean
177
): Tensor;
178
179
/**
180
* Create tensor filled with ones
181
* @param size - Tensor dimensions
182
* @returns Tensor filled with ones
183
*/
184
function ones(size: number[]): Tensor;
185
186
/**
187
* Create tensor of ones with same shape as input
188
* @param tensor - Reference tensor for shape
189
* @returns Tensor of ones with matching shape
190
*/
191
function ones_like(tensor: Tensor): Tensor;
192
193
/**
194
* Quantize embedding tensor for reduced memory usage
195
* @param tensor - Input embedding tensor
196
* @param precision - Quantization precision ('binary', 'ubinary')
197
* @returns Quantized tensor
198
*/
199
function quantize_embeddings(tensor: Tensor, precision: string): Tensor;
200
201
/**
202
* Dynamic time warping algorithm for sequence alignment
203
* @param matrix - Distance matrix
204
* @returns DTW distance and alignment path
205
*/
206
function dynamicTimeWarping(matrix: number[][]): {
207
distance: number;
208
matrix: number[][];
209
};
210
```
211
212
### Audio Processing
213
214
Audio processing utilities for speech and audio analysis tasks.
215
216
```javascript { .api }
217
/**
218
* Read audio file from URL or file path
219
* @param url - Audio file URL or path
220
* @param sampling_rate - Target sampling rate (default: 16000)
221
* @returns Promise resolving to Float32Array audio data
222
*/
223
async function read_audio(
224
url: string | URL,
225
sampling_rate?: number
226
): Promise<Float32Array>;
227
228
/**
229
* Generate Hanning window function
230
* @param M - Window length
231
* @returns Hanning window coefficients
232
*/
233
function hanning(M: number): Float64Array;
234
235
/**
236
* Create mel-scale filter bank for spectrogram analysis
237
* @param num_frequency_bins - Number of frequency bins
238
* @param num_mel_filters - Number of mel filters
239
* @param min_frequency - Minimum frequency
240
* @param max_frequency - Maximum frequency
241
* @param sampling_rate - Audio sampling rate
242
* @param norm - Normalization method (optional)
243
* @param mel_scale - Mel scale type (optional)
244
* @param triangularize_in_mel_space - Whether to triangularize in mel space
245
* @returns Mel filter bank matrix
246
*/
247
function mel_filter_bank(
248
num_frequency_bins: number,
249
num_mel_filters: number,
250
min_frequency: number,
251
max_frequency: number,
252
sampling_rate: number,
253
norm?: string | null,
254
mel_scale?: string,
255
triangularize_in_mel_space?: boolean
256
): number[][];
257
258
/**
259
* Compute spectrogram using Short-Time Fourier Transform
260
* @param waveform - Input audio waveform
261
* @param window - Window function
262
* @param frame_length - Length of each frame
263
* @param hop_length - Number of samples between frames
264
* @param options - Additional STFT options
265
* @returns Complex spectrogram tensor
266
*/
267
function spectrogram(
268
waveform: Float32Array | Float64Array,
269
window: Float64Array,
270
frame_length: number,
271
hop_length: number,
272
options?: {
273
fft_length?: number;
274
power?: number;
275
center?: boolean;
276
pad_mode?: string;
277
normalized?: boolean;
278
}
279
): { data: Float32Array; dims: number[] };
280
281
/**
282
* Generate window function for audio processing
283
* @param window_length - Length of the window
284
* @param name - Window type ('hann', 'hamming', 'blackman', etc.)
285
* @param options - Additional window options
286
* @returns Window function coefficients
287
*/
288
function window_function(
289
window_length: number,
290
name: string,
291
options?: {
292
symmetric?: boolean;
293
dtype?: string;
294
}
295
): Float64Array;
296
```
297
298
### Mathematical Operations
299
300
Core mathematical functions and classes for ML computations.
301
302
#### FFT Class
303
304
Fast Fourier Transform implementation for frequency domain analysis.
305
306
```javascript { .api }
307
/**
308
* Fast Fourier Transform implementation
309
*/
310
class FFT {
311
/** FFT length */
312
readonly fft_length: number;
313
314
/**
315
* Create FFT instance
316
* @param fft_length - Transform length (must be power of 2)
317
*/
318
constructor(fft_length: number);
319
320
/**
321
* Compute real-valued FFT
322
* @param out - Output buffer for complex results
323
* @param input - Real input signal
324
*/
325
realTransform(out: Float32Array, input: Float32Array): void;
326
327
/**
328
* Compute complex FFT
329
* @param out - Output buffer for complex results
330
* @param input - Complex input signal
331
*/
332
transform(out: Float32Array, input: Float32Array): void;
333
}
334
```
335
336
#### Mathematical Utility Functions
337
338
```javascript { .api }
339
/**
340
* Apply softmax activation function
341
* @param arr - Input array
342
* @returns Softmax probabilities
343
*/
344
function softmax(arr: number[]): number[];
345
346
/**
347
* Apply log softmax activation function
348
* @param arr - Input array
349
* @returns Log softmax values
350
*/
351
function log_softmax(arr: number[]): number[];
352
353
/**
354
* Compute dot product of two arrays
355
* @param arr1 - First array
356
* @param arr2 - Second array
357
* @returns Dot product result
358
*/
359
function dot(arr1: number[], arr2: number[]): number;
360
361
/**
362
* Compute cosine similarity between two vectors
363
* @param arr1 - First vector
364
* @param arr2 - Second vector
365
* @returns Cosine similarity (-1 to 1)
366
*/
367
function cos_sim(arr1: number[], arr2: number[]): number;
368
369
/**
370
* Compute magnitude (L2 norm) of a vector
371
* @param arr - Input vector
372
* @returns Vector magnitude
373
*/
374
function magnitude(arr: number[]): number;
375
376
/**
377
* Find minimum value and index
378
* @param arr - Input array
379
* @returns Object with min value and index
380
*/
381
function min(arr: number[]): { min_val: number; min_idx: number };
382
383
/**
384
* Find maximum value and index
385
* @param arr - Input array
386
* @returns Object with max value and index
387
*/
388
function max(arr: number[]): { max_val: number; max_idx: number };
389
390
/**
391
* Get top k items from array
392
* @param items - Array of { score, index } objects
393
* @param top_k - Number of top items to return (default: 1)
394
* @returns Top k items sorted by score
395
*/
396
function getTopItems(
397
items: Array<{ score: number; index: number }>,
398
top_k?: number
399
): Array<{ score: number; index: number }>;
400
401
/**
402
* Apply median filter to data
403
* @param data - Input data array
404
* @param windowSize - Filter window size
405
* @returns Filtered data
406
*/
407
function medianFilter(data: number[], windowSize: number): number[];
408
409
/**
410
* Round number to specified decimal places
411
* @param num - Number to round
412
* @param decimals - Number of decimal places
413
* @returns Rounded number
414
*/
415
function round(num: number, decimals: number): number;
416
417
/**
418
* Apply banker's rounding (round half to even)
419
* @param x - Number to round
420
* @returns Rounded number
421
*/
422
function bankers_round(x: number): number;
423
```
424
425
### Image Processing
426
427
The RawImage class provides comprehensive image manipulation capabilities optimized for ML preprocessing.
428
429
#### RawImage Class
430
431
```javascript { .api }
432
/**
433
* Image processing class for ML preprocessing
434
*/
435
class RawImage {
436
/** Image pixel data */
437
data: Uint8ClampedArray;
438
/** Image width in pixels */
439
width: number;
440
/** Image height in pixels */
441
height: number;
442
/** Number of color channels (1-4) */
443
channels: number;
444
445
/**
446
* Create new RawImage instance
447
* @param data - Pixel data array
448
* @param width - Image width
449
* @param height - Image height
450
* @param channels - Number of channels (1=grayscale, 3=RGB, 4=RGBA)
451
*/
452
constructor(
453
data: Uint8ClampedArray,
454
width: number,
455
height: number,
456
channels: number
457
);
458
459
/**
460
* Get image dimensions
461
* @returns [width, height] tuple
462
*/
463
get size(): [number, number];
464
465
/**
466
* Load image from URL, file path, or buffer
467
* @param input - Image source (URL, path, or buffer)
468
* @returns Promise resolving to RawImage instance
469
*/
470
static async read(input: string | URL | Buffer): Promise<RawImage>;
471
472
/**
473
* Load image from URL
474
* @param url - Image URL
475
* @returns Promise resolving to RawImage instance
476
*/
477
static async fromURL(url: string | URL): Promise<RawImage>;
478
479
/**
480
* Create blank image filled with color
481
* @param width - Image width
482
* @param height - Image height
483
* @param channels - Number of channels
484
* @param color - Fill color (default: black)
485
* @returns New RawImage instance
486
*/
487
static zeros(
488
width: number,
489
height: number,
490
channels: number,
491
color?: number
492
): RawImage;
493
494
/**
495
* Resize image to new dimensions
496
* @param width - Target width
497
* @param height - Target height
498
* @param options - Resize options (resample method)
499
* @returns Resized RawImage
500
*/
501
resize(
502
width: number,
503
height: number,
504
options?: { resample?: number }
505
): RawImage;
506
507
/**
508
* Crop rectangular region from image
509
* @param left - Left coordinate
510
* @param top - Top coordinate
511
* @param width - Crop width
512
* @param height - Crop height
513
* @returns Cropped RawImage
514
*/
515
crop(left: number, top: number, width: number, height: number): RawImage;
516
517
/**
518
* Convert between color spaces/channel counts
519
* @param channels - Target number of channels
520
* @returns Converted RawImage
521
*/
522
convert(channels: number): RawImage;
523
524
/**
525
* Flip image horizontally
526
* @returns Horizontally flipped RawImage
527
*/
528
flip(): RawImage;
529
530
/**
531
* Apply center crop to make image square
532
* @param crop_size - Size of square crop
533
* @returns Center-cropped RawImage
534
*/
535
center_crop(crop_size: number): RawImage;
536
537
/**
538
* Convert image to tensor format for ML models
539
* @param channel_format - Channel ordering ('CHW' or 'HWC')
540
* @returns Image tensor
541
*/
542
toTensor(channel_format?: string): Tensor;
543
544
/**
545
* Save image to file (Node.js only)
546
* @param path - Output file path
547
*/
548
save(path: string): Promise<void>;
549
550
/**
551
* Clone the image
552
* @returns New RawImage instance with same data
553
*/
554
clone(): RawImage;
555
}
556
```
557
558
### Audio Processing
559
560
Audio utility functions for loading and preprocessing audio data for speech recognition and audio classification tasks.
561
562
```javascript { .api }
563
/**
564
* Load and preprocess audio file
565
* @param url - Audio file URL or path
566
* @param sampling_rate - Target sampling rate (default: 16000)
567
* @returns Promise resolving to audio tensor
568
*/
569
async function read_audio(
570
url: string,
571
sampling_rate?: number
572
): Promise<{
573
audio: Float32Array;
574
sampling_rate: number;
575
}>;
576
577
/**
578
* Generate Hanning window for audio processing
579
* @param M - Window length
580
* @returns Hanning window coefficients
581
*/
582
function hanning(M: number): Float64Array;
583
584
/**
585
* Create mel-scale filter bank for audio feature extraction
586
* @param num_frequency_bins - Number of frequency bins
587
* @param num_mel_filters - Number of mel filters
588
* @param min_frequency - Minimum frequency
589
* @param max_frequency - Maximum frequency
590
* @param sampling_rate - Audio sampling rate
591
* @param norm - Normalization method
592
* @param mel_scale - Mel scale type
593
* @returns Mel filter bank matrix
594
*/
595
function mel_filter_bank(
596
num_frequency_bins: number,
597
num_mel_filters: number,
598
min_frequency: number,
599
max_frequency: number,
600
sampling_rate: number,
601
norm?: string,
602
mel_scale?: string
603
): number[][];
604
605
/**
606
* Compute spectrogram from audio signal
607
* @param waveform - Audio waveform data
608
* @param window - Window function coefficients
609
* @param frame_length - Frame length for STFT
610
* @param hop_length - Hop length between frames
611
* @param options - Additional spectrogram options
612
* @returns Spectrogram tensor
613
*/
614
function spectrogram(
615
waveform: Float32Array | Float64Array,
616
window: Float32Array | Float64Array,
617
frame_length: number,
618
hop_length: number,
619
options?: {
620
fft_length?: number;
621
power?: number;
622
center?: boolean;
623
pad_mode?: string;
624
onesided?: boolean;
625
}
626
): Tensor;
627
628
/**
629
* Generate window function for audio processing
630
* @param window_length - Length of window
631
* @param name - Window type ('hann', 'hamming', etc.)
632
* @param options - Window parameters
633
* @returns Window function coefficients
634
*/
635
function window_function(
636
window_length: number,
637
name: string,
638
options?: {
639
periodic?: boolean;
640
beta?: number;
641
dtype?: string;
642
}
643
): Float64Array;
644
```
645
646
### Mathematical Functions
647
648
Core mathematical operations for machine learning computations.
649
650
```javascript { .api }
651
/**
652
* Apply softmax function to array
653
* @param arr - Input array
654
* @returns Softmax-normalized array
655
*/
656
function softmax(arr: number[]): Float32Array;
657
658
/**
659
* Apply log softmax function to array
660
* @param arr - Input array
661
* @returns Log softmax values
662
*/
663
function log_softmax(arr: number[]): Float32Array;
664
665
/**
666
* Compute dot product of two arrays
667
* @param arr1 - First array
668
* @param arr2 - Second array
669
* @returns Dot product result
670
*/
671
function dot(arr1: number[], arr2: number[]): number;
672
673
/**
674
* Get top-k items from array
675
* @param items - Input array with scores
676
* @param top_k - Number of top items (0 for all)
677
* @returns Sorted top-k items
678
*/
679
function getTopItems(
680
items: Array<{ score: number; [key: string]: any }>,
681
top_k?: number
682
): Array<{ score: number; [key: string]: any }>;
683
684
/**
685
* Compute cosine similarity between two vectors
686
* @param arr1 - First vector
687
* @param arr2 - Second vector
688
* @returns Cosine similarity score
689
*/
690
function cos_sim(arr1: number[], arr2: number[]): number;
691
692
/**
693
* Compute vector magnitude (L2 norm)
694
* @param arr - Input vector
695
* @returns Vector magnitude
696
*/
697
function magnitude(arr: number[]): number;
698
699
/**
700
* Find minimum value in array
701
* @param arr - Input array
702
* @returns Minimum value
703
*/
704
function min(arr: number[]): number;
705
706
/**
707
* Find maximum value in array
708
* @param arr - Input array
709
* @returns Maximum value
710
*/
711
function max(arr: number[]): number;
712
713
/**
714
* Apply median filter to data
715
* @param data - Input data array
716
* @param windowSize - Size of median filter window
717
* @returns Filtered data
718
*/
719
function medianFilter(data: number[], windowSize: number): number[];
720
721
/**
722
* Round number to specified decimal places
723
* @param num - Number to round
724
* @param decimals - Number of decimal places
725
* @returns Rounded number
726
*/
727
function round(num: number, decimals: number): number;
728
729
/**
730
* Interpolate array data to new dimensions
731
* @param input - Input data array
732
* @param input_shape - Input dimensions [channels, height, width]
733
* @param output_shape - Output dimensions [height, width]
734
* @param mode - Interpolation mode ('bilinear', 'nearest')
735
* @param align_corners - Whether to align corners
736
* @returns Interpolated data array
737
*/
738
function interpolate_data(
739
input: number[],
740
input_shape: [number, number, number],
741
output_shape: [number, number],
742
mode?: string,
743
align_corners?: boolean
744
): number[];
745
746
/**
747
* Permute array data dimensions
748
* @param array - Input data array
749
* @param dims - Original dimensions
750
* @param axes - New axis order
751
* @returns Permuted data array
752
*/
753
function permute_data(
754
array: number[],
755
dims: number[],
756
axes: number[]
757
): number[];
758
```
759
760
#### FFT Class
761
762
Fast Fourier Transform implementation for frequency domain analysis.
763
764
```javascript { .api }
765
/**
766
* Fast Fourier Transform implementation
767
*/
768
class FFT {
769
/**
770
* Create FFT instance
771
* @param fft_length - Length of FFT
772
*/
773
constructor(fft_length: number);
774
775
/**
776
* Compute forward FFT
777
* @param signal - Input signal (real or complex)
778
* @returns FFT coefficients
779
*/
780
forward(signal: number[] | Complex[]): Complex[];
781
782
/**
783
* Compute inverse FFT
784
* @param spectrum - Frequency domain coefficients
785
* @returns Time domain signal
786
*/
787
inverse(spectrum: Complex[]): Complex[];
788
}
789
790
interface Complex {
791
real: number;
792
imag: number;
793
}
794
```
795
796
## Usage Examples
797
798
### Basic Tensor Operations
799
800
```javascript
801
import { Tensor, cat, stack, mean_pooling } from "@xenova/transformers";
802
803
// Create tensors
804
const tensor1 = new Tensor("float32", new Float32Array([1, 2, 3, 4]), [2, 2]);
805
const tensor2 = new Tensor("float32", new Float32Array([5, 6, 7, 8]), [2, 2]);
806
807
// Concatenate tensors
808
const concatenated = cat([tensor1, tensor2], 0); // Shape: [4, 2]
809
810
// Stack tensors
811
const stacked = stack([tensor1, tensor2], 0); // Shape: [2, 2, 2]
812
813
// Compute mean
814
const mean_tensor = tensor1.mean(); // Scalar mean
815
const row_means = tensor1.mean(1); // Row-wise means
816
```
817
818
### Image Processing
819
820
```javascript
821
import { RawImage } from "@xenova/transformers";
822
823
// Load image from URL
824
const image = await RawImage.fromURL("https://example.com/image.jpg");
825
826
// Resize and crop
827
const resized = image.resize(224, 224);
828
const cropped = resized.center_crop(224);
829
830
// Convert to tensor for model input
831
const tensor = cropped.toTensor("CHW"); // Channel-Height-Width format
832
833
// Create blank image
834
const blank = RawImage.zeros(100, 100, 3); // 100x100 RGB image
835
```
836
837
### Audio Processing
838
839
```javascript
840
import { read_audio, spectrogram, hanning } from "@xenova/transformers";
841
842
// Load audio file
843
const { audio, sampling_rate } = await read_audio("audio.wav", 16000);
844
845
// Create window function
846
const window = hanning(512);
847
848
// Compute spectrogram
849
const spec = spectrogram(audio, window, 512, 256, {
850
fft_length: 512,
851
power: 2.0,
852
});
853
```
854
855
### Mathematical Operations
856
857
```javascript
858
import { softmax, cos_sim, getTopItems } from "@xenova/transformers";
859
860
// Apply softmax
861
const logits = [2.0, 1.0, 0.1];
862
const probabilities = softmax(logits);
863
864
// Compute cosine similarity
865
const vec1 = [1, 2, 3];
866
const vec2 = [4, 5, 6];
867
const similarity = cos_sim(vec1, vec2);
868
869
// Get top-k results
870
const scores = [
871
{ label: "cat", score: 0.9 },
872
{ label: "dog", score: 0.8 },
873
{ label: "bird", score: 0.3 },
874
];
875
const top2 = getTopItems(scores, 2); // Top 2 results
876
```
877
878
## Types
879
880
```javascript { .api }
881
type TypedArray =
882
| Float32Array
883
| Float64Array
884
| Int8Array
885
| Uint8Array
886
| Int16Array
887
| Uint16Array
888
| Int32Array
889
| Uint32Array
890
| Int64Array
891
| BigInt64Array
892
| BigUint64Array;
893
894
interface Complex {
895
real: number;
896
imag: number;
897
}
898
```