Tessl Tile for npm/@xenova/transformers@2.17.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

index.md models-tokenizers.md pipelines.md processors.md utilities.md

processors.mddocs/

0
# Processors
1

2
Processors are used to prepare non-textual inputs (image or audio) for machine learning models. They handle the preprocessing steps required to convert raw data into the format expected by specific model architectures.
3

4
## Capabilities
5

6
### Auto Processor
7

8
Automatically selects and loads the appropriate processor implementation based on the model configuration.
9

10
```javascript { .api }
11
/**
12
 * Automatically instantiate a processor based on the model type
13
 * @param pretrained_model_name_or_path - Model identifier or path
14
 * @param options - Configuration options for processor loading
15
 * @returns Promise resolving to the appropriate processor instance
16
 */
17
class AutoProcessor {
18
  static async from_pretrained(
19
    pretrained_model_name_or_path: string,
20
    options?: ProcessorOptions
21
  ): Promise<Processor>;
22
}
23

24
interface ProcessorOptions {
25
  /** Use quantized version of the model (default: true) */
26
  quantized?: boolean;
27
  /** Callback to track model download progress */
28
  progress_callback?: (progress: any) => void;
29
  /** Custom model configuration */
30
  config?: any;
31
  /** Directory to cache downloaded models */
32
  cache_dir?: string;
33
  /** Only use local files, don't download from remote */
34
  local_files_only?: boolean;
35
  /** Model revision/branch to use (default: 'main') */
36
  revision?: string;
37
}
38
```
39

40
**Usage Example:**
41

42
```javascript
43
import { AutoProcessor, read_audio } from "@xenova/transformers";
44

45
// Load a Whisper audio processor
46
const processor = await AutoProcessor.from_pretrained("openai/whisper-tiny.en");
47

48
// Process audio for speech recognition
49
const audio = await read_audio("audio.wav", 16000);
50
const { input_features } = await processor(audio);
51
```
52

53
### Base Processor Class
54

55
Base class for all processor implementations providing common functionality.
56

57
```javascript { .api }
58
/**
59
 * Base processor class for preprocessing inputs
60
 */
61
class Processor {
62
  /** Processor configuration */
63
  config: any;
64
  
65
  /**
66
   * Process inputs based on processor type
67
   * @param input - Input data (image, audio, etc.)
68
   * @param options - Processing options
69
   * @returns Processed data ready for model input
70
   */
71
  (input: any, options?: any): Promise<any>;
72
  
73
  /**
74
   * Clean up processor resources
75
   */
76
  dispose(): Promise<void>;
77
}
78
```
79

80
### Image Processors
81

82
Processors specialized for handling image inputs, including resizing, normalization, and format conversion.
83

84
```javascript { .api }
85
/**
86
 * Image processor for vision models
87
 */
88
class ImageProcessor extends Processor {
89
  /**
90
   * Process image inputs for vision models
91
   * @param images - Input images (RawImage, URL, Buffer, etc.)
92
   * @param options - Image processing options
93
   * @returns Processed image features
94
   */
95
  (images: ImageInput | ImageInput[], options?: ImageProcessorOptions): Promise<{
96
    pixel_values: Tensor;
97
  }>;
98
}
99

100
interface ImageProcessorOptions {
101
  /** Target image size for resizing */
102
  size?: { height: number; width: number };
103
  /** Whether to normalize pixel values */
104
  do_normalize?: boolean;
105
  /** Mean values for normalization */
106
  image_mean?: number[];
107
  /** Standard deviation values for normalization */
108
  image_std?: number[];
109
  /** Whether to resize images */
110
  do_resize?: boolean;
111
  /** Resizing algorithm */
112
  resample?: number;
113
  /** Whether to center crop images */
114
  do_center_crop?: boolean;
115
  /** Crop size for center cropping */
116
  crop_size?: { height: number; width: number };
117
}
118

119
type ImageInput = string | URL | RawImage | Buffer;
120
```
121

122
### Audio Processors
123

124
Processors for handling audio inputs, including feature extraction and spectrogram generation.
125

126
```javascript { .api }
127
/**
128
 * Audio processor for speech and audio models
129
 */
130
class AudioProcessor extends Processor {
131
  /**
132
   * Process audio inputs for speech/audio models
133
   * @param audio - Input audio data
134
   * @param options - Audio processing options
135
   * @returns Processed audio features
136
   */
137
  (audio: AudioInput, options?: AudioProcessorOptions): Promise<{
138
    input_features?: Tensor;
139
    input_values?: Tensor;
140
  }>;
141
}
142

143
interface AudioProcessorOptions {
144
  /** Target sampling rate */
145
  sampling_rate?: number;
146
  /** Whether to normalize audio */
147
  do_normalize?: boolean;
148
  /** Whether to pad audio to fixed length */
149
  do_pad?: boolean;
150
  /** Maximum audio length */
151
  max_length?: number;
152
}
153

154
type AudioInput = Float32Array | number[] | string | URL;
155
```
156

157
### Whisper Processor
158

159
Specialized processor for Whisper speech recognition models.
160

161
```javascript { .api }
162
/**
163
 * Whisper processor for automatic speech recognition
164
 */
165
class WhisperProcessor extends AudioProcessor {
166
  /**
167
   * Process audio for Whisper models
168
   * @param audio - Input audio (16kHz sampling rate expected)
169
   * @param options - Whisper-specific processing options
170
   * @returns Log mel-spectrogram features
171
   */
172
  (audio: AudioInput, options?: WhisperProcessorOptions): Promise<{
173
    input_features: Tensor;
174
  }>;
175
}
176

177
interface WhisperProcessorOptions extends AudioProcessorOptions {
178
  /** Number of mel filter banks (default: 80) */
179
  n_fft?: number;
180
  /** Hop length for STFT (default: 160) */
181
  hop_length?: number;
182
  /** Number of samples per chunk (default: 480000) */
183
  chunk_length?: number;
184
}
185
```
186

187
### CLIP Processor
188

189
Multimodal processor for CLIP vision-language models that handles both images and text.
190

191
```javascript { .api }
192
/**
193
 * CLIP processor for vision-language models
194
 */
195
class CLIPProcessor extends Processor {
196
  /** Image processor component */
197
  image_processor: ImageProcessor;
198
  /** Text tokenizer component */
199
  tokenizer: PreTrainedTokenizer;
200
  
201
  /**
202
   * Process text and/or images for CLIP models
203
   * @param options - Input data and processing options
204
   * @returns Processed features for both modalities
205
   */
206
  (options: CLIPProcessorInput): Promise<CLIPProcessorOutput>;
207
}
208

209
interface CLIPProcessorInput {
210
  /** Input text (optional) */
211
  text?: string | string[];
212
  /** Input images (optional) */
213
  images?: ImageInput | ImageInput[];
214
  /** Text processing options */
215
  text_options?: any;
216
  /** Image processing options */
217
  image_options?: ImageProcessorOptions;
218
}
219

220
interface CLIPProcessorOutput {
221
  /** Text input IDs (if text provided) */
222
  input_ids?: Tensor;
223
  /** Text attention mask (if text provided) */
224
  attention_mask?: Tensor;
225
  /** Image pixel values (if images provided) */
226
  pixel_values?: Tensor;
227
}
228
```
229

230
### Utility Functions
231

232
Helper functions for data preprocessing and conversion.
233

234
```javascript { .api }
235
/**
236
 * Convert bounding boxes from center format to corners format
237
 * @param coords - Center coordinates [center_x, center_y, width, height]
238
 * @returns Corner coordinates [top_left_x, top_left_y, bottom_right_x, bottom_right_y]
239
 */
240
function center_to_corners_format(coords: [number, number, number, number]): [number, number, number, number];
241

242
/**
243
 * Post-process object detection outputs
244
 * @param outputs - Raw model outputs
245
 * @param threshold - Score threshold for filtering (default: 0.5)
246
 * @param target_sizes - Original image sizes for coordinate scaling
247
 * @param is_zero_shot - Whether this is zero-shot detection
248
 * @returns Processed detection results
249
 */
250
function post_process_object_detection(
251
  outputs: { logits: Tensor; pred_boxes: Tensor },
252
  threshold?: number,
253
  target_sizes?: number[][],
254
  is_zero_shot?: boolean
255
): Array<{
256
  boxes: number[][];
257
  classes: number[];
258
  scores: number[];
259
}>;
260
```
261

262
## Types
263

264
```javascript { .api }
265
interface Processor {
266
  config: any;
267
  (input: any, options?: any): Promise<any>;
268
  dispose(): Promise<void>;
269
}
270

271
interface ProcessorComponents {
272
  image_processor?: ImageProcessor;
273
  tokenizer?: PreTrainedTokenizer;
274
  feature_extractor?: AudioProcessor;
275
}
276
```

Version

Tile

Files

processors.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

processors.mddocs/