or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

index.mdmodels-tokenizers.mdpipelines.mdprocessors.mdutilities.md

processors.mddocs/

0

# Processors

1

2

Processors are used to prepare non-textual inputs (image or audio) for machine learning models. They handle the preprocessing steps required to convert raw data into the format expected by specific model architectures.

3

4

## Capabilities

5

6

### Auto Processor

7

8

Automatically selects and loads the appropriate processor implementation based on the model configuration.

9

10

```javascript { .api }

11

/**

12

* Automatically instantiate a processor based on the model type

13

* @param pretrained_model_name_or_path - Model identifier or path

14

* @param options - Configuration options for processor loading

15

* @returns Promise resolving to the appropriate processor instance

16

*/

17

class AutoProcessor {

18

static async from_pretrained(

19

pretrained_model_name_or_path: string,

20

options?: ProcessorOptions

21

): Promise<Processor>;

22

}

23

24

interface ProcessorOptions {

25

/** Use quantized version of the model (default: true) */

26

quantized?: boolean;

27

/** Callback to track model download progress */

28

progress_callback?: (progress: any) => void;

29

/** Custom model configuration */

30

config?: any;

31

/** Directory to cache downloaded models */

32

cache_dir?: string;

33

/** Only use local files, don't download from remote */

34

local_files_only?: boolean;

35

/** Model revision/branch to use (default: 'main') */

36

revision?: string;

37

}

38

```

39

40

**Usage Example:**

41

42

```javascript

43

import { AutoProcessor, read_audio } from "@xenova/transformers";

44

45

// Load a Whisper audio processor

46

const processor = await AutoProcessor.from_pretrained("openai/whisper-tiny.en");

47

48

// Process audio for speech recognition

49

const audio = await read_audio("audio.wav", 16000);

50

const { input_features } = await processor(audio);

51

```

52

53

### Base Processor Class

54

55

Base class for all processor implementations providing common functionality.

56

57

```javascript { .api }

58

/**

59

* Base processor class for preprocessing inputs

60

*/

61

class Processor {

62

/** Processor configuration */

63

config: any;

64

65

/**

66

* Process inputs based on processor type

67

* @param input - Input data (image, audio, etc.)

68

* @param options - Processing options

69

* @returns Processed data ready for model input

70

*/

71

(input: any, options?: any): Promise<any>;

72

73

/**

74

* Clean up processor resources

75

*/

76

dispose(): Promise<void>;

77

}

78

```

79

80

### Image Processors

81

82

Processors specialized for handling image inputs, including resizing, normalization, and format conversion.

83

84

```javascript { .api }

85

/**

86

* Image processor for vision models

87

*/

88

class ImageProcessor extends Processor {

89

/**

90

* Process image inputs for vision models

91

* @param images - Input images (RawImage, URL, Buffer, etc.)

92

* @param options - Image processing options

93

* @returns Processed image features

94

*/

95

(images: ImageInput | ImageInput[], options?: ImageProcessorOptions): Promise<{

96

pixel_values: Tensor;

97

}>;

98

}

99

100

interface ImageProcessorOptions {

101

/** Target image size for resizing */

102

size?: { height: number; width: number };

103

/** Whether to normalize pixel values */

104

do_normalize?: boolean;

105

/** Mean values for normalization */

106

image_mean?: number[];

107

/** Standard deviation values for normalization */

108

image_std?: number[];

109

/** Whether to resize images */

110

do_resize?: boolean;

111

/** Resizing algorithm */

112

resample?: number;

113

/** Whether to center crop images */

114

do_center_crop?: boolean;

115

/** Crop size for center cropping */

116

crop_size?: { height: number; width: number };

117

}

118

119

type ImageInput = string | URL | RawImage | Buffer;

120

```

121

122

### Audio Processors

123

124

Processors for handling audio inputs, including feature extraction and spectrogram generation.

125

126

```javascript { .api }

127

/**

128

* Audio processor for speech and audio models

129

*/

130

class AudioProcessor extends Processor {

131

/**

132

* Process audio inputs for speech/audio models

133

* @param audio - Input audio data

134

* @param options - Audio processing options

135

* @returns Processed audio features

136

*/

137

(audio: AudioInput, options?: AudioProcessorOptions): Promise<{

138

input_features?: Tensor;

139

input_values?: Tensor;

140

}>;

141

}

142

143

interface AudioProcessorOptions {

144

/** Target sampling rate */

145

sampling_rate?: number;

146

/** Whether to normalize audio */

147

do_normalize?: boolean;

148

/** Whether to pad audio to fixed length */

149

do_pad?: boolean;

150

/** Maximum audio length */

151

max_length?: number;

152

}

153

154

type AudioInput = Float32Array | number[] | string | URL;

155

```

156

157

### Whisper Processor

158

159

Specialized processor for Whisper speech recognition models.

160

161

```javascript { .api }

162

/**

163

* Whisper processor for automatic speech recognition

164

*/

165

class WhisperProcessor extends AudioProcessor {

166

/**

167

* Process audio for Whisper models

168

* @param audio - Input audio (16kHz sampling rate expected)

169

* @param options - Whisper-specific processing options

170

* @returns Log mel-spectrogram features

171

*/

172

(audio: AudioInput, options?: WhisperProcessorOptions): Promise<{

173

input_features: Tensor;

174

}>;

175

}

176

177

interface WhisperProcessorOptions extends AudioProcessorOptions {

178

/** Number of mel filter banks (default: 80) */

179

n_fft?: number;

180

/** Hop length for STFT (default: 160) */

181

hop_length?: number;

182

/** Number of samples per chunk (default: 480000) */

183

chunk_length?: number;

184

}

185

```

186

187

### CLIP Processor

188

189

Multimodal processor for CLIP vision-language models that handles both images and text.

190

191

```javascript { .api }

192

/**

193

* CLIP processor for vision-language models

194

*/

195

class CLIPProcessor extends Processor {

196

/** Image processor component */

197

image_processor: ImageProcessor;

198

/** Text tokenizer component */

199

tokenizer: PreTrainedTokenizer;

200

201

/**

202

* Process text and/or images for CLIP models

203

* @param options - Input data and processing options

204

* @returns Processed features for both modalities

205

*/

206

(options: CLIPProcessorInput): Promise<CLIPProcessorOutput>;

207

}

208

209

interface CLIPProcessorInput {

210

/** Input text (optional) */

211

text?: string | string[];

212

/** Input images (optional) */

213

images?: ImageInput | ImageInput[];

214

/** Text processing options */

215

text_options?: any;

216

/** Image processing options */

217

image_options?: ImageProcessorOptions;

218

}

219

220

interface CLIPProcessorOutput {

221

/** Text input IDs (if text provided) */

222

input_ids?: Tensor;

223

/** Text attention mask (if text provided) */

224

attention_mask?: Tensor;

225

/** Image pixel values (if images provided) */

226

pixel_values?: Tensor;

227

}

228

```

229

230

### Utility Functions

231

232

Helper functions for data preprocessing and conversion.

233

234

```javascript { .api }

235

/**

236

* Convert bounding boxes from center format to corners format

237

* @param coords - Center coordinates [center_x, center_y, width, height]

238

* @returns Corner coordinates [top_left_x, top_left_y, bottom_right_x, bottom_right_y]

239

*/

240

function center_to_corners_format(coords: [number, number, number, number]): [number, number, number, number];

241

242

/**

243

* Post-process object detection outputs

244

* @param outputs - Raw model outputs

245

* @param threshold - Score threshold for filtering (default: 0.5)

246

* @param target_sizes - Original image sizes for coordinate scaling

247

* @param is_zero_shot - Whether this is zero-shot detection

248

* @returns Processed detection results

249

*/

250

function post_process_object_detection(

251

outputs: { logits: Tensor; pred_boxes: Tensor },

252

threshold?: number,

253

target_sizes?: number[][],

254

is_zero_shot?: boolean

255

): Array<{

256

boxes: number[][];

257

classes: number[];

258

scores: number[];

259

}>;

260

```

261

262

## Types

263

264

```javascript { .api }

265

interface Processor {

266

config: any;

267

(input: any, options?: any): Promise<any>;

268

dispose(): Promise<void>;

269

}

270

271

interface ProcessorComponents {

272

image_processor?: ImageProcessor;

273

tokenizer?: PreTrainedTokenizer;

274

feature_extractor?: AudioProcessor;

275

}

276

```