0
# Feature Extraction
1
2
Audio and image preprocessing capabilities for multimodal models, providing consistent interfaces for different modalities. The feature extraction system handles format conversion, normalization, resizing, and model-specific preprocessing requirements.
3
4
## Capabilities
5
6
### Auto Feature Extractors
7
8
Automatic selection of appropriate feature extractors based on model configurations.
9
10
```python { .api }
11
class AutoFeatureExtractor:
12
@classmethod
13
def from_pretrained(
14
cls,
15
pretrained_model_name_or_path: Union[str, os.PathLike],
16
cache_dir: Union[str, os.PathLike] = None,
17
force_download: bool = False,
18
local_files_only: bool = False,
19
token: Union[str, bool] = None,
20
revision: str = "main",
21
**kwargs
22
):
23
"""
24
Load feature extractor automatically detecting the type.
25
26
Args:
27
pretrained_model_name_or_path: Model name or path
28
cache_dir: Custom cache directory
29
force_download: Force fresh download
30
local_files_only: Only use local files
31
token: Authentication token
32
revision: Model revision/branch
33
34
Returns:
35
Appropriate feature extractor instance
36
"""
37
38
class AutoImageProcessor:
39
@classmethod
40
def from_pretrained(
41
cls,
42
pretrained_model_name_or_path: Union[str, os.PathLike],
43
**kwargs
44
):
45
"""Load image processor automatically detecting the type."""
46
```
47
48
### Base Feature Extraction Classes
49
50
Foundation classes for all feature extractors.
51
52
```python { .api }
53
class FeatureExtractionMixin:
54
"""Base class for all feature extractors."""
55
56
def __init__(self, **kwargs)
57
58
def __call__(self, *args, **kwargs):
59
"""Main preprocessing method."""
60
61
@classmethod
62
def from_pretrained(
63
cls,
64
pretrained_model_name_or_path: Union[str, os.PathLike],
65
**kwargs
66
) -> "FeatureExtractionMixin":
67
"""Load feature extractor from pretrained model."""
68
69
def save_pretrained(
70
self,
71
save_directory: Union[str, os.PathLike],
72
push_to_hub: bool = False,
73
**kwargs
74
) -> None:
75
"""Save feature extractor to directory."""
76
77
def to_dict(self) -> Dict[str, Any]:
78
"""Convert to dictionary representation."""
79
80
class ImageProcessingMixin:
81
"""Base class for image processors."""
82
83
def __call__(
84
self,
85
images: Union["PIL.Image.Image", np.ndarray, torch.Tensor, List],
86
return_tensors: Optional[Union[str, TensorType]] = None,
87
**kwargs
88
) -> BatchFeature:
89
"""
90
Process images for model input.
91
92
Args:
93
images: Input image(s) in various formats
94
return_tensors: Format of returned tensors
95
**kwargs: Additional processing parameters
96
97
Returns:
98
Processed image features
99
"""
100
101
def preprocess(self, images, **kwargs) -> BatchFeature:
102
"""Alias for __call__."""
103
```
104
105
### Audio Feature Extractors
106
107
Preprocessing for audio and speech models.
108
109
```python { .api }
110
class Wav2Vec2FeatureExtractor(FeatureExtractionMixin):
111
def __init__(
112
self,
113
feature_size: int = 1,
114
sampling_rate: int = 16000,
115
padding_value: float = 0.0,
116
do_normalize: bool = True,
117
return_attention_mask: bool = False,
118
**kwargs
119
):
120
"""
121
Wav2Vec2 audio feature extractor.
122
123
Args:
124
feature_size: Feature dimension
125
sampling_rate: Expected sampling rate
126
padding_value: Value for padding
127
do_normalize: Normalize audio values
128
return_attention_mask: Return attention mask
129
"""
130
131
def __call__(
132
self,
133
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
134
padding: Union[bool, str] = False,
135
max_length: Optional[int] = None,
136
truncation: bool = False,
137
pad_to_multiple_of: Optional[int] = None,
138
return_tensors: Optional[Union[str, TensorType]] = None,
139
sampling_rate: Optional[int] = None,
140
return_attention_mask: Optional[bool] = None,
141
**kwargs
142
) -> BatchFeature:
143
"""
144
Process raw audio for Wav2Vec2 models.
145
146
Args:
147
raw_speech: Raw audio waveform(s)
148
padding: Padding strategy
149
max_length: Maximum sequence length
150
truncation: Enable truncation
151
pad_to_multiple_of: Pad to multiple of this value
152
return_tensors: Format of returned tensors
153
sampling_rate: Sampling rate of input audio
154
return_attention_mask: Return attention mask
155
156
Returns:
157
Processed audio features
158
"""
159
160
class WhisperFeatureExtractor(FeatureExtractionMixin):
161
def __init__(
162
self,
163
feature_size: int = 80,
164
sampling_rate: int = 16000,
165
hop_length: int = 160,
166
chunk_length: int = 30,
167
n_fft: int = 400,
168
**kwargs
169
):
170
"""
171
Whisper mel-spectrogram feature extractor.
172
173
Args:
174
feature_size: Number of mel filters
175
sampling_rate: Audio sampling rate
176
hop_length: Hop length for STFT
177
chunk_length: Audio chunk length in seconds
178
n_fft: FFT window size
179
"""
180
181
def __call__(
182
self,
183
raw_speech: Union[np.ndarray, List[float], List[np.ndarray]],
184
truncation: bool = True,
185
pad_to_multiple_of: Optional[int] = None,
186
return_tensors: Optional[Union[str, TensorType]] = None,
187
return_attention_mask: Optional[bool] = None,
188
sampling_rate: Optional[int] = None,
189
**kwargs
190
) -> BatchFeature:
191
"""Process raw audio for Whisper models."""
192
```
193
194
### Image Processors
195
196
Preprocessing for computer vision models.
197
198
```python { .api }
199
class ViTImageProcessor(ImageProcessingMixin):
200
def __init__(
201
self,
202
do_resize: bool = True,
203
size: Dict[str, int] = None,
204
resample: "PIL.Image.Resampling" = None,
205
do_rescale: bool = True,
206
rescale_factor: Union[int, float] = 1/255,
207
do_normalize: bool = True,
208
image_mean: Optional[Union[float, List[float]]] = None,
209
image_std: Optional[Union[float, List[float]]] = None,
210
do_convert_rgb: bool = True,
211
**kwargs
212
):
213
"""
214
Vision Transformer image processor.
215
216
Args:
217
do_resize: Whether to resize images
218
size: Target size dictionary
219
resample: Resampling method
220
do_rescale: Whether to rescale pixel values
221
rescale_factor: Rescaling factor
222
do_normalize: Whether to normalize
223
image_mean: Mean for normalization
224
image_std: Standard deviation for normalization
225
do_convert_rgb: Convert to RGB format
226
"""
227
228
def __call__(
229
self,
230
images: Union["PIL.Image.Image", np.ndarray, torch.Tensor, List],
231
return_tensors: Optional[Union[str, TensorType]] = None,
232
**kwargs
233
) -> BatchFeature:
234
"""Process images for Vision Transformer models."""
235
236
class ConvNextImageProcessor(ImageProcessingMixin):
237
def __init__(
238
self,
239
do_resize: bool = True,
240
size: Dict[str, int] = None,
241
crop_pct: float = 0.875,
242
resample: "PIL.Image.Resampling" = None,
243
do_rescale: bool = True,
244
rescale_factor: Union[int, float] = 1/255,
245
do_normalize: bool = True,
246
image_mean: Optional[Union[float, List[float]]] = None,
247
image_std: Optional[Union[float, List[float]]] = None,
248
**kwargs
249
):
250
"""ConvNeXT image processor with crop percentage."""
251
252
class DetrImageProcessor(ImageProcessingMixin):
253
def __init__(
254
self,
255
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
256
do_resize: bool = True,
257
size: Dict[str, int] = None,
258
resample: "PIL.Image.Resampling" = None,
259
do_rescale: bool = True,
260
rescale_factor: Union[int, float] = 1/255,
261
do_normalize: bool = True,
262
image_mean: Optional[Union[float, List[float]]] = None,
263
image_std: Optional[Union[float, List[float]]] = None,
264
do_convert_annotations: Optional[bool] = None,
265
**kwargs
266
):
267
"""
268
DETR image processor for object detection.
269
270
Args:
271
format: Annotation format (COCO, Pascal VOC, etc.)
272
do_convert_annotations: Convert annotation format
273
"""
274
275
def __call__(
276
self,
277
images: Union["PIL.Image.Image", np.ndarray, torch.Tensor, List],
278
annotations: Optional[Union[Dict, List[Dict]]] = None,
279
return_tensors: Optional[Union[str, TensorType]] = None,
280
**kwargs
281
) -> BatchFeature:
282
"""
283
Process images and annotations for DETR models.
284
285
Args:
286
images: Input images
287
annotations: Bounding box annotations
288
return_tensors: Format of returned tensors
289
290
Returns:
291
Processed features with images and annotations
292
"""
293
```
294
295
### Batch Feature Container
296
297
Container for processed features with convenient access methods.
298
299
```python { .api }
300
class BatchFeature:
301
"""Container for batch of processed features."""
302
303
def __init__(
304
self,
305
data: Optional[Dict[str, Any]] = None,
306
tensor_type: Union[None, str, TensorType] = None
307
)
308
309
def __getitem__(self, item: Union[str, int]) -> Any:
310
"""Access feature data by key or index."""
311
312
def __setitem__(self, key: str, value: Any) -> None:
313
"""Set feature data value."""
314
315
def keys(self) -> List[str]:
316
"""Get all available keys."""
317
318
def values(self) -> List[Any]:
319
"""Get all values."""
320
321
def items(self) -> List[Tuple[str, Any]]:
322
"""Get key-value pairs."""
323
324
def to(self, device: Union[str, torch.device, int]) -> "BatchFeature":
325
"""Move tensors to specified device."""
326
327
def convert_to_tensors(
328
self,
329
tensor_type: Optional[Union[str, TensorType]] = None
330
) -> "BatchFeature":
331
"""Convert to specified tensor format."""
332
333
@property
334
def pixel_values(self) -> Optional[torch.Tensor]:
335
"""Processed image pixel values."""
336
337
@property
338
def input_features(self) -> Optional[torch.Tensor]:
339
"""Processed audio input features."""
340
```
341
342
### Audio Processing Utilities
343
344
Helper functions for audio preprocessing.
345
346
```python { .api }
347
def is_speech_available() -> bool:
348
"""Check if speech processing libraries are available."""
349
350
def load_audio(
351
audio: Union[str, np.ndarray],
352
sampling_rate: int = 16000
353
) -> np.ndarray:
354
"""Load and resample audio file."""
355
356
def mel_filter_bank(
357
num_frequency_bins: int,
358
num_mel_filters: int,
359
min_frequency: float,
360
max_frequency: float,
361
sampling_rate: int,
362
norm: Optional[str] = None,
363
mel_scale: str = "htk"
364
) -> np.ndarray:
365
"""Create mel filter bank matrix."""
366
367
def spectrogram(
368
waveform: np.ndarray,
369
window: np.ndarray,
370
frame_length: int,
371
hop_length: int,
372
fft_length: Optional[int] = None,
373
power: Optional[float] = 1.0,
374
center: bool = True,
375
pad_mode: str = "reflect"
376
) -> np.ndarray:
377
"""Compute spectrogram of audio waveform."""
378
```
379
380
### Image Processing Utilities
381
382
Helper functions for image preprocessing.
383
384
```python { .api }
385
def is_vision_available() -> bool:
386
"""Check if vision processing libraries are available."""
387
388
def load_image(
389
image: Union[str, "PIL.Image.Image", np.ndarray, torch.Tensor]
390
) -> "PIL.Image.Image":
391
"""Load image from various input formats."""
392
393
def resize(
394
image: "PIL.Image.Image",
395
size: Tuple[int, int],
396
resample: "PIL.Image.Resampling" = None,
397
reducing_gap: Optional[int] = None
398
) -> "PIL.Image.Image":
399
"""Resize image to target size."""
400
401
def center_crop(
402
image: "PIL.Image.Image",
403
size: Tuple[int, int]
404
) -> "PIL.Image.Image":
405
"""Center crop image to target size."""
406
407
def normalize(
408
image: np.ndarray,
409
mean: Union[float, List[float]],
410
std: Union[float, List[float]]
411
) -> np.ndarray:
412
"""Normalize image with mean and standard deviation."""
413
414
def rescale(
415
image: np.ndarray,
416
scale: float
417
) -> np.ndarray:
418
"""Rescale image pixel values."""
419
420
def to_channel_dimension_format(
421
image: np.ndarray,
422
channel_dim: Union[ChannelDimension, str]
423
) -> np.ndarray:
424
"""Convert image to specified channel dimension format."""
425
```
426
427
## Feature Extraction Examples
428
429
Common preprocessing patterns for different modalities:
430
431
```python
432
from transformers import AutoFeatureExtractor, AutoImageProcessor
433
import numpy as np
434
from PIL import Image
435
436
# Audio processing
437
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
438
439
# Process single audio file
440
audio_array = np.random.randn(16000) # 1 second at 16kHz
441
inputs = feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt")
442
443
# Process batch of audio files
444
audio_batch = [np.random.randn(16000), np.random.randn(24000)]
445
inputs = feature_extractor(
446
audio_batch,
447
sampling_rate=16000,
448
padding=True,
449
return_tensors="pt"
450
)
451
452
# Image processing
453
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
454
455
# Process single image
456
image = Image.open("example.jpg")
457
inputs = image_processor(image, return_tensors="pt")
458
459
# Process batch of images
460
images = [Image.open(f"image_{i}.jpg") for i in range(3)]
461
inputs = image_processor(images, return_tensors="pt")
462
463
# Object detection with annotations
464
from transformers import DetrImageProcessor
465
466
image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
467
468
# With bounding box annotations
469
annotations = [{
470
"boxes": [[100, 100, 200, 200], [300, 300, 400, 400]],
471
"labels": [1, 2],
472
"area": [10000, 10000],
473
"iscrowd": [0, 0]
474
}]
475
476
inputs = image_processor(
477
images=image,
478
annotations=annotations,
479
return_tensors="pt"
480
)
481
482
# Access processed features
483
pixel_values = inputs.pixel_values # Processed image tensors
484
labels = inputs.labels if hasattr(inputs, 'labels') else None # Processed annotations
485
```
486
487
## Supported Input Formats
488
489
The feature extraction system supports various input formats:
490
491
**Audio Formats:**
492
- NumPy arrays (raw waveforms)
493
- Lists of floats (raw audio samples)
494
- Audio file paths (automatically loaded)
495
- Batch processing with automatic padding
496
497
**Image Formats:**
498
- PIL Images
499
- NumPy arrays (H, W, C) format
500
- PyTorch tensors
501
- TensorFlow tensors
502
- File paths (automatically loaded)
503
- Batch processing with consistent preprocessing
504
505
**Output Formats:**
506
- PyTorch tensors (`return_tensors="pt"`)
507
- TensorFlow tensors (`return_tensors="tf"`)
508
- NumPy arrays (`return_tensors="np"`)
509
- JAX arrays (`return_tensors="jax"`)