State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Audio and image preprocessing capabilities for multimodal models, providing consistent interfaces for different modalities. The feature extraction system handles format conversion, normalization, resizing, and model-specific preprocessing requirements.
Automatic selection of appropriate feature extractors based on model configurations.
class AutoFeatureExtractor:
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
cache_dir: Union[str, os.PathLike] = None,
force_download: bool = False,
local_files_only: bool = False,
token: Union[str, bool] = None,
revision: str = "main",
**kwargs
):
"""
Load feature extractor automatically detecting the type.
Args:
pretrained_model_name_or_path: Model name or path
cache_dir: Custom cache directory
force_download: Force fresh download
local_files_only: Only use local files
token: Authentication token
revision: Model revision/branch
Returns:
Appropriate feature extractor instance
"""
class AutoImageProcessor:
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
**kwargs
):
"""Load image processor automatically detecting the type."""Foundation classes for all feature extractors.
class FeatureExtractionMixin:
"""Base class for all feature extractors."""
def __init__(self, **kwargs)
def __call__(self, *args, **kwargs):
"""Main preprocessing method."""
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Union[str, os.PathLike],
**kwargs
) -> "FeatureExtractionMixin":
"""Load feature extractor from pretrained model."""
def save_pretrained(
self,
save_directory: Union[str, os.PathLike],
push_to_hub: bool = False,
**kwargs
) -> None:
"""Save feature extractor to directory."""
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary representation."""
class ImageProcessingMixin:
"""Base class for image processors."""
def __call__(
self,
images: Union["PIL.Image.Image", np.ndarray, torch.Tensor, List],
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> BatchFeature:
"""
Process images for model input.
Args:
images: Input image(s) in various formats
return_tensors: Format of returned tensors
**kwargs: Additional processing parameters
Returns:
Processed image features
"""
def preprocess(self, images, **kwargs) -> BatchFeature:
"""Alias for __call__."""Preprocessing for audio and speech models.
class Wav2Vec2FeatureExtractor(FeatureExtractionMixin):
def __init__(
self,
feature_size: int = 1,
sampling_rate: int = 16000,
padding_value: float = 0.0,
do_normalize: bool = True,
return_attention_mask: bool = False,
**kwargs
):
"""
Wav2Vec2 audio feature extractor.
Args:
feature_size: Feature dimension
sampling_rate: Expected sampling rate
padding_value: Value for padding
do_normalize: Normalize audio values
return_attention_mask: Return attention mask
"""
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
padding: Union[bool, str] = False,
max_length: Optional[int] = None,
truncation: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
sampling_rate: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
**kwargs
) -> BatchFeature:
"""
Process raw audio for Wav2Vec2 models.
Args:
raw_speech: Raw audio waveform(s)
padding: Padding strategy
max_length: Maximum sequence length
truncation: Enable truncation
pad_to_multiple_of: Pad to multiple of this value
return_tensors: Format of returned tensors
sampling_rate: Sampling rate of input audio
return_attention_mask: Return attention mask
Returns:
Processed audio features
"""
class WhisperFeatureExtractor(FeatureExtractionMixin):
def __init__(
self,
feature_size: int = 80,
sampling_rate: int = 16000,
hop_length: int = 160,
chunk_length: int = 30,
n_fft: int = 400,
**kwargs
):
"""
Whisper mel-spectrogram feature extractor.
Args:
feature_size: Number of mel filters
sampling_rate: Audio sampling rate
hop_length: Hop length for STFT
chunk_length: Audio chunk length in seconds
n_fft: FFT window size
"""
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray]],
truncation: bool = True,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_attention_mask: Optional[bool] = None,
sampling_rate: Optional[int] = None,
**kwargs
) -> BatchFeature:
"""Process raw audio for Whisper models."""Preprocessing for computer vision models.
class ViTImageProcessor(ImageProcessingMixin):
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: "PIL.Image.Resampling" = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1/255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
**kwargs
):
"""
Vision Transformer image processor.
Args:
do_resize: Whether to resize images
size: Target size dictionary
resample: Resampling method
do_rescale: Whether to rescale pixel values
rescale_factor: Rescaling factor
do_normalize: Whether to normalize
image_mean: Mean for normalization
image_std: Standard deviation for normalization
do_convert_rgb: Convert to RGB format
"""
def __call__(
self,
images: Union["PIL.Image.Image", np.ndarray, torch.Tensor, List],
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> BatchFeature:
"""Process images for Vision Transformer models."""
class ConvNextImageProcessor(ImageProcessingMixin):
def __init__(
self,
do_resize: bool = True,
size: Dict[str, int] = None,
crop_pct: float = 0.875,
resample: "PIL.Image.Resampling" = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1/255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
**kwargs
):
"""ConvNeXT image processor with crop percentage."""
class DetrImageProcessor(ImageProcessingMixin):
def __init__(
self,
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
do_resize: bool = True,
size: Dict[str, int] = None,
resample: "PIL.Image.Resampling" = None,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1/255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_annotations: Optional[bool] = None,
**kwargs
):
"""
DETR image processor for object detection.
Args:
format: Annotation format (COCO, Pascal VOC, etc.)
do_convert_annotations: Convert annotation format
"""
def __call__(
self,
images: Union["PIL.Image.Image", np.ndarray, torch.Tensor, List],
annotations: Optional[Union[Dict, List[Dict]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> BatchFeature:
"""
Process images and annotations for DETR models.
Args:
images: Input images
annotations: Bounding box annotations
return_tensors: Format of returned tensors
Returns:
Processed features with images and annotations
"""Container for processed features with convenient access methods.
class BatchFeature:
"""Container for batch of processed features."""
def __init__(
self,
data: Optional[Dict[str, Any]] = None,
tensor_type: Union[None, str, TensorType] = None
)
def __getitem__(self, item: Union[str, int]) -> Any:
"""Access feature data by key or index."""
def __setitem__(self, key: str, value: Any) -> None:
"""Set feature data value."""
def keys(self) -> List[str]:
"""Get all available keys."""
def values(self) -> List[Any]:
"""Get all values."""
def items(self) -> List[Tuple[str, Any]]:
"""Get key-value pairs."""
def to(self, device: Union[str, torch.device, int]) -> "BatchFeature":
"""Move tensors to specified device."""
def convert_to_tensors(
self,
tensor_type: Optional[Union[str, TensorType]] = None
) -> "BatchFeature":
"""Convert to specified tensor format."""
@property
def pixel_values(self) -> Optional[torch.Tensor]:
"""Processed image pixel values."""
@property
def input_features(self) -> Optional[torch.Tensor]:
"""Processed audio input features."""Helper functions for audio preprocessing.
def is_speech_available() -> bool:
"""Check if speech processing libraries are available."""
def load_audio(
audio: Union[str, np.ndarray],
sampling_rate: int = 16000
) -> np.ndarray:
"""Load and resample audio file."""
def mel_filter_bank(
num_frequency_bins: int,
num_mel_filters: int,
min_frequency: float,
max_frequency: float,
sampling_rate: int,
norm: Optional[str] = None,
mel_scale: str = "htk"
) -> np.ndarray:
"""Create mel filter bank matrix."""
def spectrogram(
waveform: np.ndarray,
window: np.ndarray,
frame_length: int,
hop_length: int,
fft_length: Optional[int] = None,
power: Optional[float] = 1.0,
center: bool = True,
pad_mode: str = "reflect"
) -> np.ndarray:
"""Compute spectrogram of audio waveform."""Helper functions for image preprocessing.
def is_vision_available() -> bool:
"""Check if vision processing libraries are available."""
def load_image(
image: Union[str, "PIL.Image.Image", np.ndarray, torch.Tensor]
) -> "PIL.Image.Image":
"""Load image from various input formats."""
def resize(
image: "PIL.Image.Image",
size: Tuple[int, int],
resample: "PIL.Image.Resampling" = None,
reducing_gap: Optional[int] = None
) -> "PIL.Image.Image":
"""Resize image to target size."""
def center_crop(
image: "PIL.Image.Image",
size: Tuple[int, int]
) -> "PIL.Image.Image":
"""Center crop image to target size."""
def normalize(
image: np.ndarray,
mean: Union[float, List[float]],
std: Union[float, List[float]]
) -> np.ndarray:
"""Normalize image with mean and standard deviation."""
def rescale(
image: np.ndarray,
scale: float
) -> np.ndarray:
"""Rescale image pixel values."""
def to_channel_dimension_format(
image: np.ndarray,
channel_dim: Union[ChannelDimension, str]
) -> np.ndarray:
"""Convert image to specified channel dimension format."""Common preprocessing patterns for different modalities:
from transformers import AutoFeatureExtractor, AutoImageProcessor
import numpy as np
from PIL import Image
# Audio processing
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
# Process single audio file
audio_array = np.random.randn(16000) # 1 second at 16kHz
inputs = feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt")
# Process batch of audio files
audio_batch = [np.random.randn(16000), np.random.randn(24000)]
inputs = feature_extractor(
audio_batch,
sampling_rate=16000,
padding=True,
return_tensors="pt"
)
# Image processing
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
# Process single image
image = Image.open("example.jpg")
inputs = image_processor(image, return_tensors="pt")
# Process batch of images
images = [Image.open(f"image_{i}.jpg") for i in range(3)]
inputs = image_processor(images, return_tensors="pt")
# Object detection with annotations
from transformers import DetrImageProcessor
image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
# With bounding box annotations
annotations = [{
"boxes": [[100, 100, 200, 200], [300, 300, 400, 400]],
"labels": [1, 2],
"area": [10000, 10000],
"iscrowd": [0, 0]
}]
inputs = image_processor(
images=image,
annotations=annotations,
return_tensors="pt"
)
# Access processed features
pixel_values = inputs.pixel_values # Processed image tensors
labels = inputs.labels if hasattr(inputs, 'labels') else None # Processed annotationsThe feature extraction system supports various input formats:
Audio Formats:
Image Formats:
Output Formats:
return_tensors="pt")return_tensors="tf")return_tensors="np")return_tensors="jax")Install with Tessl CLI
npx tessl i tessl/pypi-transformers