CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-transformers

State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow

Overview
Eval results
Files

feature-extraction.mddocs/

Feature Extraction

Audio and image preprocessing capabilities for multimodal models, providing consistent interfaces for different modalities. The feature extraction system handles format conversion, normalization, resizing, and model-specific preprocessing requirements.

Capabilities

Auto Feature Extractors

Automatic selection of appropriate feature extractors based on model configurations.

class AutoFeatureExtractor:
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        cache_dir: Union[str, os.PathLike] = None,
        force_download: bool = False,
        local_files_only: bool = False,
        token: Union[str, bool] = None,
        revision: str = "main",
        **kwargs
    ):
        """
        Load feature extractor automatically detecting the type.
        
        Args:
            pretrained_model_name_or_path: Model name or path
            cache_dir: Custom cache directory
            force_download: Force fresh download
            local_files_only: Only use local files
            token: Authentication token
            revision: Model revision/branch
        
        Returns:
            Appropriate feature extractor instance
        """

class AutoImageProcessor:
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs
    ):
        """Load image processor automatically detecting the type."""

Base Feature Extraction Classes

Foundation classes for all feature extractors.

class FeatureExtractionMixin:
    """Base class for all feature extractors."""
    
    def __init__(self, **kwargs)
    
    def __call__(self, *args, **kwargs):
        """Main preprocessing method."""
    
    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        **kwargs
    ) -> "FeatureExtractionMixin":
        """Load feature extractor from pretrained model."""
    
    def save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        push_to_hub: bool = False,
        **kwargs
    ) -> None:
        """Save feature extractor to directory."""
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary representation."""

class ImageProcessingMixin:
    """Base class for image processors."""
    
    def __call__(
        self,
        images: Union["PIL.Image.Image", np.ndarray, torch.Tensor, List],
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs
    ) -> BatchFeature:
        """
        Process images for model input.
        
        Args:
            images: Input image(s) in various formats
            return_tensors: Format of returned tensors
            **kwargs: Additional processing parameters
        
        Returns:
            Processed image features
        """
    
    def preprocess(self, images, **kwargs) -> BatchFeature:
        """Alias for __call__."""

Audio Feature Extractors

Preprocessing for audio and speech models.

class Wav2Vec2FeatureExtractor(FeatureExtractionMixin):
    def __init__(
        self,
        feature_size: int = 1,
        sampling_rate: int = 16000,
        padding_value: float = 0.0,
        do_normalize: bool = True,
        return_attention_mask: bool = False,
        **kwargs
    ):
        """
        Wav2Vec2 audio feature extractor.
        
        Args:
            feature_size: Feature dimension
            sampling_rate: Expected sampling rate
            padding_value: Value for padding
            do_normalize: Normalize audio values
            return_attention_mask: Return attention mask
        """
    
    def __call__(
        self,
        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        padding: Union[bool, str] = False,
        max_length: Optional[int] = None,
        truncation: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        sampling_rate: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        **kwargs
    ) -> BatchFeature:
        """
        Process raw audio for Wav2Vec2 models.
        
        Args:
            raw_speech: Raw audio waveform(s)
            padding: Padding strategy
            max_length: Maximum sequence length
            truncation: Enable truncation
            pad_to_multiple_of: Pad to multiple of this value
            return_tensors: Format of returned tensors
            sampling_rate: Sampling rate of input audio
            return_attention_mask: Return attention mask
        
        Returns:
            Processed audio features
        """

class WhisperFeatureExtractor(FeatureExtractionMixin):
    def __init__(
        self,
        feature_size: int = 80,
        sampling_rate: int = 16000,
        hop_length: int = 160,
        chunk_length: int = 30,
        n_fft: int = 400,
        **kwargs
    ):
        """
        Whisper mel-spectrogram feature extractor.
        
        Args:
            feature_size: Number of mel filters
            sampling_rate: Audio sampling rate
            hop_length: Hop length for STFT
            chunk_length: Audio chunk length in seconds
            n_fft: FFT window size
        """
    
    def __call__(
        self,
        raw_speech: Union[np.ndarray, List[float], List[np.ndarray]],
        truncation: bool = True,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_attention_mask: Optional[bool] = None,
        sampling_rate: Optional[int] = None,
        **kwargs
    ) -> BatchFeature:
        """Process raw audio for Whisper models."""

Image Processors

Preprocessing for computer vision models.

class ViTImageProcessor(ImageProcessingMixin):
    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: "PIL.Image.Resampling" = None,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1/255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = True,
        **kwargs
    ):
        """
        Vision Transformer image processor.
        
        Args:
            do_resize: Whether to resize images
            size: Target size dictionary
            resample: Resampling method
            do_rescale: Whether to rescale pixel values
            rescale_factor: Rescaling factor
            do_normalize: Whether to normalize
            image_mean: Mean for normalization
            image_std: Standard deviation for normalization
            do_convert_rgb: Convert to RGB format
        """
    
    def __call__(
        self,
        images: Union["PIL.Image.Image", np.ndarray, torch.Tensor, List],
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs
    ) -> BatchFeature:
        """Process images for Vision Transformer models."""

class ConvNextImageProcessor(ImageProcessingMixin):
    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        crop_pct: float = 0.875,
        resample: "PIL.Image.Resampling" = None,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1/255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        **kwargs
    ):
        """ConvNeXT image processor with crop percentage."""

class DetrImageProcessor(ImageProcessingMixin):
    def __init__(
        self,
        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: "PIL.Image.Resampling" = None,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1/255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_annotations: Optional[bool] = None,
        **kwargs
    ):
        """
        DETR image processor for object detection.
        
        Args:
            format: Annotation format (COCO, Pascal VOC, etc.)
            do_convert_annotations: Convert annotation format
        """
    
    def __call__(
        self,
        images: Union["PIL.Image.Image", np.ndarray, torch.Tensor, List],
        annotations: Optional[Union[Dict, List[Dict]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs
    ) -> BatchFeature:
        """
        Process images and annotations for DETR models.
        
        Args:
            images: Input images
            annotations: Bounding box annotations
            return_tensors: Format of returned tensors
        
        Returns:
            Processed features with images and annotations
        """

Batch Feature Container

Container for processed features with convenient access methods.

class BatchFeature:
    """Container for batch of processed features."""
    
    def __init__(
        self,
        data: Optional[Dict[str, Any]] = None,
        tensor_type: Union[None, str, TensorType] = None
    )
    
    def __getitem__(self, item: Union[str, int]) -> Any:
        """Access feature data by key or index."""
    
    def __setitem__(self, key: str, value: Any) -> None:
        """Set feature data value."""
    
    def keys(self) -> List[str]:
        """Get all available keys."""
    
    def values(self) -> List[Any]:
        """Get all values."""
    
    def items(self) -> List[Tuple[str, Any]]:
        """Get key-value pairs."""
    
    def to(self, device: Union[str, torch.device, int]) -> "BatchFeature":
        """Move tensors to specified device."""
    
    def convert_to_tensors(
        self,
        tensor_type: Optional[Union[str, TensorType]] = None
    ) -> "BatchFeature":
        """Convert to specified tensor format."""
    
    @property
    def pixel_values(self) -> Optional[torch.Tensor]:
        """Processed image pixel values."""
    
    @property
    def input_features(self) -> Optional[torch.Tensor]:
        """Processed audio input features."""

Audio Processing Utilities

Helper functions for audio preprocessing.

def is_speech_available() -> bool:
    """Check if speech processing libraries are available."""

def load_audio(
    audio: Union[str, np.ndarray],
    sampling_rate: int = 16000
) -> np.ndarray:
    """Load and resample audio file."""

def mel_filter_bank(
    num_frequency_bins: int,
    num_mel_filters: int,
    min_frequency: float,
    max_frequency: float,
    sampling_rate: int,
    norm: Optional[str] = None,
    mel_scale: str = "htk"
) -> np.ndarray:
    """Create mel filter bank matrix."""

def spectrogram(
    waveform: np.ndarray,
    window: np.ndarray,
    frame_length: int,
    hop_length: int,
    fft_length: Optional[int] = None,
    power: Optional[float] = 1.0,
    center: bool = True,
    pad_mode: str = "reflect"
) -> np.ndarray:
    """Compute spectrogram of audio waveform."""

Image Processing Utilities

Helper functions for image preprocessing.

def is_vision_available() -> bool:
    """Check if vision processing libraries are available."""

def load_image(
    image: Union[str, "PIL.Image.Image", np.ndarray, torch.Tensor]
) -> "PIL.Image.Image":
    """Load image from various input formats."""

def resize(
    image: "PIL.Image.Image",
    size: Tuple[int, int],
    resample: "PIL.Image.Resampling" = None,
    reducing_gap: Optional[int] = None
) -> "PIL.Image.Image":
    """Resize image to target size."""

def center_crop(
    image: "PIL.Image.Image",
    size: Tuple[int, int]
) -> "PIL.Image.Image":
    """Center crop image to target size."""

def normalize(
    image: np.ndarray,
    mean: Union[float, List[float]],
    std: Union[float, List[float]]
) -> np.ndarray:
    """Normalize image with mean and standard deviation."""

def rescale(
    image: np.ndarray,
    scale: float
) -> np.ndarray:
    """Rescale image pixel values."""

def to_channel_dimension_format(
    image: np.ndarray,
    channel_dim: Union[ChannelDimension, str]
) -> np.ndarray:
    """Convert image to specified channel dimension format."""

Feature Extraction Examples

Common preprocessing patterns for different modalities:

from transformers import AutoFeatureExtractor, AutoImageProcessor
import numpy as np
from PIL import Image

# Audio processing
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

# Process single audio file
audio_array = np.random.randn(16000)  # 1 second at 16kHz
inputs = feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt")

# Process batch of audio files
audio_batch = [np.random.randn(16000), np.random.randn(24000)]
inputs = feature_extractor(
    audio_batch,
    sampling_rate=16000,
    padding=True,
    return_tensors="pt"
)

# Image processing
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")

# Process single image
image = Image.open("example.jpg")
inputs = image_processor(image, return_tensors="pt")

# Process batch of images
images = [Image.open(f"image_{i}.jpg") for i in range(3)]
inputs = image_processor(images, return_tensors="pt")

# Object detection with annotations
from transformers import DetrImageProcessor

image_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")

# With bounding box annotations
annotations = [{
    "boxes": [[100, 100, 200, 200], [300, 300, 400, 400]],
    "labels": [1, 2],
    "area": [10000, 10000],
    "iscrowd": [0, 0]
}]

inputs = image_processor(
    images=image,
    annotations=annotations,
    return_tensors="pt"
)

# Access processed features
pixel_values = inputs.pixel_values  # Processed image tensors
labels = inputs.labels if hasattr(inputs, 'labels') else None  # Processed annotations

Supported Input Formats

The feature extraction system supports various input formats:

Audio Formats:

  • NumPy arrays (raw waveforms)
  • Lists of floats (raw audio samples)
  • Audio file paths (automatically loaded)
  • Batch processing with automatic padding

Image Formats:

  • PIL Images
  • NumPy arrays (H, W, C) format
  • PyTorch tensors
  • TensorFlow tensors
  • File paths (automatically loaded)
  • Batch processing with consistent preprocessing

Output Formats:

  • PyTorch tensors (return_tensors="pt")
  • TensorFlow tensors (return_tensors="tf")
  • NumPy arrays (return_tensors="np")
  • JAX arrays (return_tensors="jax")

Install with Tessl CLI

npx tessl i tessl/pypi-transformers

docs

feature-extraction.md

generation.md

index.md

models.md

optimization.md

pipelines.md

tokenization.md

training.md

tile.json