CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-keras-nightly

Multi-backend deep learning framework providing a unified API for building and training neural networks across JAX, TensorFlow, PyTorch, and OpenVINO backends

Pending
Overview
Eval results
Files

preprocessing.mddocs/

Data Processing

Comprehensive data preprocessing utilities for images, text, audio, and numerical data with built-in augmentation capabilities, dataset creation functions, and feature preprocessing layers.

Capabilities

Dataset Creation

Functions for creating datasets from various data sources and formats.

def image_dataset_from_directory(directory, labels='inferred', label_mode='int',
                                class_names=None, color_mode='rgb', batch_size=32,
                                image_size=(256, 256), shuffle=True, seed=None,
                                validation_split=None, subset=None, **kwargs):
    """
    Create image dataset from directory structure.
    
    Args:
        directory (str): Path to directory containing subdirectories of images
        labels (str): How to generate labels ('inferred' or None)
        label_mode (str): Type of labels ('int', 'categorical', 'binary', None)
        class_names (list, optional): Explicit list of class names
        color_mode (str): Image color mode ('grayscale', 'rgb', 'rgba')
        batch_size (int): Batch size
        image_size (tuple): Target image size
        shuffle (bool): Whether to shuffle data
        seed (int, optional): Random seed
        validation_split (float, optional): Fraction for validation
        subset (str, optional): Subset to return ('training' or 'validation')
        
    Returns:
        Dataset: Configured image dataset
    """

def text_dataset_from_directory(directory, labels='inferred', label_mode='int',
                                class_names=None, batch_size=32, max_length=None,
                                shuffle=True, seed=None, validation_split=None,
                                subset=None, **kwargs): ...

def timeseries_dataset_from_array(data, targets, sequence_length, sequence_stride=1,
                                 sampling_rate=1, batch_size=128, shuffle=False,
                                 seed=None, start_index=None, end_index=None): ...

def audio_dataset_from_directory(directory, labels='inferred', label_mode='int',
                                class_names=None, batch_size=32, sampling_rate=16000,
                                output_sequence_length=16000, **kwargs): ...

Text Processing Layers

Preprocessing layers for text and sequence data including vectorization and encoding.

class TextVectorization:
    """
    Text vectorization layer for converting text to sequences.
    
    Args:
        max_tokens (int, optional): Maximum vocabulary size
        standardize (str or callable): Text standardization ('lower_and_strip_punctuation', 'lower', 'strip_punctuation', or callable)
        split (str or callable): Text splitting strategy ('whitespace' or callable)
        ngrams (int, optional): N-gram size
        output_mode (str): Output format ('int', 'multi_hot', 'count', 'tf_idf')
        output_sequence_length (int, optional): Output sequence length
        pad_to_max_tokens (bool): Whether to pad to max_tokens
        vocabulary (list, optional): Pre-existing vocabulary
        idf_weights (array, optional): IDF weights for tf-idf mode
        sparse (bool): Whether to return sparse tensors
        ragged (bool): Whether to return ragged tensors
    """
    def __init__(self, max_tokens=None, standardize='lower_and_strip_punctuation',
                 split='whitespace', ngrams=None, output_mode='int',
                 output_sequence_length=None, **kwargs): ...
    
    def adapt(self, data, batch_size=None, steps=None): ...
    def get_vocabulary(self): ...
    def set_vocabulary(self, vocabulary, idf_weights=None): ...

class StringLookup:
    """
    String to integer lookup layer.
    
    Args:
        max_tokens (int, optional): Maximum vocabulary size
        num_oov_indices (int): Number of out-of-vocabulary indices
        mask_token (str, optional): Token to use for masking
        oov_token (str): Token to use for out-of-vocabulary
        vocabulary (list, optional): Pre-existing vocabulary
        idf_weights (array, optional): IDF weights
        invert (bool): Whether to invert the lookup
        output_mode (str): Output format ('int', 'multi_hot', 'count', 'one_hot', 'tf_idf')
        sparse (bool): Whether to return sparse tensors
        pad_to_max_tokens (bool): Whether to pad to max_tokens
    """
    def __init__(self, max_tokens=None, num_oov_indices=1, mask_token=None,
                 oov_token='[UNK]', vocabulary=None, **kwargs): ...

class IntegerLookup:
    """Integer to integer lookup layer."""
    def __init__(self, max_tokens=None, num_oov_indices=1, mask_token=None,
                 oov_token=-1, vocabulary=None, **kwargs): ...

class CategoryEncoding:
    """
    Categorical encoding layer.
    
    Args:
        num_tokens (int, optional): Total number of tokens
        output_mode (str): Output format ('multi_hot', 'one_hot', 'count')
        sparse (bool): Whether to return sparse tensors
    """
    def __init__(self, num_tokens=None, output_mode='multi_hot', sparse=False, **kwargs): ...

Image Processing Layers

Preprocessing layers for image data including resizing, augmentation, and transformations.

class Resizing:
    """
    Resize images to target size.
    
    Args:
        height (int): Target height
        width (int): Target width
        interpolation (str): Interpolation method ('bilinear', 'nearest', 'bicubic', 'area', 'lanczos3', 'lanczos5', 'gaussian', 'mitchellcubic')
        crop_to_aspect_ratio (bool): Whether to crop to maintain aspect ratio
    """
    def __init__(self, height, width, interpolation='bilinear', crop_to_aspect_ratio=False, **kwargs): ...

class CenterCrop:
    """
    Crop images to specified size from center.
    
    Args:
        height (int): Target height
        width (int): Target width
    """
    def __init__(self, height, width, **kwargs): ...

class Rescaling:
    """
    Rescale pixel values.
    
    Args:
        scale (float): Scaling factor
        offset (float): Offset value
    """
    def __init__(self, scale, offset=0.0, **kwargs): ...

# Data augmentation layers
class RandomFlip:
    """
    Random image flipping.
    
    Args:
        mode (str): Flip mode ('horizontal', 'vertical', 'horizontal_and_vertical')
        seed (int, optional): Random seed
    """
    def __init__(self, mode='horizontal_and_vertical', seed=None, **kwargs): ...

class RandomRotation:
    """
    Random image rotation.
    
    Args:
        factor (float or tuple): Rotation factor as fraction of 2π
        fill_mode (str): Fill mode for transformed pixels
        interpolation (str): Interpolation method
        seed (int, optional): Random seed
        fill_value (float): Fill value for constant fill mode
    """
    def __init__(self, factor, fill_mode='reflect', interpolation='bilinear',
                 seed=None, fill_value=0.0, **kwargs): ...

class RandomZoom:
    """Random image zooming."""
    def __init__(self, height_factor, width_factor=None, fill_mode='reflect',
                 interpolation='bilinear', seed=None, fill_value=0.0, **kwargs): ...

class RandomTranslation:
    """Random image translation."""
    def __init__(self, height_factor, width_factor, fill_mode='reflect',
                 interpolation='bilinear', seed=None, fill_value=0.0, **kwargs): ...

class RandomCrop:
    """Random image cropping."""
    def __init__(self, height, width, seed=None, **kwargs): ...

class RandomBrightness:
    """Random brightness adjustment."""
    def __init__(self, factor, value_range=(0, 255), seed=None, **kwargs): ...

class RandomContrast:
    """Random contrast adjustment."""
    def __init__(self, factor, seed=None, **kwargs): ...

Numerical Processing Layers

Preprocessing layers for numerical data including normalization and discretization.

class Normalization:
    """
    Feature normalization layer.
    
    Args:
        axis (int): Axis to normalize along
        mean (array, optional): Pre-computed mean
        variance (array, optional): Pre-computed variance
        invert (bool): Whether to invert normalization
    """
    def __init__(self, axis=-1, mean=None, variance=None, invert=False, **kwargs): ...
    
    def adapt(self, data, batch_size=None, steps=None): ...

class Discretization:
    """
    Value discretization layer.
    
    Args:
        bin_boundaries (array, optional): Bin boundary values
        num_bins (int, optional): Number of bins
        epsilon (float): Small value for bin boundary adjustment
        output_mode (str): Output format ('int', 'one_hot', 'multi_hot', 'count')
        sparse (bool): Whether to return sparse tensors
    """
    def __init__(self, bin_boundaries=None, num_bins=None, epsilon=0.01,
                 output_mode='int', sparse=False, **kwargs): ...
    
    def adapt(self, data, batch_size=None, steps=None): ...

Audio Processing Layers

Specialized layers for audio signal processing.

class MelSpectrogram:
    """
    Mel-frequency spectrogram layer.
    
    Args:
        fft_length (int): FFT length
        sequence_stride (int): Hop length between frames
        sequence_length (int): Window length
        window (str): Window function
        sampling_rate (int): Audio sampling rate
        num_mel_bins (int): Number of mel frequency bins
        min_freq (float): Minimum frequency
        max_freq (float): Maximum frequency
        power_to_db (bool): Whether to convert power to decibels
        top_db (float): Dynamic range for dB conversion
        mag_exp (float): Magnitude exponent
    """
    def __init__(self, fft_length=2048, sequence_stride=512, sequence_length=None,
                 window='hann', sampling_rate=16000, num_mel_bins=128, **kwargs): ...

class STFTSpectrogram:
    """Short-time Fourier transform spectrogram layer."""
    def __init__(self, fft_length=2048, sequence_stride=512, sequence_length=None,
                 window='hann', **kwargs): ...

Utility Functions

Additional preprocessing utilities and helper functions.

def split_dataset(dataset, left_size=None, right_size=None, shuffle=False, seed=None):
    """
    Split dataset into two parts.
    
    Args:
        dataset: Dataset to split
        left_size (float or int, optional): Size of left split
        right_size (float or int, optional): Size of right split  
        shuffle (bool): Whether to shuffle before splitting
        seed (int, optional): Random seed
        
    Returns:
        tuple: (left_dataset, right_dataset)
    """

def to_categorical(y, num_classes=None, dtype='float32'):
    """
    Convert integer labels to categorical encoding.
    
    Args:
        y (array): Integer labels
        num_classes (int, optional): Total number of classes
        dtype (str): Output data type
        
    Returns:
        array: Categorical encoded labels
    """

def normalize(x, axis=-1, order=2):
    """
    Normalize arrays along specified axis.
    
    Args:
        x (array): Input array
        axis (int): Normalization axis
        order (int): Norm order
        
    Returns:
        array: Normalized array
    """

def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre',
                 truncating='pre', value=0.0):
    """
    Pad sequences to same length.
    
    Args:
        sequences (list): List of sequences
        maxlen (int, optional): Maximum length
        dtype (str): Output data type
        padding (str): Padding strategy ('pre' or 'post')
        truncating (str): Truncation strategy ('pre' or 'post')
        value (float): Padding value
        
    Returns:
        array: Padded sequences
    """

Usage Examples

Image Data Pipeline

import keras
from keras import layers

# Create dataset from directory
train_dataset = keras.utils.image_dataset_from_directory(
    'path/to/train',
    validation_split=0.2,
    subset='training',
    seed=123,
    image_size=(224, 224),
    batch_size=32
)

val_dataset = keras.utils.image_dataset_from_directory(
    'path/to/train',
    validation_split=0.2,
    subset='validation',
    seed=123,
    image_size=(224, 224),
    batch_size=32
)

# Build preprocessing pipeline
data_augmentation = keras.Sequential([
    layers.RandomFlip('horizontal'),
    layers.RandomRotation(0.2),
    layers.RandomZoom(0.2),
    layers.RandomBrightness(0.2),
    layers.RandomContrast(0.2)
])

# Apply to datasets
train_dataset = train_dataset.map(lambda x, y: (data_augmentation(x, training=True), y))

# Normalize pixel values
normalization = layers.Rescaling(1./255)
train_dataset = train_dataset.map(lambda x, y: (normalization(x), y))
val_dataset = val_dataset.map(lambda x, y: (normalization(x), y))

Text Data Pipeline

import keras
from keras import layers

# Create text dataset
train_dataset = keras.utils.text_dataset_from_directory(
    'path/to/text_data',
    batch_size=32,
    validation_split=0.2,
    subset='training',
    seed=123
)

# Text vectorization
vectorize_layer = layers.TextVectorization(
    max_tokens=10000,
    output_sequence_length=100,
    standardize='lower_and_strip_punctuation'
)

# Adapt to training data
text_only_dataset = train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(text_only_dataset)

# Apply vectorization
train_dataset = train_dataset.map(lambda x, y: (vectorize_layer(x), y))

Install with Tessl CLI

npx tessl i tessl/pypi-keras-nightly

docs

activations.md

applications.md

backend-config.md

core-framework.md

index.md

initializers.md

layers.md

losses-metrics.md

operations.md

optimizers.md

preprocessing.md

regularizers.md

training-callbacks.md

tile.json