CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-keras

Multi-backend deep learning framework that provides a unified, high-level API for building and training neural networks across JAX, TensorFlow, PyTorch, and OpenVINO backends.

Pending
Overview
Eval results
Files

data-utils.mddocs/

Data Processing and Utilities

Built-in datasets, data preprocessing utilities, image processing functions, and various helper utilities for machine learning workflows. These tools simplify data preparation and provide ready-to-use datasets for experimentation.

Capabilities

Built-in Datasets

Standard datasets commonly used for machine learning research and experimentation, pre-loaded and ready to use.

# MNIST handwritten digits dataset
def load_data():
    """
    Load MNIST dataset.
    
    Returns:
    Tuple of ((x_train, y_train), (x_test, y_test))
    - x_train, x_test: uint8 arrays of grayscale image data with shape (num_samples, 28, 28)
    - y_train, y_test: uint8 arrays of digit labels (0-9) with shape (num_samples,)
    """

# Fashion-MNIST dataset (available as keras.datasets.fashion_mnist.load_data())
# CIFAR-10 dataset (available as keras.datasets.cifar10.load_data())
# CIFAR-100 dataset (available as keras.datasets.cifar100.load_data())
# IMDB movie reviews dataset (available as keras.datasets.imdb.load_data())
# Reuters newswire dataset (available as keras.datasets.reuters.load_data())
# Boston housing dataset (available as keras.datasets.boston_housing.load_data())
# California housing dataset (available as keras.datasets.california_housing.load_data())

Image Processing Utilities

Functions for loading, saving, and manipulating images for machine learning workflows.

def load_img(path, color_mode='rgb', target_size=None, interpolation='nearest', 
             keep_aspect_ratio=False):
    """
    Load an image into PIL format.
    
    Parameters:
    - path: Path to image file
    - color_mode: 'grayscale', 'rgb', 'rgba'
    - target_size: Tuple (height, width) to resize image
    - interpolation: Interpolation method for resizing
    - keep_aspect_ratio: Whether to keep aspect ratio when resizing
    
    Returns:
    PIL Image instance
    """

def save_img(path, x, data_format=None, file_format=None, scale=True, **kwargs):
    """
    Save an image to disk.
    
    Parameters:
    - path: Path to save image
    - x: Image array data
    - data_format: Image data format
    - file_format: Image file format ('png', 'jpeg', etc.)
    - scale: Whether to scale pixel values to [0, 255]
    """

def img_to_array(img, data_format=None, dtype=None):
    """
    Convert PIL Image to numpy array.
    
    Parameters:
    - img: PIL Image instance
    - data_format: Image data format ('channels_first' or 'channels_last')
    - dtype: Data type for output array
    
    Returns:
    Numpy array representation of image
    """

def array_to_img(x, data_format=None, scale=True, dtype=None):
    """
    Convert numpy array to PIL Image.
    
    Parameters:
    - x: Input array
    - data_format: Image data format
    - scale: Whether to scale values to [0, 255]
    - dtype: Data type
    
    Returns:
    PIL Image instance
    """

Data Transformation Utilities

Functions for common data preprocessing tasks including categorical encoding, normalization, and sequence processing.

def to_categorical(y, num_classes=None, dtype='float32'):
    """
    Convert class vector to categorical (one-hot) matrix.
    
    Parameters:
    - y: Array of class labels to convert
    - num_classes: Total number of classes (optional)
    - dtype: Data type for output matrix
    
    Returns:
    Binary matrix representation of input as numpy array
    """

def normalize(x, axis=-1, order=2):
    """
    Normalize array along specified axis.
    
    Parameters:
    - x: Array to normalize
    - axis: Axis along which to normalize
    - order: Normalization order (1 for L1, 2 for L2)
    
    Returns:
    Normalized array
    """

def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre',
                  truncating='pre', value=0.0):
    """
    Pad sequences to same length.
    
    Parameters:
    - sequences: List of sequences to pad
    - maxlen: Maximum length of sequences
    - dtype: Data type for output
    - padding: 'pre' or 'post' padding
    - truncating: 'pre' or 'post' truncation
    - value: Padding value
    
    Returns:
    2D numpy array with shape (len(sequences), maxlen)
    """

Dataset Creation Utilities

Functions for creating tf.data.Dataset objects from directories and arrays for efficient data loading.

def image_dataset_from_directory(directory, labels='inferred', label_mode='int',
                                class_names=None, color_mode='rgb', batch_size=32,
                                image_size=(256, 256), shuffle=True, seed=None,
                                validation_split=None, subset=None, interpolation='bilinear',
                                follow_links=False, crop_to_aspect_ratio=False):
    """
    Generate dataset from image directory.
    
    Parameters:
    - directory: Path to directory containing subdirectories of images
    - labels: 'inferred' (from directory structure) or list of labels
    - label_mode: 'int', 'categorical', 'binary', or None
    - class_names: List of class names (overrides inferred names)
    - color_mode: 'grayscale', 'rgb', or 'rgba'
    - batch_size: Batch size
    - image_size: Size to resize images to
    - shuffle: Whether to shuffle data
    - seed: Random seed for shuffling
    - validation_split: Fraction of data for validation
    - subset: 'training' or 'validation' (when validation_split is set)
    - interpolation: Interpolation method for resizing
    - follow_links: Whether to follow symlinks
    - crop_to_aspect_ratio: Whether to crop to maintain aspect ratio
    
    Returns:
    tf.data.Dataset object
    """

def text_dataset_from_directory(directory, labels='inferred', label_mode='int',
                               class_names=None, batch_size=32, max_length=None,
                               shuffle=True, seed=None, validation_split=None,
                               subset=None, follow_links=False):
    """
    Generate dataset from text directory.
    
    Parameters:
    - directory: Path to directory containing text files
    - labels: 'inferred' or list of labels
    - label_mode: 'int', 'categorical', 'binary', or None
    - class_names: List of class names
    - batch_size: Batch size
    - max_length: Maximum sequence length
    - shuffle: Whether to shuffle data
    - seed: Random seed
    - validation_split: Fraction for validation
    - subset: 'training' or 'validation'
    - follow_links: Whether to follow symlinks
    
    Returns:
    tf.data.Dataset object
    """

def timeseries_dataset_from_array(data, targets, sequence_length, sequence_stride=1,
                                 sampling_rate=1, batch_size=128, shuffle=False,
                                 seed=None, start_index=None, end_index=None):
    """
    Create dataset from time series data.
    
    Parameters:
    - data: Array of data points
    - targets: Array of targets corresponding to data
    - sequence_length: Length of output sequences
    - sequence_stride: Stride between successive output sequences
    - sampling_rate: Rate to sample data points within sequences
    - batch_size: Batch size
    - shuffle: Whether to shuffle data
    - seed: Random seed
    - start_index: Start index for data
    - end_index: End index for data
    
    Returns:
    tf.data.Dataset object yielding (inputs, targets) tuples
    """

Data Utilities

Utility classes and functions for advanced data handling including custom datasets, feature engineering, and data packing.

class Sequence:
    """Base class for fitting to sequence of data."""
    
    def __init__(self):
        """Initialize sequence."""
        
    def __getitem__(self, index):
        """
        Get batch at index.
        
        Parameters:
        - index: Batch index
        
        Returns:
        Batch data
        """
    
    def __len__(self):
        """
        Number of batches in sequence.
        
        Returns:
        Number of batches
        """
    
    def on_epoch_end(self):
        """Method called at end of every epoch."""

class FeatureSpace:
    """Utility for feature preprocessing and engineering."""
    
    def __init__(self, features, output_mode='concat'):
        """
        Initialize feature space.
        
        Parameters:
        - features: Dict mapping feature names to preprocessing layers
        - output_mode: 'concat' or 'dict'
        """
    
    def adapt(self, dataset):
        """
        Fit feature preprocessing on dataset.
        
        Parameters:
        - dataset: Dataset to adapt to
        """
    
    def __call__(self, data):
        """
        Apply feature preprocessing.
        
        Parameters:
        - data: Input data
        
        Returns:
        Preprocessed features
        """

def pack_x_y_sample_weight(x, y=None, sample_weight=None):
    """
    Pack user-provided data into tuple.
    
    Parameters:
    - x: Input data
    - y: Target data
    - sample_weight: Sample weights
    
    Returns:
    Packed data tuple
    """

def unpack_x_y_sample_weight(data):
    """
    Unpack user-provided data tuple.
    
    Parameters:
    - data: Packed data tuple
    
    Returns:
    Tuple of (x, y, sample_weight)
    """

def split_dataset(dataset, left_size=None, right_size=None, shuffle=False, seed=None):
    """
    Split dataset into two datasets.
    
    Parameters:
    - dataset: Dataset to split
    - left_size: Size of left split
    - right_size: Size of right split
    - shuffle: Whether to shuffle before splitting
    - seed: Random seed
    
    Returns:
    Tuple of (left_dataset, right_dataset)
    """

File and Download Utilities

Functions for downloading files and managing data assets.

def get_file(fname=None, origin=None, untar=False, md5_hash=None, file_hash=None,
             cache_subdir='datasets', hash_algorithm='auto', extract=False,
             archive_format='auto', cache_dir=None):
    """
    Download file from URL if not already cached.
    
    Parameters:
    - fname: Name of file (if origin has different name)
    - origin: Original URL of file
    - untar: Whether to untar file after download
    - md5_hash: MD5 hash for verification (deprecated)
    - file_hash: Hash for verification
    - cache_subdir: Subdirectory under cache directory
    - hash_algorithm: Hash algorithm ('md5', 'sha256', 'auto')
    - extract: Whether to extract archive after download
    - archive_format: Archive format ('auto', 'tar', 'zip')
    - cache_dir: Location to store cached files
    
    Returns:
    Path to downloaded file
    """

Configuration and Random Utilities

Utilities for setting random seeds and managing global configuration.

def set_random_seed(seed=None):
    """
    Set random seed for reproducible results.
    
    Parameters:
    - seed: Random seed value
    """

class Config:
    """Global configuration utility."""
    
    def enable(self, feature):
        """Enable configuration feature."""
        
    def disable(self, feature):
        """Disable configuration feature."""
        
    def is_enabled(self, feature):
        """Check if feature is enabled."""

class Progbar:
    """Progress bar utility for training loops."""
    
    def __init__(self, target, width=30, verbose=1, interval=0.05,
                 stateful_metrics=None, unit_name='step'):
        """
        Initialize progress bar.
        
        Parameters:
        - target: Total number of steps expected
        - width: Progress bar width
        - verbose: Verbosity mode
        - interval: Minimum update interval
        - stateful_metrics: Metrics that shouldn't be averaged
        - unit_name: Display name for step units
        """
    
    def update(self, current, values=None, finalize=None):
        """
        Update progress bar.
        
        Parameters:
        - current: Current step index
        - values: List of tuples (name, value) for metrics
        - finalize: Whether to finalize progress bar
        """

Usage Examples

Loading and Preprocessing Images

import keras
from keras.utils import load_img, img_to_array, to_categorical
import numpy as np

# Load and preprocess single image
img_path = 'cat.jpg'
img = load_img(img_path, target_size=(224, 224))
img_array = img_to_array(img)
img_array = np.expand_dims(img_array, axis=0)
img_array /= 255.0  # Normalize to [0, 1]

# Convert labels to categorical
labels = [0, 1, 2, 1, 0]  # Class indices
categorical_labels = to_categorical(labels, num_classes=3)
print(categorical_labels)
# [[1. 0. 0.]
#  [0. 1. 0.]
#  [0. 0. 1.]
#  [0. 1. 0.]
#  [1. 0. 0.]]

Creating Datasets from Directories

import keras

# Create image dataset from directory structure
train_dataset = keras.utils.image_dataset_from_directory(
    'path/to/train_data/',
    labels='inferred',
    label_mode='categorical',
    color_mode='rgb',
    batch_size=32,
    image_size=(224, 224),
    shuffle=True,
    validation_split=0.2,
    subset='training',
    seed=123
)

val_dataset = keras.utils.image_dataset_from_directory(
    'path/to/train_data/',
    labels='inferred',
    label_mode='categorical',
    color_mode='rgb',
    batch_size=32,
    image_size=(224, 224),
    shuffle=True,
    validation_split=0.2,
    subset='validation',
    seed=123
)

# Use datasets for training
# model.fit(train_dataset, validation_data=val_dataset, epochs=10)

Working with Built-in Datasets

import keras
from keras.datasets import mnist, cifar10
from keras.utils import to_categorical

# Load MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preprocess data
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)

# Convert labels to categorical
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

print(f"Training data shape: {x_train.shape}")
print(f"Training labels shape: {y_train.shape}")

Custom Data Sequence

import keras
import numpy as np

class CustomDataSequence(keras.utils.Sequence):
    def __init__(self, x_data, y_data, batch_size):
        self.x_data = x_data
        self.y_data = y_data
        self.batch_size = batch_size
        self.indices = np.arange(len(self.x_data))
    
    def __len__(self):
        return len(self.x_data) // self.batch_size
    
    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_x = self.x_data[batch_indices]
        batch_y = self.y_data[batch_indices]
        return batch_x, batch_y
    
    def on_epoch_end(self):
        np.random.shuffle(self.indices)

# Use custom sequence
# train_sequence = CustomDataSequence(x_train, y_train, batch_size=32)
# model.fit(train_sequence, epochs=10)

Feature Engineering with FeatureSpace

import keras
from keras import layers
from keras.utils import FeatureSpace

# Define feature preprocessing
feature_space = FeatureSpace(
    features={
        'age': layers.Normalization(),
        'category': layers.StringLookup(output_mode='one_hot'),
        'price': layers.Discretization(num_bins=10),
    },
    output_mode='concat'
)

# Adapt to training data
# feature_space.adapt(train_dataset)

# Apply preprocessing
# processed_features = feature_space(raw_features)

Creating Time Series Dataset

import keras
import numpy as np

# Generate sample time series data
data = np.sin(np.arange(1000) * 0.1)
targets = np.sin(np.arange(1000) * 0.1 + 0.1)

# Create dataset for sequence prediction
dataset = keras.utils.timeseries_dataset_from_array(
    data=data,
    targets=targets,
    sequence_length=10,
    batch_size=32,
    shuffle=True
)

# Use for training RNN models
# model.fit(dataset, epochs=10)

Install with Tessl CLI

npx tessl i tessl/pypi-keras

docs

activations.md

applications.md

data-utils.md

index.md

initializers.md

layers.md

models.md

operations.md

random.md

regularizers.md

saving.md

training.md

tile.json