CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-keras-preprocessing

Easy data preprocessing and data augmentation for deep learning models

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

sequence-processing.mddocs/

Sequence Processing

Sequence padding, temporal data generation, and utilities for preparing sequential data for neural networks. These tools handle variable-length sequences and time series data preprocessing for recurrent neural networks and sequence models.

Capabilities

Sequence Padding

Utilities for converting variable-length sequences to fixed-length arrays suitable for batch processing in neural networks.

def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', 
                  truncating='pre', value=0.):
    """
    Pad sequences to the same length.
    
    Transforms a list of num_samples sequences (lists of integers) into a 2D 
    numpy array of shape (num_samples, num_timesteps). Sequences shorter than 
    num_timesteps are padded with value. Sequences longer than num_timesteps 
    are truncated.
    
    Parameters:
    - sequences (list): List of lists, where each element is a sequence
    - maxlen (int, optional): Maximum length of all sequences. If None, uses length of longest sequence
    - dtype (str): Type of the output sequences ('int32', 'float32', etc.)
    - padding (str): 'pre' or 'post' - pad either before or after each sequence
    - truncating (str): 'pre' or 'post' - remove values from sequences larger than maxlen
    - value (float or str): Padding value
    
    Returns:
    - numpy.ndarray: Array with shape (len(sequences), maxlen)
    """

Time Series Generation

Generator class for creating batches of temporal data from continuous sequences.

class TimeseriesGenerator:
    """
    Utility class for generating batches of temporal data.
    
    Creates overlapping sequences from continuous time series data for training
    sequence models. Handles sampling, stride, shuffling, and batch generation.
    """
    
    def __init__(self, data, targets, length, sampling_rate=1, stride=1, 
                 start_index=0, end_index=None, shuffle=False, reverse=False, 
                 batch_size=128):
        """
        Initialize timeseries generator.
        
        Parameters:
        - data (numpy.ndarray): Time series data
        - targets (numpy.ndarray): Target values corresponding to data
        - length (int): Length of input sequences
        - sampling_rate (int): Period between successive individual timesteps
        - stride (int): Period between successive sequences
        - start_index (int): Data points earlier than start_index will not be used
        - end_index (int, optional): Data points later than end_index will not be used
        - shuffle (bool): Whether to shuffle the rows at each epoch
        - reverse (bool): Whether to reverse the temporal order of sequences
        - batch_size (int): Number of timeseries samples in each batch
        """
    
    def __len__(self):
        """
        Return number of batches in the generator.
        
        Returns:
        - int: Number of batches
        """
    
    def __getitem__(self, index):
        """
        Get batch at specified index.
        
        Parameters:
        - index (int): Batch index
        
        Returns:
        - tuple: (samples, targets) - batch of sequences and corresponding targets
        """
    
    def get_config(self):
        """
        Return generator configuration as dictionary.
        
        Returns:
        - dict: Configuration dictionary
        """
    
    def to_json(self, **kwargs):
        """
        Return JSON string containing generator configuration.
        
        Returns:
        - str: JSON string of generator configuration
        """

Skipgram Generation

Utilities for generating skipgram word pairs for word2vec training.

def skipgrams(sequence, vocabulary_size, window_size=4, negative_samples=1., 
              shuffle=True, categorical=False, sampling_table=None, seed=None):
    """
    Generate skipgram word pairs for word2vec training.
    
    Creates (word, context) pairs and (word, random_word) negative samples
    from a sequence of word indexes.
    
    Parameters:
    - sequence (list): Sequence of word indexes
    - vocabulary_size (int): Size of vocabulary
    - window_size (int): Maximum distance between current and predicted word
    - negative_samples (float): Ratio of negative samples to positive samples
    - shuffle (bool): Whether to shuffle word couples before returning
    - categorical (bool): Whether to return categorical labels
    - sampling_table (numpy.ndarray, optional): Probability table for sampling
    - seed (int, optional): Random seed
    
    Returns:
    - tuple: (couples, labels) where couples is list of word pairs and labels 
             indicates positive (1) or negative (0) samples
    """

def make_sampling_table(size, sampling_factor=1e-5):
    """
    Generate word rank-based probabilistic sampling table for skipgrams.
    
    Creates sampling probabilities based on word frequency ranks, used to
    downsample frequent words in skipgram generation.
    
    Parameters:
    - size (int): Size of vocabulary
    - sampling_factor (float): Factor for downsampling frequent words
    
    Returns:
    - numpy.ndarray: Sampling probabilities for each word rank
    """

Serialization

def timeseries_generator_from_json(json_string):
    """
    Parse JSON timeseries generator configuration and return generator instance.
    
    Parameters:
    - json_string (str): JSON string containing generator configuration
    
    Returns:
    - TimeseriesGenerator: Generator instance with loaded configuration
    """

Usage Examples

Basic Sequence Padding

from keras_preprocessing.sequence import pad_sequences

# Variable length sequences
sequences = [
    [1, 2, 3],
    [1, 2, 3, 4, 5],
    [1, 2]
]

# Pad to same length (default: pre-padding with zeros)
padded = pad_sequences(sequences, maxlen=5)
print(padded)
# [[0 0 1 2 3]
#  [1 2 3 4 5]
#  [0 0 0 1 2]]

# Post-padding
padded_post = pad_sequences(sequences, maxlen=5, padding='post')
print(padded_post)
# [[1 2 3 0 0]
#  [1 2 3 4 5]
#  [1 2 0 0 0]]

# Truncation
long_sequences = [[1, 2, 3, 4, 5, 6, 7]]
truncated = pad_sequences(long_sequences, maxlen=5, truncating='post')
print(truncated)  # [[1 2 3 4 5]]

Time Series Data Generation

import numpy as np
from keras_preprocessing.sequence import TimeseriesGenerator

# Create sample time series data
data = np.array([i for i in range(50)])  # [0, 1, 2, ..., 49]
targets = data  # For autoregression, targets can be same as data

# Create generator for sequences of length 10
generator = TimeseriesGenerator(
    data=data,
    targets=targets,
    length=10,
    batch_size=6,
    sampling_rate=1,
    stride=1
)

print(f"Number of batches: {len(generator)}")  # 7

# Get first batch
batch_x, batch_y = generator[0]
print(f"Batch shape: {batch_x.shape}")  # (6, 10)
print(f"Target shape: {batch_y.shape}")  # (6,)

# First sequence: data[0:10] -> target[10]
print(f"First sequence: {batch_x[0]} -> {batch_y[0]}")
# [0 1 2 3 4 5 6 7 8 9] -> 10

Skipgram Generation for Word2Vec

from keras_preprocessing.sequence import skipgrams, make_sampling_table

# Sample word sequence
sequence = [1, 2, 3, 4, 5, 2, 6, 7, 8, 9]
vocabulary_size = 10

# Generate skipgrams
couples, labels = skipgrams(
    sequence=sequence,
    vocabulary_size=vocabulary_size,
    window_size=2,
    negative_samples=1.0
)

print(f"Generated {len(couples)} word pairs")
print(f"Positive samples: {sum(labels)}")
print(f"Negative samples: {len(labels) - sum(labels)}")

# Example couples and labels
for i in range(5):
    word, context = couples[i]
    label_type = "positive" if labels[i] == 1 else "negative"
    print(f"({word}, {context}) - {label_type}")

# Create sampling table for frequent word downsampling
sampling_table = make_sampling_table(vocabulary_size)
print(f"Sampling probabilities: {sampling_table[:5]}")

Advanced Time Series with Custom Parameters

# Multi-feature time series
data = np.random.randn(100, 3)  # 100 timesteps, 3 features
targets = np.random.randn(100, 1)  # Regression targets

# Generator with stride and sampling
generator = TimeseriesGenerator(
    data=data,
    targets=targets,
    length=15,
    sampling_rate=2,  # Use every 2nd timestep
    stride=3,         # Move 3 steps between sequences
    batch_size=4,
    shuffle=True,
    reverse=False
)

# Get configuration for serialization
config = generator.get_config()
json_config = generator.to_json()

Install with Tessl CLI

npx tessl i tessl/pypi-keras-preprocessing

docs

image-processing.md

index.md

sequence-processing.md

text-processing.md

tile.json