Easy data preprocessing and data augmentation for deep learning models
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Sequence padding, temporal data generation, and utilities for preparing sequential data for neural networks. These tools handle variable-length sequences and time series data preprocessing for recurrent neural networks and sequence models.
Utilities for converting variable-length sequences to fixed-length arrays suitable for batch processing in neural networks.
def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre',
truncating='pre', value=0.):
"""
Pad sequences to the same length.
Transforms a list of num_samples sequences (lists of integers) into a 2D
numpy array of shape (num_samples, num_timesteps). Sequences shorter than
num_timesteps are padded with value. Sequences longer than num_timesteps
are truncated.
Parameters:
- sequences (list): List of lists, where each element is a sequence
- maxlen (int, optional): Maximum length of all sequences. If None, uses length of longest sequence
- dtype (str): Type of the output sequences ('int32', 'float32', etc.)
- padding (str): 'pre' or 'post' - pad either before or after each sequence
- truncating (str): 'pre' or 'post' - remove values from sequences larger than maxlen
- value (float or str): Padding value
Returns:
- numpy.ndarray: Array with shape (len(sequences), maxlen)
"""Generator class for creating batches of temporal data from continuous sequences.
class TimeseriesGenerator:
"""
Utility class for generating batches of temporal data.
Creates overlapping sequences from continuous time series data for training
sequence models. Handles sampling, stride, shuffling, and batch generation.
"""
def __init__(self, data, targets, length, sampling_rate=1, stride=1,
start_index=0, end_index=None, shuffle=False, reverse=False,
batch_size=128):
"""
Initialize timeseries generator.
Parameters:
- data (numpy.ndarray): Time series data
- targets (numpy.ndarray): Target values corresponding to data
- length (int): Length of input sequences
- sampling_rate (int): Period between successive individual timesteps
- stride (int): Period between successive sequences
- start_index (int): Data points earlier than start_index will not be used
- end_index (int, optional): Data points later than end_index will not be used
- shuffle (bool): Whether to shuffle the rows at each epoch
- reverse (bool): Whether to reverse the temporal order of sequences
- batch_size (int): Number of timeseries samples in each batch
"""
def __len__(self):
"""
Return number of batches in the generator.
Returns:
- int: Number of batches
"""
def __getitem__(self, index):
"""
Get batch at specified index.
Parameters:
- index (int): Batch index
Returns:
- tuple: (samples, targets) - batch of sequences and corresponding targets
"""
def get_config(self):
"""
Return generator configuration as dictionary.
Returns:
- dict: Configuration dictionary
"""
def to_json(self, **kwargs):
"""
Return JSON string containing generator configuration.
Returns:
- str: JSON string of generator configuration
"""Utilities for generating skipgram word pairs for word2vec training.
def skipgrams(sequence, vocabulary_size, window_size=4, negative_samples=1.,
shuffle=True, categorical=False, sampling_table=None, seed=None):
"""
Generate skipgram word pairs for word2vec training.
Creates (word, context) pairs and (word, random_word) negative samples
from a sequence of word indexes.
Parameters:
- sequence (list): Sequence of word indexes
- vocabulary_size (int): Size of vocabulary
- window_size (int): Maximum distance between current and predicted word
- negative_samples (float): Ratio of negative samples to positive samples
- shuffle (bool): Whether to shuffle word couples before returning
- categorical (bool): Whether to return categorical labels
- sampling_table (numpy.ndarray, optional): Probability table for sampling
- seed (int, optional): Random seed
Returns:
- tuple: (couples, labels) where couples is list of word pairs and labels
indicates positive (1) or negative (0) samples
"""
def make_sampling_table(size, sampling_factor=1e-5):
"""
Generate word rank-based probabilistic sampling table for skipgrams.
Creates sampling probabilities based on word frequency ranks, used to
downsample frequent words in skipgram generation.
Parameters:
- size (int): Size of vocabulary
- sampling_factor (float): Factor for downsampling frequent words
Returns:
- numpy.ndarray: Sampling probabilities for each word rank
"""def timeseries_generator_from_json(json_string):
"""
Parse JSON timeseries generator configuration and return generator instance.
Parameters:
- json_string (str): JSON string containing generator configuration
Returns:
- TimeseriesGenerator: Generator instance with loaded configuration
"""from keras_preprocessing.sequence import pad_sequences
# Variable length sequences
sequences = [
[1, 2, 3],
[1, 2, 3, 4, 5],
[1, 2]
]
# Pad to same length (default: pre-padding with zeros)
padded = pad_sequences(sequences, maxlen=5)
print(padded)
# [[0 0 1 2 3]
# [1 2 3 4 5]
# [0 0 0 1 2]]
# Post-padding
padded_post = pad_sequences(sequences, maxlen=5, padding='post')
print(padded_post)
# [[1 2 3 0 0]
# [1 2 3 4 5]
# [1 2 0 0 0]]
# Truncation
long_sequences = [[1, 2, 3, 4, 5, 6, 7]]
truncated = pad_sequences(long_sequences, maxlen=5, truncating='post')
print(truncated) # [[1 2 3 4 5]]import numpy as np
from keras_preprocessing.sequence import TimeseriesGenerator
# Create sample time series data
data = np.array([i for i in range(50)]) # [0, 1, 2, ..., 49]
targets = data # For autoregression, targets can be same as data
# Create generator for sequences of length 10
generator = TimeseriesGenerator(
data=data,
targets=targets,
length=10,
batch_size=6,
sampling_rate=1,
stride=1
)
print(f"Number of batches: {len(generator)}") # 7
# Get first batch
batch_x, batch_y = generator[0]
print(f"Batch shape: {batch_x.shape}") # (6, 10)
print(f"Target shape: {batch_y.shape}") # (6,)
# First sequence: data[0:10] -> target[10]
print(f"First sequence: {batch_x[0]} -> {batch_y[0]}")
# [0 1 2 3 4 5 6 7 8 9] -> 10from keras_preprocessing.sequence import skipgrams, make_sampling_table
# Sample word sequence
sequence = [1, 2, 3, 4, 5, 2, 6, 7, 8, 9]
vocabulary_size = 10
# Generate skipgrams
couples, labels = skipgrams(
sequence=sequence,
vocabulary_size=vocabulary_size,
window_size=2,
negative_samples=1.0
)
print(f"Generated {len(couples)} word pairs")
print(f"Positive samples: {sum(labels)}")
print(f"Negative samples: {len(labels) - sum(labels)}")
# Example couples and labels
for i in range(5):
word, context = couples[i]
label_type = "positive" if labels[i] == 1 else "negative"
print(f"({word}, {context}) - {label_type}")
# Create sampling table for frequent word downsampling
sampling_table = make_sampling_table(vocabulary_size)
print(f"Sampling probabilities: {sampling_table[:5]}")# Multi-feature time series
data = np.random.randn(100, 3) # 100 timesteps, 3 features
targets = np.random.randn(100, 1) # Regression targets
# Generator with stride and sampling
generator = TimeseriesGenerator(
data=data,
targets=targets,
length=15,
sampling_rate=2, # Use every 2nd timestep
stride=3, # Move 3 steps between sequences
batch_size=4,
shuffle=True,
reverse=False
)
# Get configuration for serialization
config = generator.get_config()
json_config = generator.to_json()Install with Tessl CLI
npx tessl i tessl/pypi-keras-preprocessing