or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

image-processing.mdindex.mdsequence-processing.mdtext-processing.md

sequence-processing.mddocs/

0

# Sequence Processing

1

2

Sequence padding, temporal data generation, and utilities for preparing sequential data for neural networks. These tools handle variable-length sequences and time series data preprocessing for recurrent neural networks and sequence models.

3

4

## Capabilities

5

6

### Sequence Padding

7

8

Utilities for converting variable-length sequences to fixed-length arrays suitable for batch processing in neural networks.

9

10

```python { .api }

11

def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre',

12

truncating='pre', value=0.):

13

"""

14

Pad sequences to the same length.

15

16

Transforms a list of num_samples sequences (lists of integers) into a 2D

17

numpy array of shape (num_samples, num_timesteps). Sequences shorter than

18

num_timesteps are padded with value. Sequences longer than num_timesteps

19

are truncated.

20

21

Parameters:

22

- sequences (list): List of lists, where each element is a sequence

23

- maxlen (int, optional): Maximum length of all sequences. If None, uses length of longest sequence

24

- dtype (str): Type of the output sequences ('int32', 'float32', etc.)

25

- padding (str): 'pre' or 'post' - pad either before or after each sequence

26

- truncating (str): 'pre' or 'post' - remove values from sequences larger than maxlen

27

- value (float or str): Padding value

28

29

Returns:

30

- numpy.ndarray: Array with shape (len(sequences), maxlen)

31

"""

32

```

33

34

### Time Series Generation

35

36

Generator class for creating batches of temporal data from continuous sequences.

37

38

```python { .api }

39

class TimeseriesGenerator:

40

"""

41

Utility class for generating batches of temporal data.

42

43

Creates overlapping sequences from continuous time series data for training

44

sequence models. Handles sampling, stride, shuffling, and batch generation.

45

"""

46

47

def __init__(self, data, targets, length, sampling_rate=1, stride=1,

48

start_index=0, end_index=None, shuffle=False, reverse=False,

49

batch_size=128):

50

"""

51

Initialize timeseries generator.

52

53

Parameters:

54

- data (numpy.ndarray): Time series data

55

- targets (numpy.ndarray): Target values corresponding to data

56

- length (int): Length of input sequences

57

- sampling_rate (int): Period between successive individual timesteps

58

- stride (int): Period between successive sequences

59

- start_index (int): Data points earlier than start_index will not be used

60

- end_index (int, optional): Data points later than end_index will not be used

61

- shuffle (bool): Whether to shuffle the rows at each epoch

62

- reverse (bool): Whether to reverse the temporal order of sequences

63

- batch_size (int): Number of timeseries samples in each batch

64

"""

65

66

def __len__(self):

67

"""

68

Return number of batches in the generator.

69

70

Returns:

71

- int: Number of batches

72

"""

73

74

def __getitem__(self, index):

75

"""

76

Get batch at specified index.

77

78

Parameters:

79

- index (int): Batch index

80

81

Returns:

82

- tuple: (samples, targets) - batch of sequences and corresponding targets

83

"""

84

85

def get_config(self):

86

"""

87

Return generator configuration as dictionary.

88

89

Returns:

90

- dict: Configuration dictionary

91

"""

92

93

def to_json(self, **kwargs):

94

"""

95

Return JSON string containing generator configuration.

96

97

Returns:

98

- str: JSON string of generator configuration

99

"""

100

```

101

102

### Skipgram Generation

103

104

Utilities for generating skipgram word pairs for word2vec training.

105

106

```python { .api }

107

def skipgrams(sequence, vocabulary_size, window_size=4, negative_samples=1.,

108

shuffle=True, categorical=False, sampling_table=None, seed=None):

109

"""

110

Generate skipgram word pairs for word2vec training.

111

112

Creates (word, context) pairs and (word, random_word) negative samples

113

from a sequence of word indexes.

114

115

Parameters:

116

- sequence (list): Sequence of word indexes

117

- vocabulary_size (int): Size of vocabulary

118

- window_size (int): Maximum distance between current and predicted word

119

- negative_samples (float): Ratio of negative samples to positive samples

120

- shuffle (bool): Whether to shuffle word couples before returning

121

- categorical (bool): Whether to return categorical labels

122

- sampling_table (numpy.ndarray, optional): Probability table for sampling

123

- seed (int, optional): Random seed

124

125

Returns:

126

- tuple: (couples, labels) where couples is list of word pairs and labels

127

indicates positive (1) or negative (0) samples

128

"""

129

130

def make_sampling_table(size, sampling_factor=1e-5):

131

"""

132

Generate word rank-based probabilistic sampling table for skipgrams.

133

134

Creates sampling probabilities based on word frequency ranks, used to

135

downsample frequent words in skipgram generation.

136

137

Parameters:

138

- size (int): Size of vocabulary

139

- sampling_factor (float): Factor for downsampling frequent words

140

141

Returns:

142

- numpy.ndarray: Sampling probabilities for each word rank

143

"""

144

```

145

146

### Serialization

147

148

```python { .api }

149

def timeseries_generator_from_json(json_string):

150

"""

151

Parse JSON timeseries generator configuration and return generator instance.

152

153

Parameters:

154

- json_string (str): JSON string containing generator configuration

155

156

Returns:

157

- TimeseriesGenerator: Generator instance with loaded configuration

158

"""

159

```

160

161

## Usage Examples

162

163

### Basic Sequence Padding

164

165

```python

166

from keras_preprocessing.sequence import pad_sequences

167

168

# Variable length sequences

169

sequences = [

170

[1, 2, 3],

171

[1, 2, 3, 4, 5],

172

[1, 2]

173

]

174

175

# Pad to same length (default: pre-padding with zeros)

176

padded = pad_sequences(sequences, maxlen=5)

177

print(padded)

178

# [[0 0 1 2 3]

179

# [1 2 3 4 5]

180

# [0 0 0 1 2]]

181

182

# Post-padding

183

padded_post = pad_sequences(sequences, maxlen=5, padding='post')

184

print(padded_post)

185

# [[1 2 3 0 0]

186

# [1 2 3 4 5]

187

# [1 2 0 0 0]]

188

189

# Truncation

190

long_sequences = [[1, 2, 3, 4, 5, 6, 7]]

191

truncated = pad_sequences(long_sequences, maxlen=5, truncating='post')

192

print(truncated) # [[1 2 3 4 5]]

193

```

194

195

### Time Series Data Generation

196

197

```python

198

import numpy as np

199

from keras_preprocessing.sequence import TimeseriesGenerator

200

201

# Create sample time series data

202

data = np.array([i for i in range(50)]) # [0, 1, 2, ..., 49]

203

targets = data # For autoregression, targets can be same as data

204

205

# Create generator for sequences of length 10

206

generator = TimeseriesGenerator(

207

data=data,

208

targets=targets,

209

length=10,

210

batch_size=6,

211

sampling_rate=1,

212

stride=1

213

)

214

215

print(f"Number of batches: {len(generator)}") # 7

216

217

# Get first batch

218

batch_x, batch_y = generator[0]

219

print(f"Batch shape: {batch_x.shape}") # (6, 10)

220

print(f"Target shape: {batch_y.shape}") # (6,)

221

222

# First sequence: data[0:10] -> target[10]

223

print(f"First sequence: {batch_x[0]} -> {batch_y[0]}")

224

# [0 1 2 3 4 5 6 7 8 9] -> 10

225

```

226

227

### Skipgram Generation for Word2Vec

228

229

```python

230

from keras_preprocessing.sequence import skipgrams, make_sampling_table

231

232

# Sample word sequence

233

sequence = [1, 2, 3, 4, 5, 2, 6, 7, 8, 9]

234

vocabulary_size = 10

235

236

# Generate skipgrams

237

couples, labels = skipgrams(

238

sequence=sequence,

239

vocabulary_size=vocabulary_size,

240

window_size=2,

241

negative_samples=1.0

242

)

243

244

print(f"Generated {len(couples)} word pairs")

245

print(f"Positive samples: {sum(labels)}")

246

print(f"Negative samples: {len(labels) - sum(labels)}")

247

248

# Example couples and labels

249

for i in range(5):

250

word, context = couples[i]

251

label_type = "positive" if labels[i] == 1 else "negative"

252

print(f"({word}, {context}) - {label_type}")

253

254

# Create sampling table for frequent word downsampling

255

sampling_table = make_sampling_table(vocabulary_size)

256

print(f"Sampling probabilities: {sampling_table[:5]}")

257

```

258

259

### Advanced Time Series with Custom Parameters

260

261

```python

262

# Multi-feature time series

263

data = np.random.randn(100, 3) # 100 timesteps, 3 features

264

targets = np.random.randn(100, 1) # Regression targets

265

266

# Generator with stride and sampling

267

generator = TimeseriesGenerator(

268

data=data,

269

targets=targets,

270

length=15,

271

sampling_rate=2, # Use every 2nd timestep

272

stride=3, # Move 3 steps between sequences

273

batch_size=4,

274

shuffle=True,

275

reverse=False

276

)

277

278

# Get configuration for serialization

279

config = generator.get_config()

280

json_config = generator.to_json()

281

```