0
# Arrays and Homogeneous Data
1
2
PyTables provides several array classes optimized for different use cases with homogeneous data storage. These include standard arrays for fixed-size datasets, chunked arrays for large data with compression, enlargeable arrays for growing datasets, and variable-length arrays for irregular data structures.
3
4
## Capabilities
5
6
### Standard Arrays
7
8
Fixed-size arrays for storing homogeneous data with direct NumPy integration and memory-mapped access.
9
10
```python { .api }
11
class Array:
12
def __init__(self, parentnode, name, obj=None, title="", byteorder=None, **kwargs):
13
"""
14
Array constructor (typically called via File.create_array).
15
16
Parameters:
17
- parentnode (Group): Parent group
18
- name (str): Array name
19
- obj (array-like): Initial data or shape specification
20
- title (str): Descriptive title
21
- byteorder (str): Byte order ('little', 'big', 'native')
22
"""
23
24
def read(self, start=None, stop=None, step=None, out=None):
25
"""
26
Read array data with optional slicing.
27
28
Parameters:
29
- start (int or tuple): Starting indices for each dimension
30
- stop (int or tuple): Stopping indices for each dimension
31
- step (int or tuple): Step sizes for each dimension
32
- out (ndarray): Pre-allocated output array
33
34
Returns:
35
ndarray: Array data with requested slice
36
"""
37
38
def __getitem__(self, key):
39
"""
40
Array-style indexing and slicing.
41
42
Parameters:
43
- key (int, slice, tuple): Index specification
44
45
Returns:
46
ndarray or scalar: Selected data
47
"""
48
49
def __setitem__(self, key, value):
50
"""
51
Array-style assignment with indexing and slicing.
52
53
Parameters:
54
- key (int, slice, tuple): Index specification
55
- value (scalar or array-like): Data to assign
56
"""
57
58
def iterrows(self, start=None, stop=None, step=None):
59
"""
60
Iterate over array rows.
61
62
Parameters:
63
- start (int): Starting row index
64
- stop (int): Stopping row index
65
- step (int): Step size
66
67
Yields:
68
ndarray: Each row as a 1D array
69
"""
70
```
71
72
### Chunked Arrays (CArray)
73
74
Arrays stored in chunks for efficient compression and partial I/O operations on large datasets.
75
76
```python { .api }
77
class CArray:
78
def __init__(self, parentnode, name, atom, shape, title="", filters=None, chunkshape=None, byteorder=None, **kwargs):
79
"""
80
Chunked array constructor (typically called via File.create_carray).
81
82
Parameters:
83
- parentnode (Group): Parent group
84
- name (str): Array name
85
- atom (Atom): Data type specification
86
- shape (tuple): Array dimensions
87
- title (str): Descriptive title
88
- filters (Filters): Compression options
89
- chunkshape (tuple): Chunk dimensions for optimization
90
- byteorder (str): Byte order specification
91
"""
92
93
def read(self, start=None, stop=None, step=None, out=None):
94
"""
95
Read chunked array data with chunk-aware optimization.
96
97
Parameters:
98
- start (int or tuple): Starting indices
99
- stop (int or tuple): Stopping indices
100
- step (int or tuple): Step sizes
101
- out (ndarray): Pre-allocated output array
102
103
Returns:
104
ndarray: Requested data with chunk-optimized access
105
"""
106
107
def __getitem__(self, key):
108
"""Chunk-optimized array indexing."""
109
110
def __setitem__(self, key, value):
111
"""Chunk-optimized array assignment."""
112
```
113
114
### Enlargeable Arrays (EArray)
115
116
Arrays that can grow along one dimension, ideal for streaming data or incremental data collection.
117
118
```python { .api }
119
class EArray:
120
def __init__(self, parentnode, name, atom, shape, title="", filters=None, expectedrows=1000, chunkshape=None, byteorder=None, **kwargs):
121
"""
122
Enlargeable array constructor (typically called via File.create_earray).
123
124
Parameters:
125
- parentnode (Group): Parent group
126
- name (str): Array name
127
- atom (Atom): Data type specification
128
- shape (tuple): Initial shape (first dimension can be 0 for empty)
129
- title (str): Descriptive title
130
- filters (Filters): Compression options
131
- expectedrows (int): Expected final size for optimization
132
- chunkshape (tuple): Chunk dimensions
133
- byteorder (str): Byte order specification
134
"""
135
136
def append(self, sequence):
137
"""
138
Append data to the enlargeable dimension.
139
140
Parameters:
141
- sequence (array-like): Data to append along first dimension
142
"""
143
144
def read(self, start=None, stop=None, step=None, out=None):
145
"""
146
Read data with support for the enlargeable dimension.
147
148
Parameters:
149
- start (int or tuple): Starting indices
150
- stop (int or tuple): Stopping indices
151
- step (int or tuple): Step sizes
152
- out (ndarray): Pre-allocated output array
153
154
Returns:
155
ndarray: Requested data
156
"""
157
158
def truncate(self, size):
159
"""
160
Truncate array to specified size along enlargeable dimension.
161
162
Parameters:
163
- size (int): New size for first dimension
164
"""
165
```
166
167
### Variable-Length Arrays (VLArray)
168
169
Arrays where each row can have different lengths, suitable for irregular data structures.
170
171
```python { .api }
172
class VLArray:
173
def __init__(self, parentnode, name, atom, title="", filters=None, expectedrows=1000, chunkshape=None, byteorder=None, **kwargs):
174
"""
175
Variable-length array constructor (typically called via File.create_vlarray).
176
177
Parameters:
178
- parentnode (Group): Parent group
179
- name (str): Array name
180
- atom (Atom): Data type for individual elements
181
- title (str): Descriptive title
182
- filters (Filters): Compression options
183
- expectedrows (int): Expected number of rows
184
- chunkshape (int): Rows per chunk
185
- byteorder (str): Byte order specification
186
"""
187
188
def append(self, sequence):
189
"""
190
Append a new variable-length row.
191
192
Parameters:
193
- sequence (array-like): Data for the new row (can be any length)
194
"""
195
196
def read(self, start=None, stop=None, step=None):
197
"""
198
Read variable-length rows.
199
200
Parameters:
201
- start (int): Starting row index
202
- stop (int): Stopping row index
203
- step (int): Step size
204
205
Returns:
206
list: List of arrays, one per row
207
"""
208
209
def __getitem__(self, key):
210
"""
211
Access individual rows or slices.
212
213
Parameters:
214
- key (int or slice): Row selection
215
216
Returns:
217
ndarray or list: Single row array or list of row arrays
218
"""
219
220
def __setitem__(self, key, value):
221
"""
222
Set individual rows.
223
224
Parameters:
225
- key (int): Row index
226
- value (array-like): New row data
227
"""
228
229
def get_row_size(self, row):
230
"""
231
Get the length of a specific row.
232
233
Parameters:
234
- row (int): Row index
235
236
Returns:
237
int: Number of elements in the specified row
238
"""
239
240
def iterrows(self, start=None, stop=None, step=None):
241
"""
242
Iterate over variable-length rows.
243
244
Parameters:
245
- start (int): Starting row index
246
- stop (int): Stopping row index
247
- step (int): Step size
248
249
Yields:
250
ndarray: Each row as a 1D array
251
"""
252
```
253
254
## Common Array Properties
255
256
```python { .api }
257
# Properties available on all array types
258
class ArrayBase:
259
@property
260
def shape(self):
261
"""Tuple describing array dimensions."""
262
263
@property
264
def size(self):
265
"""Total number of elements in the array."""
266
267
@property
268
def ndim(self):
269
"""Number of array dimensions."""
270
271
@property
272
def dtype(self):
273
"""NumPy data type of array elements."""
274
275
@property
276
def atom(self):
277
"""Atom object describing element type."""
278
279
@property
280
def size_in_memory(self):
281
"""Estimated memory usage of array data."""
282
283
@property
284
def size_on_disk(self):
285
"""Actual disk space used by the array."""
286
287
@property
288
def chunkshape(self):
289
"""Chunk dimensions (for chunked arrays)."""
290
291
@property
292
def filters(self):
293
"""Applied compression filters."""
294
```
295
296
## Usage Examples
297
298
### Standard Arrays
299
300
```python
301
import tables as tb
302
import numpy as np
303
304
with tb.open_file("arrays.h5", "w") as h5file:
305
# Create arrays from existing data
306
data_2d = np.random.random((100, 50))
307
array_2d = h5file.create_array("/", "data_2d", data_2d, "2D Random Data")
308
309
# Create empty array with specified shape and type
310
empty_array = h5file.create_array("/", "empty", np.zeros((10, 20)), "Empty Array")
311
312
# Access data
313
subset = array_2d[10:20, 5:15] # Slice operation
314
single_value = array_2d[0, 0] # Single element
315
316
# Modify data
317
array_2d[0:5, 0:5] = np.ones((5, 5))
318
```
319
320
### Chunked Arrays for Large Data
321
322
```python
323
import tables as tb
324
import numpy as np
325
326
with tb.open_file("large_data.h5", "w") as h5file:
327
# Create large chunked array with compression
328
filters = tb.Filters(complevel=6, complib='blosc')
329
large_array = h5file.create_carray("/", "large_data",
330
tb.Float64Atom(),
331
shape=(10000, 10000),
332
filters=filters,
333
chunkshape=(100, 100))
334
335
# Fill array in chunks to manage memory
336
for i in range(0, 10000, 100):
337
for j in range(0, 10000, 100):
338
chunk_data = np.random.random((100, 100))
339
large_array[i:i+100, j:j+100] = chunk_data
340
341
# Efficient partial reads
342
corner = large_array[0:500, 0:500]
343
```
344
345
### Enlargeable Arrays for Streaming Data
346
347
```python
348
import tables as tb
349
import numpy as np
350
351
with tb.open_file("streaming.h5", "w") as h5file:
352
# Create enlargeable array starting with zero rows
353
earray = h5file.create_earray("/", "stream_data",
354
tb.Float32Atom(),
355
shape=(0, 10), # 0 rows initially, 10 columns
356
expectedrows=100000)
357
358
# Simulate streaming data arrival
359
for batch in range(100):
360
# Generate batch of new data (varying size)
361
batch_size = np.random.randint(50, 200)
362
new_data = np.random.random((batch_size, 10))
363
364
# Append to array
365
earray.append(new_data)
366
367
print(f"Final array shape: {earray.shape}")
368
369
# Read recent data
370
recent_data = earray[-1000:] # Last 1000 rows
371
```
372
373
### Variable-Length Arrays for Irregular Data
374
375
```python
376
import tables as tb
377
import numpy as np
378
379
with tb.open_file("irregular.h5", "w") as h5file:
380
# Create VLArray for storing sequences of different lengths
381
vlarray = h5file.create_vlarray("/", "sequences",
382
tb.Int32Atom(),
383
"Variable Length Sequences")
384
385
# Add sequences of different lengths
386
sequences = [
387
[1, 2, 3],
388
[10, 20, 30, 40, 50],
389
[100],
390
[7, 8, 9, 10, 11, 12, 13, 14, 15]
391
]
392
393
for seq in sequences:
394
vlarray.append(seq)
395
396
# Access individual sequences
397
first_seq = vlarray[0] # numpy array: [1, 2, 3]
398
all_seqs = vlarray.read() # List of numpy arrays
399
400
# Get sequence lengths
401
lengths = [vlarray.get_row_size(i) for i in range(len(vlarray))]
402
print(f"Sequence lengths: {lengths}")
403
404
# Iterate over sequences
405
for i, seq in enumerate(vlarray):
406
print(f"Sequence {i}: {seq}")
407
```