Hierarchical datasets for Python with HDF5 library for managing extremely large amounts of data
PyTables provides several array classes optimized for different use cases with homogeneous data storage. These include standard arrays for fixed-size datasets, chunked arrays for large data with compression, enlargeable arrays for growing datasets, and variable-length arrays for irregular data structures.
Fixed-size arrays for storing homogeneous data with direct NumPy integration and memory-mapped access.
class Array:
def __init__(self, parentnode, name, obj=None, title="", byteorder=None, **kwargs):
"""
Array constructor (typically called via File.create_array).
Parameters:
- parentnode (Group): Parent group
- name (str): Array name
- obj (array-like): Initial data or shape specification
- title (str): Descriptive title
- byteorder (str): Byte order ('little', 'big', 'native')
"""
def read(self, start=None, stop=None, step=None, out=None):
"""
Read array data with optional slicing.
Parameters:
- start (int or tuple): Starting indices for each dimension
- stop (int or tuple): Stopping indices for each dimension
- step (int or tuple): Step sizes for each dimension
- out (ndarray): Pre-allocated output array
Returns:
ndarray: Array data with requested slice
"""
def __getitem__(self, key):
"""
Array-style indexing and slicing.
Parameters:
- key (int, slice, tuple): Index specification
Returns:
ndarray or scalar: Selected data
"""
def __setitem__(self, key, value):
"""
Array-style assignment with indexing and slicing.
Parameters:
- key (int, slice, tuple): Index specification
- value (scalar or array-like): Data to assign
"""
def iterrows(self, start=None, stop=None, step=None):
"""
Iterate over array rows.
Parameters:
- start (int): Starting row index
- stop (int): Stopping row index
- step (int): Step size
Yields:
ndarray: Each row as a 1D array
"""Arrays stored in chunks for efficient compression and partial I/O operations on large datasets.
class CArray:
def __init__(self, parentnode, name, atom, shape, title="", filters=None, chunkshape=None, byteorder=None, **kwargs):
"""
Chunked array constructor (typically called via File.create_carray).
Parameters:
- parentnode (Group): Parent group
- name (str): Array name
- atom (Atom): Data type specification
- shape (tuple): Array dimensions
- title (str): Descriptive title
- filters (Filters): Compression options
- chunkshape (tuple): Chunk dimensions for optimization
- byteorder (str): Byte order specification
"""
def read(self, start=None, stop=None, step=None, out=None):
"""
Read chunked array data with chunk-aware optimization.
Parameters:
- start (int or tuple): Starting indices
- stop (int or tuple): Stopping indices
- step (int or tuple): Step sizes
- out (ndarray): Pre-allocated output array
Returns:
ndarray: Requested data with chunk-optimized access
"""
def __getitem__(self, key):
"""Chunk-optimized array indexing."""
def __setitem__(self, key, value):
"""Chunk-optimized array assignment."""Arrays that can grow along one dimension, ideal for streaming data or incremental data collection.
class EArray:
def __init__(self, parentnode, name, atom, shape, title="", filters=None, expectedrows=1000, chunkshape=None, byteorder=None, **kwargs):
"""
Enlargeable array constructor (typically called via File.create_earray).
Parameters:
- parentnode (Group): Parent group
- name (str): Array name
- atom (Atom): Data type specification
- shape (tuple): Initial shape (first dimension can be 0 for empty)
- title (str): Descriptive title
- filters (Filters): Compression options
- expectedrows (int): Expected final size for optimization
- chunkshape (tuple): Chunk dimensions
- byteorder (str): Byte order specification
"""
def append(self, sequence):
"""
Append data to the enlargeable dimension.
Parameters:
- sequence (array-like): Data to append along first dimension
"""
def read(self, start=None, stop=None, step=None, out=None):
"""
Read data with support for the enlargeable dimension.
Parameters:
- start (int or tuple): Starting indices
- stop (int or tuple): Stopping indices
- step (int or tuple): Step sizes
- out (ndarray): Pre-allocated output array
Returns:
ndarray: Requested data
"""
def truncate(self, size):
"""
Truncate array to specified size along enlargeable dimension.
Parameters:
- size (int): New size for first dimension
"""Arrays where each row can have different lengths, suitable for irregular data structures.
class VLArray:
def __init__(self, parentnode, name, atom, title="", filters=None, expectedrows=1000, chunkshape=None, byteorder=None, **kwargs):
"""
Variable-length array constructor (typically called via File.create_vlarray).
Parameters:
- parentnode (Group): Parent group
- name (str): Array name
- atom (Atom): Data type for individual elements
- title (str): Descriptive title
- filters (Filters): Compression options
- expectedrows (int): Expected number of rows
- chunkshape (int): Rows per chunk
- byteorder (str): Byte order specification
"""
def append(self, sequence):
"""
Append a new variable-length row.
Parameters:
- sequence (array-like): Data for the new row (can be any length)
"""
def read(self, start=None, stop=None, step=None):
"""
Read variable-length rows.
Parameters:
- start (int): Starting row index
- stop (int): Stopping row index
- step (int): Step size
Returns:
list: List of arrays, one per row
"""
def __getitem__(self, key):
"""
Access individual rows or slices.
Parameters:
- key (int or slice): Row selection
Returns:
ndarray or list: Single row array or list of row arrays
"""
def __setitem__(self, key, value):
"""
Set individual rows.
Parameters:
- key (int): Row index
- value (array-like): New row data
"""
def get_row_size(self, row):
"""
Get the length of a specific row.
Parameters:
- row (int): Row index
Returns:
int: Number of elements in the specified row
"""
def iterrows(self, start=None, stop=None, step=None):
"""
Iterate over variable-length rows.
Parameters:
- start (int): Starting row index
- stop (int): Stopping row index
- step (int): Step size
Yields:
ndarray: Each row as a 1D array
"""# Properties available on all array types
class ArrayBase:
@property
def shape(self):
"""Tuple describing array dimensions."""
@property
def size(self):
"""Total number of elements in the array."""
@property
def ndim(self):
"""Number of array dimensions."""
@property
def dtype(self):
"""NumPy data type of array elements."""
@property
def atom(self):
"""Atom object describing element type."""
@property
def size_in_memory(self):
"""Estimated memory usage of array data."""
@property
def size_on_disk(self):
"""Actual disk space used by the array."""
@property
def chunkshape(self):
"""Chunk dimensions (for chunked arrays)."""
@property
def filters(self):
"""Applied compression filters."""import tables as tb
import numpy as np
with tb.open_file("arrays.h5", "w") as h5file:
# Create arrays from existing data
data_2d = np.random.random((100, 50))
array_2d = h5file.create_array("/", "data_2d", data_2d, "2D Random Data")
# Create empty array with specified shape and type
empty_array = h5file.create_array("/", "empty", np.zeros((10, 20)), "Empty Array")
# Access data
subset = array_2d[10:20, 5:15] # Slice operation
single_value = array_2d[0, 0] # Single element
# Modify data
array_2d[0:5, 0:5] = np.ones((5, 5))import tables as tb
import numpy as np
with tb.open_file("large_data.h5", "w") as h5file:
# Create large chunked array with compression
filters = tb.Filters(complevel=6, complib='blosc')
large_array = h5file.create_carray("/", "large_data",
tb.Float64Atom(),
shape=(10000, 10000),
filters=filters,
chunkshape=(100, 100))
# Fill array in chunks to manage memory
for i in range(0, 10000, 100):
for j in range(0, 10000, 100):
chunk_data = np.random.random((100, 100))
large_array[i:i+100, j:j+100] = chunk_data
# Efficient partial reads
corner = large_array[0:500, 0:500]import tables as tb
import numpy as np
with tb.open_file("streaming.h5", "w") as h5file:
# Create enlargeable array starting with zero rows
earray = h5file.create_earray("/", "stream_data",
tb.Float32Atom(),
shape=(0, 10), # 0 rows initially, 10 columns
expectedrows=100000)
# Simulate streaming data arrival
for batch in range(100):
# Generate batch of new data (varying size)
batch_size = np.random.randint(50, 200)
new_data = np.random.random((batch_size, 10))
# Append to array
earray.append(new_data)
print(f"Final array shape: {earray.shape}")
# Read recent data
recent_data = earray[-1000:] # Last 1000 rowsimport tables as tb
import numpy as np
with tb.open_file("irregular.h5", "w") as h5file:
# Create VLArray for storing sequences of different lengths
vlarray = h5file.create_vlarray("/", "sequences",
tb.Int32Atom(),
"Variable Length Sequences")
# Add sequences of different lengths
sequences = [
[1, 2, 3],
[10, 20, 30, 40, 50],
[100],
[7, 8, 9, 10, 11, 12, 13, 14, 15]
]
for seq in sequences:
vlarray.append(seq)
# Access individual sequences
first_seq = vlarray[0] # numpy array: [1, 2, 3]
all_seqs = vlarray.read() # List of numpy arrays
# Get sequence lengths
lengths = [vlarray.get_row_size(i) for i in range(len(vlarray))]
print(f"Sequence lengths: {lengths}")
# Iterate over sequences
for i, seq in enumerate(vlarray):
print(f"Sequence {i}: {seq}")Install with Tessl CLI
npx tessl i tessl/pypi-tables