tessl/pypi-tables

Hierarchical datasets for Python with HDF5 library for managing extremely large amounts of data

Overview

Eval results

Files

Arrays and Homogeneous Data

Name: tessl/pypi-tables
Author: tessl

PyTables provides several array classes optimized for different use cases with homogeneous data storage. These include standard arrays for fixed-size datasets, chunked arrays for large data with compression, enlargeable arrays for growing datasets, and variable-length arrays for irregular data structures.

Capabilities

Standard Arrays

Fixed-size arrays for storing homogeneous data with direct NumPy integration and memory-mapped access.

class Array:
    def __init__(self, parentnode, name, obj=None, title="", byteorder=None, **kwargs):
        """
        Array constructor (typically called via File.create_array).
        
        Parameters:
        - parentnode (Group): Parent group
        - name (str): Array name
        - obj (array-like): Initial data or shape specification
        - title (str): Descriptive title
        - byteorder (str): Byte order ('little', 'big', 'native')
        """
        
    def read(self, start=None, stop=None, step=None, out=None):
        """
        Read array data with optional slicing.
        
        Parameters:
        - start (int or tuple): Starting indices for each dimension
        - stop (int or tuple): Stopping indices for each dimension
        - step (int or tuple): Step sizes for each dimension
        - out (ndarray): Pre-allocated output array
        
        Returns:
        ndarray: Array data with requested slice
        """
        
    def __getitem__(self, key):
        """
        Array-style indexing and slicing.
        
        Parameters:
        - key (int, slice, tuple): Index specification
        
        Returns:
        ndarray or scalar: Selected data
        """
        
    def __setitem__(self, key, value):
        """
        Array-style assignment with indexing and slicing.
        
        Parameters:
        - key (int, slice, tuple): Index specification
        - value (scalar or array-like): Data to assign
        """
        
    def iterrows(self, start=None, stop=None, step=None):
        """
        Iterate over array rows.
        
        Parameters:
        - start (int): Starting row index
        - stop (int): Stopping row index
        - step (int): Step size
        
        Yields:
        ndarray: Each row as a 1D array
        """

Chunked Arrays (CArray)

Arrays stored in chunks for efficient compression and partial I/O operations on large datasets.

class CArray:
    def __init__(self, parentnode, name, atom, shape, title="", filters=None, chunkshape=None, byteorder=None, **kwargs):
        """
        Chunked array constructor (typically called via File.create_carray).
        
        Parameters:
        - parentnode (Group): Parent group
        - name (str): Array name
        - atom (Atom): Data type specification
        - shape (tuple): Array dimensions
        - title (str): Descriptive title
        - filters (Filters): Compression options
        - chunkshape (tuple): Chunk dimensions for optimization
        - byteorder (str): Byte order specification
        """
        
    def read(self, start=None, stop=None, step=None, out=None):
        """
        Read chunked array data with chunk-aware optimization.
        
        Parameters:
        - start (int or tuple): Starting indices
        - stop (int or tuple): Stopping indices
        - step (int or tuple): Step sizes
        - out (ndarray): Pre-allocated output array
        
        Returns:
        ndarray: Requested data with chunk-optimized access
        """
        
    def __getitem__(self, key):
        """Chunk-optimized array indexing."""
        
    def __setitem__(self, key, value):
        """Chunk-optimized array assignment."""

Enlargeable Arrays (EArray)

Arrays that can grow along one dimension, ideal for streaming data or incremental data collection.

class EArray:
    def __init__(self, parentnode, name, atom, shape, title="", filters=None, expectedrows=1000, chunkshape=None, byteorder=None, **kwargs):
        """
        Enlargeable array constructor (typically called via File.create_earray).
        
        Parameters:
        - parentnode (Group): Parent group
        - name (str): Array name
        - atom (Atom): Data type specification
        - shape (tuple): Initial shape (first dimension can be 0 for empty)
        - title (str): Descriptive title
        - filters (Filters): Compression options
        - expectedrows (int): Expected final size for optimization
        - chunkshape (tuple): Chunk dimensions
        - byteorder (str): Byte order specification
        """
        
    def append(self, sequence):
        """
        Append data to the enlargeable dimension.
        
        Parameters:
        - sequence (array-like): Data to append along first dimension
        """
        
    def read(self, start=None, stop=None, step=None, out=None):
        """
        Read data with support for the enlargeable dimension.
        
        Parameters:
        - start (int or tuple): Starting indices
        - stop (int or tuple): Stopping indices  
        - step (int or tuple): Step sizes
        - out (ndarray): Pre-allocated output array
        
        Returns:
        ndarray: Requested data
        """
        
    def truncate(self, size):
        """
        Truncate array to specified size along enlargeable dimension.
        
        Parameters:
        - size (int): New size for first dimension
        """

Variable-Length Arrays (VLArray)

Arrays where each row can have different lengths, suitable for irregular data structures.

class VLArray:
    def __init__(self, parentnode, name, atom, title="", filters=None, expectedrows=1000, chunkshape=None, byteorder=None, **kwargs):
        """
        Variable-length array constructor (typically called via File.create_vlarray).
        
        Parameters:
        - parentnode (Group): Parent group
        - name (str): Array name
        - atom (Atom): Data type for individual elements
        - title (str): Descriptive title
        - filters (Filters): Compression options
        - expectedrows (int): Expected number of rows
        - chunkshape (int): Rows per chunk
        - byteorder (str): Byte order specification
        """
        
    def append(self, sequence):
        """
        Append a new variable-length row.
        
        Parameters:
        - sequence (array-like): Data for the new row (can be any length)
        """
        
    def read(self, start=None, stop=None, step=None):
        """
        Read variable-length rows.
        
        Parameters:
        - start (int): Starting row index
        - stop (int): Stopping row index
        - step (int): Step size
        
        Returns:
        list: List of arrays, one per row
        """
        
    def __getitem__(self, key):
        """
        Access individual rows or slices.
        
        Parameters:
        - key (int or slice): Row selection
        
        Returns:
        ndarray or list: Single row array or list of row arrays
        """
        
    def __setitem__(self, key, value):
        """
        Set individual rows.
        
        Parameters:
        - key (int): Row index
        - value (array-like): New row data
        """
        
    def get_row_size(self, row):
        """
        Get the length of a specific row.
        
        Parameters:
        - row (int): Row index
        
        Returns:
        int: Number of elements in the specified row
        """
        
    def iterrows(self, start=None, stop=None, step=None):
        """
        Iterate over variable-length rows.
        
        Parameters:
        - start (int): Starting row index
        - stop (int): Stopping row index
        - step (int): Step size
        
        Yields:
        ndarray: Each row as a 1D array
        """

Common Array Properties

# Properties available on all array types
class ArrayBase:
    @property
    def shape(self):
        """Tuple describing array dimensions."""
        
    @property
    def size(self):
        """Total number of elements in the array."""
        
    @property
    def ndim(self):
        """Number of array dimensions."""
        
    @property
    def dtype(self):
        """NumPy data type of array elements."""
        
    @property
    def atom(self):
        """Atom object describing element type."""
        
    @property
    def size_in_memory(self):
        """Estimated memory usage of array data."""
        
    @property
    def size_on_disk(self):
        """Actual disk space used by the array."""
        
    @property
    def chunkshape(self):
        """Chunk dimensions (for chunked arrays)."""
        
    @property
    def filters(self):
        """Applied compression filters."""

Usage Examples

Standard Arrays

import tables as tb
import numpy as np

with tb.open_file("arrays.h5", "w") as h5file:
    # Create arrays from existing data
    data_2d = np.random.random((100, 50))
    array_2d = h5file.create_array("/", "data_2d", data_2d, "2D Random Data")
    
    # Create empty array with specified shape and type
    empty_array = h5file.create_array("/", "empty", np.zeros((10, 20)), "Empty Array")
    
    # Access data
    subset = array_2d[10:20, 5:15]  # Slice operation
    single_value = array_2d[0, 0]   # Single element
    
    # Modify data
    array_2d[0:5, 0:5] = np.ones((5, 5))

Chunked Arrays for Large Data

import tables as tb
import numpy as np

with tb.open_file("large_data.h5", "w") as h5file:
    # Create large chunked array with compression
    filters = tb.Filters(complevel=6, complib='blosc')
    large_array = h5file.create_carray("/", "large_data", 
                                     tb.Float64Atom(), 
                                     shape=(10000, 10000),
                                     filters=filters,
                                     chunkshape=(100, 100))
    
    # Fill array in chunks to manage memory
    for i in range(0, 10000, 100):
        for j in range(0, 10000, 100):
            chunk_data = np.random.random((100, 100))
            large_array[i:i+100, j:j+100] = chunk_data
    
    # Efficient partial reads
    corner = large_array[0:500, 0:500]

Enlargeable Arrays for Streaming Data

import tables as tb
import numpy as np

with tb.open_file("streaming.h5", "w") as h5file:
    # Create enlargeable array starting with zero rows
    earray = h5file.create_earray("/", "stream_data", 
                                tb.Float32Atom(),
                                shape=(0, 10),  # 0 rows initially, 10 columns
                                expectedrows=100000)
    
    # Simulate streaming data arrival
    for batch in range(100):
        # Generate batch of new data (varying size)
        batch_size = np.random.randint(50, 200)
        new_data = np.random.random((batch_size, 10))
        
        # Append to array
        earray.append(new_data)
        
    print(f"Final array shape: {earray.shape}")
    
    # Read recent data
    recent_data = earray[-1000:]  # Last 1000 rows

Variable-Length Arrays for Irregular Data

import tables as tb
import numpy as np

with tb.open_file("irregular.h5", "w") as h5file:
    # Create VLArray for storing sequences of different lengths
    vlarray = h5file.create_vlarray("/", "sequences", 
                                  tb.Int32Atom(),
                                  "Variable Length Sequences")
    
    # Add sequences of different lengths
    sequences = [
        [1, 2, 3],
        [10, 20, 30, 40, 50],
        [100],
        [7, 8, 9, 10, 11, 12, 13, 14, 15]
    ]
    
    for seq in sequences:
        vlarray.append(seq)
    
    # Access individual sequences
    first_seq = vlarray[0]        # numpy array: [1, 2, 3]
    all_seqs = vlarray.read()     # List of numpy arrays
    
    # Get sequence lengths
    lengths = [vlarray.get_row_size(i) for i in range(len(vlarray))]
    print(f"Sequence lengths: {lengths}")
    
    # Iterate over sequences
    for i, seq in enumerate(vlarray):
        print(f"Sequence {i}: {seq}")

Install with Tessl CLI