tessl/pypi-tables

Hierarchical datasets for Python with HDF5 library for managing extremely large amounts of data

Overview

Eval results

Files

Tables and Structured Data

Name: tessl/pypi-tables
Author: tessl

PyTables' table system provides efficient storage and retrieval of structured, record-oriented data with column-oriented access, conditional querying, indexing capabilities, and in-place modification. Tables are ideal for datasets where each record has the same structure but individual fields need to be accessed independently.

Capabilities

Table Creation and Structure

Tables store structured data using column definitions that specify data types, shapes, and constraints for each field.

class Table:
    def __init__(self, parentnode, name, description, title="", filters=None, expectedrows=10000, **kwargs):
        """
        Table constructor (typically called via File.create_table).
        
        Parameters:
        - parentnode (Group): Parent group
        - name (str): Table name
        - description (Description): Column structure definition
        - title (str): Descriptive title
        - filters (Filters): Compression and filtering options
        - expectedrows (int): Expected number of rows for optimization
        """
        
    @property
    def description(self):
        """Table structure description with column information."""
        
    @property
    def colnames(self):
        """List of column names in the table."""
        
    @property
    def coldtypes(self):
        """Dictionary mapping column names to NumPy data types."""
        
    @property
    def colindexed(self):
        """Dictionary indicating which columns have indexes."""

Data Reading

Comprehensive data reading with slicing, field selection, and conditional filtering.

class Table:
    def read(self, start=None, stop=None, step=None, field=None, out=None):
        """
        Read table data with optional slicing and field selection.
        
        Parameters:
        - start (int): Starting row index
        - stop (int): Ending row index (exclusive)
        - step (int): Step size for row selection
        - field (str): Single field name to read
        - out (array): Pre-allocated output array
        
        Returns:
        ndarray: Structured array containing the requested data
        """
        
    def read_where(self, condition, condvars=None, field=None, start=None, stop=None, step=None, out=None):
        """
        Read rows that satisfy a condition.
        
        Parameters:
        - condition (str): Conditional expression string
        - condvars (dict): Variables for use in condition
        - field (str): Single field name to read
        - start (int): Starting row for search
        - stop (int): Ending row for search
        - step (int): Step size for search
        - out (array): Pre-allocated output array
        
        Returns:
        ndarray: Rows satisfying the condition
        """
        
    def __getitem__(self, key):
        """
        Array-style indexing for table access.
        
        Parameters:
        - key (int, slice, or tuple): Row selection specification
        
        Returns:
        ndarray or scalar: Selected data
        """

Conditional Iteration

Iterator-based access for memory-efficient processing of large datasets.

class Table:
    def where(self, condition, condvars=None, start=None, stop=None, step=None):
        """
        Iterate over rows satisfying a condition.
        
        Parameters:
        - condition (str): Conditional expression string
        - condvars (dict): Variables for condition evaluation
        - start (int): Starting row for search
        - stop (int): Ending row for search  
        - step (int): Step size for search
        
        Yields:
        Row: Each row object satisfying the condition
        """
        
    def iread(self, start=None, stop=None, step=None):
        """
        Iterate over table rows.
        
        Parameters:
        - start (int): Starting row index
        - stop (int): Ending row index
        - step (int): Step size
        
        Yields:
        Row: Each row in the specified range
        """
        
    def iterrows(self, start=None, stop=None, step=None):
        """
        Iterate over table rows (alias for iread).
        
        Parameters:
        - start (int): Starting row index
        - stop (int): Ending row index
        - step (int): Step size
        
        Yields:
        Row: Each row in the specified range
        """

Data Modification

In-place data modification including appending new records, modifying existing data, and row removal.

class Table:
    def append(self, rows):
        """
        Append new rows to the table.
        
        Parameters:
        - rows (array-like): Structured data to append
        """
        
    def modify_column(self, start=None, stop=None, step=None, column=None, value=None):
        """
        Modify values in a specific column.
        
        Parameters:
        - start (int): Starting row index
        - stop (int): Ending row index
        - step (int): Step size
        - column (str): Column name to modify
        - value (scalar or array): New values
        """
        
    def modify_columns(self, start=None, stop=None, step=None, columns=None):
        """
        Modify multiple columns simultaneously.
        
        Parameters:
        - start (int): Starting row index
        - stop (int): Ending row index
        - step (int): Step size
        - columns (dict): Mapping of column names to new values
        """
        
    def remove_rows(self, start, stop=None):
        """
        Remove rows from the table.
        
        Parameters:
        - start (int): Starting row index to remove
        - stop (int): Ending row index (exclusive), or None for single row
        """
        
    def truncate(self, size):
        """
        Truncate table to specified number of rows.
        
        Parameters:
        - size (int): New table size in rows
        """

Column Access

Individual column access through the cols attribute provides column-specific operations.

class Cols:
    def __getitem__(self, name):
        """
        Get column accessor by name.
        
        Parameters:
        - name (str): Column name
        
        Returns:
        Column: Column accessor object
        """
        
    def __setitem__(self, name, value):
        """
        Set entire column values.
        
        Parameters:
        - name (str): Column name
        - value (array-like): New column data
        """
        
    def _f_col(self, name):
        """
        Get Column object for specific column.
        
        Parameters:
        - name (str): Column name
        
        Returns:
        Column: Column accessor object
        """

class Column:
    def __getitem__(self, key):
        """
        Get column values with slicing support.
        
        Parameters:
        - key (int, slice): Row selection
        
        Returns:
        ndarray or scalar: Column values
        """
        
    def __setitem__(self, key, value):
        """
        Set column values with slicing support.
        
        Parameters:
        - key (int, slice): Row selection
        - value (scalar or array): New values
        """
        
    def create_index(self, optlevel=6, kind="medium", filters=None, tmp_dir=None):
        """
        Create an index for this column to accelerate queries.
        
        Parameters:
        - optlevel (int): Optimization level (0-9)
        - kind (str): Index type ("ultralight", "light", "medium", "full")
        - filters (Filters): Compression for index
        - tmp_dir (str): Temporary directory for index creation
        """
        
    def remove_index(self):
        """Remove the index from this column."""
        
    def reindex(self):
        """Recreate the index for this column."""

Row Access

Individual row manipulation through Row objects.

class Row:
    def __getitem__(self, name):
        """
        Get field value by name.
        
        Parameters:
        - name (str): Field name
        
        Returns:
        any: Field value
        """
        
    def __setitem__(self, name, value):
        """
        Set field value by name.
        
        Parameters:
        - name (str): Field name  
        - value (any): New field value
        """
        
    def append(self):
        """Append this row's current values to the table."""
        
    def update(self):
        """Update the table with this row's current values."""

    @property
    def table(self):
        """Reference to the parent table."""

Table Properties

class Table:
    @property
    def cols(self):
        """Cols accessor for column-oriented operations."""
        
    @property
    def row(self):
        """Row accessor for record-oriented operations."""
        
    @property
    def nrows(self):
        """Number of rows in the table."""
        
    @property
    def shape(self):
        """Shape of the table as (nrows,)."""
        
    @property
    def size_in_memory(self):
        """Estimated memory usage of table data."""
        
    @property
    def size_on_disk(self):
        """Actual disk space used by the table."""

Usage Examples

Creating and Populating Tables

import tables as tb
import numpy as np

# Define table structure
class Particle(tb.IsDescription):
    name = tb.StringCol(16)      # 16-character string
    idnumber = tb.Int64Col()     # Signed 64-bit integer
    ADCcount = tb.UInt16Col()    # Unsigned 16-bit integer
    TDCcount = tb.UInt8Col()     # Unsigned 8-bit integer
    energy = tb.Float32Col()     # 32-bit float
    timestamp = tb.Time64Col()   # Timestamp

# Create file and table
with tb.open_file("particles.h5", "w") as h5file:
    table = h5file.create_table("/", "detector", Particle, "Particle Data")
    
    # Append data using Row interface
    particle = table.row
    for i in range(1000):
        particle['name'] = f'Particle_{i:04d}'
        particle['idnumber'] = i
        particle['ADCcount'] = np.random.randint(0, 65536)
        particle['TDCcount'] = np.random.randint(0, 256)
        particle['energy'] = np.random.exponential(10.0)
        particle['timestamp'] = i * 0.1
        particle.append()
    table.flush()
    
    # Query high-energy particles
    high_energy = [row for row in table.where('energy > 20.0')]
    print(f"Found {len(high_energy)} high-energy particles")
    
    # Read specific columns
    energies = table.read(field='energy')
    timestamps = table.read(field='timestamp')
    
    # Column-based operations
    table.cols.energy[0:10] = np.random.random(10) * 100
    
    # Create index for faster queries
    table.cols.energy.create_index()

Advanced Querying

# Complex conditional queries
with tb.open_file("particles.h5", "r") as h5file:
    table = h5file.root.detector
    
    # Query with multiple conditions
    results = table.read_where('(energy > 15.0) & (TDCcount < 100)')
    
    # Query with external variables
    min_energy = 10.0
    max_time = 50.0
    condition = '(energy > min_energy) & (timestamp < max_time)'
    results = table.read_where(condition, {'min_energy': min_energy, 'max_time': max_time})
    
    # Memory-efficient iteration over large result sets
    for row in table.where('energy > 30.0'):
        print(f"High energy particle: {row['name']}, energy: {row['energy']}")

Install with Tessl CLI