Hierarchical datasets for Python with HDF5 library for managing extremely large amounts of data
PyTables' table system provides efficient storage and retrieval of structured, record-oriented data with column-oriented access, conditional querying, indexing capabilities, and in-place modification. Tables are ideal for datasets where each record has the same structure but individual fields need to be accessed independently.
Tables store structured data using column definitions that specify data types, shapes, and constraints for each field.
class Table:
def __init__(self, parentnode, name, description, title="", filters=None, expectedrows=10000, **kwargs):
"""
Table constructor (typically called via File.create_table).
Parameters:
- parentnode (Group): Parent group
- name (str): Table name
- description (Description): Column structure definition
- title (str): Descriptive title
- filters (Filters): Compression and filtering options
- expectedrows (int): Expected number of rows for optimization
"""
@property
def description(self):
"""Table structure description with column information."""
@property
def colnames(self):
"""List of column names in the table."""
@property
def coldtypes(self):
"""Dictionary mapping column names to NumPy data types."""
@property
def colindexed(self):
"""Dictionary indicating which columns have indexes."""Comprehensive data reading with slicing, field selection, and conditional filtering.
class Table:
def read(self, start=None, stop=None, step=None, field=None, out=None):
"""
Read table data with optional slicing and field selection.
Parameters:
- start (int): Starting row index
- stop (int): Ending row index (exclusive)
- step (int): Step size for row selection
- field (str): Single field name to read
- out (array): Pre-allocated output array
Returns:
ndarray: Structured array containing the requested data
"""
def read_where(self, condition, condvars=None, field=None, start=None, stop=None, step=None, out=None):
"""
Read rows that satisfy a condition.
Parameters:
- condition (str): Conditional expression string
- condvars (dict): Variables for use in condition
- field (str): Single field name to read
- start (int): Starting row for search
- stop (int): Ending row for search
- step (int): Step size for search
- out (array): Pre-allocated output array
Returns:
ndarray: Rows satisfying the condition
"""
def __getitem__(self, key):
"""
Array-style indexing for table access.
Parameters:
- key (int, slice, or tuple): Row selection specification
Returns:
ndarray or scalar: Selected data
"""Iterator-based access for memory-efficient processing of large datasets.
class Table:
def where(self, condition, condvars=None, start=None, stop=None, step=None):
"""
Iterate over rows satisfying a condition.
Parameters:
- condition (str): Conditional expression string
- condvars (dict): Variables for condition evaluation
- start (int): Starting row for search
- stop (int): Ending row for search
- step (int): Step size for search
Yields:
Row: Each row object satisfying the condition
"""
def iread(self, start=None, stop=None, step=None):
"""
Iterate over table rows.
Parameters:
- start (int): Starting row index
- stop (int): Ending row index
- step (int): Step size
Yields:
Row: Each row in the specified range
"""
def iterrows(self, start=None, stop=None, step=None):
"""
Iterate over table rows (alias for iread).
Parameters:
- start (int): Starting row index
- stop (int): Ending row index
- step (int): Step size
Yields:
Row: Each row in the specified range
"""In-place data modification including appending new records, modifying existing data, and row removal.
class Table:
def append(self, rows):
"""
Append new rows to the table.
Parameters:
- rows (array-like): Structured data to append
"""
def modify_column(self, start=None, stop=None, step=None, column=None, value=None):
"""
Modify values in a specific column.
Parameters:
- start (int): Starting row index
- stop (int): Ending row index
- step (int): Step size
- column (str): Column name to modify
- value (scalar or array): New values
"""
def modify_columns(self, start=None, stop=None, step=None, columns=None):
"""
Modify multiple columns simultaneously.
Parameters:
- start (int): Starting row index
- stop (int): Ending row index
- step (int): Step size
- columns (dict): Mapping of column names to new values
"""
def remove_rows(self, start, stop=None):
"""
Remove rows from the table.
Parameters:
- start (int): Starting row index to remove
- stop (int): Ending row index (exclusive), or None for single row
"""
def truncate(self, size):
"""
Truncate table to specified number of rows.
Parameters:
- size (int): New table size in rows
"""Individual column access through the cols attribute provides column-specific operations.
class Cols:
def __getitem__(self, name):
"""
Get column accessor by name.
Parameters:
- name (str): Column name
Returns:
Column: Column accessor object
"""
def __setitem__(self, name, value):
"""
Set entire column values.
Parameters:
- name (str): Column name
- value (array-like): New column data
"""
def _f_col(self, name):
"""
Get Column object for specific column.
Parameters:
- name (str): Column name
Returns:
Column: Column accessor object
"""
class Column:
def __getitem__(self, key):
"""
Get column values with slicing support.
Parameters:
- key (int, slice): Row selection
Returns:
ndarray or scalar: Column values
"""
def __setitem__(self, key, value):
"""
Set column values with slicing support.
Parameters:
- key (int, slice): Row selection
- value (scalar or array): New values
"""
def create_index(self, optlevel=6, kind="medium", filters=None, tmp_dir=None):
"""
Create an index for this column to accelerate queries.
Parameters:
- optlevel (int): Optimization level (0-9)
- kind (str): Index type ("ultralight", "light", "medium", "full")
- filters (Filters): Compression for index
- tmp_dir (str): Temporary directory for index creation
"""
def remove_index(self):
"""Remove the index from this column."""
def reindex(self):
"""Recreate the index for this column."""Individual row manipulation through Row objects.
class Row:
def __getitem__(self, name):
"""
Get field value by name.
Parameters:
- name (str): Field name
Returns:
any: Field value
"""
def __setitem__(self, name, value):
"""
Set field value by name.
Parameters:
- name (str): Field name
- value (any): New field value
"""
def append(self):
"""Append this row's current values to the table."""
def update(self):
"""Update the table with this row's current values."""
@property
def table(self):
"""Reference to the parent table."""class Table:
@property
def cols(self):
"""Cols accessor for column-oriented operations."""
@property
def row(self):
"""Row accessor for record-oriented operations."""
@property
def nrows(self):
"""Number of rows in the table."""
@property
def shape(self):
"""Shape of the table as (nrows,)."""
@property
def size_in_memory(self):
"""Estimated memory usage of table data."""
@property
def size_on_disk(self):
"""Actual disk space used by the table."""import tables as tb
import numpy as np
# Define table structure
class Particle(tb.IsDescription):
name = tb.StringCol(16) # 16-character string
idnumber = tb.Int64Col() # Signed 64-bit integer
ADCcount = tb.UInt16Col() # Unsigned 16-bit integer
TDCcount = tb.UInt8Col() # Unsigned 8-bit integer
energy = tb.Float32Col() # 32-bit float
timestamp = tb.Time64Col() # Timestamp
# Create file and table
with tb.open_file("particles.h5", "w") as h5file:
table = h5file.create_table("/", "detector", Particle, "Particle Data")
# Append data using Row interface
particle = table.row
for i in range(1000):
particle['name'] = f'Particle_{i:04d}'
particle['idnumber'] = i
particle['ADCcount'] = np.random.randint(0, 65536)
particle['TDCcount'] = np.random.randint(0, 256)
particle['energy'] = np.random.exponential(10.0)
particle['timestamp'] = i * 0.1
particle.append()
table.flush()
# Query high-energy particles
high_energy = [row for row in table.where('energy > 20.0')]
print(f"Found {len(high_energy)} high-energy particles")
# Read specific columns
energies = table.read(field='energy')
timestamps = table.read(field='timestamp')
# Column-based operations
table.cols.energy[0:10] = np.random.random(10) * 100
# Create index for faster queries
table.cols.energy.create_index()# Complex conditional queries
with tb.open_file("particles.h5", "r") as h5file:
table = h5file.root.detector
# Query with multiple conditions
results = table.read_where('(energy > 15.0) & (TDCcount < 100)')
# Query with external variables
min_energy = 10.0
max_time = 50.0
condition = '(energy > min_energy) & (timestamp < max_time)'
results = table.read_where(condition, {'min_energy': min_energy, 'max_time': max_time})
# Memory-efficient iteration over large result sets
for row in table.where('energy > 30.0'):
print(f"High energy particle: {row['name']}, energy: {row['energy']}")Install with Tessl CLI
npx tessl i tessl/pypi-tables