Hierarchical datasets for Python with HDF5 library for managing extremely large amounts of data
PyTables provides a comprehensive type system with Atom types for defining individual data elements and Column types for table structure definitions. This system supports all NumPy data types plus specialized types for time, strings, and complex data structures, enabling precise control over data storage and memory usage.
Base classes for defining table structures with strongly-typed column definitions.
class IsDescription:
"""
Base class for user-defined table descriptions.
Inherit from this class to define table structures.
"""
pass
class Description:
"""
Runtime table description created from dictionaries or existing tables.
"""
def __init__(self, description, validate=True):
"""
Create description from dictionary or class.
Parameters:
- description (dict or class): Column definitions
- validate (bool): Validate column definitions
"""
@classmethod
def from_dtype(cls, dtype, ptparams=None):
"""
Create description from NumPy dtype.
Parameters:
- dtype (numpy.dtype): NumPy structured dtype
- ptparams (dict): PyTables-specific parameters
Returns:
Description: Table description object
"""Atom types define the data type and storage characteristics for individual elements.
class Atom:
"""Base class for all atom types."""
def __init__(self, type, shape=(), dflt=None):
"""
Base atom constructor.
Parameters:
- type (str): Type identifier
- shape (tuple): Element shape for multidimensional atoms
- dflt (any): Default value
"""
@property
def type(self):
"""String identifier for the atom type."""
@property
def shape(self):
"""Shape tuple for multidimensional atoms."""
@property
def size(self):
"""Size in bytes of a single element."""
# String Atoms
class StringAtom(Atom):
"""Fixed-length string atom."""
def __init__(self, itemsize, shape=(), dflt=b''):
"""
Parameters:
- itemsize (int): Maximum string length in bytes
- shape (tuple): Shape for arrays of strings
- dflt (bytes): Default value
"""
class VLStringAtom(Atom):
"""Variable-length string atom (raw bytes)."""
def __init__(self, dflt=b''):
"""
Parameters:
- dflt (bytes): Default value
"""
class VLUnicodeAtom(Atom):
"""Variable-length Unicode string atom."""
def __init__(self, dflt=''):
"""
Parameters:
- dflt (str): Default value
"""
# Boolean Atoms
class BoolAtom(Atom):
"""Boolean atom (True/False)."""
def __init__(self, shape=(), dflt=False):
"""
Parameters:
- shape (tuple): Shape for arrays of booleans
- dflt (bool): Default value
"""
# Integer Atoms
class IntAtom(Atom):
"""Generic signed integer atom (platform-dependent size)."""
def __init__(self, shape=(), dflt=0):
"""
Parameters:
- shape (tuple): Shape for arrays of integers
- dflt (int): Default value
"""
class UIntAtom(Atom):
"""Generic unsigned integer atom (platform-dependent size)."""
def __init__(self, shape=(), dflt=0): ...
class Int8Atom(Atom):
"""8-bit signed integer atom (-128 to 127)."""
def __init__(self, shape=(), dflt=0): ...
class UInt8Atom(Atom):
"""8-bit unsigned integer atom (0 to 255)."""
def __init__(self, shape=(), dflt=0): ...
class Int16Atom(Atom):
"""16-bit signed integer atom (-32768 to 32767)."""
def __init__(self, shape=(), dflt=0): ...
class UInt16Atom(Atom):
"""16-bit unsigned integer atom (0 to 65535)."""
def __init__(self, shape=(), dflt=0): ...
class Int32Atom(Atom):
"""32-bit signed integer atom."""
def __init__(self, shape=(), dflt=0): ...
class UInt32Atom(Atom):
"""32-bit unsigned integer atom."""
def __init__(self, shape=(), dflt=0): ...
class Int64Atom(Atom):
"""64-bit signed integer atom."""
def __init__(self, shape=(), dflt=0): ...
class UInt64Atom(Atom):
"""64-bit unsigned integer atom."""
def __init__(self, shape=(), dflt=0): ...
# Floating Point Atoms
class FloatAtom(Atom):
"""Generic floating point atom (platform-dependent precision)."""
def __init__(self, shape=(), dflt=0.0): ...
class Float32Atom(Atom):
"""32-bit floating point atom (IEEE 754 single precision)."""
def __init__(self, shape=(), dflt=0.0): ...
class Float64Atom(Atom):
"""64-bit floating point atom (IEEE 754 double precision)."""
def __init__(self, shape=(), dflt=0.0): ...
class Float16Atom(Atom):
"""16-bit floating point atom (IEEE 754 half precision)."""
def __init__(self, shape=(), dflt=0.0):
"""
Note: Available when NumPy supports float16 type
"""
class Float96Atom(Atom):
"""96-bit extended precision floating point atom."""
def __init__(self, shape=(), dflt=0.0):
"""
Note: Platform-dependent availability
"""
class Float128Atom(Atom):
"""128-bit quadruple precision floating point atom."""
def __init__(self, shape=(), dflt=0.0):
"""
Note: Platform-dependent availability
"""
# Complex Number Atoms
class ComplexAtom(Atom):
"""Generic complex number atom (platform-dependent precision)."""
def __init__(self, shape=(), dflt=0.0+0j): ...
class Complex32Atom(Atom):
"""32-bit complex atom (two 16-bit floats)."""
def __init__(self, shape=(), dflt=0.0+0j): ...
class Complex64Atom(Atom):
"""64-bit complex atom (two 32-bit floats)."""
def __init__(self, shape=(), dflt=0.0+0j): ...
class Complex128Atom(Atom):
"""128-bit complex atom (two 64-bit floats)."""
def __init__(self, shape=(), dflt=0.0+0j): ...
class Complex192Atom(Atom):
"""192-bit complex atom (two 96-bit floats)."""
def __init__(self, shape=(), dflt=0.0+0j):
"""
Note: Platform-dependent availability
"""
class Complex256Atom(Atom):
"""256-bit complex atom (two 128-bit floats)."""
def __init__(self, shape=(), dflt=0.0+0j):
"""
Note: Platform-dependent availability
"""
# Time Atoms
class TimeAtom(Atom):
"""Generic time atom (platform-dependent precision)."""
def __init__(self, shape=(), dflt=0.0): ...
class Time32Atom(Atom):
"""32-bit time atom (seconds since epoch)."""
def __init__(self, shape=(), dflt=0.0): ...
class Time64Atom(Atom):
"""64-bit time atom (microseconds since epoch)."""
def __init__(self, shape=(), dflt=0.0): ...
# Special Atoms
class EnumAtom(Atom):
"""Enumerated type atom with named values."""
def __init__(self, enum, dflt, base=None, shape=()):
"""
Parameters:
- enum (Enum): Enumeration definition
- dflt (any): Default enumeration value
- base (Atom): Base atom type for storage
- shape (tuple): Shape for arrays of enums
"""
class PseudoAtom(Atom):
"""Pseudo-atom for complex data types."""
def __init__(self, kind, shape=(), dflt=None): ...
class ObjectAtom(Atom):
"""Object atom for Python object storage (with pickle)."""
def __init__(self, shape=(), dflt=None): ...Column types are used in table descriptions to define the structure and data types for table columns.
class Col:
"""Base class for all column types."""
def __init__(self, type=None, itemsize=None, shape=(), dflt=None, pos=None):
"""
Base column constructor.
Parameters:
- type (str): Column type identifier
- itemsize (int): Size specification for variable types
- shape (tuple): Shape for multidimensional columns
- dflt (any): Default value
- pos (int): Column position in table
"""
# String Columns
class StringCol(Col):
"""Fixed-length string column."""
def __init__(self, itemsize, shape=(), dflt=b'', pos=None):
"""
Parameters:
- itemsize (int): Maximum string length
- shape (tuple): Shape for string arrays
- dflt (bytes): Default value
- pos (int): Column position
"""
# Boolean Columns
class BoolCol(Col):
"""Boolean column."""
def __init__(self, shape=(), dflt=False, pos=None): ...
# Integer Columns
class IntCol(Col):
"""Generic signed integer column."""
def __init__(self, shape=(), dflt=0, pos=None): ...
class UIntCol(Col):
"""Generic unsigned integer column."""
def __init__(self, shape=(), dflt=0, pos=None): ...
class Int8Col(Col):
"""8-bit signed integer column."""
def __init__(self, shape=(), dflt=0, pos=None): ...
class UInt8Col(Col):
"""8-bit unsigned integer column."""
def __init__(self, shape=(), dflt=0, pos=None): ...
class Int16Col(Col):
"""16-bit signed integer column."""
def __init__(self, shape=(), dflt=0, pos=None): ...
class UInt16Col(Col):
"""16-bit unsigned integer column."""
def __init__(self, shape=(), dflt=0, pos=None): ...
class Int32Col(Col):
"""32-bit signed integer column."""
def __init__(self, shape=(), dflt=0, pos=None): ...
class UInt32Col(Col):
"""32-bit unsigned integer column."""
def __init__(self, shape=(), dflt=0, pos=None): ...
class Int64Col(Col):
"""64-bit signed integer column."""
def __init__(self, shape=(), dflt=0, pos=None): ...
class UInt64Col(Col):
"""64-bit unsigned integer column."""
def __init__(self, shape=(), dflt=0, pos=None): ...
# Floating Point Columns
class FloatCol(Col):
"""Generic floating point column."""
def __init__(self, shape=(), dflt=0.0, pos=None): ...
class Float32Col(Col):
"""32-bit floating point column."""
def __init__(self, shape=(), dflt=0.0, pos=None): ...
class Float64Col(Col):
"""64-bit floating point column."""
def __init__(self, shape=(), dflt=0.0, pos=None): ...
class Float16Col(Col):
"""16-bit floating point column."""
def __init__(self, shape=(), dflt=0.0, pos=None):
"""
Note: Available when NumPy supports float16 type
"""
class Float96Col(Col):
"""96-bit extended precision floating point column."""
def __init__(self, shape=(), dflt=0.0, pos=None):
"""
Note: Platform-dependent availability
"""
class Float128Col(Col):
"""128-bit quadruple precision floating point column."""
def __init__(self, shape=(), dflt=0.0, pos=None):
"""
Note: Platform-dependent availability
"""
# Complex Number Columns
class ComplexCol(Col):
"""Generic complex number column."""
def __init__(self, shape=(), dflt=0.0+0j, pos=None): ...
class Complex32Col(Col):
"""32-bit complex column."""
def __init__(self, shape=(), dflt=0.0+0j, pos=None): ...
class Complex64Col(Col):
"""64-bit complex column."""
def __init__(self, shape=(), dflt=0.0+0j, pos=None): ...
class Complex128Col(Col):
"""128-bit complex column."""
def __init__(self, shape=(), dflt=0.0+0j, pos=None): ...
class Complex192Col(Col):
"""192-bit complex column (two 96-bit floats)."""
def __init__(self, shape=(), dflt=0.0+0j, pos=None):
"""
Note: Platform-dependent availability
"""
class Complex256Col(Col):
"""256-bit complex column (two 128-bit floats)."""
def __init__(self, shape=(), dflt=0.0+0j, pos=None):
"""
Note: Platform-dependent availability
"""
# Time Columns
class TimeCol(Col):
"""Generic time column."""
def __init__(self, shape=(), dflt=0.0, pos=None): ...
class Time32Col(Col):
"""32-bit time column."""
def __init__(self, shape=(), dflt=0.0, pos=None): ...
class Time64Col(Col):
"""64-bit time column."""
def __init__(self, shape=(), dflt=0.0, pos=None): ...
# Special Columns
class EnumCol(Col):
"""Enumerated type column."""
def __init__(self, enum, dflt, base=None, shape=(), pos=None):
"""
Parameters:
- enum (Enum): Enumeration definition
- dflt (any): Default enumeration value
- base (Col): Base column type for storage
- shape (tuple): Shape for enum arrays
- pos (int): Column position
"""def split_type(type):
"""
Split a type specification into components.
Parameters:
- type (str): Type specification string
Returns:
tuple: (kind, itemsize) components of the type
"""import tables as tb
# Method 1: Class-based description
class Experiment(tb.IsDescription):
# Basic types
run_id = tb.Int64Col() # 64-bit integer
timestamp = tb.Time64Col() # Microsecond timestamp
temperature = tb.Float32Col() # 32-bit float
active = tb.BoolCol() # Boolean
# String types
name = tb.StringCol(50) # Fixed-length string (50 bytes)
notes = tb.StringCol(200, dflt=b'') # With default value
# Array types
coordinates = tb.Float64Col(shape=(3,)) # 3D position vector
measurements = tb.Int16Col(shape=(10,)) # Array of 10 measurements
# Complex types
signal = tb.Complex64Col() # Complex number
# Enumerated types with custom enum
Status = tb.Enum(['active', 'paused', 'stopped'])
status = tb.EnumCol(Status, 'active', base=tb.UInt8Col())
# Method 2: Dictionary-based description
experiment_desc = {
'run_id': tb.Int64Col(),
'timestamp': tb.Time64Col(),
'temperature': tb.Float32Col(),
'name': tb.StringCol(50),
'coordinates': tb.Float64Col(shape=(3,)),
'measurements': tb.Int16Col(shape=(10,))
}
# Create table with either approach
with tb.open_file("experiment.h5", "w") as h5file:
table1 = h5file.create_table("/", "exp_class", Experiment)
table2 = h5file.create_table("/", "exp_dict", experiment_desc)import tables as tb
import numpy as np
with tb.open_file("atoms.h5", "w") as h5file:
# Create arrays with specific atom types
# String array
string_atom = tb.StringAtom(20) # 20-byte strings
string_array = h5file.create_carray("/", "strings", string_atom,
shape=(100,), filters=tb.Filters(complevel=1))
# Time series data
time_atom = tb.Time64Atom() # Microsecond precision
time_array = h5file.create_earray("/", "timestamps", time_atom,
shape=(0,), expectedrows=100000)
# Complex signal data
complex_atom = tb.Complex128Atom()
signal_array = h5file.create_carray("/", "signal", complex_atom,
shape=(1000, 1000))
# Multidimensional atoms
vector_atom = tb.Float32Atom(shape=(3,)) # 3D vectors
vector_array = h5file.create_array("/", "vectors",
np.zeros((100,), dtype=[('pos', '3f4')]))import tables as tb
from enum import Enum
# Custom enumeration
class Priority(Enum):
LOW = 1
MEDIUM = 2
HIGH = 3
CRITICAL = 4
# Table with mixed advanced types
class TaskDescription(tb.IsDescription):
task_id = tb.UInt32Col()
created = tb.Time64Col()
# Variable-length strings (stored as objects)
title = tb.StringCol(100)
description = tb.StringCol(500, dflt=b'No description')
# Custom enumeration
priority = tb.EnumCol(Priority, Priority.MEDIUM, base=tb.UInt8Col())
# Multi-dimensional data
progress_history = tb.Float32Col(shape=(10,)) # Last 10 progress values
# Complex metadata (stored as pickled objects)
metadata = tb.ObjectAtom()
with tb.open_file("tasks.h5", "w") as h5file:
table = h5file.create_table("/", "tasks", TaskDescription)
# Add sample data
row = table.row
row['task_id'] = 1
row['created'] = 1640995200000000 # Timestamp in microseconds
row['title'] = b'Implement feature X'
row['priority'] = Priority.HIGH
row['progress_history'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
row['metadata'] = {'tags': ['urgent', 'backend'], 'assignee': 'developer'}
row.append()
table.flush()Install with Tessl CLI
npx tessl i tessl/pypi-tables