Hierarchical datasets for Python with HDF5 library for managing extremely large amounts of data
npx @tessl/cli install tessl/pypi-tables@3.10.0A comprehensive Python library for managing hierarchical datasets, designed to efficiently cope with extremely large amounts of data. PyTables is built on top of the HDF5 library and NumPy, featuring an object-oriented interface combined with Cython-generated C extensions for performance-critical operations. It provides fast interactive data storage and retrieval capabilities with advanced compression, indexing, and querying features optimized for scientific computing and data analysis workflows.
pip install tablesimport tablesCommon patterns for file operations:
import tables as tbFor specific functionality:
from tables import open_file, File, Group, Table, Array
from tables import StringCol, IntCol, FloatCol # Column types
from tables import Filters # Compressionimport tables as tb
import numpy as np
# Open/create an HDF5 file
h5file = tb.open_file("example.h5", mode="w", title="Example File")
# Create a group for organization
group = h5file.create_group("/", "detector", "Detector Information")
# Create a table with structured data
class Particle(tb.IsDescription):
name = tb.StringCol(16) # 16-character String
idnumber = tb.Int64Col() # Signed 64-bit integer
ADCcount = tb.UInt16Col() # Unsigned 16-bit integer
TDCcount = tb.UInt8Col() # Unsigned 8-bit integer
energy = tb.Float32Col() # 32-bit floating point
timestamp = tb.Time64Col()# Timestamp
table = h5file.create_table(group, 'readout', Particle, "Readout example")
# Add data to table
particle = table.row
for i in range(10):
particle['name'] = f'Particle: {i:6d}'
particle['TDCcount'] = i % 256
particle['ADCcount'] = np.random.randint(0, 65535)
particle['energy'] = np.random.random()
particle['timestamp'] = i * 1.0
particle.append()
table.flush()
# Create arrays for homogeneous data
array_c = h5file.create_array(group, 'array_c', np.arange(100), "Array C")
# Query data
results = [row for row in table.where('TDCcount > 5')]
# Close file
h5file.close()PyTables implements a hierarchical tree structure similar to a filesystem:
The design emphasizes memory efficiency, disk optimization, and seamless integration with NumPy arrays while providing ACID transaction capabilities through undo/redo mechanisms.
Core file management including opening, creating, copying, and validating PyTables/HDF5 files with comprehensive mode control and optimization options.
def open_file(filename, mode="r", title="", root_uep="/", filters=None, **kwargs): ...
def copy_file(srcfilename, dstfilename, overwrite=False, **kwargs): ...
def is_hdf5_file(filename): ...
def is_pytables_file(filename): ...Group-based hierarchical organization for structuring datasets in tree-like namespaces with directory-style navigation and node management.
class Group:
def _f_walknodes(self, classname=None): ...
def _f_list_nodes(self, classname=None): ...
def __contains__(self, name): ...
def __getitem__(self, name): ...Table-based structured data storage with column-oriented access, conditional querying, indexing, and modification capabilities for record-based datasets.
class Table:
def read(self, start=None, stop=None, step=None, field=None, out=None): ...
def read_where(self, condition, condvars=None, **kwargs): ...
def where(self, condition, condvars=None, start=None, stop=None): ...
def append(self, rows): ...
def modify_column(self, start=None, stop=None, step=None, column=None, value=None): ...Array-based homogeneous data storage including standard arrays, chunked arrays, enlargeable arrays, and variable-length arrays with NumPy integration.
class Array:
def read(self, start=None, stop=None, step=None, out=None): ...
def __getitem__(self, key): ...
def __setitem__(self, key, value): ...
class EArray:
def append(self, sequence): ...
def read(self, start=None, stop=None, step=None, out=None): ...Comprehensive type system with Atom types for individual data elements and Column types for table structure definitions, supporting all NumPy data types plus specialized types.
class IsDescription: ...
# Atom types
class StringAtom: ...
class IntAtom: ...
class FloatAtom: ...
class TimeAtom: ...
# Column types
class StringCol: ...
class IntCol: ...
class FloatCol: ...
class TimeCol: ...Advanced compression and filtering system supporting multiple algorithms (zlib, blosc, blosc2, bzip2, lzo) with configurable parameters for optimal storage and I/O performance.
class Filters:
def __init__(self, complevel=0, complib="zlib", shuffle=True, bitshuffle=False, fletcher32=False): ...
def set_blosc_max_threads(nthreads): ...
def set_blosc2_max_threads(nthreads): ...Expression-based querying system with compiled expressions, B-tree indexing, and conditional iteration for efficient data retrieval from large datasets.
class Expr:
def eval(self): ...
def append(self, expr): ...
# Table methods
def create_index(self, **kwargs): ...
def remove_index(self): ...
def reindex(self): ...Complete undo/redo transaction system with marks, rollback capabilities, and ACID-compliant operations for data integrity and collaborative workflows.
class File:
def enable_undo(self, filters=None): ...
def disable_undo(self): ...
def mark(self, name=None): ...
def undo(self, mark=None): ...
def redo(self, mark=None): ...class File:
"""Main PyTables file interface."""
def __init__(self, filename, mode="r", title="", root_uep="/", filters=None, **kwargs): ...
def close(self): ...
def flush(self): ...
def create_group(self, where, name, title="", filters=None, createparents=False): ...
def create_table(self, where, name, description, title="", filters=None, expectedrows=10000, createparents=False, sample=None, byteorder=None, **kwargs): ...
def create_array(self, where, name, object, title="", byteorder=None, createparents=False, sample=None): ...
class Node:
"""Base class for all PyTables nodes."""
def _f_close(self): ...
def _f_copy(self, newparent=None, newname=None, overwrite=False, recursive=False, createparents=False, **kwargs): ...
def _f_move(self, newparent=None, newname=None, overwrite=False, createparents=False): ...
def _f_remove(self): ...
def _f_rename(self, newname): ...
class IsDescription:
"""Base class for table descriptions."""
pass
class UnImplemented(Leaf):
"""
Represents datasets not supported by PyTables in generic HDF5 files.
Used when PyTables encounters HDF5 datasets with unsupported datatype
or dataspace combinations. Allows access to metadata and attributes
but not the actual data.
"""
class Unknown(Leaf):
"""
Represents unknown node types in HDF5 files.
Used as a fallback for HDF5 nodes that cannot be classified
into any supported PyTables category.
"""
FilterProperties = dict[str, any]
"""Dictionary containing filter and compression properties."""
__version__: str
"""PyTables version string."""
hdf5_version: str
"""Underlying HDF5 library version string."""
class Enum:
"""
Enumerated type for defining named value sets.
Used to create enumerated types where variables can take one of a
predefined set of named values. Each value has a name and concrete value.
"""
def __init__(self, enum_values):
"""
Create enumeration from sequence or mapping.
Parameters:
- enum_values: Sequence of names or mapping of names to values
"""# Core Exceptions
class HDF5ExtError(Exception):
"""Errors from the HDF5 library."""
class ClosedNodeError(ValueError):
"""Operations on closed nodes."""
class ClosedFileError(ValueError):
"""Operations on closed files."""
class FileModeError(ValueError):
"""Invalid file mode operations."""
class NodeError(AttributeError):
"""General node-related errors."""
class NoSuchNodeError(LookupError):
"""Access to non-existent nodes."""
# Specialized Exceptions
class UndoRedoError(Exception):
"""Undo/redo system errors."""
class FlavorError(TypeError):
"""Data flavor conversion errors."""
class ChunkError(ValueError):
"""Chunking-related errors."""
class NotChunkedError(ChunkError):
"""Operations requiring chunked layout."""
# Warning Classes
class NaturalNameWarning(UserWarning):
"""Natural naming convention warnings."""
class PerformanceWarning(UserWarning):
"""Performance-related warnings."""
class DataTypeWarning(UserWarning):
"""Data type compatibility warnings."""def test():
"""Run the PyTables test suite."""
def print_versions():
"""Print version information for PyTables and dependencies."""
def silence_hdf5_messages():
"""Suppress HDF5 diagnostic messages."""
def restrict_flavors(keep=None):
"""
Restrict available NumPy data flavors.
Parameters:
- keep (list): List of flavors to keep available
"""
def get_pytables_version():
"""
Get PyTables version string.
Returns:
str: PyTables version
Note: Deprecated, use tables.__version__ instead
"""
def get_hdf5_version():
"""
Get HDF5 library version string.
Returns:
str: HDF5 version
Note: Deprecated, use tables.hdf5_version instead
"""PyTables provides several command-line utilities for file management and inspection:
These tools are available after installing PyTables and can be run directly from the command line.