Hierarchical datasets for Python with HDF5 library for managing extremely large amounts of data
PyTables provides powerful querying capabilities through expression-based conditional access and B-tree indexing for fast data retrieval. The query system supports complex conditions with optimized evaluation and memory-efficient iteration over large datasets.
class Expr:
def __init__(self, expr, uservars=None, **kwargs):
"""
Compiled expression for fast evaluation.
Parameters:
- expr (str): Expression string
- uservars (dict): Variables for expression evaluation
"""
def eval(self):
"""
Evaluate expression and return results.
Returns:
ndarray: Boolean array with evaluation results
"""
def append(self, expr):
"""
Append additional expression.
Parameters:
- expr (str): Expression to append
"""class Column:
def create_index(self, optlevel=6, kind="medium", filters=None, tmp_dir=None):
"""
Create index for fast querying.
Parameters:
- optlevel (int): Optimization level (0-9)
- kind (str): Index type ("ultralight", "light", "medium", "full")
- filters (Filters): Compression for index data
- tmp_dir (str): Temporary directory for index creation
"""
def remove_index(self):
"""Remove existing index from column."""
def reindex(self):
"""Recreate index with current optimization settings."""
def reindex_dirty(self):
"""Reindex if column data has been modified since last index update."""class Table:
def read_where(self, condition, condvars=None, **kwargs):
"""
Read rows matching condition.
Parameters:
- condition (str): Query condition
- condvars (dict): Variables for condition
Returns:
ndarray: Matching rows
"""
def where(self, condition, condvars=None, **kwargs):
"""
Iterate over rows matching condition.
Parameters:
- condition (str): Query condition
- condvars (dict): Variables for condition
Yields:
Row: Each matching row
"""
def iread_where(self, condition, condvars=None, **kwargs):
"""
Iterate over matching rows (alternative interface).
Parameters:
- condition (str): Query condition
- condvars (dict): Variables for condition
Yields:
Row: Each matching row
"""import tables as tb
import numpy as np
with tb.open_file("indexed_data.h5", "w") as h5file:
# Create table with sample data
class Measurement(tb.IsDescription):
sensor_id = tb.Int32Col()
timestamp = tb.Time64Col()
value = tb.Float64Col()
quality = tb.StringCol(10)
table = h5file.create_table("/", "measurements", Measurement)
# Fill with sample data
row = table.row
for i in range(10000):
row['sensor_id'] = np.random.randint(1, 100)
row['timestamp'] = i * 1.0
row['value'] = np.random.normal(25.0, 5.0)
row['quality'] = b'good' if np.random.random() > 0.1 else b'bad'
row.append()
table.flush()
# Create indexes for faster queries
table.cols.sensor_id.create_index()
table.cols.timestamp.create_index()
table.cols.value.create_index()
# Simple queries
high_values = table.read_where('value > 30.0')
sensor_data = table.read_where('sensor_id == 42')
# Complex queries with multiple conditions
recent_good = table.read_where('(timestamp > 5000.0) & (quality == b"good")')
# Queries with external variables
min_val = 20.0
max_val = 35.0
filtered = table.read_where('(value >= min_val) & (value <= max_val)',
{'min_val': min_val, 'max_val': max_val})
# Memory-efficient iteration
for row in table.where('(sensor_id < 10) & (value > 25.0)'):
print(f"Sensor {row['sensor_id']}: {row['value']}")Install with Tessl CLI
npx tessl i tessl/pypi-tables