tessl/pypi-tables

Hierarchical datasets for Python with HDF5 library for managing extremely large amounts of data

Overview

Eval results

Files

Querying and Indexing

Name: tessl/pypi-tables
Author: tessl

PyTables provides powerful querying capabilities through expression-based conditional access and B-tree indexing for fast data retrieval. The query system supports complex conditions with optimized evaluation and memory-efficient iteration over large datasets.

Capabilities

Expression-Based Querying

class Expr:
    def __init__(self, expr, uservars=None, **kwargs):
        """
        Compiled expression for fast evaluation.
        
        Parameters:
        - expr (str): Expression string
        - uservars (dict): Variables for expression evaluation
        """
        
    def eval(self):
        """
        Evaluate expression and return results.
        
        Returns:
        ndarray: Boolean array with evaluation results
        """
        
    def append(self, expr):
        """
        Append additional expression.
        
        Parameters:
        - expr (str): Expression to append
        """

Table Indexing

class Column:
    def create_index(self, optlevel=6, kind="medium", filters=None, tmp_dir=None):
        """
        Create index for fast querying.
        
        Parameters:
        - optlevel (int): Optimization level (0-9)
        - kind (str): Index type ("ultralight", "light", "medium", "full")
        - filters (Filters): Compression for index data
        - tmp_dir (str): Temporary directory for index creation
        """
        
    def remove_index(self):
        """Remove existing index from column."""
        
    def reindex(self):
        """Recreate index with current optimization settings."""
        
    def reindex_dirty(self):
        """Reindex if column data has been modified since last index update."""

Query Methods

class Table:
    def read_where(self, condition, condvars=None, **kwargs):
        """
        Read rows matching condition.
        
        Parameters:
        - condition (str): Query condition
        - condvars (dict): Variables for condition
        
        Returns:
        ndarray: Matching rows
        """
        
    def where(self, condition, condvars=None, **kwargs):
        """
        Iterate over rows matching condition.
        
        Parameters:
        - condition (str): Query condition
        - condvars (dict): Variables for condition
        
        Yields:
        Row: Each matching row
        """
        
    def iread_where(self, condition, condvars=None, **kwargs):
        """
        Iterate over matching rows (alternative interface).
        
        Parameters:
        - condition (str): Query condition
        - condvars (dict): Variables for condition
        
        Yields:
        Row: Each matching row
        """

Usage Examples

import tables as tb
import numpy as np

with tb.open_file("indexed_data.h5", "w") as h5file:
    # Create table with sample data
    class Measurement(tb.IsDescription):
        sensor_id = tb.Int32Col()
        timestamp = tb.Time64Col()
        value = tb.Float64Col()
        quality = tb.StringCol(10)
    
    table = h5file.create_table("/", "measurements", Measurement)
    
    # Fill with sample data
    row = table.row
    for i in range(10000):
        row['sensor_id'] = np.random.randint(1, 100)
        row['timestamp'] = i * 1.0
        row['value'] = np.random.normal(25.0, 5.0)
        row['quality'] = b'good' if np.random.random() > 0.1 else b'bad'
        row.append()
    table.flush()
    
    # Create indexes for faster queries
    table.cols.sensor_id.create_index()
    table.cols.timestamp.create_index()
    table.cols.value.create_index()
    
    # Simple queries
    high_values = table.read_where('value > 30.0')
    sensor_data = table.read_where('sensor_id == 42')
    
    # Complex queries with multiple conditions
    recent_good = table.read_where('(timestamp > 5000.0) & (quality == b"good")')
    
    # Queries with external variables
    min_val = 20.0
    max_val = 35.0
    filtered = table.read_where('(value >= min_val) & (value <= max_val)',
                               {'min_val': min_val, 'max_val': max_val})
    
    # Memory-efficient iteration
    for row in table.where('(sensor_id < 10) & (value > 25.0)'):
        print(f"Sensor {row['sensor_id']}: {row['value']}")

Install with Tessl CLI