tessl/pypi-vaex-hdf5

HDF5 file support for vaex DataFrame library with memory-mapped access and specialized format readers

Overview

Eval results

Files

HDF5 Dataset Reading

Name: tessl/pypi-vaex-hdf5
Author: tessl

Memory-mapped reading of HDF5 files with support for multiple formats and zero-copy access patterns. The dataset readers provide efficient access to large datasets without loading entire files into memory.

Capabilities

Standard HDF5 Dataset Reading

The main class for reading HDF5 files in vaex format with memory mapping for optimal performance.

class Hdf5MemoryMapped(DatasetMemoryMapped):
    """
    Implements the vaex hdf5 file format with memory mapping support.
    
    Provides zero-copy access to HDF5 datasets through memory mapping,
    supporting both read and write operations with automatic format detection.
    """
    def __init__(self, path, write=False, fs_options={}, fs=None, nommap=None, group=None, _fingerprint=None):
        """
        Initialize HDF5 memory-mapped dataset.
        
        Parameters:
        - path: Path to HDF5 file
        - write: Enable write mode (default: False)
        - fs_options: Filesystem options for remote storage
        - fs: Filesystem implementation (for remote storage)
        - nommap: Force disable memory mapping
        - group: HDF5 group path to read from
        - _fingerprint: Cached fingerprint for testing
        """

Class Methods

@classmethod
def create(cls, path, N, column_names, dtypes=None, write=True):
    """
    Create a new empty HDF5 file with specified columns.
    
    Parameters:
    - path: Output file path
    - N: Number of rows to allocate
    - column_names: List of column names
    - dtypes: List of numpy dtypes (default: float64 for all)
    - write: Enable write mode
    
    Returns:
    Hdf5MemoryMapped instance of the created file
    
    Raises:
    ValueError: If N is 0 (cannot export empty table)
    """

@classmethod
def quick_test(cls, path, fs_options={}, fs=None):
    """
    Quick test if file has HDF5 extension.
    
    Parameters:
    - path: File path to test
    - fs_options: Filesystem options
    - fs: Filesystem implementation
    
    Returns:
    bool: True if path ends with .hdf5 or .h5
    """

@classmethod
def can_open(cls, path, fs_options={}, fs=None, group=None, **kwargs):
    """
    Check if file can be opened as vaex HDF5 format.
    
    Parameters:
    - path: File path to check
    - fs_options: Filesystem options
    - fs: Filesystem implementation  
    - group: Specific HDF5 group to check
    
    Returns:
    bool: True if file can be opened
    """

@classmethod
def get_options(cls, path):
    """Get available options for opening file."""
    
@classmethod  
def option_to_args(cls, option):
    """Convert option to constructor arguments."""

Instance Methods

def write_meta(self):
    """
    Write metadata (units, descriptions, UCDs) as HDF5 attributes.
    
    UCDs, descriptions and units are written as attributes in the HDF5 file,
    instead of a separate file as the default Dataset.write_meta().
    """

def close(self):
    """Close the HDF5 file and clean up resources."""

AMUSE Format Support

Reader for HDF5 files created by the AMUSE astrophysics framework.

class AmuseHdf5MemoryMapped(Hdf5MemoryMapped):
    """
    Implements reading Amuse HDF5 files from amusecode.org.
    
    AMUSE (Astrophysical Multipurpose Software Environment) creates HDF5 files
    with specific structure containing particle data and metadata.
    """
    def __init__(self, path, write=False, fs_options={}, fs=None):
        """
        Initialize AMUSE HDF5 dataset reader.
        
        Parameters:
        - path: Path to AMUSE HDF5 file  
        - write: Enable write mode (default: False)
        - fs_options: Filesystem options
        - fs: Filesystem implementation
        """

    @classmethod
    def can_open(cls, path, *args, **kwargs):
        """
        Check if file is AMUSE HDF5 format.
        
        Parameters:
        - path: File path to check
        
        Returns:
        bool: True if file contains 'particles' group
        """

Gadget2 Format Support

Reader for HDF5 files created by the Gadget2 N-body simulation code.

class Hdf5MemoryMappedGadget(DatasetMemoryMapped):
    """
    Implements reading Gadget2 HDF5 files.
    
    Gadget2 is a cosmological N-body/SPH simulation code that outputs
    HDF5 files with particle data organized by particle type.
    """
    def __init__(self, path, particle_name=None, particle_type=None, fs_options={}, fs=None):
        """
        Initialize Gadget2 HDF5 dataset reader.
        
        Parameters:
        - path: Path to Gadget2 HDF5 file (can include #<particle_type>)
        - particle_name: Name of particle type ("gas", "halo", "disk", "bulge", "stars", "dm")
        - particle_type: Numeric particle type (0-5)
        - fs_options: Filesystem options
        - fs: Filesystem implementation
        
        Note: Either particle_name, particle_type, or #<type> in path must be specified
        """

    @classmethod
    def can_open(cls, path, fs_options={}, fs=None, *args, **kwargs):
        """
        Check if file is Gadget2 HDF5 format with specified particle type.
        
        Parameters:
        - path: File path (may include #<particle_type>)
        - fs_options: Filesystem options
        - fs: Filesystem implementation
        
        Returns:
        bool: True if file contains the specified particle type data
        """

    @classmethod
    def get_options(cls, path):
        """Get available options for Gadget2 file."""
        
    @classmethod
    def option_to_args(cls, option):
        """Convert option to constructor arguments."""

Usage Examples

Reading Standard HDF5 Files

import vaex
from vaex.hdf5.dataset import Hdf5MemoryMapped

# Automatic detection via vaex.open
df = vaex.open('data.hdf5')

# Direct instantiation
dataset = Hdf5MemoryMapped('data.hdf5')
df = vaex.from_dataset(dataset)

# Reading specific group
dataset = Hdf5MemoryMapped('data.hdf5', group='/table')

# Reading from remote storage
dataset = Hdf5MemoryMapped('s3://bucket/data.hdf5', 
                          fs_options={'anon': True})

Creating New HDF5 Files

from vaex.hdf5.dataset import Hdf5MemoryMapped
import numpy as np

# Create empty file with specified structure
dataset = Hdf5MemoryMapped.create(
    'new_data.hdf5', 
    N=1000,
    column_names=['x', 'y', 'z', 'velocity'],
    dtypes=[np.float64, np.float64, np.float64, np.float32],
    write=True
)

# Populate with data
df = vaex.from_dataset(dataset)
df.x[:] = np.random.random(1000)
df.y[:] = np.random.random(1000)
# ... continue with data population

Reading AMUSE Files

# AMUSE files auto-detected by vaex.open
df = vaex.open('amuse_simulation.hdf5')

# Direct instantiation
from vaex.hdf5.dataset import AmuseHdf5MemoryMapped
dataset = AmuseHdf5MemoryMapped('amuse_simulation.hdf5')
df = vaex.from_dataset(dataset)

Reading Gadget2 Files

# Using path with particle type
df_gas = vaex.open('snapshot_001.hdf5#0')  # Gas particles
df_dm = vaex.open('snapshot_001.hdf5#5')   # Dark matter particles

# Using particle name
from vaex.hdf5.dataset import Hdf5MemoryMappedGadget
dataset = Hdf5MemoryMappedGadget('snapshot_001.hdf5', particle_name='gas')
df = vaex.from_dataset(dataset)

# Using particle type number
dataset = Hdf5MemoryMappedGadget('snapshot_001.hdf5', particle_type=0)

Constants

gadget_particle_names = ["gas", "halo", "disk", "bulge", "stars", "dm"]

Mapping of Gadget2 particle type names to their numeric indices (0-5).

Error Handling

All dataset readers may raise:

FileNotFoundError: If the specified file doesn't exist
OSError: For file permission or I/O errors
h5py.H5Error: For HDF5 format errors
ValueError: For invalid parameters or unsupported data formats
KeyError: If specified groups or datasets don't exist in the file

Install with Tessl CLI

npx tessl i tessl/pypi-vaex-hdf5

docs

data-export.md

dataset-reading.md

high-performance-writing.md

index.md

memory-mapping-utils.md

tile.json