HDF5 file support for vaex DataFrame library with memory-mapped access and specialized format readers
npx @tessl/cli install tessl/pypi-vaex-hdf5@0.14.0HDF5 file support for the Vaex high-performance Python library that enables lazy out-of-core DataFrame operations on large datasets. It offers memory-mapped HDF5 file reading capabilities with zero-copy access patterns, supports various HDF5 dataset formats including scientific data from Gadget simulations and AMUSE astrophysics framework, and provides efficient data export functionality to HDF5 format.
pip install vaex-hdf5import vaex.hdf5.dataset
import vaex.hdf5.export
import vaex.hdf5.writer
import vaex.hdf5.utilsFor direct dataset access:
from vaex.hdf5.dataset import Hdf5MemoryMapped, AmuseHdf5MemoryMapped, Hdf5MemoryMappedGadgetimport vaex
# Reading HDF5 files (automatic detection via vaex.open)
df = vaex.open('data.hdf5')
# Reading specialized formats
df_amuse = vaex.open('simulation.hdf5') # AMUSE format auto-detected
df_gadget = vaex.open('snapshot.hdf5#0') # Gadget format with particle type
# Exporting to HDF5
df = vaex.from_csv('data.csv')
df.export('output.hdf5')
# Manual dataset creation
from vaex.hdf5.dataset import Hdf5MemoryMapped
dataset = Hdf5MemoryMapped.create('new_file.hdf5', N=1000,
column_names=['x', 'y', 'z'])
# High-performance writing with Writer
from vaex.hdf5.writer import Writer
with Writer('output.hdf5') as writer:
writer.layout(df)
writer.write(df)The vaex-hdf5 package is built around several key components:
The package integrates seamlessly with the broader Vaex ecosystem through entry points that register HDF5 dataset openers, enabling automatic format detection and optimal performance for billion-row datasets through lazy evaluation and memory mapping techniques.
Memory-mapped reading of HDF5 files with support for standard vaex format, AMUSE scientific data format, and Gadget2 simulation format. Provides zero-copy access patterns and automatic format detection.
class Hdf5MemoryMapped:
def __init__(self, path, write=False, fs_options={}, fs=None, nommap=None, group=None, _fingerprint=None): ...
@classmethod
def create(cls, path, N, column_names, dtypes=None, write=True): ...
@classmethod
def can_open(cls, path, fs_options={}, fs=None, group=None, **kwargs): ...
def write_meta(self): ...
def close(self): ...
class AmuseHdf5MemoryMapped(Hdf5MemoryMapped):
def __init__(self, path, write=False, fs_options={}, fs=None): ...
class Hdf5MemoryMappedGadget(DatasetMemoryMapped):
def __init__(self, path, particle_name=None, particle_type=None, fs_options={}, fs=None): ...High-level functions for exporting vaex DataFrames to HDF5 format with support for both version 1 and version 2 formats, compression options, and streaming export for large datasets.
def export_hdf5(dataset, path, column_names=None, byteorder="=", shuffle=False,
selection=False, progress=None, virtual=True, sort=None,
ascending=True, parallel=True): ...
def export_hdf5_v1(dataset, path, column_names=None, byteorder="=", shuffle=False,
selection=False, progress=None, virtual=True): ...Low-level writer classes for streaming large datasets to HDF5 format with optimal memory usage, parallel writing support, and specialized column writers for different data types.
class Writer:
def __init__(self, path, group="/table", mode="w", byteorder="="): ...
def layout(self, df, progress=None): ...
def write(self, df, chunk_size=int(1e5), parallel=True, progress=None,
column_count=1, export_threads=0): ...
def close(self): ...
def __enter__(self): ...
def __exit__(self, *args): ...Low-level utilities for memory mapping HDF5 datasets and arrays with support for masked arrays and different storage layouts.
def mmap_array(mmap, file, offset, dtype, shape): ...
def h5mmap(mmap, file, data, mask=None): ...# Common type aliases used throughout the API
PathLike = Union[str, Path]
FileSystemOptions = Dict[str, Any]
FileSystem = Any # fsspec filesystem
ProgressCallback = Callable[[float], bool]
ByteOrder = Literal["=", "<", ">"]