HDF5 file support for vaex DataFrame library with memory-mapped access and specialized format readers
Low-level writer classes for streaming large datasets to HDF5 format with optimal memory usage, parallel writing support, and specialized column writers for different data types.
The primary interface for high-performance HDF5 writing with memory mapping and parallel processing support.
class Writer:
"""
High-level HDF5 writer optimized for large DataFrame export.
Provides streaming write capabilities using memory mapping for optimal
performance and supports parallel column writing.
"""
def __init__(self, path, group="/table", mode="w", byteorder="="):
"""
Initialize HDF5 writer.
Parameters:
- path: Output file path
- group: HDF5 group path for table data (default: "/table")
- mode: File open mode ("w" for write, "a" for append)
- byteorder: Byte order ("=" for native, "<" for little endian, ">" for big endian)
"""
def layout(self, df, progress=None):
"""
Set up file layout and allocate space for DataFrame.
This must be called before write(). Analyzes the DataFrame schema,
calculates storage requirements, and pre-allocates HDF5 datasets.
Parameters:
- df: DataFrame to analyze and prepare for writing
- progress: Progress callback for layout operations
Raises:
AssertionError: If layout() called twice
ValueError: If DataFrame is empty
"""
def write(self, df, chunk_size=100000, parallel=True, progress=None,
column_count=1, export_threads=0):
"""
Write DataFrame data to HDF5 file.
Streams data in chunks using memory mapping for optimal performance.
Parameters:
- df: DataFrame to write (must match layout() DataFrame)
- chunk_size: Number of rows to process per chunk (rounded to multiple of 8)
- parallel: Enable parallel processing within vaex
- progress: Progress callback for write operations
- column_count: Number of columns to process simultaneously
- export_threads: Number of threads for column writing (0 for single-threaded)
Raises:
AssertionError: If layout() not called first
ValueError: If DataFrame is empty
"""
def close(self):
"""
Close writer and clean up resources.
Closes memory maps, file handles, and HDF5 file.
Must be called when finished writing.
"""
def __enter__(self):
"""Context manager entry."""
def __exit__(self, *args):
"""Context manager exit - automatically calls close()."""Specialized writers for different data types with optimized storage strategies.
class ColumnWriterDictionaryEncoded:
"""
Writer for dictionary-encoded (categorical) columns.
Stores unique values in a dictionary and indices separately
for efficient storage of categorical data.
"""
def __init__(self, h5parent, name, dtype, values, shape, has_null, byteorder="=", df=None):
"""
Initialize dictionary-encoded column writer.
Parameters:
- h5parent: Parent HDF5 group
- name: Column name
- dtype: Dictionary-encoded data type
- values: Array of unique category values
- shape: Column shape (rows, ...)
- has_null: Whether column contains null values
- byteorder: Byte order for numeric data
- df: Source DataFrame for extracting index values
Raises:
ValueError: If encoded index contains null values
"""
def mmap(self, mmap, file):
"""Set up memory mapping for writing."""
def write(self, values):
"""Write index values for a chunk of data."""
def write_extra(self):
"""Write the dictionary values."""
@property
def progress(self):
"""Get writing progress as fraction (0-1)."""class ColumnWriterPrimitive:
"""
Writer for primitive data types (numeric, datetime, boolean).
Handles standard data types with optional null bitmaps and masks.
"""
def __init__(self, h5parent, name, dtype, shape, has_mask, has_null, byteorder="="):
"""
Initialize primitive column writer.
Parameters:
- h5parent: Parent HDF5 group
- name: Column name
- dtype: Data type
- shape: Column shape (rows, ...)
- has_mask: Whether column has a mask array
- has_null: Whether column has null values (Arrow format)
- byteorder: Byte order
Raises:
ValueError: If both has_mask and has_null are True
"""
def mmap(self, mmap, file):
"""Set up memory mapping for writing."""
def write(self, values):
"""Write a chunk of values."""
def write_extra(self):
"""Write any extra data (no-op for primitives)."""
@property
def progress(self):
"""Get writing progress as fraction (0-1)."""class ColumnWriterString:
"""
Writer for variable-length string columns.
Uses efficient Arrow-style storage with separate data and index arrays.
"""
def __init__(self, h5parent, name, dtype, shape, byte_length, has_null):
"""
Initialize string column writer.
Parameters:
- h5parent: Parent HDF5 group
- name: Column name
- dtype: String data type
- shape: Column shape (rows,)
- byte_length: Total bytes needed for all strings
- has_null: Whether column contains null values
"""
def mmap(self, mmap, file):
"""Set up memory mapping for writing."""
def write(self, values):
"""Write a chunk of string values."""
def write_extra(self):
"""Write any extra data (no-op for strings)."""
@property
def progress(self):
"""Get writing progress as fraction (0-1)."""from vaex.hdf5.writer import Writer
import vaex
# Load large DataFrame
df = vaex.open('large_dataset.parquet')
# Context manager ensures proper cleanup
with Writer('output.hdf5') as writer:
writer.layout(df)
writer.write(df)with Writer('output.hdf5', group='/data', byteorder='<') as writer:
# Set up layout with progress tracking
def layout_progress(fraction):
print(f"Layout progress: {fraction*100:.1f}%")
writer.layout(df, progress=layout_progress)
# Write with custom settings
def write_progress(fraction):
print(f"Write progress: {fraction*100:.1f}%")
writer.write(df,
chunk_size=50000, # Smaller chunks
parallel=True, # Enable parallel processing
progress=write_progress,
column_count=2, # Process 2 columns at once
export_threads=4) # Use 4 writer threads# For very large datasets with limited memory
with Writer('huge_output.hdf5') as writer:
writer.layout(df)
# Use smaller chunks and disable threading
writer.write(df,
chunk_size=10000,
parallel=False,
export_threads=0)# Write filtered DataFrame
df_filtered = df[df.score > 0.8]
with Writer('filtered_output.hdf5') as writer:
writer.layout(df_filtered)
writer.write(df_filtered)# Without context manager (not recommended)
writer = Writer('output.hdf5')
try:
writer.layout(df)
writer.write(df)
finally:
writer.close() # Always close to prevent resource leaksChoose chunk size based on available memory and data characteristics:
# For large datasets with sufficient memory
writer.write(df, chunk_size=1000000) # 1M rows per chunk
# For memory-constrained environments
writer.write(df, chunk_size=10000) # 10K rows per chunk
# Chunk size is automatically rounded to multiple of 8Configure parallelism based on system resources:
# CPU-intensive workloads
writer.write(df, parallel=True, export_threads=4)
# I/O-intensive workloads
writer.write(df, parallel=True, export_threads=1)
# Single-threaded for debugging
writer.write(df, parallel=False, export_threads=0)The writer automatically uses memory mapping when possible:
VAEX_USE_MMAP environment variableUSE_MMAP = True # Environment variable VAEX_USE_MMAP (default: True)
max_int32 = 2147483647 # Maximum 32-bit integer valueThe writer automatically selects appropriate column writers:
ColumnWriterPrimitiveColumnWriterStringColumnWriterDictionaryEncodedTypeErrorThe writer creates HDF5 files with this structure:
/table/ # Main table group
├── columns/ # Column data group
│ ├── column1/ # Individual column group
│ │ └── data # Column data array
│ ├── column2/ # String column example
│ │ ├── data # String bytes
│ │ ├── indices # String offsets
│ │ └── null_bitmap # Null value bitmap (if needed)
│ └── column3/ # Dictionary-encoded example
│ ├── indices/ # Category indices
│ │ └── data
│ └── dictionary/ # Category values
│ └── data
└── @attrs
├── type: "table"
└── column_order: "column1,column2,column3"Writer classes may raise:
AssertionError: If methods called in wrong orderValueError: For invalid parameters or empty DataFramesTypeError: For unsupported data typesOSError: For file system errorsh5py.H5Error: For HDF5 writing errorsMemoryError: If insufficient memory for operationsInstall with Tessl CLI
npx tessl i tessl/pypi-vaex-hdf5