Hierarchical datasets for Python with HDF5 library for managing extremely large amounts of data
PyTables provides advanced compression and filtering capabilities using multiple algorithms including zlib, blosc, blosc2, bzip2, and lzo. The filtering system optimizes storage efficiency and I/O performance while maintaining data integrity through checksums and error detection.
class Filters:
def __init__(self, complevel=0, complib="zlib", shuffle=True, bitshuffle=False, fletcher32=False):
"""
Configure compression and filtering options.
Parameters:
- complevel (int): Compression level (0-9, 0=no compression)
- complib (str): Compression library ("zlib", "blosc", "blosc2", "bzip2", "lzo")
- shuffle (bool): Enable byte-shuffling filter
- bitshuffle (bool): Enable bit-shuffling filter (blosc/blosc2 only)
- fletcher32 (bool): Enable Fletcher32 checksum
"""
@property
def complevel(self):
"""Compression level (0-9)."""
@property
def complib(self):
"""Compression library name."""
@property
def shuffle(self):
"""Byte-shuffling filter status."""
@property
def bitshuffle(self):
"""Bit-shuffling filter status."""
@property
def fletcher32(self):
"""Fletcher32 checksum status."""def set_blosc_max_threads(nthreads):
"""
Set maximum number of threads for Blosc compression.
Parameters:
- nthreads (int): Maximum threads (0 for automatic)
"""
def set_blosc2_max_threads(nthreads):
"""
Set maximum number of threads for Blosc2 compression.
Parameters:
- nthreads (int): Maximum threads (0 for automatic)
"""
def blosc_compcode_to_compname(compcode):
"""
Convert Blosc compression code to name.
Parameters:
- compcode (int): Compression code
Returns:
str: Compression algorithm name
"""
def blosc2_compcode_to_compname(compcode):
"""
Convert Blosc2 compression code to name.
Parameters:
- compcode (int): Compression code
Returns:
str: Compression algorithm name
"""
def blosc_get_complib_info():
"""
Get information about available Blosc compression libraries.
Returns:
dict: Library information including versions and supported algorithms
"""
def blosc2_get_complib_info():
"""
Get information about available Blosc2 compression libraries.
Returns:
dict: Library information including versions and supported algorithms
"""
blosc_compressor_list = ["blosclz", "lz4", "lz4hc", "snappy", "zlib", "zstd"]
"""List of available Blosc compressors."""
blosc2_compressor_list = ["blosclz", "lz4", "lz4hc", "zlib", "zstd"]
"""List of available Blosc2 compressors."""import tables as tb
import numpy as np
# Different compression configurations
filters_none = tb.Filters(complevel=0) # No compression
filters_zlib = tb.Filters(complevel=6, complib="zlib", shuffle=True)
filters_blosc = tb.Filters(complevel=5, complib="blosc", shuffle=True, fletcher32=True)
filters_blosc2 = tb.Filters(complevel=1, complib="blosc2", bitshuffle=True)
with tb.open_file("compressed.h5", "w") as h5file:
# Create arrays with different compression
data = np.random.random((1000, 1000))
array_none = h5file.create_carray("/", "uncompressed", tb.Float64Atom(),
data.shape, filters=filters_none)
array_zlib = h5file.create_carray("/", "zlib_compressed", tb.Float64Atom(),
data.shape, filters=filters_zlib)
array_blosc = h5file.create_carray("/", "blosc_compressed", tb.Float64Atom(),
data.shape, filters=filters_blosc)
# Fill with same data
array_none[:] = data
array_zlib[:] = data
array_blosc[:] = data
# Compare sizes
print(f"Uncompressed: {array_none.size_on_disk} bytes")
print(f"Zlib: {array_zlib.size_on_disk} bytes")
print(f"Blosc: {array_blosc.size_on_disk} bytes")
# Configure Blosc threading
tb.set_blosc_max_threads(4) # Use 4 threads for compressionInstall with Tessl CLI
npx tessl i tessl/pypi-tables