tessl/pypi-h5netcdf

netCDF4 file access via h5py with hierarchical and legacy APIs for scientific computing

0.83x

Overview

Eval results

Files

Variables and Data Access

Name: tessl/pypi-h5netcdf
Rating: 0.69 (1 reviews)
Author: tessl

Variables are the primary data containers in netCDF4 files, storing multidimensional arrays with associated metadata. They support various data types, compression options, and chunking strategies for efficient storage and access.

Capabilities

Variable Creation

Create variables with specified dimensions, data types, and storage options.

def create_variable(self, name: str, dimensions: tuple = (), dtype = None, 
                   data = None, fillvalue = None, chunks: tuple = None,
                   chunking_heuristic: str = None, compression: str = None, 
                   compression_opts: int = None, shuffle: bool = False, 
                   fletcher32: bool = False, **kwargs) -> Variable:
    """
    Create a new variable in the group.
    
    Args:
        name (str): Variable name
        dimensions (tuple): Tuple of dimension names
        dtype: NumPy dtype or UserType for the variable data
        data: Initial data to store (optional)
        fillvalue: Fill value for missing data
        chunks (tuple): Chunk sizes for each dimension
        chunking_heuristic (str): Auto-chunking approach ('h5py' or 'h5netcdf')
        compression (str): Compression method ('gzip', 'lzf', 'szip')
        compression_opts (int): Compression level (0-9 for gzip)
        shuffle (bool): Apply shuffle filter before compression
        fletcher32 (bool): Apply Fletcher32 checksum
        **kwargs: Additional HDF5 dataset creation parameters
        
    Returns:
        Variable: The newly created variable
    """
    ...

Variable Properties

Access variable metadata and configuration.

class Variable(BaseVariable):
    @property
    def name(self) -> str:
        """Variable name."""
        ...
    
    @property
    def dimensions(self) -> tuple:
        """Tuple of dimension names."""
        ...
    
    @property
    def shape(self) -> tuple:
        """Current shape of the variable."""
        ...
    
    @property
    def ndim(self) -> int:
        """Number of dimensions."""
        ...
    
    @property
    def dtype(self) -> np.dtype:
        """NumPy data type."""
        ...
    
    @property
    def datatype(self):
        """NetCDF datatype (includes user-defined types like EnumType, VLType, CompoundType)."""
        ...
    
    @property
    def attrs(self) -> Attributes:
        """Variable attributes."""
        ...

Storage Configuration Properties

Access information about variable storage and compression.

@property
def chunks(self) -> tuple:
    """Chunk sizes for each dimension (None if not chunked)."""
    ...

@property
def compression(self) -> str:
    """Compression method used ('gzip', 'lzf', 'szip', or None)."""
    ...

@property
def compression_opts(self) -> int:
    """Compression options/level."""
    ...

@property
def shuffle(self) -> bool:
    """Whether shuffle filter is applied."""
    ...

@property
def fletcher32(self) -> bool:
    """Whether Fletcher32 checksum is applied."""
    ...

Data Access

Read and write variable data using NumPy-style indexing.

def __getitem__(self, key) -> np.ndarray:
    """
    Read data from the variable using NumPy-style indexing.
    
    Args:
        key: Index specification (int, slice, tuple of indices/slices)
        
    Returns:
        np.ndarray: The requested data
    """
    ...

def __setitem__(self, key, value) -> None:
    """
    Write data to the variable using NumPy-style indexing.
    
    Args:
        key: Index specification (int, slice, tuple of indices/slices)
        value: Data to write (scalar, array, or array-like)
    """
    ...
    
def __len__(self) -> int:
    """
    Return the size of the first dimension.
    
    Returns:
        int: Size of first dimension
    """
    ...

NumPy Integration

Seamless integration with NumPy arrays and operations.

def __array__(self, *args, **kwargs) -> np.ndarray:
    """NumPy array interface support (loads all data)."""
    ...
    
def __repr__(self) -> str:
    """String representation of the variable."""
    ...

Usage Examples

Basic Variable Operations

import h5netcdf
import numpy as np

with h5netcdf.File('variables.nc', 'w') as f:
    # Create dimensions
    f.dimensions['time'] = 100
    f.dimensions['lat'] = 180
    f.dimensions['lon'] = 360
    
    # Create a simple variable
    temp = f.create_variable('temperature', ('time', 'lat', 'lon'), dtype='f4')
    
    # Set attributes
    temp.attrs['units'] = 'K'
    temp.attrs['long_name'] = 'Air Temperature'
    temp.attrs['valid_range'] = [200.0, 350.0]
    
    # Write some data
    temp[0, :, :] = np.random.random((180, 360)) * 50 + 273.15
    
    # Read data back
    first_timestep = temp[0, :, :]
    print(f"Temperature shape: {temp.shape}")
    print(f"Temperature dtype: {temp.dtype}")

Advanced Indexing

with h5netcdf.File('indexing.nc', 'r') as f:
    temp = f.variables['temperature']
    
    # Various indexing patterns
    all_data = temp[:]                    # All data
    first_time = temp[0, :, :]           # First time slice
    subset = temp[10:20, 50:100, 100:200] # Subset
    single_point = temp[15, 90, 180]     # Single value
    
    # Fancy indexing
    specific_times = temp[[0, 5, 10], :, :]  # Specific time steps
    
    # Step indexing
    every_10th = temp[::10, :, :]        # Every 10th time step

Chunking and Compression

with h5netcdf.File('compressed.nc', 'w') as f:
    f.dimensions['time'] = None  # Unlimited
    f.dimensions['lat'] = 721
    f.dimensions['lon'] = 1440
    
    # Create compressed variable with chunking
    temp = f.create_variable(
        'temperature', 
        ('time', 'lat', 'lon'), 
        dtype='f4',
        chunks=(1, 361, 720),      # Chunk size
        compression='gzip',         # Compression method
        compression_opts=6,         # Compression level
        shuffle=True,              # Shuffle filter
        fletcher32=True            # Checksum
    )
    
    # Check compression settings
    print(f"Chunks: {temp.chunks}")
    print(f"Compression: {temp.compression}")
    print(f"Compression level: {temp.compression_opts}")
    print(f"Shuffle: {temp.shuffle}")
    print(f"Fletcher32: {temp.fletcher32}")

Fill Values and Missing Data

with h5netcdf.File('missing_data.nc', 'w') as f:
    f.dimensions['time'] = 10
    f.dimensions['station'] = 50
    
    # Variable with fill value
    temp = f.create_variable(
        'temperature', 
        ('time', 'station'), 
        dtype='f4',
        fillvalue=-999.0
    )
    
    # Write partial data
    temp[0, :25] = np.random.random(25) * 30 + 273.15
    # Remaining values will be fill value
    
    # Check for fill values when reading
    data = temp[:]
    valid_data = data[data != -999.0]
    print(f"Valid measurements: {len(valid_data)}")

Working with Different Data Types

with h5netcdf.File('data_types.nc', 'w') as f:
    f.dimensions['n'] = 100
    
    # Integer variables
    int_var = f.create_variable('integers', ('n',), dtype='i4')
    int_var[:] = np.arange(100)
    
    # Float variables
    float_var = f.create_variable('floats', ('n',), dtype='f8')
    float_var[:] = np.random.random(100)
    
    # String variables
    f.dimensions['str_len'] = 20
    str_var = f.create_variable('strings', ('n', 'str_len'), dtype='S1')
    
    # Boolean-like (using integers)
    bool_var = f.create_variable('flags', ('n',), dtype='i1')
    bool_var[:] = np.random.choice([0, 1], 100)

Unlimited Dimensions

with h5netcdf.File('unlimited.nc', 'w') as f:
    # Create unlimited dimension
    f.dimensions['time'] = None  # Unlimited
    f.dimensions['station'] = 10
    
    # Variable with unlimited dimension
    temp = f.create_variable('temperature', ('time', 'station'), dtype='f4')
    
    # Write data in chunks (simulating time series)
    for t in range(5):
        # Extend the unlimited dimension
        temp[t, :] = np.random.random(10) * 30 + 273.15
    
    print(f"Current time dimension size: {f.dimensions['time'].size}")
    print(f"Variable shape: {temp.shape}")

Coordinate Variables

with h5netcdf.File('coordinates.nc', 'w') as f:
    # Create dimensions
    f.dimensions['lat'] = 180
    f.dimensions['lon'] = 360
    f.dimensions['time'] = 12
    
    # Create coordinate variables (same name as dimension)
    lat = f.create_variable('lat', ('lat',), dtype='f4')
    lat[:] = np.linspace(-89.5, 89.5, 180)
    lat.attrs['units'] = 'degrees_north'
    lat.attrs['long_name'] = 'Latitude'
    
    lon = f.create_variable('lon', ('lon',), dtype='f4')
    lon[:] = np.linspace(-179.5, 179.5, 360)
    lon.attrs['units'] = 'degrees_east'
    lon.attrs['long_name'] = 'Longitude'
    
    time = f.create_variable('time', ('time',), dtype='f8')
    time[:] = np.arange(12)
    time.attrs['units'] = 'months since 2023-01-01'
    time.attrs['calendar'] = 'standard'
    
    # Data variable using these coordinates
    temp = f.create_variable('temperature', ('time', 'lat', 'lon'), dtype='f4')
    temp.attrs['coordinates'] = 'time lat lon'

Performance Considerations

Chunking Strategy

Time series data: Chunk along time dimension for efficient appends
Spatial data: Chunk to match typical access patterns (e.g., geographic tiles)
Rule of thumb: Aim for chunk sizes of 10KB to 1MB

Compression Guidelines

gzip: Good general-purpose compression, level 6 is often optimal
lzf: Faster compression/decompression, lower ratio
szip: Good for scientific data, patent restrictions
shuffle: Almost always beneficial with compression

Memory Management

# Efficient: Process data in chunks
with h5netcdf.File('large_data.nc', 'r') as f:
    temp = f.variables['temperature']
    
    # Instead of loading all data at once
    # all_data = temp[:]  # Memory intensive
    
    # Process in chunks
    for i in range(0, temp.shape[0], 10):
        chunk = temp[i:i+10, :, :]
        # Process chunk
        result = process_chunk(chunk)

Install with Tessl CLI

npx tessl i tessl/pypi-h5netcdf