An implementation of chunked, compressed, N-dimensional arrays for Python
Codec classes for data compression, transformation, and encoding. These enable efficient storage through various compression algorithms and data transformations that can be applied to zarr arrays.
class BloscCodec:
"""Blosc compression codec with multiple algorithms."""
def __init__(
self,
cname: BloscCname = 'zstd',
clevel: int = 5,
shuffle: BloscShuffle = BloscShuffle.SHUFFLE,
typesize: int = None,
blocksize: int = 0,
**kwargs
): ...
@property
def cname(self) -> BloscCname: ...
@property
def clevel(self) -> int: ...
@property
def shuffle(self) -> BloscShuffle: ...class BloscCname(Enum):
"""Blosc compression algorithms."""
LZ4 = "lz4"
LZ4HC = "lz4hc"
ZLIB = "zlib"
ZSTD = "zstd"
BLOSCLZ = "blosclz"
SNAPPY = "snappy"class BloscShuffle(Enum):
"""Blosc shuffle options."""
NOSHUFFLE = 0
SHUFFLE = 1
BITSHUFFLE = 2class GzipCodec:
"""Gzip compression codec."""
def __init__(self, level: int = 6, **kwargs): ...
@property
def level(self) -> int: ...class ZstdCodec:
"""Zstandard compression codec."""
def __init__(
self,
level: int = 3,
checksum: bool = False,
**kwargs
): ...
@property
def level(self) -> int: ...
@property
def checksum(self) -> bool: ...class BytesCodec:
"""Array to bytes conversion codec."""
def __init__(self, endian: Endian = Endian.LITTLE, **kwargs): ...
@property
def endian(self) -> Endian: ...class Endian(Enum):
"""Byte order options."""
BIG = "big"
LITTLE = "little"
NATIVE = "native"class TransposeCodec:
"""Array transposition codec for dimension reordering."""
def __init__(self, order: tuple[int, ...], **kwargs): ...
@property
def order(self) -> tuple[int, ...]: ...class ShardingCodec:
"""Sharding codec for subdividing chunks into smaller shards."""
def __init__(
self,
chunk_shape: tuple[int, ...],
codecs: list[Codec],
index_codecs: list[Codec] = None,
index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.END,
**kwargs
): ...
@property
def chunk_shape(self) -> tuple[int, ...]: ...
@property
def codecs(self) -> list[Codec]: ...
@property
def index_codecs(self) -> list[Codec]: ...
@property
def index_location(self) -> ShardingCodecIndexLocation: ...class ShardingCodecIndexLocation(Enum):
"""Shard index storage location."""
START = "start"
END = "end"class VLenUTF8Codec:
"""Variable-length UTF-8 string codec."""
def __init__(self, **kwargs): ...class VLenBytesCodec:
"""Variable-length bytes codec."""
def __init__(self, **kwargs): ...class Crc32cCodec:
"""CRC32C checksum codec for data integrity."""
def __init__(self, **kwargs): ...Codec = Union[
BloscCodec, GzipCodec, ZstdCodec, BytesCodec,
TransposeCodec, ShardingCodec, VLenUTF8Codec,
VLenBytesCodec, Crc32cCodec
]
CompressorLike = Union[str, dict, Codec, None]
FiltersLike = Union[list[Codec], None]import zarr
from zarr.codecs import BloscCodec, GzipCodec, ZstdCodec
# Create array with Blosc compression
blosc_codec = BloscCodec(cname='zstd', clevel=3, shuffle='shuffle')
arr = zarr.create(
shape=(1000, 1000),
chunks=(100, 100),
compressor=blosc_codec
)
# Use Gzip compression
gzip_codec = GzipCodec(level=6)
arr = zarr.create(shape=(500, 500), compressor=gzip_codec)
# Use Zstandard compression
zstd_codec = ZstdCodec(level=5, checksum=True)
arr = zarr.create(shape=(800, 800), compressor=zstd_codec)from zarr.codecs import BloscCodec, BytesCodec, TransposeCodec
# Create multi-stage codec pipeline
codecs = [
TransposeCodec(order=(1, 0)), # Transpose dimensions
BloscCodec(cname='lz4', clevel=1), # Compress
BytesCodec(endian='little') # Convert to bytes
]
arr = zarr.create(
shape=(1000, 2000),
chunks=(100, 200),
codecs=codecs
)from zarr.codecs import ShardingCodec, BloscCodec, BytesCodec
# Use sharding to group small chunks efficiently
shard_codec = ShardingCodec(
chunk_shape=(10, 10), # Shard shape within chunk
codecs=[
BloscCodec(cname='zstd', clevel=3),
BytesCodec()
],
index_location='end'
)
arr = zarr.create(
shape=(10000, 10000),
chunks=(100, 100), # Main chunk size
codecs=[shard_codec]
)from zarr.codecs import VLenUTF8Codec
# Array of variable-length strings
string_codec = VLenUTF8Codec()
str_arr = zarr.create(
shape=(1000,),
dtype='<U', # Variable-length unicode
codecs=[string_codec]
)
str_arr[0] = "Hello, world!"
str_arr[1] = "Variable length strings work great with zarr"from zarr.codecs import Crc32cCodec, BloscCodec, BytesCodec
# Add checksum for data integrity
codecs = [
BloscCodec(cname='zstd', clevel=3),
BytesCodec(),
Crc32cCodec() # Add CRC32C checksum
]
arr = zarr.create(
shape=(5000, 5000),
chunks=(500, 500),
codecs=codecs
)# Fast compression for temporary data
fast_codecs = [
BloscCodec(cname='lz4', clevel=1, shuffle='noshuffle'),
BytesCodec()
]
# Maximum compression for archival
archive_codecs = [
BloscCodec(cname='zstd', clevel=9, shuffle='bitshuffle'),
BytesCodec()
]
# Optimize for numerical data patterns
numeric_codecs = [
TransposeCodec(order=(2, 1, 0)), # Reorder for better compression
BloscCodec(cname='zstd', clevel=3, shuffle='shuffle'),
BytesCodec()
]from zarr.codecs import BloscCodec, BloscCname, BloscShuffle
# Fine-tune Blosc parameters
codec = BloscCodec(
cname=BloscCname.ZSTD,
clevel=7, # Higher compression
shuffle=BloscShuffle.BITSHUFFLE, # Better for numerical data
blocksize=2**16 # 64KB blocks
)
# Configure for specific data types
float_codec = BloscCodec(
cname='zstd',
shuffle='shuffle', # Good for floating point
typesize=8 # 8-byte floats
)
int_codec = BloscCodec(
cname='lz4hc',
shuffle='bitshuffle', # Excellent for integers
typesize=4 # 4-byte integers
)Install with Tessl CLI
npx tessl i tessl/pypi-zarr