0
# Codecs
1
2
Codec classes for data compression, transformation, and encoding. These enable efficient storage through various compression algorithms and data transformations that can be applied to zarr arrays.
3
4
## Capabilities
5
6
### Compression Codecs
7
8
```python { .api }
9
class BloscCodec:
10
"""Blosc compression codec with multiple algorithms."""
11
12
def __init__(
13
self,
14
cname: BloscCname = 'zstd',
15
clevel: int = 5,
16
shuffle: BloscShuffle = BloscShuffle.SHUFFLE,
17
typesize: int = None,
18
blocksize: int = 0,
19
**kwargs
20
): ...
21
22
@property
23
def cname(self) -> BloscCname: ...
24
@property
25
def clevel(self) -> int: ...
26
@property
27
def shuffle(self) -> BloscShuffle: ...
28
```
29
30
```python { .api }
31
class BloscCname(Enum):
32
"""Blosc compression algorithms."""
33
LZ4 = "lz4"
34
LZ4HC = "lz4hc"
35
ZLIB = "zlib"
36
ZSTD = "zstd"
37
BLOSCLZ = "blosclz"
38
SNAPPY = "snappy"
39
```
40
41
```python { .api }
42
class BloscShuffle(Enum):
43
"""Blosc shuffle options."""
44
NOSHUFFLE = 0
45
SHUFFLE = 1
46
BITSHUFFLE = 2
47
```
48
49
```python { .api }
50
class GzipCodec:
51
"""Gzip compression codec."""
52
53
def __init__(self, level: int = 6, **kwargs): ...
54
55
@property
56
def level(self) -> int: ...
57
```
58
59
```python { .api }
60
class ZstdCodec:
61
"""Zstandard compression codec."""
62
63
def __init__(
64
self,
65
level: int = 3,
66
checksum: bool = False,
67
**kwargs
68
): ...
69
70
@property
71
def level(self) -> int: ...
72
@property
73
def checksum(self) -> bool: ...
74
```
75
76
### Array Processing Codecs
77
78
```python { .api }
79
class BytesCodec:
80
"""Array to bytes conversion codec."""
81
82
def __init__(self, endian: Endian = Endian.LITTLE, **kwargs): ...
83
84
@property
85
def endian(self) -> Endian: ...
86
```
87
88
```python { .api }
89
class Endian(Enum):
90
"""Byte order options."""
91
BIG = "big"
92
LITTLE = "little"
93
NATIVE = "native"
94
```
95
96
```python { .api }
97
class TransposeCodec:
98
"""Array transposition codec for dimension reordering."""
99
100
def __init__(self, order: tuple[int, ...], **kwargs): ...
101
102
@property
103
def order(self) -> tuple[int, ...]: ...
104
```
105
106
### Advanced Codecs
107
108
```python { .api }
109
class ShardingCodec:
110
"""Sharding codec for subdividing chunks into smaller shards."""
111
112
def __init__(
113
self,
114
chunk_shape: tuple[int, ...],
115
codecs: list[Codec],
116
index_codecs: list[Codec] = None,
117
index_location: ShardingCodecIndexLocation = ShardingCodecIndexLocation.END,
118
**kwargs
119
): ...
120
121
@property
122
def chunk_shape(self) -> tuple[int, ...]: ...
123
@property
124
def codecs(self) -> list[Codec]: ...
125
@property
126
def index_codecs(self) -> list[Codec]: ...
127
@property
128
def index_location(self) -> ShardingCodecIndexLocation: ...
129
```
130
131
```python { .api }
132
class ShardingCodecIndexLocation(Enum):
133
"""Shard index storage location."""
134
START = "start"
135
END = "end"
136
```
137
138
### String and Variable-Length Data Codecs
139
140
```python { .api }
141
class VLenUTF8Codec:
142
"""Variable-length UTF-8 string codec."""
143
144
def __init__(self, **kwargs): ...
145
```
146
147
```python { .api }
148
class VLenBytesCodec:
149
"""Variable-length bytes codec."""
150
151
def __init__(self, **kwargs): ...
152
```
153
154
### Checksum and Integrity Codecs
155
156
```python { .api }
157
class Crc32cCodec:
158
"""CRC32C checksum codec for data integrity."""
159
160
def __init__(self, **kwargs): ...
161
```
162
163
## Type Definitions
164
165
```python { .api }
166
Codec = Union[
167
BloscCodec, GzipCodec, ZstdCodec, BytesCodec,
168
TransposeCodec, ShardingCodec, VLenUTF8Codec,
169
VLenBytesCodec, Crc32cCodec
170
]
171
172
CompressorLike = Union[str, dict, Codec, None]
173
FiltersLike = Union[list[Codec], None]
174
```
175
176
## Usage Examples
177
178
### Basic Compression
179
180
```python
181
import zarr
182
from zarr.codecs import BloscCodec, GzipCodec, ZstdCodec
183
184
# Create array with Blosc compression
185
blosc_codec = BloscCodec(cname='zstd', clevel=3, shuffle='shuffle')
186
arr = zarr.create(
187
shape=(1000, 1000),
188
chunks=(100, 100),
189
compressor=blosc_codec
190
)
191
192
# Use Gzip compression
193
gzip_codec = GzipCodec(level=6)
194
arr = zarr.create(shape=(500, 500), compressor=gzip_codec)
195
196
# Use Zstandard compression
197
zstd_codec = ZstdCodec(level=5, checksum=True)
198
arr = zarr.create(shape=(800, 800), compressor=zstd_codec)
199
```
200
201
### Codec Pipelines
202
203
```python
204
from zarr.codecs import BloscCodec, BytesCodec, TransposeCodec
205
206
# Create multi-stage codec pipeline
207
codecs = [
208
TransposeCodec(order=(1, 0)), # Transpose dimensions
209
BloscCodec(cname='lz4', clevel=1), # Compress
210
BytesCodec(endian='little') # Convert to bytes
211
]
212
213
arr = zarr.create(
214
shape=(1000, 2000),
215
chunks=(100, 200),
216
codecs=codecs
217
)
218
```
219
220
### Sharding for Small Chunks
221
222
```python
223
from zarr.codecs import ShardingCodec, BloscCodec, BytesCodec
224
225
# Use sharding to group small chunks efficiently
226
shard_codec = ShardingCodec(
227
chunk_shape=(10, 10), # Shard shape within chunk
228
codecs=[
229
BloscCodec(cname='zstd', clevel=3),
230
BytesCodec()
231
],
232
index_location='end'
233
)
234
235
arr = zarr.create(
236
shape=(10000, 10000),
237
chunks=(100, 100), # Main chunk size
238
codecs=[shard_codec]
239
)
240
```
241
242
### String Data Handling
243
244
```python
245
from zarr.codecs import VLenUTF8Codec
246
247
# Array of variable-length strings
248
string_codec = VLenUTF8Codec()
249
str_arr = zarr.create(
250
shape=(1000,),
251
dtype='<U', # Variable-length unicode
252
codecs=[string_codec]
253
)
254
255
str_arr[0] = "Hello, world!"
256
str_arr[1] = "Variable length strings work great with zarr"
257
```
258
259
### Data Integrity with Checksums
260
261
```python
262
from zarr.codecs import Crc32cCodec, BloscCodec, BytesCodec
263
264
# Add checksum for data integrity
265
codecs = [
266
BloscCodec(cname='zstd', clevel=3),
267
BytesCodec(),
268
Crc32cCodec() # Add CRC32C checksum
269
]
270
271
arr = zarr.create(
272
shape=(5000, 5000),
273
chunks=(500, 500),
274
codecs=codecs
275
)
276
```
277
278
### Performance Optimization Examples
279
280
```python
281
# Fast compression for temporary data
282
fast_codecs = [
283
BloscCodec(cname='lz4', clevel=1, shuffle='noshuffle'),
284
BytesCodec()
285
]
286
287
# Maximum compression for archival
288
archive_codecs = [
289
BloscCodec(cname='zstd', clevel=9, shuffle='bitshuffle'),
290
BytesCodec()
291
]
292
293
# Optimize for numerical data patterns
294
numeric_codecs = [
295
TransposeCodec(order=(2, 1, 0)), # Reorder for better compression
296
BloscCodec(cname='zstd', clevel=3, shuffle='shuffle'),
297
BytesCodec()
298
]
299
```
300
301
### Custom Codec Configuration
302
303
```python
304
from zarr.codecs import BloscCodec, BloscCname, BloscShuffle
305
306
# Fine-tune Blosc parameters
307
codec = BloscCodec(
308
cname=BloscCname.ZSTD,
309
clevel=7, # Higher compression
310
shuffle=BloscShuffle.BITSHUFFLE, # Better for numerical data
311
blocksize=2**16 # 64KB blocks
312
)
313
314
# Configure for specific data types
315
float_codec = BloscCodec(
316
cname='zstd',
317
shuffle='shuffle', # Good for floating point
318
typesize=8 # 8-byte floats
319
)
320
321
int_codec = BloscCodec(
322
cname='lz4hc',
323
shuffle='bitshuffle', # Excellent for integers
324
typesize=4 # 4-byte integers
325
)
326
```