0
# Persistence and Serialization
1
2
Fast compressed persistence optimized for Python objects containing large NumPy arrays. Provides memory mapping support, multiple compression algorithms, and cross-platform compatibility as a replacement for pickle, specifically designed for scientific computing and machine learning workflows.
3
4
## Capabilities
5
6
### Object Persistence
7
8
High-performance serialization and deserialization of Python objects with special optimizations for NumPy arrays and scientific data structures.
9
10
```python { .api }
11
def dump(value, filename, compress=0, protocol=None):
12
"""
13
Persist arbitrary Python object to file with optional compression.
14
15
Parameters:
16
- value: any Python object to store
17
- filename: str, pathlib.Path, or file object for output
18
- compress: compression specification:
19
- False or 0: no compression
20
- True or 1-9: zlib compression level
21
- str: compression method ('zlib', 'gzip', 'bz2', 'lzma', 'xz', 'lz4')
22
- tuple: (method, level) for specific compression and level
23
- protocol: int, pickle protocol version (None for highest available)
24
25
Returns:
26
str: filename if string was passed, None otherwise
27
"""
28
29
def load(filename, mmap_mode=None, ensure_native_byte_order="auto"):
30
"""
31
Reconstruct Python object from file created with joblib.dump.
32
33
Parameters:
34
- filename: str, pathlib.Path, or file object to read from
35
- mmap_mode: memory mapping mode for NumPy arrays:
36
- None: load normally into memory
37
- 'r+': read-write memory mapping
38
- 'r': read-only memory mapping
39
- 'w+': write memory mapping
40
- 'c': copy-on-write memory mapping
41
- ensure_native_byte_order: byte order handling:
42
- "auto": automatic conversion if needed
43
- True: force native byte order conversion
44
- False: preserve original byte order
45
46
Returns:
47
Reconstructed Python object
48
"""
49
```
50
51
**Basic Usage Examples:**
52
53
```python
54
from joblib import dump, load
55
import numpy as np
56
57
# Simple object persistence
58
data = {'array': np.random.random(1000), 'metadata': {'version': 1}}
59
dump(data, 'data.pkl')
60
loaded_data = load('data.pkl')
61
62
# With compression
63
large_array = np.random.random((10000, 1000))
64
dump(large_array, 'large_data.pkl', compress=3) # zlib level 3
65
loaded_array = load('large_data.pkl')
66
67
# Different compression methods
68
dump(data, 'data_gzip.pkl', compress='gzip')
69
dump(data, 'data_bz2.pkl', compress=('bz2', 9)) # bz2 level 9
70
dump(data, 'data_lz4.pkl', compress='lz4') # Fast compression
71
72
# File objects
73
with open('output.pkl', 'wb') as f:
74
dump(data, f, compress=True)
75
76
with open('output.pkl', 'rb') as f:
77
loaded_data = load(f)
78
```
79
80
**Memory Mapping Examples:**
81
82
```python
83
import numpy as np
84
from joblib import dump, load
85
86
# Create and save large array
87
huge_array = np.random.random((50000, 1000))
88
dump(huge_array, 'huge_array.pkl')
89
90
# Memory map for efficient access without loading into RAM
91
mapped_array = load('huge_array.pkl', mmap_mode='r')
92
print(f"Array shape: {mapped_array.shape}")
93
print(f"Mean of first 1000 elements: {np.mean(mapped_array[:1000, :])}")
94
95
# Read-write memory mapping
96
mapped_rw = load('huge_array.pkl', mmap_mode='r+')
97
mapped_rw[0, 0] = 999.0 # Modifies the file directly
98
99
# Copy-on-write mapping (changes don't affect original file)
100
mapped_cow = load('huge_array.pkl', mmap_mode='c')
101
mapped_cow[0, 0] = 888.0 # Creates a copy when modified
102
```
103
104
**Advanced Persistence Patterns:**
105
106
```python
107
from joblib import dump, load
108
import numpy as np
109
from pathlib import Path
110
111
# Custom objects with __getstate__/__setstate__
112
class CustomModel:
113
def __init__(self, weights, metadata):
114
self.weights = weights
115
self.metadata = metadata
116
self._fitted = False
117
118
def fit(self, data):
119
self._fitted = True
120
return self
121
122
def __getstate__(self):
123
# Custom serialization logic
124
state = self.__dict__.copy()
125
# Remove unpicklable attributes if needed
126
return state
127
128
def __setstate__(self, state):
129
# Custom deserialization logic
130
self.__dict__.update(state)
131
132
# Serialize complex model
133
model = CustomModel(np.random.random((100, 50)), {'version': '1.0'})
134
model.fit(training_data)
135
136
dump(model, 'trained_model.pkl', compress=True)
137
loaded_model = load('trained_model.pkl')
138
139
# Batch processing with efficient I/O
140
def save_batch(data_batch, batch_id, output_dir):
141
filename = Path(output_dir) / f'batch_{batch_id:04d}.pkl'
142
dump(data_batch, filename, compress='lz4') # Fast compression
143
144
def load_batch(batch_id, output_dir):
145
filename = Path(output_dir) / f'batch_{batch_id:04d}.pkl'
146
return load(filename)
147
148
# Process large dataset in batches
149
output_dir = Path('./processed_batches')
150
output_dir.mkdir(exist_ok=True)
151
152
# Save batches
153
for i, batch in enumerate(data_batches):
154
processed_batch = process_data(batch)
155
save_batch(processed_batch, i, output_dir)
156
157
# Load specific batches as needed
158
batch_5 = load_batch(5, output_dir)
159
```
160
161
## Compression Options
162
163
### Available Compression Methods
164
165
```python
166
# No compression (fastest I/O, largest files)
167
dump(data, 'data.pkl', compress=False)
168
169
# Zlib compression (good balance, default)
170
dump(data, 'data.pkl', compress=True) # Level 1
171
dump(data, 'data.pkl', compress=6) # Level 6
172
dump(data, 'data.pkl', compress='zlib') # Method name
173
174
# Gzip compression (widely compatible)
175
dump(data, 'data.pkl', compress='gzip')
176
dump(data, 'data.pkl', compress=('gzip', 9)) # Maximum compression
177
178
# Bzip2 compression (high compression ratio, slower)
179
dump(data, 'data.pkl', compress='bz2')
180
dump(data, 'data.pkl', compress=('bz2', 9))
181
182
# LZMA/XZ compression (highest compression, slowest)
183
dump(data, 'data.pkl', compress='lzma')
184
dump(data, 'data.pkl', compress='xz')
185
186
# LZ4 compression (fastest compression, lower ratio)
187
dump(data, 'data.pkl', compress='lz4') # Requires python-lz4 package
188
```
189
190
### Compression Performance Comparison
191
192
```python
193
import time
194
import numpy as np
195
from joblib import dump, load
196
197
# Generate test data
198
large_data = {
199
'arrays': [np.random.random((1000, 1000)) for _ in range(5)],
200
'sparse_data': np.zeros((10000, 10000)),
201
'metadata': {'created': time.time(), 'size': 'large'}
202
}
203
204
# Test different compression methods
205
methods = [
206
(False, "No compression"),
207
(1, "Zlib level 1"),
208
(6, "Zlib level 6"),
209
('gzip', "Gzip"),
210
('bz2', "Bzip2"),
211
('lz4', "LZ4"),
212
]
213
214
for compress, description in methods:
215
start_time = time.time()
216
dump(large_data, f'test_{description.lower().replace(" ", "_")}.pkl', compress=compress)
217
dump_time = time.time() - start_time
218
219
start_time = time.time()
220
loaded_data = load(f'test_{description.lower().replace(" ", "_")}.pkl')
221
load_time = time.time() - start_time
222
223
file_size = os.path.getsize(f'test_{description.lower().replace(" ", "_")}.pkl')
224
print(f"{description}: {dump_time:.2f}s dump, {load_time:.2f}s load, {file_size/1024**2:.1f}MB")
225
```
226
227
## Memory Mapping Strategies
228
229
### Efficient Large Data Access
230
231
```python
232
from joblib import dump, load
233
import numpy as np
234
235
# Save large dataset
236
dataset = {
237
'features': np.random.random((100000, 200)),
238
'labels': np.random.randint(0, 10, 100000),
239
'metadata': {'samples': 100000, 'features': 200}
240
}
241
242
dump(dataset, 'large_dataset.pkl')
243
244
# Memory map for efficient partial access
245
mapped_data = load('large_dataset.pkl', mmap_mode='r')
246
247
# Access subset without loading entire array
248
subset_features = mapped_data['features'][1000:2000] # Only loads this slice
249
subset_labels = mapped_data['labels'][1000:2000]
250
251
# Process data in chunks to manage memory
252
def process_in_chunks(data, chunk_size=1000):
253
n_samples = data['features'].shape[0]
254
results = []
255
256
for start in range(0, n_samples, chunk_size):
257
end = min(start + chunk_size, n_samples)
258
chunk_features = data['features'][start:end]
259
chunk_labels = data['labels'][start:end]
260
261
# Process chunk
262
chunk_result = process_chunk(chunk_features, chunk_labels)
263
results.append(chunk_result)
264
265
return results
266
267
# Process without loading entire dataset into memory
268
results = process_in_chunks(mapped_data)
269
```
270
271
### Cross-Platform Compatibility
272
273
```python
274
from joblib import dump, load
275
import numpy as np
276
277
# Ensure consistent byte order across platforms
278
data = np.random.random(1000).astype(np.float64)
279
dump(data, 'cross_platform_data.pkl')
280
281
# Load with automatic byte order handling
282
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order="auto")
283
284
# Force byte order conversion if needed
285
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order=True)
286
287
# Preserve original byte order
288
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order=False)
289
```
290
291
## Integration with Scientific Computing
292
293
### NumPy Array Optimizations
294
295
```python
296
import numpy as np
297
from joblib import dump, load
298
299
# Joblib automatically optimizes NumPy array storage
300
arrays = {
301
'float32_array': np.random.random(10000).astype(np.float32),
302
'int64_array': np.arange(10000, dtype=np.int64),
303
'complex_array': np.random.random(5000) + 1j * np.random.random(5000),
304
'structured_array': np.array([(i, f'item_{i}') for i in range(1000)],
305
dtype=[('id', 'i4'), ('name', 'U10')])
306
}
307
308
# Efficient storage with type preservation
309
dump(arrays, 'numpy_arrays.pkl', compress=True)
310
loaded_arrays = load('numpy_arrays.pkl')
311
312
# Verify types are preserved
313
assert loaded_arrays['float32_array'].dtype == np.float32
314
assert loaded_arrays['structured_array'].dtype.names == ('id', 'name')
315
```
316
317
### Machine Learning Model Persistence
318
319
```python
320
from joblib import dump, load
321
import numpy as np
322
323
# Example scikit-learn style model
324
class SimpleLinearRegression:
325
def __init__(self):
326
self.weights = None
327
self.bias = None
328
self.training_history = []
329
330
def fit(self, X, y):
331
# Simple linear regression fitting
332
self.weights = np.linalg.lstsq(X, y, rcond=None)[0]
333
self.bias = np.mean(y - X @ self.weights)
334
self.training_history.append({'samples': len(X), 'features': X.shape[1]})
335
return self
336
337
def predict(self, X):
338
return X @ self.weights + self.bias
339
340
# Train and save model
341
X_train = np.random.random((1000, 10))
342
y_train = X_train @ np.random.random(10) + np.random.random() * 0.1
343
344
model = SimpleLinearRegression()
345
model.fit(X_train, y_train)
346
347
# Persist trained model
348
dump(model, 'trained_model.pkl', compress=True)
349
350
# Load model for inference
351
loaded_model = load('trained_model.pkl')
352
predictions = loaded_model.predict(X_test)
353
```