Tessl Tile for pypi/joblib@1.5.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

index.md memory-caching.md parallel-processing.md persistence-serialization.md utilities-infrastructure.md

persistence-serialization.mddocs/

0
# Persistence and Serialization
1

2
Fast compressed persistence optimized for Python objects containing large NumPy arrays. Provides memory mapping support, multiple compression algorithms, and cross-platform compatibility as a replacement for pickle, specifically designed for scientific computing and machine learning workflows.
3

4
## Capabilities
5

6
### Object Persistence
7

8
High-performance serialization and deserialization of Python objects with special optimizations for NumPy arrays and scientific data structures.
9

10
```python { .api }
11
def dump(value, filename, compress=0, protocol=None):
12
    """
13
    Persist arbitrary Python object to file with optional compression.
14

15
    Parameters:
16
    - value: any Python object to store
17
    - filename: str, pathlib.Path, or file object for output
18
    - compress: compression specification:
19
        - False or 0: no compression
20
        - True or 1-9: zlib compression level
21
        - str: compression method ('zlib', 'gzip', 'bz2', 'lzma', 'xz', 'lz4')
22
        - tuple: (method, level) for specific compression and level
23
    - protocol: int, pickle protocol version (None for highest available)
24

25
    Returns:
26
    str: filename if string was passed, None otherwise
27
    """
28

29
def load(filename, mmap_mode=None, ensure_native_byte_order="auto"):
30
    """
31
    Reconstruct Python object from file created with joblib.dump.
32

33
    Parameters:
34
    - filename: str, pathlib.Path, or file object to read from
35
    - mmap_mode: memory mapping mode for NumPy arrays:
36
        - None: load normally into memory
37
        - 'r+': read-write memory mapping
38
        - 'r': read-only memory mapping
39
        - 'w+': write memory mapping
40
        - 'c': copy-on-write memory mapping
41
    - ensure_native_byte_order: byte order handling:
42
        - "auto": automatic conversion if needed
43
        - True: force native byte order conversion
44
        - False: preserve original byte order
45

46
    Returns:
47
    Reconstructed Python object
48
    """
49
```
50

51
**Basic Usage Examples:**
52

53
```python
54
from joblib import dump, load
55
import numpy as np
56

57
# Simple object persistence
58
data = {'array': np.random.random(1000), 'metadata': {'version': 1}}
59
dump(data, 'data.pkl')
60
loaded_data = load('data.pkl')
61

62
# With compression
63
large_array = np.random.random((10000, 1000))
64
dump(large_array, 'large_data.pkl', compress=3)  # zlib level 3
65
loaded_array = load('large_data.pkl')
66

67
# Different compression methods
68
dump(data, 'data_gzip.pkl', compress='gzip')
69
dump(data, 'data_bz2.pkl', compress=('bz2', 9))  # bz2 level 9
70
dump(data, 'data_lz4.pkl', compress='lz4')       # Fast compression
71

72
# File objects
73
with open('output.pkl', 'wb') as f:
74
    dump(data, f, compress=True)
75

76
with open('output.pkl', 'rb') as f:
77
    loaded_data = load(f)
78
```
79

80
**Memory Mapping Examples:**
81

82
```python
83
import numpy as np
84
from joblib import dump, load
85

86
# Create and save large array
87
huge_array = np.random.random((50000, 1000))
88
dump(huge_array, 'huge_array.pkl')
89

90
# Memory map for efficient access without loading into RAM
91
mapped_array = load('huge_array.pkl', mmap_mode='r')
92
print(f"Array shape: {mapped_array.shape}")
93
print(f"Mean of first 1000 elements: {np.mean(mapped_array[:1000, :])}")
94

95
# Read-write memory mapping
96
mapped_rw = load('huge_array.pkl', mmap_mode='r+')
97
mapped_rw[0, 0] = 999.0  # Modifies the file directly
98

99
# Copy-on-write mapping (changes don't affect original file)
100
mapped_cow = load('huge_array.pkl', mmap_mode='c')
101
mapped_cow[0, 0] = 888.0  # Creates a copy when modified
102
```
103

104
**Advanced Persistence Patterns:**
105

106
```python
107
from joblib import dump, load
108
import numpy as np
109
from pathlib import Path
110

111
# Custom objects with __getstate__/__setstate__
112
class CustomModel:
113
    def __init__(self, weights, metadata):
114
        self.weights = weights
115
        self.metadata = metadata
116
        self._fitted = False
117
    
118
    def fit(self, data):
119
        self._fitted = True
120
        return self
121
    
122
    def __getstate__(self):
123
        # Custom serialization logic
124
        state = self.__dict__.copy()
125
        # Remove unpicklable attributes if needed
126
        return state
127
    
128
    def __setstate__(self, state):
129
        # Custom deserialization logic
130
        self.__dict__.update(state)
131

132
# Serialize complex model
133
model = CustomModel(np.random.random((100, 50)), {'version': '1.0'})
134
model.fit(training_data)
135

136
dump(model, 'trained_model.pkl', compress=True)
137
loaded_model = load('trained_model.pkl')
138

139
# Batch processing with efficient I/O
140
def save_batch(data_batch, batch_id, output_dir):
141
    filename = Path(output_dir) / f'batch_{batch_id:04d}.pkl'
142
    dump(data_batch, filename, compress='lz4')  # Fast compression
143

144
def load_batch(batch_id, output_dir):
145
    filename = Path(output_dir) / f'batch_{batch_id:04d}.pkl'
146
    return load(filename)
147

148
# Process large dataset in batches
149
output_dir = Path('./processed_batches')
150
output_dir.mkdir(exist_ok=True)
151

152
# Save batches
153
for i, batch in enumerate(data_batches):
154
    processed_batch = process_data(batch)
155
    save_batch(processed_batch, i, output_dir)
156

157
# Load specific batches as needed
158
batch_5 = load_batch(5, output_dir)
159
```
160

161
## Compression Options
162

163
### Available Compression Methods
164

165
```python
166
# No compression (fastest I/O, largest files)
167
dump(data, 'data.pkl', compress=False)
168

169
# Zlib compression (good balance, default)
170
dump(data, 'data.pkl', compress=True)   # Level 1
171
dump(data, 'data.pkl', compress=6)      # Level 6
172
dump(data, 'data.pkl', compress='zlib') # Method name
173

174
# Gzip compression (widely compatible)
175
dump(data, 'data.pkl', compress='gzip')
176
dump(data, 'data.pkl', compress=('gzip', 9))  # Maximum compression
177

178
# Bzip2 compression (high compression ratio, slower)
179
dump(data, 'data.pkl', compress='bz2')
180
dump(data, 'data.pkl', compress=('bz2', 9))
181

182
# LZMA/XZ compression (highest compression, slowest)
183
dump(data, 'data.pkl', compress='lzma')
184
dump(data, 'data.pkl', compress='xz')
185

186
# LZ4 compression (fastest compression, lower ratio)
187
dump(data, 'data.pkl', compress='lz4')  # Requires python-lz4 package
188
```
189

190
### Compression Performance Comparison
191

192
```python
193
import time
194
import numpy as np
195
from joblib import dump, load
196

197
# Generate test data
198
large_data = {
199
    'arrays': [np.random.random((1000, 1000)) for _ in range(5)],
200
    'sparse_data': np.zeros((10000, 10000)),
201
    'metadata': {'created': time.time(), 'size': 'large'}
202
}
203

204
# Test different compression methods
205
methods = [
206
    (False, "No compression"),
207
    (1, "Zlib level 1"),
208
    (6, "Zlib level 6"),
209
    ('gzip', "Gzip"),
210
    ('bz2', "Bzip2"),
211
    ('lz4', "LZ4"),
212
]
213

214
for compress, description in methods:
215
    start_time = time.time()
216
    dump(large_data, f'test_{description.lower().replace(" ", "_")}.pkl', compress=compress)
217
    dump_time = time.time() - start_time
218
    
219
    start_time = time.time()
220
    loaded_data = load(f'test_{description.lower().replace(" ", "_")}.pkl')
221
    load_time = time.time() - start_time
222
    
223
    file_size = os.path.getsize(f'test_{description.lower().replace(" ", "_")}.pkl')
224
    print(f"{description}: {dump_time:.2f}s dump, {load_time:.2f}s load, {file_size/1024**2:.1f}MB")
225
```
226

227
## Memory Mapping Strategies
228

229
### Efficient Large Data Access
230

231
```python
232
from joblib import dump, load
233
import numpy as np
234

235
# Save large dataset
236
dataset = {
237
    'features': np.random.random((100000, 200)),
238
    'labels': np.random.randint(0, 10, 100000),
239
    'metadata': {'samples': 100000, 'features': 200}
240
}
241

242
dump(dataset, 'large_dataset.pkl')
243

244
# Memory map for efficient partial access
245
mapped_data = load('large_dataset.pkl', mmap_mode='r')
246

247
# Access subset without loading entire array
248
subset_features = mapped_data['features'][1000:2000]  # Only loads this slice
249
subset_labels = mapped_data['labels'][1000:2000]
250

251
# Process data in chunks to manage memory
252
def process_in_chunks(data, chunk_size=1000):
253
    n_samples = data['features'].shape[0]
254
    results = []
255
    
256
    for start in range(0, n_samples, chunk_size):
257
        end = min(start + chunk_size, n_samples)
258
        chunk_features = data['features'][start:end]
259
        chunk_labels = data['labels'][start:end]
260
        
261
        # Process chunk
262
        chunk_result = process_chunk(chunk_features, chunk_labels)
263
        results.append(chunk_result)
264
    
265
    return results
266

267
# Process without loading entire dataset into memory
268
results = process_in_chunks(mapped_data)
269
```
270

271
### Cross-Platform Compatibility
272

273
```python
274
from joblib import dump, load
275
import numpy as np
276

277
# Ensure consistent byte order across platforms
278
data = np.random.random(1000).astype(np.float64)
279
dump(data, 'cross_platform_data.pkl')
280

281
# Load with automatic byte order handling
282
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order="auto")
283

284
# Force byte order conversion if needed
285
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order=True)
286

287
# Preserve original byte order
288
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order=False)
289
```
290

291
## Integration with Scientific Computing
292

293
### NumPy Array Optimizations
294

295
```python
296
import numpy as np
297
from joblib import dump, load
298

299
# Joblib automatically optimizes NumPy array storage
300
arrays = {
301
    'float32_array': np.random.random(10000).astype(np.float32),
302
    'int64_array': np.arange(10000, dtype=np.int64),
303
    'complex_array': np.random.random(5000) + 1j * np.random.random(5000),
304
    'structured_array': np.array([(i, f'item_{i}') for i in range(1000)], 
305
                                dtype=[('id', 'i4'), ('name', 'U10')])
306
}
307

308
# Efficient storage with type preservation
309
dump(arrays, 'numpy_arrays.pkl', compress=True)
310
loaded_arrays = load('numpy_arrays.pkl')
311

312
# Verify types are preserved
313
assert loaded_arrays['float32_array'].dtype == np.float32
314
assert loaded_arrays['structured_array'].dtype.names == ('id', 'name')
315
```
316

317
### Machine Learning Model Persistence
318

319
```python
320
from joblib import dump, load
321
import numpy as np
322

323
# Example scikit-learn style model
324
class SimpleLinearRegression:
325
    def __init__(self):
326
        self.weights = None
327
        self.bias = None
328
        self.training_history = []
329
    
330
    def fit(self, X, y):
331
        # Simple linear regression fitting
332
        self.weights = np.linalg.lstsq(X, y, rcond=None)[0]
333
        self.bias = np.mean(y - X @ self.weights)
334
        self.training_history.append({'samples': len(X), 'features': X.shape[1]})
335
        return self
336
    
337
    def predict(self, X):
338
        return X @ self.weights + self.bias
339

340
# Train and save model
341
X_train = np.random.random((1000, 10))
342
y_train = X_train @ np.random.random(10) + np.random.random() * 0.1
343

344
model = SimpleLinearRegression()
345
model.fit(X_train, y_train)
346

347
# Persist trained model
348
dump(model, 'trained_model.pkl', compress=True)
349

350
# Load model for inference
351
loaded_model = load('trained_model.pkl')
352
predictions = loaded_model.predict(X_test)
353
```

Version

Tile

Files

persistence-serialization.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

persistence-serialization.mddocs/