or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

index.mdmemory-caching.mdparallel-processing.mdpersistence-serialization.mdutilities-infrastructure.md

persistence-serialization.mddocs/

0

# Persistence and Serialization

1

2

Fast compressed persistence optimized for Python objects containing large NumPy arrays. Provides memory mapping support, multiple compression algorithms, and cross-platform compatibility as a replacement for pickle, specifically designed for scientific computing and machine learning workflows.

3

4

## Capabilities

5

6

### Object Persistence

7

8

High-performance serialization and deserialization of Python objects with special optimizations for NumPy arrays and scientific data structures.

9

10

```python { .api }

11

def dump(value, filename, compress=0, protocol=None):

12

"""

13

Persist arbitrary Python object to file with optional compression.

14

15

Parameters:

16

- value: any Python object to store

17

- filename: str, pathlib.Path, or file object for output

18

- compress: compression specification:

19

- False or 0: no compression

20

- True or 1-9: zlib compression level

21

- str: compression method ('zlib', 'gzip', 'bz2', 'lzma', 'xz', 'lz4')

22

- tuple: (method, level) for specific compression and level

23

- protocol: int, pickle protocol version (None for highest available)

24

25

Returns:

26

str: filename if string was passed, None otherwise

27

"""

28

29

def load(filename, mmap_mode=None, ensure_native_byte_order="auto"):

30

"""

31

Reconstruct Python object from file created with joblib.dump.

32

33

Parameters:

34

- filename: str, pathlib.Path, or file object to read from

35

- mmap_mode: memory mapping mode for NumPy arrays:

36

- None: load normally into memory

37

- 'r+': read-write memory mapping

38

- 'r': read-only memory mapping

39

- 'w+': write memory mapping

40

- 'c': copy-on-write memory mapping

41

- ensure_native_byte_order: byte order handling:

42

- "auto": automatic conversion if needed

43

- True: force native byte order conversion

44

- False: preserve original byte order

45

46

Returns:

47

Reconstructed Python object

48

"""

49

```

50

51

**Basic Usage Examples:**

52

53

```python

54

from joblib import dump, load

55

import numpy as np

56

57

# Simple object persistence

58

data = {'array': np.random.random(1000), 'metadata': {'version': 1}}

59

dump(data, 'data.pkl')

60

loaded_data = load('data.pkl')

61

62

# With compression

63

large_array = np.random.random((10000, 1000))

64

dump(large_array, 'large_data.pkl', compress=3) # zlib level 3

65

loaded_array = load('large_data.pkl')

66

67

# Different compression methods

68

dump(data, 'data_gzip.pkl', compress='gzip')

69

dump(data, 'data_bz2.pkl', compress=('bz2', 9)) # bz2 level 9

70

dump(data, 'data_lz4.pkl', compress='lz4') # Fast compression

71

72

# File objects

73

with open('output.pkl', 'wb') as f:

74

dump(data, f, compress=True)

75

76

with open('output.pkl', 'rb') as f:

77

loaded_data = load(f)

78

```

79

80

**Memory Mapping Examples:**

81

82

```python

83

import numpy as np

84

from joblib import dump, load

85

86

# Create and save large array

87

huge_array = np.random.random((50000, 1000))

88

dump(huge_array, 'huge_array.pkl')

89

90

# Memory map for efficient access without loading into RAM

91

mapped_array = load('huge_array.pkl', mmap_mode='r')

92

print(f"Array shape: {mapped_array.shape}")

93

print(f"Mean of first 1000 elements: {np.mean(mapped_array[:1000, :])}")

94

95

# Read-write memory mapping

96

mapped_rw = load('huge_array.pkl', mmap_mode='r+')

97

mapped_rw[0, 0] = 999.0 # Modifies the file directly

98

99

# Copy-on-write mapping (changes don't affect original file)

100

mapped_cow = load('huge_array.pkl', mmap_mode='c')

101

mapped_cow[0, 0] = 888.0 # Creates a copy when modified

102

```

103

104

**Advanced Persistence Patterns:**

105

106

```python

107

from joblib import dump, load

108

import numpy as np

109

from pathlib import Path

110

111

# Custom objects with __getstate__/__setstate__

112

class CustomModel:

113

def __init__(self, weights, metadata):

114

self.weights = weights

115

self.metadata = metadata

116

self._fitted = False

117

118

def fit(self, data):

119

self._fitted = True

120

return self

121

122

def __getstate__(self):

123

# Custom serialization logic

124

state = self.__dict__.copy()

125

# Remove unpicklable attributes if needed

126

return state

127

128

def __setstate__(self, state):

129

# Custom deserialization logic

130

self.__dict__.update(state)

131

132

# Serialize complex model

133

model = CustomModel(np.random.random((100, 50)), {'version': '1.0'})

134

model.fit(training_data)

135

136

dump(model, 'trained_model.pkl', compress=True)

137

loaded_model = load('trained_model.pkl')

138

139

# Batch processing with efficient I/O

140

def save_batch(data_batch, batch_id, output_dir):

141

filename = Path(output_dir) / f'batch_{batch_id:04d}.pkl'

142

dump(data_batch, filename, compress='lz4') # Fast compression

143

144

def load_batch(batch_id, output_dir):

145

filename = Path(output_dir) / f'batch_{batch_id:04d}.pkl'

146

return load(filename)

147

148

# Process large dataset in batches

149

output_dir = Path('./processed_batches')

150

output_dir.mkdir(exist_ok=True)

151

152

# Save batches

153

for i, batch in enumerate(data_batches):

154

processed_batch = process_data(batch)

155

save_batch(processed_batch, i, output_dir)

156

157

# Load specific batches as needed

158

batch_5 = load_batch(5, output_dir)

159

```

160

161

## Compression Options

162

163

### Available Compression Methods

164

165

```python

166

# No compression (fastest I/O, largest files)

167

dump(data, 'data.pkl', compress=False)

168

169

# Zlib compression (good balance, default)

170

dump(data, 'data.pkl', compress=True) # Level 1

171

dump(data, 'data.pkl', compress=6) # Level 6

172

dump(data, 'data.pkl', compress='zlib') # Method name

173

174

# Gzip compression (widely compatible)

175

dump(data, 'data.pkl', compress='gzip')

176

dump(data, 'data.pkl', compress=('gzip', 9)) # Maximum compression

177

178

# Bzip2 compression (high compression ratio, slower)

179

dump(data, 'data.pkl', compress='bz2')

180

dump(data, 'data.pkl', compress=('bz2', 9))

181

182

# LZMA/XZ compression (highest compression, slowest)

183

dump(data, 'data.pkl', compress='lzma')

184

dump(data, 'data.pkl', compress='xz')

185

186

# LZ4 compression (fastest compression, lower ratio)

187

dump(data, 'data.pkl', compress='lz4') # Requires python-lz4 package

188

```

189

190

### Compression Performance Comparison

191

192

```python

193

import time

194

import numpy as np

195

from joblib import dump, load

196

197

# Generate test data

198

large_data = {

199

'arrays': [np.random.random((1000, 1000)) for _ in range(5)],

200

'sparse_data': np.zeros((10000, 10000)),

201

'metadata': {'created': time.time(), 'size': 'large'}

202

}

203

204

# Test different compression methods

205

methods = [

206

(False, "No compression"),

207

(1, "Zlib level 1"),

208

(6, "Zlib level 6"),

209

('gzip', "Gzip"),

210

('bz2', "Bzip2"),

211

('lz4', "LZ4"),

212

]

213

214

for compress, description in methods:

215

start_time = time.time()

216

dump(large_data, f'test_{description.lower().replace(" ", "_")}.pkl', compress=compress)

217

dump_time = time.time() - start_time

218

219

start_time = time.time()

220

loaded_data = load(f'test_{description.lower().replace(" ", "_")}.pkl')

221

load_time = time.time() - start_time

222

223

file_size = os.path.getsize(f'test_{description.lower().replace(" ", "_")}.pkl')

224

print(f"{description}: {dump_time:.2f}s dump, {load_time:.2f}s load, {file_size/1024**2:.1f}MB")

225

```

226

227

## Memory Mapping Strategies

228

229

### Efficient Large Data Access

230

231

```python

232

from joblib import dump, load

233

import numpy as np

234

235

# Save large dataset

236

dataset = {

237

'features': np.random.random((100000, 200)),

238

'labels': np.random.randint(0, 10, 100000),

239

'metadata': {'samples': 100000, 'features': 200}

240

}

241

242

dump(dataset, 'large_dataset.pkl')

243

244

# Memory map for efficient partial access

245

mapped_data = load('large_dataset.pkl', mmap_mode='r')

246

247

# Access subset without loading entire array

248

subset_features = mapped_data['features'][1000:2000] # Only loads this slice

249

subset_labels = mapped_data['labels'][1000:2000]

250

251

# Process data in chunks to manage memory

252

def process_in_chunks(data, chunk_size=1000):

253

n_samples = data['features'].shape[0]

254

results = []

255

256

for start in range(0, n_samples, chunk_size):

257

end = min(start + chunk_size, n_samples)

258

chunk_features = data['features'][start:end]

259

chunk_labels = data['labels'][start:end]

260

261

# Process chunk

262

chunk_result = process_chunk(chunk_features, chunk_labels)

263

results.append(chunk_result)

264

265

return results

266

267

# Process without loading entire dataset into memory

268

results = process_in_chunks(mapped_data)

269

```

270

271

### Cross-Platform Compatibility

272

273

```python

274

from joblib import dump, load

275

import numpy as np

276

277

# Ensure consistent byte order across platforms

278

data = np.random.random(1000).astype(np.float64)

279

dump(data, 'cross_platform_data.pkl')

280

281

# Load with automatic byte order handling

282

loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order="auto")

283

284

# Force byte order conversion if needed

285

loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order=True)

286

287

# Preserve original byte order

288

loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order=False)

289

```

290

291

## Integration with Scientific Computing

292

293

### NumPy Array Optimizations

294

295

```python

296

import numpy as np

297

from joblib import dump, load

298

299

# Joblib automatically optimizes NumPy array storage

300

arrays = {

301

'float32_array': np.random.random(10000).astype(np.float32),

302

'int64_array': np.arange(10000, dtype=np.int64),

303

'complex_array': np.random.random(5000) + 1j * np.random.random(5000),

304

'structured_array': np.array([(i, f'item_{i}') for i in range(1000)],

305

dtype=[('id', 'i4'), ('name', 'U10')])

306

}

307

308

# Efficient storage with type preservation

309

dump(arrays, 'numpy_arrays.pkl', compress=True)

310

loaded_arrays = load('numpy_arrays.pkl')

311

312

# Verify types are preserved

313

assert loaded_arrays['float32_array'].dtype == np.float32

314

assert loaded_arrays['structured_array'].dtype.names == ('id', 'name')

315

```

316

317

### Machine Learning Model Persistence

318

319

```python

320

from joblib import dump, load

321

import numpy as np

322

323

# Example scikit-learn style model

324

class SimpleLinearRegression:

325

def __init__(self):

326

self.weights = None

327

self.bias = None

328

self.training_history = []

329

330

def fit(self, X, y):

331

# Simple linear regression fitting

332

self.weights = np.linalg.lstsq(X, y, rcond=None)[0]

333

self.bias = np.mean(y - X @ self.weights)

334

self.training_history.append({'samples': len(X), 'features': X.shape[1]})

335

return self

336

337

def predict(self, X):

338

return X @ self.weights + self.bias

339

340

# Train and save model

341

X_train = np.random.random((1000, 10))

342

y_train = X_train @ np.random.random(10) + np.random.random() * 0.1

343

344

model = SimpleLinearRegression()

345

model.fit(X_train, y_train)

346

347

# Persist trained model

348

dump(model, 'trained_model.pkl', compress=True)

349

350

# Load model for inference

351

loaded_model = load('trained_model.pkl')

352

predictions = loaded_model.predict(X_test)

353

```