Lightweight pipelining with Python functions for disk-caching, parallel computing, and fast compressed persistence
—
Fast compressed persistence optimized for Python objects containing large NumPy arrays. Provides memory mapping support, multiple compression algorithms, and cross-platform compatibility as a replacement for pickle, specifically designed for scientific computing and machine learning workflows.
High-performance serialization and deserialization of Python objects with special optimizations for NumPy arrays and scientific data structures.
def dump(value, filename, compress=0, protocol=None):
"""
Persist arbitrary Python object to file with optional compression.
Parameters:
- value: any Python object to store
- filename: str, pathlib.Path, or file object for output
- compress: compression specification:
- False or 0: no compression
- True or 1-9: zlib compression level
- str: compression method ('zlib', 'gzip', 'bz2', 'lzma', 'xz', 'lz4')
- tuple: (method, level) for specific compression and level
- protocol: int, pickle protocol version (None for highest available)
Returns:
str: filename if string was passed, None otherwise
"""
def load(filename, mmap_mode=None, ensure_native_byte_order="auto"):
"""
Reconstruct Python object from file created with joblib.dump.
Parameters:
- filename: str, pathlib.Path, or file object to read from
- mmap_mode: memory mapping mode for NumPy arrays:
- None: load normally into memory
- 'r+': read-write memory mapping
- 'r': read-only memory mapping
- 'w+': write memory mapping
- 'c': copy-on-write memory mapping
- ensure_native_byte_order: byte order handling:
- "auto": automatic conversion if needed
- True: force native byte order conversion
- False: preserve original byte order
Returns:
Reconstructed Python object
"""Basic Usage Examples:
from joblib import dump, load
import numpy as np
# Simple object persistence
data = {'array': np.random.random(1000), 'metadata': {'version': 1}}
dump(data, 'data.pkl')
loaded_data = load('data.pkl')
# With compression
large_array = np.random.random((10000, 1000))
dump(large_array, 'large_data.pkl', compress=3) # zlib level 3
loaded_array = load('large_data.pkl')
# Different compression methods
dump(data, 'data_gzip.pkl', compress='gzip')
dump(data, 'data_bz2.pkl', compress=('bz2', 9)) # bz2 level 9
dump(data, 'data_lz4.pkl', compress='lz4') # Fast compression
# File objects
with open('output.pkl', 'wb') as f:
dump(data, f, compress=True)
with open('output.pkl', 'rb') as f:
loaded_data = load(f)Memory Mapping Examples:
import numpy as np
from joblib import dump, load
# Create and save large array
huge_array = np.random.random((50000, 1000))
dump(huge_array, 'huge_array.pkl')
# Memory map for efficient access without loading into RAM
mapped_array = load('huge_array.pkl', mmap_mode='r')
print(f"Array shape: {mapped_array.shape}")
print(f"Mean of first 1000 elements: {np.mean(mapped_array[:1000, :])}")
# Read-write memory mapping
mapped_rw = load('huge_array.pkl', mmap_mode='r+')
mapped_rw[0, 0] = 999.0 # Modifies the file directly
# Copy-on-write mapping (changes don't affect original file)
mapped_cow = load('huge_array.pkl', mmap_mode='c')
mapped_cow[0, 0] = 888.0 # Creates a copy when modifiedAdvanced Persistence Patterns:
from joblib import dump, load
import numpy as np
from pathlib import Path
# Custom objects with __getstate__/__setstate__
class CustomModel:
def __init__(self, weights, metadata):
self.weights = weights
self.metadata = metadata
self._fitted = False
def fit(self, data):
self._fitted = True
return self
def __getstate__(self):
# Custom serialization logic
state = self.__dict__.copy()
# Remove unpicklable attributes if needed
return state
def __setstate__(self, state):
# Custom deserialization logic
self.__dict__.update(state)
# Serialize complex model
model = CustomModel(np.random.random((100, 50)), {'version': '1.0'})
model.fit(training_data)
dump(model, 'trained_model.pkl', compress=True)
loaded_model = load('trained_model.pkl')
# Batch processing with efficient I/O
def save_batch(data_batch, batch_id, output_dir):
filename = Path(output_dir) / f'batch_{batch_id:04d}.pkl'
dump(data_batch, filename, compress='lz4') # Fast compression
def load_batch(batch_id, output_dir):
filename = Path(output_dir) / f'batch_{batch_id:04d}.pkl'
return load(filename)
# Process large dataset in batches
output_dir = Path('./processed_batches')
output_dir.mkdir(exist_ok=True)
# Save batches
for i, batch in enumerate(data_batches):
processed_batch = process_data(batch)
save_batch(processed_batch, i, output_dir)
# Load specific batches as needed
batch_5 = load_batch(5, output_dir)# No compression (fastest I/O, largest files)
dump(data, 'data.pkl', compress=False)
# Zlib compression (good balance, default)
dump(data, 'data.pkl', compress=True) # Level 1
dump(data, 'data.pkl', compress=6) # Level 6
dump(data, 'data.pkl', compress='zlib') # Method name
# Gzip compression (widely compatible)
dump(data, 'data.pkl', compress='gzip')
dump(data, 'data.pkl', compress=('gzip', 9)) # Maximum compression
# Bzip2 compression (high compression ratio, slower)
dump(data, 'data.pkl', compress='bz2')
dump(data, 'data.pkl', compress=('bz2', 9))
# LZMA/XZ compression (highest compression, slowest)
dump(data, 'data.pkl', compress='lzma')
dump(data, 'data.pkl', compress='xz')
# LZ4 compression (fastest compression, lower ratio)
dump(data, 'data.pkl', compress='lz4') # Requires python-lz4 packageimport time
import numpy as np
from joblib import dump, load
# Generate test data
large_data = {
'arrays': [np.random.random((1000, 1000)) for _ in range(5)],
'sparse_data': np.zeros((10000, 10000)),
'metadata': {'created': time.time(), 'size': 'large'}
}
# Test different compression methods
methods = [
(False, "No compression"),
(1, "Zlib level 1"),
(6, "Zlib level 6"),
('gzip', "Gzip"),
('bz2', "Bzip2"),
('lz4', "LZ4"),
]
for compress, description in methods:
start_time = time.time()
dump(large_data, f'test_{description.lower().replace(" ", "_")}.pkl', compress=compress)
dump_time = time.time() - start_time
start_time = time.time()
loaded_data = load(f'test_{description.lower().replace(" ", "_")}.pkl')
load_time = time.time() - start_time
file_size = os.path.getsize(f'test_{description.lower().replace(" ", "_")}.pkl')
print(f"{description}: {dump_time:.2f}s dump, {load_time:.2f}s load, {file_size/1024**2:.1f}MB")from joblib import dump, load
import numpy as np
# Save large dataset
dataset = {
'features': np.random.random((100000, 200)),
'labels': np.random.randint(0, 10, 100000),
'metadata': {'samples': 100000, 'features': 200}
}
dump(dataset, 'large_dataset.pkl')
# Memory map for efficient partial access
mapped_data = load('large_dataset.pkl', mmap_mode='r')
# Access subset without loading entire array
subset_features = mapped_data['features'][1000:2000] # Only loads this slice
subset_labels = mapped_data['labels'][1000:2000]
# Process data in chunks to manage memory
def process_in_chunks(data, chunk_size=1000):
n_samples = data['features'].shape[0]
results = []
for start in range(0, n_samples, chunk_size):
end = min(start + chunk_size, n_samples)
chunk_features = data['features'][start:end]
chunk_labels = data['labels'][start:end]
# Process chunk
chunk_result = process_chunk(chunk_features, chunk_labels)
results.append(chunk_result)
return results
# Process without loading entire dataset into memory
results = process_in_chunks(mapped_data)from joblib import dump, load
import numpy as np
# Ensure consistent byte order across platforms
data = np.random.random(1000).astype(np.float64)
dump(data, 'cross_platform_data.pkl')
# Load with automatic byte order handling
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order="auto")
# Force byte order conversion if needed
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order=True)
# Preserve original byte order
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order=False)import numpy as np
from joblib import dump, load
# Joblib automatically optimizes NumPy array storage
arrays = {
'float32_array': np.random.random(10000).astype(np.float32),
'int64_array': np.arange(10000, dtype=np.int64),
'complex_array': np.random.random(5000) + 1j * np.random.random(5000),
'structured_array': np.array([(i, f'item_{i}') for i in range(1000)],
dtype=[('id', 'i4'), ('name', 'U10')])
}
# Efficient storage with type preservation
dump(arrays, 'numpy_arrays.pkl', compress=True)
loaded_arrays = load('numpy_arrays.pkl')
# Verify types are preserved
assert loaded_arrays['float32_array'].dtype == np.float32
assert loaded_arrays['structured_array'].dtype.names == ('id', 'name')from joblib import dump, load
import numpy as np
# Example scikit-learn style model
class SimpleLinearRegression:
def __init__(self):
self.weights = None
self.bias = None
self.training_history = []
def fit(self, X, y):
# Simple linear regression fitting
self.weights = np.linalg.lstsq(X, y, rcond=None)[0]
self.bias = np.mean(y - X @ self.weights)
self.training_history.append({'samples': len(X), 'features': X.shape[1]})
return self
def predict(self, X):
return X @ self.weights + self.bias
# Train and save model
X_train = np.random.random((1000, 10))
y_train = X_train @ np.random.random(10) + np.random.random() * 0.1
model = SimpleLinearRegression()
model.fit(X_train, y_train)
# Persist trained model
dump(model, 'trained_model.pkl', compress=True)
# Load model for inference
loaded_model = load('trained_model.pkl')
predictions = loaded_model.predict(X_test)Install with Tessl CLI
npx tessl i tessl/pypi-joblib