tessl/pypi-joblib

Lightweight pipelining with Python functions for disk-caching, parallel computing, and fast compressed persistence

—

Pending

Overview

Eval results

Files

Persistence and Serialization

Name: tessl/pypi-joblib
Author: tessl

Fast compressed persistence optimized for Python objects containing large NumPy arrays. Provides memory mapping support, multiple compression algorithms, and cross-platform compatibility as a replacement for pickle, specifically designed for scientific computing and machine learning workflows.

Capabilities

Object Persistence

High-performance serialization and deserialization of Python objects with special optimizations for NumPy arrays and scientific data structures.

def dump(value, filename, compress=0, protocol=None):
    """
    Persist arbitrary Python object to file with optional compression.

    Parameters:
    - value: any Python object to store
    - filename: str, pathlib.Path, or file object for output
    - compress: compression specification:
        - False or 0: no compression
        - True or 1-9: zlib compression level
        - str: compression method ('zlib', 'gzip', 'bz2', 'lzma', 'xz', 'lz4')
        - tuple: (method, level) for specific compression and level
    - protocol: int, pickle protocol version (None for highest available)

    Returns:
    str: filename if string was passed, None otherwise
    """

def load(filename, mmap_mode=None, ensure_native_byte_order="auto"):
    """
    Reconstruct Python object from file created with joblib.dump.

    Parameters:
    - filename: str, pathlib.Path, or file object to read from
    - mmap_mode: memory mapping mode for NumPy arrays:
        - None: load normally into memory
        - 'r+': read-write memory mapping
        - 'r': read-only memory mapping
        - 'w+': write memory mapping
        - 'c': copy-on-write memory mapping
    - ensure_native_byte_order: byte order handling:
        - "auto": automatic conversion if needed
        - True: force native byte order conversion
        - False: preserve original byte order

    Returns:
    Reconstructed Python object
    """

Basic Usage Examples:

from joblib import dump, load
import numpy as np

# Simple object persistence
data = {'array': np.random.random(1000), 'metadata': {'version': 1}}
dump(data, 'data.pkl')
loaded_data = load('data.pkl')

# With compression
large_array = np.random.random((10000, 1000))
dump(large_array, 'large_data.pkl', compress=3)  # zlib level 3
loaded_array = load('large_data.pkl')

# Different compression methods
dump(data, 'data_gzip.pkl', compress='gzip')
dump(data, 'data_bz2.pkl', compress=('bz2', 9))  # bz2 level 9
dump(data, 'data_lz4.pkl', compress='lz4')       # Fast compression

# File objects
with open('output.pkl', 'wb') as f:
    dump(data, f, compress=True)

with open('output.pkl', 'rb') as f:
    loaded_data = load(f)

Memory Mapping Examples:

import numpy as np
from joblib import dump, load

# Create and save large array
huge_array = np.random.random((50000, 1000))
dump(huge_array, 'huge_array.pkl')

# Memory map for efficient access without loading into RAM
mapped_array = load('huge_array.pkl', mmap_mode='r')
print(f"Array shape: {mapped_array.shape}")
print(f"Mean of first 1000 elements: {np.mean(mapped_array[:1000, :])}")

# Read-write memory mapping
mapped_rw = load('huge_array.pkl', mmap_mode='r+')
mapped_rw[0, 0] = 999.0  # Modifies the file directly

# Copy-on-write mapping (changes don't affect original file)
mapped_cow = load('huge_array.pkl', mmap_mode='c')
mapped_cow[0, 0] = 888.0  # Creates a copy when modified

Advanced Persistence Patterns:

from joblib import dump, load
import numpy as np
from pathlib import Path

# Custom objects with __getstate__/__setstate__
class CustomModel:
    def __init__(self, weights, metadata):
        self.weights = weights
        self.metadata = metadata
        self._fitted = False
    
    def fit(self, data):
        self._fitted = True
        return self
    
    def __getstate__(self):
        # Custom serialization logic
        state = self.__dict__.copy()
        # Remove unpicklable attributes if needed
        return state
    
    def __setstate__(self, state):
        # Custom deserialization logic
        self.__dict__.update(state)

# Serialize complex model
model = CustomModel(np.random.random((100, 50)), {'version': '1.0'})
model.fit(training_data)

dump(model, 'trained_model.pkl', compress=True)
loaded_model = load('trained_model.pkl')

# Batch processing with efficient I/O
def save_batch(data_batch, batch_id, output_dir):
    filename = Path(output_dir) / f'batch_{batch_id:04d}.pkl'
    dump(data_batch, filename, compress='lz4')  # Fast compression

def load_batch(batch_id, output_dir):
    filename = Path(output_dir) / f'batch_{batch_id:04d}.pkl'
    return load(filename)

# Process large dataset in batches
output_dir = Path('./processed_batches')
output_dir.mkdir(exist_ok=True)

# Save batches
for i, batch in enumerate(data_batches):
    processed_batch = process_data(batch)
    save_batch(processed_batch, i, output_dir)

# Load specific batches as needed
batch_5 = load_batch(5, output_dir)

Compression Options

Available Compression Methods

# No compression (fastest I/O, largest files)
dump(data, 'data.pkl', compress=False)

# Zlib compression (good balance, default)
dump(data, 'data.pkl', compress=True)   # Level 1
dump(data, 'data.pkl', compress=6)      # Level 6
dump(data, 'data.pkl', compress='zlib') # Method name

# Gzip compression (widely compatible)
dump(data, 'data.pkl', compress='gzip')
dump(data, 'data.pkl', compress=('gzip', 9))  # Maximum compression

# Bzip2 compression (high compression ratio, slower)
dump(data, 'data.pkl', compress='bz2')
dump(data, 'data.pkl', compress=('bz2', 9))

# LZMA/XZ compression (highest compression, slowest)
dump(data, 'data.pkl', compress='lzma')
dump(data, 'data.pkl', compress='xz')

# LZ4 compression (fastest compression, lower ratio)
dump(data, 'data.pkl', compress='lz4')  # Requires python-lz4 package

Compression Performance Comparison

import time
import numpy as np
from joblib import dump, load

# Generate test data
large_data = {
    'arrays': [np.random.random((1000, 1000)) for _ in range(5)],
    'sparse_data': np.zeros((10000, 10000)),
    'metadata': {'created': time.time(), 'size': 'large'}
}

# Test different compression methods
methods = [
    (False, "No compression"),
    (1, "Zlib level 1"),
    (6, "Zlib level 6"),
    ('gzip', "Gzip"),
    ('bz2', "Bzip2"),
    ('lz4', "LZ4"),
]

for compress, description in methods:
    start_time = time.time()
    dump(large_data, f'test_{description.lower().replace(" ", "_")}.pkl', compress=compress)
    dump_time = time.time() - start_time
    
    start_time = time.time()
    loaded_data = load(f'test_{description.lower().replace(" ", "_")}.pkl')
    load_time = time.time() - start_time
    
    file_size = os.path.getsize(f'test_{description.lower().replace(" ", "_")}.pkl')
    print(f"{description}: {dump_time:.2f}s dump, {load_time:.2f}s load, {file_size/1024**2:.1f}MB")

Memory Mapping Strategies

Efficient Large Data Access

from joblib import dump, load
import numpy as np

# Save large dataset
dataset = {
    'features': np.random.random((100000, 200)),
    'labels': np.random.randint(0, 10, 100000),
    'metadata': {'samples': 100000, 'features': 200}
}

dump(dataset, 'large_dataset.pkl')

# Memory map for efficient partial access
mapped_data = load('large_dataset.pkl', mmap_mode='r')

# Access subset without loading entire array
subset_features = mapped_data['features'][1000:2000]  # Only loads this slice
subset_labels = mapped_data['labels'][1000:2000]

# Process data in chunks to manage memory
def process_in_chunks(data, chunk_size=1000):
    n_samples = data['features'].shape[0]
    results = []
    
    for start in range(0, n_samples, chunk_size):
        end = min(start + chunk_size, n_samples)
        chunk_features = data['features'][start:end]
        chunk_labels = data['labels'][start:end]
        
        # Process chunk
        chunk_result = process_chunk(chunk_features, chunk_labels)
        results.append(chunk_result)
    
    return results

# Process without loading entire dataset into memory
results = process_in_chunks(mapped_data)

Cross-Platform Compatibility

from joblib import dump, load
import numpy as np

# Ensure consistent byte order across platforms
data = np.random.random(1000).astype(np.float64)
dump(data, 'cross_platform_data.pkl')

# Load with automatic byte order handling
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order="auto")

# Force byte order conversion if needed
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order=True)

# Preserve original byte order
loaded_data = load('cross_platform_data.pkl', ensure_native_byte_order=False)

Integration with Scientific Computing

NumPy Array Optimizations

import numpy as np
from joblib import dump, load

# Joblib automatically optimizes NumPy array storage
arrays = {
    'float32_array': np.random.random(10000).astype(np.float32),
    'int64_array': np.arange(10000, dtype=np.int64),
    'complex_array': np.random.random(5000) + 1j * np.random.random(5000),
    'structured_array': np.array([(i, f'item_{i}') for i in range(1000)], 
                                dtype=[('id', 'i4'), ('name', 'U10')])
}

# Efficient storage with type preservation
dump(arrays, 'numpy_arrays.pkl', compress=True)
loaded_arrays = load('numpy_arrays.pkl')

# Verify types are preserved
assert loaded_arrays['float32_array'].dtype == np.float32
assert loaded_arrays['structured_array'].dtype.names == ('id', 'name')

Machine Learning Model Persistence

from joblib import dump, load
import numpy as np

# Example scikit-learn style model
class SimpleLinearRegression:
    def __init__(self):
        self.weights = None
        self.bias = None
        self.training_history = []
    
    def fit(self, X, y):
        # Simple linear regression fitting
        self.weights = np.linalg.lstsq(X, y, rcond=None)[0]
        self.bias = np.mean(y - X @ self.weights)
        self.training_history.append({'samples': len(X), 'features': X.shape[1]})
        return self
    
    def predict(self, X):
        return X @ self.weights + self.bias

# Train and save model
X_train = np.random.random((1000, 10))
y_train = X_train @ np.random.random(10) + np.random.random() * 0.1

model = SimpleLinearRegression()
model.fit(X_train, y_train)

# Persist trained model
dump(model, 'trained_model.pkl', compress=True)

# Load model for inference
loaded_model = load('trained_model.pkl')
predictions = loaded_model.predict(X_test)

Install with Tessl CLI