A library to parallelize pandas operations on all available CPUs with minimal code changes
npx @tessl/cli install tessl/pypi-pandarallel@1.6.0An easy-to-use library that parallelizes pandas operations across all available CPUs with minimal code changes. Pandarallel transforms standard pandas methods into parallelized versions by simply changing method calls from df.apply() to df.parallel_apply(), providing automatic progress bars and seamless integration into existing pandas workflows.
pip install pandarallelfrom pandarallel import pandarallelfrom pandarallel import pandarallel
import pandas as pd
import math
# Initialize pandarallel to enable parallel processing
pandarallel.initialize(progress_bar=True)
# Create sample DataFrame
df = pd.DataFrame({
'a': [1, 2, 3, 4, 5],
'b': [0.1, 0.2, 0.3, 0.4, 0.5]
})
# Define a function to apply
def compute_function(row):
return math.sin(row.a**2) + math.sin(row.b**2)
# Use parallel version instead of regular apply
result = df.parallel_apply(compute_function, axis=1)
# Works with Series too
series_result = df.a.parallel_apply(lambda x: math.sqrt(x**2))
# And with groupby operations
grouped_result = df.groupby('a').parallel_apply(lambda group: group.b.sum())Configure pandarallel to enable parallel processing and add parallel methods to pandas objects.
from typing import Optional
@classmethod
def initialize(
cls,
shm_size_mb=None,
nb_workers=None,
progress_bar=False,
verbose=2,
use_memory_fs: Optional[bool] = None
) -> None:
"""
Initialize pandarallel and add parallel methods to pandas objects.
Args:
shm_size_mb (int, optional): Shared memory size in MB (deprecated parameter)
nb_workers (int, optional): Number of worker processes. Defaults to number of physical CPU cores (detected automatically)
progress_bar (bool): Enable progress bars during parallel operations. Default: False
verbose (int): Verbosity level (0=silent, 1=warnings, 2=info). Default: 2
use_memory_fs (bool, optional): Use memory file system for data transfer. Auto-detected if None
Returns:
None
"""Parallelized versions of DataFrame operations that maintain the same API as their pandas counterparts.
def parallel_apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwargs):
"""
Parallel version of DataFrame.apply().
Args:
func (function): Function to apply to each column or row
axis (int or str): Apply function along axis (0/'index' for rows, 1/'columns' for columns)
raw (bool): Pass raw ndarray instead of Series to function
result_type (str): Control return type ('expand', 'reduce', 'broadcast')
args (tuple): Positional arguments to pass to func
**kwargs: Additional keyword arguments to pass to func
Returns:
Series or DataFrame: Result of applying func
"""
def parallel_applymap(self, func, na_action=None, **kwargs):
"""
Parallel version of DataFrame.applymap().
Args:
func (function): Function to apply to each element
na_action (str): Action to take for NaN values ('ignore' or None)
**kwargs: Additional keyword arguments to pass to func
Returns:
DataFrame: Result of applying func to each element
"""Parallelized versions of Series operations.
def parallel_apply(self, func, convert_dtype=True, args=(), *, by_row='compat', **kwargs):
"""
Parallel version of Series.apply().
Args:
func (function): Function to apply to each element
convert_dtype (bool): Try to infer better dtype for elementwise function results
args (tuple): Positional arguments to pass to func
by_row (str): Apply function row-wise ('compat' for compatibility mode)
**kwargs: Additional keyword arguments to pass to func
Returns:
Series or DataFrame: Result of applying func
"""
def parallel_map(self, arg, na_action=None, *args, **kwargs):
"""
Parallel version of Series.map().
Args:
arg (function, dict, or Series): Mapping function or correspondence
na_action (str): Action to take for NaN values ('ignore' or None)
*args: Additional positional arguments to pass to mapping function
**kwargs: Additional keyword arguments to pass to mapping function
Returns:
Series: Result of mapping values
"""Parallelized versions of GroupBy operations.
def parallel_apply(self, func, *args, **kwargs):
"""
Parallel version of GroupBy.apply() for DataFrameGroupBy.
Args:
func (function): Function to apply to each group
*args: Positional arguments to pass to func
**kwargs: Keyword arguments to pass to func
Returns:
Series or DataFrame: Result of applying func to each group
"""Parallelized versions of rolling window operations.
def parallel_apply(self, func, raw=False, engine=None, engine_kwargs=None, args=(), **kwargs):
"""
Parallel version of Rolling.apply().
Args:
func (function): Function to apply to each rolling window
raw (bool): Pass raw ndarray instead of Series to function
engine (str): Execution engine ('cython' or 'numba')
engine_kwargs (dict): Engine-specific kwargs
args (tuple): Positional arguments to pass to func
**kwargs: Additional keyword arguments to pass to func
Returns:
Series or DataFrame: Result of applying func to rolling windows
"""Parallelized versions of rolling operations on grouped data.
def parallel_apply(self, func, raw=False, engine=None, engine_kwargs=None, args=(), **kwargs):
"""
Parallel version of RollingGroupby.apply().
Args:
func (function): Function to apply to each rolling group window
raw (bool): Pass raw ndarray instead of Series to function
engine (str): Execution engine ('cython' or 'numba')
engine_kwargs (dict): Engine-specific kwargs
args (tuple): Positional arguments to pass to func
**kwargs: Additional keyword arguments to pass to func
Returns:
Series or DataFrame: Result of applying func to rolling group windows
"""Parallelized versions of expanding operations on grouped data.
def parallel_apply(self, func, raw=False, engine=None, engine_kwargs=None, args=(), **kwargs):
"""
Parallel version of ExpandingGroupby.apply().
Args:
func (function): Function to apply to each expanding group window
raw (bool): Pass raw ndarray instead of Series to function
engine (str): Execution engine ('cython' or 'numba')
engine_kwargs (dict): Engine-specific kwargs
args (tuple): Positional arguments to pass to func
**kwargs: Additional keyword arguments to pass to func
Returns:
Series or DataFrame: Result of applying func to expanding group windows
"""import pandas as pd
import numpy as np
import math
from pandarallel import pandarallel
# Initialize with progress bars
pandarallel.initialize(progress_bar=True, nb_workers=4)
# Create sample data
df = pd.DataFrame({
'a': np.random.randint(1, 8, 1000000),
'b': np.random.rand(1000000)
})
# Parallel apply on rows (axis=1)
def row_function(row):
return math.sin(row.a**2) + math.sin(row.b**2)
result = df.parallel_apply(row_function, axis=1)
# Parallel applymap on each element
def element_function(x):
return math.sin(x**2) - math.cos(x**2)
result = df.parallel_applymap(element_function)# Parallel apply on Series
series = pd.Series(np.random.rand(1000000) + 1)
def series_function(x, power=2, bias=0):
return math.log10(math.sqrt(math.exp(x**power))) + bias
result = series.parallel_apply(series_function, args=(2,), bias=3)
# Parallel map with dictionary
mapping = {i: i**2 for i in range(1, 100)}
result = series.parallel_map(mapping)# Create grouped data
df_grouped = pd.DataFrame({
'group': np.random.randint(1, 100, 1000000),
'value': np.random.rand(1000000)
})
def group_function(group_df):
total = 0
for item in group_df.value:
total += math.log10(math.sqrt(math.exp(item**2)))
return total / len(group_df.value)
result = df_grouped.groupby('group').parallel_apply(group_function)# Rolling window with parallel apply
df_rolling = pd.DataFrame({
'values': range(100000)
})
def rolling_function(window):
return window.iloc[0] + window.iloc[1]**2 + window.iloc[2]**3
result = df_rolling.values.rolling(4).parallel_apply(rolling_function, raw=False)# Use specific number of workers
pandarallel.initialize(nb_workers=8)
# Use all available CPU cores (default)
pandarallel.initialize()# Enable progress bars
pandarallel.initialize(progress_bar=True)
# Disable progress bars (default)
pandarallel.initialize(progress_bar=False)# Force use of memory file system (faster for large data)
pandarallel.initialize(use_memory_fs=True)
# Force use of pipes (more compatible)
pandarallel.initialize(use_memory_fs=False)
# Auto-detect (default) - uses memory fs if /dev/shm is available
pandarallel.initialize()# Silent mode
pandarallel.initialize(verbose=0)
# Show warnings only
pandarallel.initialize(verbose=1)
# Show info messages (default)
pandarallel.initialize(verbose=2)All parallel methods maintain the same error handling behavior as their pandas counterparts. If an exception occurs in any worker process, the entire operation will fail and raise the exception.
Common considerations:
use_memory_fs=True) provides better performance for large datasets