Toolbox for imbalanced dataset in machine learning
—
Helper functions and classes for validating sampling strategies, checking neighbor objects, docstring substitution, and creating custom samplers with functional approaches.
Imbalanced-learn provides comprehensive utility functions that support the core sampling functionality. These utilities handle parameter validation, strategy checking, neighbor object verification, and provide tools for creating custom sampling workflows.
{ .api }
def check_sampling_strategy(
sampling_strategy,
y,
sampling_type,
**kwargs
) -> dictSampling target validation for samplers.
Parameters:
float, str, dict, list or callable): Sampling information to sample the data set
float: For under-sampling methods, it corresponds to the ratio α_us defined by N_rM = α_us × N_m where N_rM and N_m are the number of samples in the majority class after resampling and the number of samples in the minority class, respectively. For over-sampling methods, it correspond to the ratio α_os defined by N_rm = α_os × N_m where N_rm and N_M are the number of samples in the minority class after resampling and the number of samples in the majority class, respectivelystr: Specify the class targeted by the resampling. Possible choices are: 'minority', 'majority', 'not minority', 'not majority', 'all', 'auto'dict: The keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted classlist: The list contains the targeted classes. Used only for cleaning methodscallable: Function taking y and returns a dict. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each classndarray of shape (n_samples,)): The target array{'over-sampling', 'under-sampling', 'clean-sampling'}): The type of sampling. Can be either 'over-sampling', 'under-sampling', or 'clean-sampling'dict): Dictionary of additional keyword arguments to pass to sampling_strategy when this is a callableReturns:
dict): The converted and validated sampling target. Returns a dictionary with the key being the class target and the value being the desired number of samplesStrategy Types:
# Target minority class only (over-sampling)
strategy = check_sampling_strategy('minority', y, 'over-sampling')
# Target majority class only (under-sampling)
strategy = check_sampling_strategy('majority', y, 'under-sampling')
# Target all classes except minority
strategy = check_sampling_strategy('not minority', y, 'under-sampling')
# Target all classes except majority
strategy = check_sampling_strategy('not majority', y, 'over-sampling')
# Target all classes
strategy = check_sampling_strategy('all', y, 'over-sampling')
# Auto strategy (equivalent to 'not majority' for over-sampling, 'not minority' for under-sampling)
strategy = check_sampling_strategy('auto', y, 'over-sampling')from collections import Counter
# Specify exact number of samples per class
y = [0, 0, 0, 1, 1, 2]
strategy = {0: 100, 1: 80, 2: 60} # Target samples for each class
validated = check_sampling_strategy(strategy, y, 'over-sampling')# For binary classification - ratio between classes
y_binary = [0, 0, 0, 0, 1] # Imbalanced binary
# Under-sampling: majority class = 0.5 * minority class size
strategy = check_sampling_strategy(0.5, y_binary, 'under-sampling')
# Over-sampling: minority class = 1.5 * majority class size
strategy = check_sampling_strategy(1.5, y_binary, 'over-sampling')def custom_strategy(y):
"""Custom sampling strategy function."""
from collections import Counter
counter = Counter(y)
# Balance to 80% of majority class size
target_size = int(0.8 * max(counter.values()))
return {cls: target_size for cls in counter.keys()}
# Use callable strategy
strategy = check_sampling_strategy(custom_strategy, y, 'under-sampling'){ .api }
def check_neighbors_object(
nn_name,
nn_object,
additional_neighbor=0
) -> objectCheck the objects is consistent to be a k nearest neighbors.
Parameters:
str): The name associated to the object to raise an error if neededint or KNeighborsMixin): The object to be checkedint, default=0): Sometimes, some algorithm need an additional neighborsReturns:
KNeighborsMixin): The k-NN objectFunctionality:
nn_object is an integer, creates a NearestNeighbors object with n_neighbors=nn_object + additional_neighbornn_object is already a neighbors object, returns a clone of itUsage Examples:
from imblearn.utils import check_neighbors_object
from sklearn.neighbors import NearestNeighbors
# From integer - creates NearestNeighbors(n_neighbors=5)
nn = check_neighbors_object('k_neighbors', 5)
# From existing object - clones it
existing_nn = NearestNeighbors(n_neighbors=3, metric='manhattan')
nn = check_neighbors_object('k_neighbors', existing_nn)
# With additional neighbors (for algorithms that need k+1 neighbors)
nn = check_neighbors_object('k_neighbors', 5, additional_neighbor=1) # Creates with 6 neighbors{ .api }
def check_target_type(
y,
indicate_one_vs_all=False
) -> ndarray | tuple[ndarray, bool]Check the target types to be conform to the current samplers.
Parameters:
ndarray): The array containing the targetbool, default=False): Either to indicate if the targets are encoded in a one-vs-all fashionReturns:
ndarray): The returned targetbool, optional): Indicate if the target was originally encoded in a one-vs-all fashion. Only returned if indicate_one_vs_all=TrueTarget Type Handling:
Example:
import numpy as np
from imblearn.utils import check_target_type
# Regular multiclass target
y_multiclass = np.array([0, 1, 2, 0, 1, 2])
y_checked = check_target_type(y_multiclass)
# One-vs-all encoded (multilabel-indicator that's actually multiclass)
y_ovr = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0]])
y_converted, is_ovr = check_target_type(y_ovr, indicate_one_vs_all=True)
# y_converted becomes [0, 1, 2, 0], is_ovr is True
# True multilabel (not supported - raises error)
y_multilabel = np.array([[1, 1, 0], [0, 1, 1], [1, 0, 1]])
# check_target_type(y_multilabel) # Raises ValueError{ .api }
class Substitution:
def __init__(self, *args, **kwargs): ...
def __call__(self, obj): ...Decorate a function's or a class' docstring to perform string substitution on it.
Parameters:
tuple): Positional arguments for substitution (mutually exclusive with kwargs)dict): Keyword arguments for substitution (mutually exclusive with args)Usage: The decorator performs string formatting on docstrings using the provided arguments.
Example:
from imblearn.utils import Substitution
# Define reusable docstring components
_random_state_docstring = """random_state : int, RandomState instance, default=None
Control the randomization of the algorithm.
- If int, random_state is the seed used by the random number generator;
- If RandomState instance, random_state is the random number generator;
- If None, the random number generator is the RandomState instance used
by np.random."""
# Use as decorator with keyword arguments
@Substitution(random_state=_random_state_docstring)
def my_function(X, y, random_state=None):
"""Apply sampling to dataset.
Parameters
----------
X : array-like
Input data.
y : array-like
Target values.
{random_state}
Returns
-------
X_resampled, y_resampled : arrays
Resampled data and targets.
"""
pass
# Use with positional arguments
@Substitution("This is a substituted description")
def another_function():
"""{}
More details here.
"""
pass{ .api }
class FunctionSampler:
def __init__(
self,
*,
func=None,
accept_sparse=True,
kw_args=None,
validate=True
): ...
def fit(self, X, y): ...
def fit_resample(self, X, y): ...Construct a sampler from calling an arbitrary callable.
Parameters:
callable, default=None): The callable to use for the transformation. This will be passed the same arguments as transform, with args and kwargs forwarded. If func is None, then func will be the identity functionbool, default=True): Whether sparse input are supported. By default, sparse inputs are supporteddict, default=None): The keyword argument expected by funcbool, default=True): Whether or not to bypass the validation of X and y. Turning-off validation allows to use the FunctionSampler with any type of dataAttributes:
dict): Dictionary containing the information to sample the dataset. The keys corresponds to the class labels from which to sample and the values are the number of samples to sampleint): Number of features in the input datasetndarray of shape (n_features_in_,)): Names of features seen during fit. Defined only when X has feature names that are all stringsMethods:
def fit(self, X, y) -> FunctionSamplerCheck inputs and statistics of the sampler.
def fit_resample(self, X, y) -> tuple[ndarray, ndarray]Resample the dataset using the provided function.
Basic Usage:
from imblearn import FunctionSampler
import numpy as np
# Simple function to select first 10 samples
def select_first_ten(X, y):
return X[:10], y[:10]
sampler = FunctionSampler(func=select_first_ten)
X_res, y_res = sampler.fit_resample(X, y)Using Existing Samplers:
from imblearn import FunctionSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
def custom_undersampling(X, y, sampling_strategy, random_state):
"""Custom function using existing sampler."""
return RandomUnderSampler(
sampling_strategy=sampling_strategy,
random_state=random_state
).fit_resample(X, y)
# Create functional sampler
sampler = FunctionSampler(
func=custom_undersampling,
kw_args={
'sampling_strategy': 'auto',
'random_state': 42
}
)
X_res, y_res = sampler.fit_resample(X, y)
print(f'Resampled distribution: {Counter(y_res)}')Advanced Custom Logic:
import numpy as np
from sklearn.cluster import KMeans
def cluster_based_sampling(X, y, n_clusters=3, random_state=None):
"""Custom sampling based on clustering."""
from collections import Counter
# Get class distribution
counter = Counter(y)
majority_class = max(counter, key=counter.get)
minority_classes = [cls for cls in counter.keys() if cls != majority_class]
# Keep all minority class samples
minority_mask = np.isin(y, minority_classes)
X_minority = X[minority_mask]
y_minority = y[minority_mask]
# Cluster majority class and sample from each cluster
majority_mask = y == majority_class
X_majority = X[majority_mask]
# Apply clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
clusters = kmeans.fit_predict(X_majority)
# Sample from each cluster
target_per_cluster = len(y_minority) // n_clusters
X_sampled_list = []
for cluster_id in range(n_clusters):
cluster_mask = clusters == cluster_id
cluster_indices = np.where(cluster_mask)[0]
if len(cluster_indices) > 0:
selected = np.random.choice(
cluster_indices,
size=min(target_per_cluster, len(cluster_indices)),
replace=False
)
X_sampled_list.append(X_majority[selected])
# Combine results
X_majority_sampled = np.vstack(X_sampled_list)
y_majority_sampled = np.full(len(X_majority_sampled), majority_class)
X_resampled = np.vstack([X_minority, X_majority_sampled])
y_resampled = np.concatenate([y_minority, y_majority_sampled])
return X_resampled, y_resampled
# Use custom cluster-based sampling
sampler = FunctionSampler(
func=cluster_based_sampling,
kw_args={'n_clusters': 5, 'random_state': 42}
)
X_res, y_res = sampler.fit_resample(X, y){ .api }
def is_sampler(estimator) -> boolReturn True if the given estimator is a sampler, False otherwise.
Parameters:
object): Estimator to testReturns:
bool): True if estimator is a sampler, otherwise FalseDetection Logic:
_estimator_type == "sampler" attributesampler_tags in estimator tagsExample:
from imblearn.utils import is_sampler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
# Test imblearn sampler
smote = SMOTE()
print(is_sampler(smote)) # True
# Test sklearn classifier
rf = RandomForestClassifier()
print(is_sampler(rf)) # False
# Test custom sampler
custom_sampler = FunctionSampler()
print(is_sampler(custom_sampler)) # Truefrom imblearn.pipeline import Pipeline
from imblearn import FunctionSampler
from sklearn.ensemble import RandomForestClassifier
# Create custom sampling function
def outlier_removal_sampling(X, y, contamination=0.1):
"""Remove outliers before standard sampling."""
from sklearn.ensemble import IsolationForest
from imblearn.under_sampling import RandomUnderSampler
# Remove outliers
iso_forest = IsolationForest(contamination=contamination, random_state=42)
outlier_mask = iso_forest.fit_predict(X) == 1
X_clean = X[outlier_mask]
y_clean = y[outlier_mask]
# Apply standard sampling
sampler = RandomUnderSampler(random_state=42)
return sampler.fit_resample(X_clean, y_clean)
# Use in pipeline
pipeline = Pipeline([
('outlier_sampling', FunctionSampler(func=outlier_removal_sampling)),
('classifier', RandomForestClassifier())
])
pipeline.fit(X, y)
predictions = pipeline.predict(X_test)from sklearn.model_selection import cross_val_score
from imblearn.utils import check_sampling_strategy
# Validate strategy before cross-validation
def safe_sampler_factory(strategy_type='auto'):
"""Create sampler with validated strategy."""
def create_sampler(X, y):
# Validate strategy for current fold
strategy = check_sampling_strategy(strategy_type, y, 'over-sampling')
from imblearn.over_sampling import SMOTE
return SMOTE(sampling_strategy=strategy, random_state=42).fit_resample(X, y)
return FunctionSampler(func=create_sampler)
# Use in cross-validation
sampler = safe_sampler_factory('not majority')
pipeline = Pipeline([('sampling', sampler), ('classifier', RandomForestClassifier())])
scores = cross_val_score(pipeline, X, y, cv=5)from imblearn.utils import check_sampling_strategy, check_target_type
def robust_sampling_pipeline(X, y, sampling_strategy='auto'):
"""Example of robust sampling with proper validation."""
try:
# Validate target type
y_validated = check_target_type(y)
# Validate sampling strategy
strategy = check_sampling_strategy(sampling_strategy, y_validated, 'over-sampling')
# Apply sampling
from imblearn.over_sampling import SMOTE
sampler = SMOTE(sampling_strategy=strategy)
return sampler.fit_resample(X, y_validated)
except ValueError as e:
print(f"Validation error: {e}")
# Fallback to identity transformation
return X, y
except Exception as e:
print(f"Sampling error: {e}")
return X, y
# Use robust pipeline
X_res, y_res = robust_sampling_pipeline(X, y, 'not majority')Install with Tessl CLI
npx tessl i tessl/pypi-imbalanced-learn