Toolbox for imbalanced dataset in machine learning
—
Ensemble methods combine multiple base learners to improve classification performance beyond what individual models can achieve. However, traditional ensemble methods often struggle with imbalanced datasets where minority classes are underrepresented. The imbalanced-learn library provides specialized ensemble classifiers that integrate resampling techniques directly into the ensemble learning process.
These ensemble methods address class imbalance by applying resampling strategies during training, ensuring that each base learner in the ensemble receives balanced training data. This approach leads to improved performance on minority classes while maintaining overall classification accuracy.
The ensemble module includes four main approaches:
A bagging classifier with additional balancing that applies resampling to each bootstrap sample before training base estimators.
class BalancedBaggingClassifier(BaggingClassifier):
def __init__(
self,
estimator=None,
n_estimators=10,
*,
max_samples=1.0,
max_features=1.0,
bootstrap=True,
bootstrap_features=False,
oob_score=False,
warm_start=False,
sampling_strategy="auto",
replacement=False,
n_jobs=None,
random_state=None,
verbose=0,
sampler=None,
)def fit(self, X, y):
"""Build a Bagging ensemble of estimators from the training set (X, y).
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples
y : array-like of shape (n_samples,)
The target values (class labels)
Returns
-------
self : object
Fitted estimator
"""
def predict(self, X):
"""Predict class for samples in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples
Returns
-------
y : ndarray of shape (n_samples,)
The predicted classes
"""
def predict_proba(self, X):
"""Predict class probabilities for samples in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples
Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class probabilities of the input samples
"""from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Create imbalanced dataset
X, y = make_classification(
n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, n_features=20,
n_clusters_per_class=1, n_samples=1000, random_state=10
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Train balanced bagging classifier
bbc = BalancedBaggingClassifier(n_estimators=10, random_state=42)
bbc.fit(X_train, y_train)
# Make predictions
y_pred = bbc.predict(X_test)
y_proba = bbc.predict_proba(X_test)A balanced random forest classifier that applies random under-sampling to balance each bootstrap sample during forest construction.
class BalancedRandomForestClassifier(RandomForestClassifier):
def __init__(
self,
n_estimators=100,
*,
criterion="gini",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features="sqrt",
max_leaf_nodes=None,
min_impurity_decrease=0.0,
bootstrap=False,
oob_score=False,
sampling_strategy="all",
replacement=True,
n_jobs=None,
random_state=None,
verbose=0,
warm_start=False,
class_weight=None,
ccp_alpha=0.0,
max_samples=None,
monotonic_cst=None,
)def fit(self, X, y, sample_weight=None):
"""Build a forest of trees from the training set (X, y).
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
The target values (class labels)
sample_weight : array-like of shape (n_samples,), default=None
Sample weights
Returns
-------
self : object
The fitted instance
"""
def predict(self, X):
"""Predict class for samples in X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input samples
Returns
-------
y : ndarray of shape (n_samples,)
The predicted classes
"""
def predict_proba(self, X):
"""Predict class probabilities for samples in X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The input samples
Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class probabilities
"""from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
# Create imbalanced dataset
X, y = make_classification(
n_samples=1000, n_classes=3, n_informative=4,
weights=[0.2, 0.3, 0.5], random_state=0
)
# Train balanced random forest
brf = BalancedRandomForestClassifier(
n_estimators=10,
sampling_strategy="all",
replacement=True,
max_depth=2,
random_state=0,
bootstrap=False
)
brf.fit(X, y)
# Make predictions
y_pred = brf.predict(X)
feature_importances = brf.feature_importances_Bag of balanced boosted learners, also known as EasyEnsemble. This classifier is an ensemble of AdaBoost learners trained on different balanced bootstrap samples achieved by random under-sampling.
class EasyEnsembleClassifier(BaggingClassifier):
def __init__(
self,
n_estimators=10,
estimator=None,
*,
warm_start=False,
sampling_strategy="auto",
replacement=False,
n_jobs=None,
random_state=None,
verbose=0,
)def fit(self, X, y):
"""Build a Bagging ensemble of estimators from the training set (X, y).
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples
y : array-like of shape (n_samples,)
The target values (class labels)
Returns
-------
self : object
Fitted estimator
"""
def predict(self, X):
"""Predict class for samples in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples
Returns
-------
y : ndarray of shape (n_samples,)
The predicted classes
"""
def predict_proba(self, X):
"""Predict class probabilities for samples in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples
Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class probabilities
"""from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Create imbalanced dataset
X, y = make_classification(
n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, n_features=20,
n_clusters_per_class=1, n_samples=1000, random_state=10
)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# Create custom AdaBoost estimator
ada_estimator = AdaBoostClassifier(n_estimators=10, algorithm="SAMME")
# Train EasyEnsemble classifier
eec = EasyEnsembleClassifier(
n_estimators=10,
estimator=ada_estimator,
random_state=42
)
eec.fit(X_train, y_train)
# Make predictions
y_pred = eec.predict(X_test)
y_proba = eec.predict_proba(X_test)Random under-sampling integrated into the learning of AdaBoost. During learning, class balancing is alleviated by random under-sampling the dataset at each iteration of the boosting algorithm.
class RUSBoostClassifier(AdaBoostClassifier):
def __init__(
self,
estimator=None,
*,
n_estimators=50,
learning_rate=1.0,
algorithm="deprecated",
sampling_strategy="auto",
replacement=False,
random_state=None,
)def fit(self, X, y, sample_weight=None):
"""Build a boosted classifier from the training set (X, y).
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples
y : array-like of shape (n_samples,)
The target values (class labels)
sample_weight : array-like of shape (n_samples,), default=None
Sample weights
Returns
-------
self : object
Returns self
"""
def predict(self, X):
"""Predict classes for samples in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples
Returns
-------
y : ndarray of shape (n_samples,)
The predicted classes
"""
def predict_proba(self, X):
"""Predict class probabilities for samples in X.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The input samples
Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class probabilities
"""from imblearn.ensemble import RUSBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
# Create imbalanced dataset
X, y = make_classification(
n_samples=1000, n_classes=3, n_informative=4,
weights=[0.2, 0.3, 0.5], random_state=0
)
# Use custom base estimator
base_estimator = DecisionTreeClassifier(max_depth=2)
# Train RUSBoost classifier
rusboost = RUSBoostClassifier(
estimator=base_estimator,
n_estimators=10,
learning_rate=1.0,
sampling_strategy="auto",
random_state=0
)
rusboost.fit(X, y)
# Make predictions
y_pred = rusboost.predict(X)
y_proba = rusboost.predict_proba(X)
# Access ensemble information
print(f"Estimator weights: {rusboost.estimator_weights_}")
print(f"Estimator errors: {rusboost.estimator_errors_}")All imbalanced-learn ensemble classifiers extend their corresponding scikit-learn base classes:
sklearn.ensemble.BaggingClassifiersklearn.ensemble.RandomForestClassifiersklearn.ensemble.BaggingClassifiersklearn.ensemble.AdaBoostClassifierThis inheritance ensures compatibility with scikit-learn's API while adding resampling capabilities.
Each ensemble method integrates resampling differently:
Bagging approaches (BalancedBaggingClassifier, EasyEnsembleClassifier) apply resampling to each bootstrap sample before training individual estimators
Random Forest (BalancedRandomForestClassifier) applies resampling before constructing each tree, then optionally applies additional bootstrapping
Boosting (RUSBoostClassifier) applies resampling at each boosting iteration, ensuring balanced training data throughout the adaptive process
All ensemble classifiers can be used within scikit-learn pipelines:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', BalancedRandomForestClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)This modular design enables easy integration into existing machine learning workflows while providing the benefits of balanced ensemble learning for imbalanced datasets.
Install with Tessl CLI
npx tessl i tessl/pypi-imbalanced-learn