Toolbox for imbalanced dataset in machine learning
npx @tessl/cli install tessl/pypi-imbalanced-learn@0.14.0A comprehensive Python toolbox for dealing with imbalanced datasets in machine learning. Provides over-sampling, under-sampling, and combination methods that integrate seamlessly with scikit-learn's API and pipeline system, enabling fair and robust machine learning models on class-imbalanced data.
pip install imbalanced-learnimport imblearnCommon imports for sampling algorithms:
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTEENN, SMOTETomekPipeline integration:
from imblearn.pipeline import Pipeline, make_pipelinefrom imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
# Create imbalanced dataset example
X = np.random.rand(1000, 4)
y = np.random.choice([0, 1], size=1000, p=[0.9, 0.1]) # 90% class 0, 10% class 1
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Apply SMOTE oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
# Or use in pipeline
pipeline = Pipeline([
('sampling', SMOTE(random_state=42)),
('classifier', RandomForestClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)Imbalanced-learn follows scikit-learn's design patterns and API conventions:
BaseSampler and implement fit_resample() methodsampling_strategy parameter across all algorithmsPipeline with custom Pipeline classrandom_state parameter for reproducibilityMethods to increase minority class samples by generating synthetic examples or duplicating existing ones. Includes SMOTE variants, ADASYN, and random over-sampling approaches.
class SMOTE:
def __init__(self, sampling_strategy='auto', random_state=None, k_neighbors=5, n_jobs=None): ...
def fit_resample(self, X, y): ...
class ADASYN:
def __init__(self, sampling_strategy='auto', random_state=None, n_neighbors=5, n_jobs=None): ...
def fit_resample(self, X, y): ...
class RandomOverSampler:
def __init__(self, sampling_strategy='auto', random_state=None, shrinkage=None): ...
def fit_resample(self, X, y): ...Methods to reduce majority class samples by removing redundant or noisy examples. Includes random under-sampling, prototype selection, and cleaning techniques.
class RandomUnderSampler:
def __init__(self, sampling_strategy='auto', random_state=None, replacement=False): ...
def fit_resample(self, X, y): ...
class TomekLinks:
def __init__(self, sampling_strategy='auto', n_jobs=None): ...
def fit_resample(self, X, y): ...
class EditedNearestNeighbours:
def __init__(self, sampling_strategy='auto', n_neighbors=3, kind_sel='all', n_jobs=None): ...
def fit_resample(self, X, y): ...Methods that apply both over-sampling and under-sampling sequentially to balance datasets using complementary techniques.
class SMOTEENN:
def __init__(self, sampling_strategy='auto', random_state=None, smote=None, enn=None, n_jobs=None): ...
def fit_resample(self, X, y): ...
class SMOTETomek:
def __init__(self, sampling_strategy='auto', random_state=None, smote=None, tomek=None, n_jobs=None): ...
def fit_resample(self, X, y): ...Ensemble classifiers that incorporate sampling techniques during training to handle class imbalance effectively.
class BalancedBaggingClassifier:
def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0,
bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False,
sampling_strategy='auto', replacement=False, n_jobs=None, random_state=None, verbose=0): ...
def fit(self, X, y, sample_weight=None): ...
def predict(self, X): ...
class BalancedRandomForestClassifier:
def __init__(self, n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
sampling_strategy='auto', replacement=False, n_jobs=None, random_state=None,
verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None): ...
def fit(self, X, y, sample_weight=None): ...
def predict(self, X): ...Specialized metrics designed to evaluate model performance on imbalanced datasets, providing more meaningful insights than traditional accuracy-based measures.
def sensitivity_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
sample_weight=None, zero_division='warn'): ...
def specificity_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
sample_weight=None, zero_division='warn'): ...
def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
sample_weight=None, correction=0.0, zero_division='warn'): ...
def classification_report_imbalanced(y_true, y_pred, labels=None, target_names=None,
sample_weight=None, digits=2, output_dict=False,
zero_division='warn'): ...Extended pipeline functionality for combining sampling and classification steps, ensuring proper cross-validation and avoiding data leakage.
class Pipeline:
def __init__(self, steps, memory=None, verbose=False): ...
def fit(self, X, y=None, **fit_params): ...
def predict(self, X, **predict_params): ...
def fit_resample(self, X, y): ...
def make_pipeline(*steps, memory=None, verbose=False): ...Cross-validation and model selection tools adapted for imbalanced datasets, including instance hardness-based splitting strategies.
class InstanceHardnessCV:
def __init__(self, estimator, cv=5, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs',
scoring=None, return_train_score=False): ...
def fit(self, X, y, groups=None, **fit_params): ...
def split(self, X, y, groups=None): ...Utilities for handling imbalanced datasets in deep learning frameworks, including balanced batch generators for Keras and TensorFlow.
class BalancedBatchGenerator:
def __init__(self, X, y, sampling_strategy='auto', random_state=None, **kwargs): ...
def __call__(self): ...
def balanced_batch_generator(X, y, sampling_strategy='auto', batch_size=32, random_state=None): ...Helper functions for validating sampling strategies, checking neighbor objects, and other utility operations.
def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): ...
def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): ...
def check_target_type(y, indicate_one_vs_all=False): ...
class FunctionSampler:
def __init__(self, func=None, accept_sparse=True, kw_args=None, validate=True): ...
def fit(self, X, y): ...
def fit_resample(self, X, y): ...Functions for creating imbalanced datasets and fetching benchmark datasets for testing and evaluation.
def make_imbalance(X, y, sampling_strategy=None, random_state=None, verbose=False, **kwargs): ...
def fetch_datasets(data_home=None, filter_data=None, download_if_missing=True,
return_X_y=False, as_frame=False): ...Core base classes and type definitions used throughout the package:
class BaseSampler:
def __init__(self): ...
def fit(self, X, y): ...
def fit_resample(self, X, y): ...
def _validate_params(self): ...
class SamplerMixin:
def fit_resample(self, X, y): ...
def is_sampler(estimator): ...