Toolbox for imbalanced dataset in machine learning
—
Functions for creating imbalanced datasets and fetching benchmark datasets for testing and evaluation of imbalanced learning algorithms.
Imbalanced-learn provides utilities for working with imbalanced datasets, including functions to create artificially imbalanced datasets from balanced ones and to fetch real-world benchmark datasets specifically curated for imbalanced learning research.
{ .api }
def make_imbalance(
X,
y,
*,
sampling_strategy=None,
random_state=None,
verbose=False,
**kwargs
) -> tuple[ndarray, ndarray]Turn a dataset into an imbalanced dataset with a specific sampling strategy.
Parameters:
{array-like, dataframe} of shape (n_samples, n_features)): Matrix containing the data to be imbalancedarray-like of shape (n_samples,)): Corresponding label for each sample in Xdict or callable, default=None): Ratio to use for resampling the data set
dict: The keys correspond to the targeted classes. The values correspond to the desired number of samples for each targeted classcallable: Function taking y and returns a dict. The keys correspond to the targeted classes. The values correspond to the desired number of samples for each classint, RandomState instance or None, default=None): If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.randombool, default=False): Show information regarding the samplingdict): Dictionary of additional keyword arguments to pass to sampling_strategyReturns:
{ndarray, dataframe} of shape (n_samples_new, n_features)): The array containing the imbalanced datandarray of shape (n_samples_new,)): The corresponding label of X_resampledAlgorithm:
The function uses RandomUnderSampler internally to reduce the number of samples in specified classes, creating imbalanced distributions from balanced datasets.
Basic Usage:
from collections import Counter
from sklearn.datasets import load_iris
from imblearn.datasets import make_imbalance
# Load balanced dataset
data = load_iris()
X, y = data.data, data.target
print(f'Distribution before imbalancing: {Counter(y)}')
# Distribution before imbalancing: Counter({0: 50, 1: 50, 2: 50})
# Create imbalanced dataset
X_res, y_res = make_imbalance(
X, y,
sampling_strategy={0: 10, 1: 20, 2: 30},
random_state=42
)
print(f'Distribution after imbalancing: {Counter(y_res)}')
# Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10})Using Callable Strategies:
def progressive_imbalance(y):
"""Create progressively more imbalanced classes."""
from collections import Counter
counter = Counter(y)
classes = sorted(counter.keys())
# Create exponentially decreasing class sizes
target_sizes = {}
base_size = 100
for i, cls in enumerate(classes):
target_sizes[cls] = base_size // (2 ** i)
return target_sizes
# Apply progressive imbalance
X_prog, y_prog = make_imbalance(
X, y,
sampling_strategy=progressive_imbalance,
random_state=42,
verbose=True
)Multi-class Imbalance Patterns:
from sklearn.datasets import make_classification
# Create multi-class dataset
X, y = make_classification(
n_classes=5,
n_samples=1000,
n_features=10,
n_informative=8,
n_redundant=1,
n_clusters_per_class=1,
weights=[0.2, 0.2, 0.2, 0.2, 0.2], # Initially balanced
random_state=42
)
print(f"Original distribution: {Counter(y)}")
# Create different imbalance patterns
strategies = {
'mild_imbalance': {0: 150, 1: 120, 2: 100, 3: 80, 4: 50},
'severe_imbalance': {0: 200, 1: 50, 2: 25, 3: 15, 4: 10},
'binary_like': {0: 250, 1: 250, 2: 10, 3: 10, 4: 10}
}
for name, strategy in strategies.items():
X_imb, y_imb = make_imbalance(X, y, sampling_strategy=strategy, random_state=42)
print(f"{name}: {Counter(y_imb)}"){ .api }
def fetch_datasets(
*,
data_home=None,
filter_data=None,
download_if_missing=True,
random_state=None,
shuffle=False,
verbose=False
) -> OrderedDictLoad the benchmark datasets from Zenodo, downloading it if necessary.
Parameters:
str, default=None): Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolderstuple of str/int, default=None): A tuple containing the ID or the name of the datasets to be returned. Refer to the dataset table to get the ID and name of the datasetsbool, default=True): If False, raise a IOError if the data is not locally available instead of trying to download the data from the source siteint, RandomState instance or None, default=None): Random state for shuffling the dataset. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.randombool, default=False): Whether to shuffle datasetbool, default=False): Show information regarding the fetchingReturns:
OrderedDict of Bunch object): The ordered is defined by filter_data. Each Bunch object (referred as dataset) have the following attributes:
ndarray of shape (n_samples, n_features)): The input datandarray of shape (n_samples,)): The target valuesstr): Description of the datasetThe collection contains 27 real-world imbalanced datasets from various domains:
| ID | Name | Repository & Target | Ratio | #S | #F |
|---|---|---|---|---|---|
| 1 | ecoli | UCI, target: imU | 8.6:1 | 336 | 7 |
| 2 | optical_digits | UCI, target: 8 | 9.1:1 | 5,620 | 64 |
| 3 | satimage | UCI, target: 4 | 9.3:1 | 6,435 | 36 |
| 4 | pen_digits | UCI, target: 5 | 9.4:1 | 10,992 | 16 |
| 5 | abalone | UCI, target: 7 | 9.7:1 | 4,177 | 10 |
| 6 | sick_euthyroid | UCI, target: sick euthyroid | 9.8:1 | 3,163 | 42 |
| 7 | spectrometer | UCI, target: >=44 | 11:1 | 531 | 93 |
| 8 | car_eval_34 | UCI, target: good, v good | 12:1 | 1,728 | 21 |
| 9 | isolet | UCI, target: A, B | 12:1 | 7,797 | 617 |
| 10 | us_crime | UCI, target: >0.65 | 12:1 | 1,994 | 100 |
| 11 | yeast_ml8 | LIBSVM, target: 8 | 13:1 | 2,417 | 103 |
| 12 | scene | LIBSVM, target: >one label | 13:1 | 2,407 | 294 |
| 13 | libras_move | UCI, target: 1 | 14:1 | 360 | 90 |
| 14 | thyroid_sick | UCI, target: sick | 15:1 | 3,772 | 52 |
| 15 | coil_2000 | KDD, CoIL, target: minority | 16:1 | 9,822 | 85 |
| 16 | arrhythmia | UCI, target: 06 | 17:1 | 452 | 278 |
| 17 | solar_flare_m0 | UCI, target: M->0 | 19:1 | 1,389 | 32 |
| 18 | oil | UCI, target: minority | 22:1 | 937 | 49 |
| 19 | car_eval_4 | UCI, target: vgood | 26:1 | 1,728 | 21 |
| 20 | wine_quality | UCI, wine, target: <=4 | 26:1 | 4,898 | 11 |
| 21 | letter_img | UCI, target: Z | 26:1 | 20,000 | 16 |
| 22 | yeast_me2 | UCI, target: ME2 | 28:1 | 1,484 | 8 |
| 23 | webpage | LIBSVM, w7a, target: minority | 33:1 | 34,780 | 300 |
| 24 | ozone_level | UCI, ozone, data | 34:1 | 2,536 | 72 |
| 25 | mammography | UCI, target: minority | 42:1 | 11,183 | 6 |
| 26 | protein_homo | KDD CUP 2004, minority | 111:1 | 145,751 | 74 |
| 27 | abalone_19 | UCI, target: 19 | 130:1 | 4,177 | 10 |
Dataset Categories:
Suitable for quick experimentation and algorithm development:
# Fetch small datasets for rapid prototyping
small_datasets = fetch_datasets(filter_data=('ecoli', 'libras_move', 'arrhythmia'))
for name, dataset in small_datasets.items():
print(f"{name}: {dataset.data.shape} samples, ratio ~{dataset.DESCR}")Good balance of complexity and computational efficiency:
# Medium-sized datasets for thorough evaluation
medium_datasets = fetch_datasets(
filter_data=('satimage', 'abalone', 'sick_euthyroid', 'coil_2000')
)For scalability testing and real-world performance evaluation:
# Large datasets for scalability testing
large_datasets = fetch_datasets(
filter_data=('pen_digits', 'isolet', 'letter_img', 'webpage', 'protein_homo')
)Usage Examples:
from imblearn.datasets import fetch_datasets
from collections import Counter
# Download all benchmark datasets
all_datasets = fetch_datasets(verbose=True)
# Analyze dataset characteristics
for name, dataset in all_datasets.items():
counter = Counter(dataset.target)
n_samples, n_features = dataset.data.shape
ratio = max(counter.values()) / min(counter.values())
print(f"{name}:")
print(f" Samples: {n_samples}, Features: {n_features}")
print(f" Classes: {len(counter)}, Ratio: {ratio:.1f}:1")
print(f" Distribution: {dict(counter)}")
print()# Fetch datasets by name
datasets_by_name = fetch_datasets(
filter_data=('ecoli', 'mammography', 'abalone_19'),
shuffle=True,
random_state=42
)
# Fetch datasets by ID
datasets_by_id = fetch_datasets(
filter_data=(1, 25, 27), # Same as above
shuffle=True,
random_state=42
)
# Access individual datasets
ecoli = datasets_by_name['ecoli']
X, y = ecoli.data, ecoli.target
print(f"Ecoli dataset: {X.shape}, classes: {Counter(y)}")from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
# Evaluate algorithm across multiple datasets
def evaluate_on_datasets(dataset_names, n_runs=5):
"""Evaluate sampling + classification across datasets."""
datasets = fetch_datasets(filter_data=dataset_names)
# Create pipeline
pipeline = Pipeline([
('sampling', SMOTE(random_state=42)),
('classifier', RandomForestClassifier(random_state=42))
])
results = {}
for name, dataset in datasets.items():
scores = cross_val_score(
pipeline, dataset.data, dataset.target,
cv=5, scoring='f1_macro'
)
results[name] = {
'mean_score': scores.mean(),
'std_score': scores.std(),
'dataset_info': {
'n_samples': dataset.data.shape[0],
'n_features': dataset.data.shape[1],
'n_classes': len(Counter(dataset.target))
}
}
return results
# Run evaluation
results = evaluate_on_datasets([
'ecoli', 'optical_digits', 'satimage', 'abalone', 'mammography'
])
for name, result in results.items():
info = result['dataset_info']
print(f"{name}:")
print(f" F1-macro: {result['mean_score']:.3f} ± {result['std_score']:.3f}")
print(f" Dataset: {info['n_samples']} samples, {info['n_features']} features, {info['n_classes']} classes")from imblearn.datasets import fetch_datasets
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
import pandas as pd
def comprehensive_benchmark():
"""Systematic evaluation across datasets and methods."""
# Select representative datasets across different characteristics
dataset_selection = {
'small_mild': 'ecoli', # Small, mild imbalance
'medium_moderate': 'abalone', # Medium, moderate imbalance
'large_mild': 'pen_digits', # Large, mild imbalance
'small_severe': 'libras_move', # Small, severe imbalance
'medium_severe': 'car_eval_4', # Medium, severe imbalance
'large_extreme': 'mammography' # Large, extreme imbalance
}
# Define sampling methods
samplers = {
'baseline': None,
'smote': SMOTE(random_state=42),
'adasyn': ADASYN(random_state=42),
'borderline': BorderlineSMOTE(random_state=42),
'under_random': RandomUnderSampler(random_state=42),
'under_enn': EditedNearestNeighbours(),
'smoteenn': SMOTEENN(random_state=42),
'smotetomek': SMOTETomek(random_state=42)
}
# Fetch datasets
datasets = fetch_datasets(filter_data=tuple(dataset_selection.values()))
results = []
for category, dataset_name in dataset_selection.items():
dataset = datasets[dataset_name]
X, y = dataset.data, dataset.target
print(f"Evaluating on {dataset_name} ({category})...")
for sampler_name, sampler in samplers.items():
if sampler is None:
# Baseline without sampling
pipeline = RandomForestClassifier(random_state=42)
else:
# Pipeline with sampling
from imblearn.pipeline import Pipeline
pipeline = Pipeline([
('sampling', sampler),
('classifier', RandomForestClassifier(random_state=42))
])
# Cross-validation
cv_results = cross_validate(
pipeline, X, y,
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
return_train_score=False
)
# Store results
results.append({
'dataset': dataset_name,
'category': category,
'sampler': sampler_name,
'accuracy': cv_results['test_accuracy'].mean(),
'f1_macro': cv_results['test_f1_macro'].mean(),
'precision_macro': cv_results['test_precision_macro'].mean(),
'recall_macro': cv_results['test_recall_macro'].mean(),
'accuracy_std': cv_results['test_accuracy'].std(),
'f1_std': cv_results['test_f1_macro'].std()
})
# Convert to DataFrame for analysis
results_df = pd.DataFrame(results)
return results_df
# Run benchmark
benchmark_results = comprehensive_benchmark()
# Analyze results
print("\nBest F1-macro scores by dataset:")
best_by_dataset = benchmark_results.loc[benchmark_results.groupby('dataset')['f1_macro'].idxmax()]
print(best_by_dataset[['dataset', 'sampler', 'f1_macro', 'f1_std']])
print("\nAverage performance by sampler:")
avg_by_sampler = benchmark_results.groupby('sampler')[['accuracy', 'f1_macro']].mean()
print(avg_by_sampler.round(3))def create_research_dataset_suite():
"""Create controlled imbalanced datasets for research."""
from sklearn.datasets import make_classification
# Define dataset configurations
configs = {
'binary_mild': {
'n_classes': 2, 'weights': [0.7, 0.3], 'n_samples': 1000,
'n_features': 20, 'n_informative': 15, 'n_redundant': 2
},
'binary_severe': {
'n_classes': 2, 'weights': [0.9, 0.1], 'n_samples': 1000,
'n_features': 20, 'n_informative': 15, 'n_redundant': 2
},
'multiclass_progressive': {
'n_classes': 5, 'weights': [0.4, 0.25, 0.2, 0.1, 0.05], 'n_samples': 2000,
'n_features': 30, 'n_informative': 20, 'n_redundant': 5
},
'high_dimensional': {
'n_classes': 3, 'weights': [0.6, 0.3, 0.1], 'n_samples': 1500,
'n_features': 100, 'n_informative': 50, 'n_redundant': 20
}
}
research_datasets = {}
for name, config in configs.items():
# Generate base dataset
X, y = make_classification(random_state=42, **config)
# Further imbalance using make_imbalance if needed
if name == 'multiclass_progressive':
# Create even more extreme imbalance
imbalance_strategy = {0: 600, 1: 300, 2: 150, 3: 75, 4: 25}
X, y = make_imbalance(X, y, sampling_strategy=imbalance_strategy, random_state=42)
research_datasets[name] = {'data': X, 'target': y}
# Print dataset characteristics
counter = Counter(y)
ratio = max(counter.values()) / min(counter.values())
print(f"{name}:")
print(f" Shape: {X.shape}")
print(f" Classes: {dict(counter)}")
print(f" Imbalance ratio: {ratio:.1f}:1")
print()
return research_datasets
# Create research datasets
research_data = create_research_dataset_suite()# Ensure reproducible results
def reproducible_evaluation(dataset_names, random_state=42):
"""Reproducible benchmark evaluation."""
# Set random state for dataset fetching
datasets = fetch_datasets(
filter_data=dataset_names,
shuffle=True,
random_state=random_state
)
# Use consistent random state across all components
for name, dataset in datasets.items():
print(f"Dataset: {name}")
print(f" Original shape: {dataset.data.shape}")
# Create reproducible imbalanced version
X_imb, y_imb = make_imbalance(
dataset.data, dataset.target,
sampling_strategy={0: 100, 1: 50}, # Example strategy
random_state=random_state,
verbose=True
)
print(f" Imbalanced shape: {X_imb.shape}")
print(f" Class distribution: {Counter(y_imb)}")
print()
# Run reproducible evaluation
reproducible_evaluation(['ecoli', 'abalone'], random_state=42)The datasets module provides essential tools for both creating controlled imbalanced datasets and accessing real-world benchmark datasets, enabling comprehensive evaluation and research in imbalanced learning.
Install with Tessl CLI
npx tessl i tessl/pypi-imbalanced-learn