Machine Learning Library Extensions providing essential tools for day-to-day data science tasks
—
Utilities for loading common machine learning datasets and generating synthetic data for experimentation and learning.
Standard datasets for classification tasks.
def iris_data():
"""
Load the Iris flower classification dataset.
Returns:
- X: array, feature matrix (150 samples, 4 features)
- y: array, class labels (3 classes: setosa, versicolor, virginica)
"""
def wine_data():
"""
Load the Wine recognition dataset.
Returns:
- X: array, feature matrix (178 samples, 13 features)
- y: array, class labels (3 wine types)
"""Standard datasets for regression tasks.
def boston_housing_data():
"""
Load the Boston Housing dataset.
Returns:
- X: array, feature matrix (506 samples, 13 features)
- y: array, housing prices (regression targets)
"""
def autompg_data():
"""
Load the Auto MPG dataset for regression.
Returns:
- X: array, feature matrix (398 samples, 7 features)
- y: array, miles per gallon (regression target)
"""Image recognition datasets.
def mnist_data():
"""
Load the MNIST handwritten digit dataset.
Returns:
- X: array, image data (70000 samples, 784 features)
- y: array, digit labels (0-9)
"""
def loadlocal_mnist(images_path, labels_path):
"""
Load MNIST data from local files.
Parameters:
- images_path: str, path to images file
- labels_path: str, path to labels file
Returns:
- X: array, image data
- y: array, digit labels
"""Functions for generating synthetic datasets.
def three_blobs_data():
"""
Generate three Gaussian blobs for clustering.
Returns:
- X: array, feature matrix (150 samples, 2 features)
- y: array, cluster labels (3 clusters)
"""
def make_multiplexer_dataset(address_bits, sample_size, positive_class_ratio=0.5):
"""
Generate multiplexer boolean function dataset.
Parameters:
- address_bits: int, number of address bits
- sample_size: int, number of samples to generate
- positive_class_ratio: float, ratio of positive class samples
Returns:
- X: array, binary feature matrix
- y: array, binary class labels
"""from mlxtend.data import iris_data, wine_data, boston_housing_data
import matplotlib.pyplot as plt
# Load Iris dataset
X_iris, y_iris = iris_data()
print(f"Iris dataset: {X_iris.shape[0]} samples, {X_iris.shape[1]} features, {len(set(y_iris))} classes")
# Load Wine dataset
X_wine, y_wine = wine_data()
print(f"Wine dataset: {X_wine.shape[0]} samples, {X_wine.shape[1]} features, {len(set(y_wine))} classes")
# Load Boston Housing dataset
X_boston, y_boston = boston_housing_data()
print(f"Boston Housing: {X_boston.shape[0]} samples, {X_boston.shape[1]} features")
# Visualize Iris data
plt.figure(figsize=(8, 6))
colors = ['red', 'green', 'blue']
for i, color in enumerate(colors):
mask = y_iris == i
plt.scatter(X_iris[mask, 0], X_iris[mask, 1], c=color, label=f'Class {i}')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.title('Iris Dataset Visualization')
plt.legend()
plt.show()from mlxtend.data import mnist_data
import matplotlib.pyplot as plt
import numpy as np
# Load MNIST dataset
X, y = mnist_data()
print(f"MNIST dataset: {X.shape[0]} samples, {X.shape[1]} features")
# Display sample digits
fig, axes = plt.subplots(2, 5, figsize=(10, 4))
for i, ax in enumerate(axes.flat):
# Reshape flat vector to 28x28 image
image = X[i].reshape(28, 28)
ax.imshow(image, cmap='gray')
ax.set_title(f'Label: {y[i]}')
ax.axis('off')
plt.tight_layout()
plt.show()from mlxtend.data import three_blobs_data, make_multiplexer_dataset
import matplotlib.pyplot as plt
# Generate three blobs for clustering
X_blobs, y_blobs = three_blobs_data()
plt.figure(figsize=(8, 6))
plt.scatter(X_blobs[:, 0], X_blobs[:, 1], c=y_blobs, cmap='viridis')
plt.title('Three Blobs Dataset')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
# Generate multiplexer dataset
X_mult, y_mult = make_multiplexer_dataset(address_bits=2, sample_size=1000)
print(f"Multiplexer dataset: {X_mult.shape[0]} samples, {X_mult.shape[1]} features")
print(f"Class distribution: {np.bincount(y_mult)}")Install with Tessl CLI
npx tessl i tessl/pypi-mlxtend