Machine Learning Library Extensions providing essential tools for day-to-day data science tasks
—
Tools for selecting optimal feature subsets and extracting new features through dimensionality reduction techniques. All transformers follow scikit-learn's fit/transform API and can be used in scikit-learn pipelines.
Forward or backward sequential feature selection to find optimal feature subsets based on cross-validation performance.
class SequentialFeatureSelector:
def __init__(self, estimator, k_features=1, forward=True, floating=False,
verbose=0, scoring=None, cv=5, n_jobs=1,
pre_dispatch='2*n_jobs', clone_estimator=True):
"""
Sequential Feature Selector for optimal feature subset selection.
Parameters:
- estimator: sklearn-compatible estimator
- k_features: int or tuple, number of features to select
- forward: bool, forward (True) or backward (False) selection
- floating: bool, use floating selection
- verbose: int, verbosity level
- scoring: str or callable, scoring metric
- cv: int, cross-validation folds
- n_jobs: int, number of parallel jobs
- pre_dispatch: str, pre-dispatch parameter for joblib
- clone_estimator: bool, clone the estimator
"""
def fit(self, X, y, custom_feature_names=None):
"""Fit feature selector"""
def transform(self, X):
"""Transform features by selecting optimal subset"""
def fit_transform(self, X, y):
"""Fit and transform features"""
def get_metric_dict(self):
"""Get performance metrics for each subset size"""
def k_feature_names_:
"""Names of selected features"""
def k_feature_idx_:
"""Indices of selected features"""
def k_score_:
"""Cross-validation score of selected feature subset"""
def subsets_:
"""Dictionary with subset information for each step"""Evaluates all possible feature combinations to find the optimal subset.
class ExhaustiveFeatureSelector:
def __init__(self, estimator, min_features=1, max_features=1,
print_progress=True, scoring='accuracy', cv=5, n_jobs=1,
pre_dispatch='2*n_jobs', clone_estimator=True):
"""
Exhaustive Feature Selector evaluating all combinations.
Parameters:
- estimator: sklearn-compatible estimator
- min_features: int, minimum number of features
- max_features: int, maximum number of features
- print_progress: bool, print progress
- scoring: str or callable, scoring metric
- cv: int, cross-validation folds
- n_jobs: int, number of parallel jobs
- pre_dispatch: str, pre-dispatch parameter
- clone_estimator: bool, clone the estimator
"""
def fit(self, X, y, custom_feature_names=None):
"""Fit exhaustive feature selector"""
def transform(self, X):
"""Transform features using best subset"""
def fit_transform(self, X, y):
"""Fit and transform features"""
def best_idx_:
"""Indices of best feature subset"""
def best_feature_names_:
"""Names of best features"""
def best_score_:
"""Cross-validation score of best subset"""
def subsets_:
"""Dictionary with all evaluated subsets"""Simple transformer for selecting specific columns by index or name.
class ColumnSelector:
def __init__(self, cols=None, drop_axis=False):
"""
Column selector for feature matrices.
Parameters:
- cols: list, column indices or names to select
- drop_axis: bool, drop axis if single column selected
"""
def fit(self, X, y=None):
"""Fit column selector (no-op)"""
def transform(self, X):
"""Select specified columns"""
def fit_transform(self, X, y=None):
"""Fit and transform columns"""Principal Component Analysis for dimensionality reduction and feature extraction.
class PrincipalComponentAnalysis:
def __init__(self, n_components=None, solver='svd', eta=0.01, epochs=100,
minibatches=None, random_seed=None, print_progress=0):
"""
Principal Component Analysis implementation.
Parameters:
- n_components: int, number of components to keep
- solver: str, solver algorithm ('svd' or 'eigen')
- eta: float, learning rate (for gradient-based solver)
- epochs: int, number of epochs (for gradient-based solver)
- minibatches: int, number of minibatches
- random_seed: int, random seed
- print_progress: int, print progress frequency
"""
def fit(self, X, y=None):
"""Fit PCA model"""
def transform(self, X):
"""Apply dimensionality reduction"""
def fit_transform(self, X, y=None):
"""Fit and transform data"""
def components_:
"""Principal axes in feature space"""
def explained_variance_ratio_:
"""Percentage of variance explained by each component"""
def mean_:
"""Per-feature empirical mean"""
def eigenvalues_:
"""Eigenvalues of the covariance matrix"""
def loadings_:
"""The loadings matrix"""Linear Discriminant Analysis for supervised dimensionality reduction and classification.
class LinearDiscriminantAnalysis:
def __init__(self, n_discriminants=None):
"""
Linear Discriminant Analysis implementation.
Parameters:
- n_discriminants: int, number of discriminants to keep
"""
def fit(self, X, y):
"""Fit LDA model"""
def transform(self, X):
"""Apply LDA transformation"""
def fit_transform(self, X, y):
"""Fit and transform data"""
def scalings_:
"""Scaling factors for each discriminant"""
def explained_variance_ratio_:
"""Percentage of variance explained by each discriminant"""
def mean_:
"""Overall mean of the data"""
def means_:
"""Class means"""
def eigenvalues_:
"""Eigenvalues in descending order"""Kernel PCA using Radial Basis Function (RBF) kernel for non-linear dimensionality reduction.
class RBFKernelPCA:
def __init__(self, gamma=15.0, n_components=None, copy_X=True):
"""
RBF Kernel PCA for non-linear dimensionality reduction.
Parameters:
- gamma: float, RBF kernel parameter
- n_components: int, number of components to keep
- copy_X: bool, copy input data
"""
def fit(self, X, y=None):
"""Fit RBF Kernel PCA model"""
def transform(self, X):
"""Apply kernel PCA transformation"""
def fit_transform(self, X, y=None):
"""Fit and transform data"""
def alphas_:
"""Eigenvectors of the kernel matrix"""
def eigenvals_:
"""Eigenvalues of the kernel matrix"""
def X_fit_:
"""Training data used for kernel computation"""from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create feature selector
clf = RandomForestClassifier(random_state=42)
sfs = SequentialFeatureSelector(clf, k_features=10, forward=True, scoring='accuracy', cv=5)
# Fit and transform features
sfs.fit(X_train, y_train)
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)
# Get selected feature information
print("Selected features:", sfs.k_feature_names_)
print("Best score:", sfs.k_score_)from mlxtend.feature_extraction import PrincipalComponentAnalysis
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
# Apply PCA
pca = PrincipalComponentAnalysis(n_components=2)
X_pca = pca.fit_transform(X)
# Plot results
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA Visualization')
plt.colorbar()
plt.show()
# Print variance explained
print("Explained variance ratio:", pca.explained_variance_ratio_)from mlxtend.feature_extraction import LinearDiscriminantAnalysis
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
# Create multi-class dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=3,
n_informative=3, random_state=42)
# Apply LDA
lda = LinearDiscriminantAnalysis(n_discriminants=2)
X_lda = lda.fit_transform(X, y)
# Plot results
plt.figure(figsize=(8, 6))
plt.scatter(X_lda[:, 0], X_lda[:, 1], c=y, cmap='viridis')
plt.xlabel('First Linear Discriminant')
plt.ylabel('Second Linear Discriminant')
plt.title('LDA Visualization')
plt.colorbar()
plt.show()
# Print variance explained
print("Explained variance ratio:", lda.explained_variance_ratio_)Install with Tessl CLI
npx tessl i tessl/pypi-mlxtend