tessl/pypi-mlxtend

Machine Learning Library Extensions providing essential tools for day-to-day data science tasks

—

Pending

Overview

Eval results

Files

Feature Engineering

Name: tessl/pypi-mlxtend
Author: tessl

Tools for selecting optimal feature subsets and extracting new features through dimensionality reduction techniques. All transformers follow scikit-learn's fit/transform API and can be used in scikit-learn pipelines.

Capabilities

Sequential Feature Selector

Forward or backward sequential feature selection to find optimal feature subsets based on cross-validation performance.

class SequentialFeatureSelector:
    def __init__(self, estimator, k_features=1, forward=True, floating=False,
                 verbose=0, scoring=None, cv=5, n_jobs=1, 
                 pre_dispatch='2*n_jobs', clone_estimator=True):
        """
        Sequential Feature Selector for optimal feature subset selection.
        
        Parameters:
        - estimator: sklearn-compatible estimator
        - k_features: int or tuple, number of features to select
        - forward: bool, forward (True) or backward (False) selection
        - floating: bool, use floating selection
        - verbose: int, verbosity level
        - scoring: str or callable, scoring metric
        - cv: int, cross-validation folds
        - n_jobs: int, number of parallel jobs
        - pre_dispatch: str, pre-dispatch parameter for joblib
        - clone_estimator: bool, clone the estimator
        """
    
    def fit(self, X, y, custom_feature_names=None):
        """Fit feature selector"""
        
    def transform(self, X):
        """Transform features by selecting optimal subset"""
        
    def fit_transform(self, X, y):
        """Fit and transform features"""
        
    def get_metric_dict(self):
        """Get performance metrics for each subset size"""
        
    def k_feature_names_:
        """Names of selected features"""
        
    def k_feature_idx_:
        """Indices of selected features"""
        
    def k_score_:
        """Cross-validation score of selected feature subset"""
        
    def subsets_:
        """Dictionary with subset information for each step"""

Exhaustive Feature Selector

Evaluates all possible feature combinations to find the optimal subset.

class ExhaustiveFeatureSelector:
    def __init__(self, estimator, min_features=1, max_features=1,
                 print_progress=True, scoring='accuracy', cv=5, n_jobs=1,
                 pre_dispatch='2*n_jobs', clone_estimator=True):
        """
        Exhaustive Feature Selector evaluating all combinations.
        
        Parameters:
        - estimator: sklearn-compatible estimator
        - min_features: int, minimum number of features
        - max_features: int, maximum number of features
        - print_progress: bool, print progress
        - scoring: str or callable, scoring metric
        - cv: int, cross-validation folds
        - n_jobs: int, number of parallel jobs
        - pre_dispatch: str, pre-dispatch parameter
        - clone_estimator: bool, clone the estimator
        """
    
    def fit(self, X, y, custom_feature_names=None):
        """Fit exhaustive feature selector"""
        
    def transform(self, X):
        """Transform features using best subset"""
        
    def fit_transform(self, X, y):
        """Fit and transform features"""
        
    def best_idx_:
        """Indices of best feature subset"""
        
    def best_feature_names_:
        """Names of best features"""
        
    def best_score_:
        """Cross-validation score of best subset"""
        
    def subsets_:
        """Dictionary with all evaluated subsets"""

Column Selector

Simple transformer for selecting specific columns by index or name.

class ColumnSelector:
    def __init__(self, cols=None, drop_axis=False):
        """
        Column selector for feature matrices.
        
        Parameters:
        - cols: list, column indices or names to select
        - drop_axis: bool, drop axis if single column selected
        """
    
    def fit(self, X, y=None):
        """Fit column selector (no-op)"""
        
    def transform(self, X):
        """Select specified columns"""
        
    def fit_transform(self, X, y=None):
        """Fit and transform columns"""

Principal Component Analysis

Principal Component Analysis for dimensionality reduction and feature extraction.

class PrincipalComponentAnalysis:
    def __init__(self, n_components=None, solver='svd', eta=0.01, epochs=100,
                 minibatches=None, random_seed=None, print_progress=0):
        """
        Principal Component Analysis implementation.
        
        Parameters:
        - n_components: int, number of components to keep
        - solver: str, solver algorithm ('svd' or 'eigen')
        - eta: float, learning rate (for gradient-based solver)
        - epochs: int, number of epochs (for gradient-based solver)
        - minibatches: int, number of minibatches
        - random_seed: int, random seed
        - print_progress: int, print progress frequency
        """
    
    def fit(self, X, y=None):
        """Fit PCA model"""
        
    def transform(self, X):
        """Apply dimensionality reduction"""
        
    def fit_transform(self, X, y=None):
        """Fit and transform data"""
        
    def components_:
        """Principal axes in feature space"""
        
    def explained_variance_ratio_:
        """Percentage of variance explained by each component"""
        
    def mean_:
        """Per-feature empirical mean"""
        
    def eigenvalues_:
        """Eigenvalues of the covariance matrix"""
        
    def loadings_:
        """The loadings matrix"""

Linear Discriminant Analysis

Linear Discriminant Analysis for supervised dimensionality reduction and classification.

class LinearDiscriminantAnalysis:
    def __init__(self, n_discriminants=None):
        """
        Linear Discriminant Analysis implementation.
        
        Parameters:
        - n_discriminants: int, number of discriminants to keep
        """
    
    def fit(self, X, y):
        """Fit LDA model"""
        
    def transform(self, X):
        """Apply LDA transformation"""
        
    def fit_transform(self, X, y):
        """Fit and transform data"""
        
    def scalings_:
        """Scaling factors for each discriminant"""
        
    def explained_variance_ratio_:
        """Percentage of variance explained by each discriminant"""
        
    def mean_:
        """Overall mean of the data"""
        
    def means_:
        """Class means"""
        
    def eigenvalues_:
        """Eigenvalues in descending order"""

RBF Kernel PCA

Kernel PCA using Radial Basis Function (RBF) kernel for non-linear dimensionality reduction.

class RBFKernelPCA:
    def __init__(self, gamma=15.0, n_components=None, copy_X=True):
        """
        RBF Kernel PCA for non-linear dimensionality reduction.
        
        Parameters:
        - gamma: float, RBF kernel parameter
        - n_components: int, number of components to keep
        - copy_X: bool, copy input data
        """
    
    def fit(self, X, y=None):
        """Fit RBF Kernel PCA model"""
        
    def transform(self, X):
        """Apply kernel PCA transformation"""
        
    def fit_transform(self, X, y=None):
        """Fit and transform data"""
        
    def alphas_:
        """Eigenvectors of the kernel matrix"""
        
    def eigenvals_:
        """Eigenvalues of the kernel matrix"""
        
    def X_fit_:
        """Training data used for kernel computation"""

Usage Examples

Sequential Feature Selection Example

from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create feature selector
clf = RandomForestClassifier(random_state=42)
sfs = SequentialFeatureSelector(clf, k_features=10, forward=True, scoring='accuracy', cv=5)

# Fit and transform features
sfs.fit(X_train, y_train)
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

# Get selected feature information
print("Selected features:", sfs.k_feature_names_)
print("Best score:", sfs.k_score_)

PCA Example

from mlxtend.feature_extraction import PrincipalComponentAnalysis
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt

# Create dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# Apply PCA
pca = PrincipalComponentAnalysis(n_components=2)
X_pca = pca.fit_transform(X)

# Plot results
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('PCA Visualization')
plt.colorbar()
plt.show()

# Print variance explained
print("Explained variance ratio:", pca.explained_variance_ratio_)

LDA Example

from mlxtend.feature_extraction import LinearDiscriminantAnalysis
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt

# Create multi-class dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=3, 
                         n_informative=3, random_state=42)

# Apply LDA
lda = LinearDiscriminantAnalysis(n_discriminants=2)
X_lda = lda.fit_transform(X, y)

# Plot results
plt.figure(figsize=(8, 6))
plt.scatter(X_lda[:, 0], X_lda[:, 1], c=y, cmap='viridis')
plt.xlabel('First Linear Discriminant')
plt.ylabel('Second Linear Discriminant')
plt.title('LDA Visualization')
plt.colorbar()
plt.show()

# Print variance explained
print("Explained variance ratio:", lda.explained_variance_ratio_)

Install with Tessl CLI