tessl/pypi-metric-learn

Python implementations of metric learning algorithms

—

Pending

Overview

Eval results

Files

Utilities

Name: tessl/pypi-metric-learn
Author: tessl

Helper classes and functions for working with metric learning algorithms, including constraint generation and data preprocessing utilities.

Capabilities

Constraints Class

Helper class for generating constraint pairs from labeled data, enabling easy conversion from supervised learning problems to weakly-supervised metric learning.

class Constraints:
    def __init__(self, partial_labels):
        """
        Build constraints from labeled data.
        
        Parameters:
        - partial_labels: array-like, shape=(n_samples,), labels with -1 for unknown
        """
    
    def positive_negative_pairs(self, n_constraints, same_length=False, random_state=None):
        """
        Generate positive and negative pairs from labeled data.
        
        Parameters:
        - n_constraints: int, number of positive and negative constraints to generate
        - same_length: bool, whether to ensure same number of positive and negative pairs
        - random_state: int or None, random state for reproducibility
        
        Returns:
        - positive_pairs: array-like, shape=(n_pos, 2), pairs with same label
        - negative_pairs: array-like, shape=(n_neg, 2), pairs with different labels
        """
    
    def chunks(self, n_chunks=100, chunk_size=2, random_state=None):
        """
        Generate chunks of similar items for RCA algorithm.
        
        Parameters:
        - n_chunks: int, number of chunks to generate
        - chunk_size: int, number of items per chunk
        - random_state: int or None, random state for reproducibility
        
        Returns:
        - chunks: array-like, shape=(n_samples,), 1D array of chunk indicators 
                 where -1 indicates that the point does not belong to any chunk
        """
    
    def generate_knntriplets(self, X, k_genuine, k_impostor):
        """
        Generate triplets from labeled data using k-nearest neighbors.
        
        Parameters:
        - X: array-like, shape=(n_samples, n_features), input data
        - k_genuine: int, number of neighbors of the same class to consider
        - k_impostor: int, number of neighbors of different classes to consider
        
        Returns:
        - triplets: array-like, shape=(n_constraints, 3), 2D array of triplet indicators
        """

Usage examples:

from metric_learn import Constraints
from sklearn.datasets import load_iris
import numpy as np

# Load sample data
X, y = load_iris(return_X_y=True)

# Create constraints generator from labels
constraints = Constraints(y)

# Generate positive and negative pairs
pos_pairs, neg_pairs = constraints.positive_negative_pairs(n_constraints=200)

# Combine into format expected by weakly-supervised algorithms
pairs = np.vstack([pos_pairs, neg_pairs])
pair_labels = np.hstack([np.ones(len(pos_pairs)), -np.ones(len(neg_pairs))])

print("Generated pairs shape:", pairs.shape)
print("Pair labels shape:", pair_labels.shape)
print("Unique pair labels:", np.unique(pair_labels))  # [-1, 1]

# Use with weakly-supervised algorithms
from metric_learn import ITML
itml = ITML(preprocessor=X)
itml.fit(pairs, pair_labels)

Working with Different Constraint Types

The metric-learn package supports various constraint formats for different algorithms:

Pair Constraints

Most common format for weakly-supervised learning:

from metric_learn import Constraints, ITML, LSML
from sklearn.datasets import make_classification

# Generate sample data
X, y = make_classification(n_samples=200, n_features=5, n_classes=3, random_state=42)

# Generate pair constraints
constraints = Constraints(y)
pos_pairs, neg_pairs = constraints.positive_negative_pairs(n_constraints=250)
pairs = np.vstack([pos_pairs, neg_pairs])
pair_labels = np.hstack([np.ones(len(pos_pairs)), -np.ones(len(neg_pairs))])

# Use with different algorithms
algorithms = [
    ITML(preprocessor=X),
    LSML(preprocessor=X)  
]

for algo in algorithms:
    algo.fit(pairs, pair_labels)
    print(f"{algo.__class__.__name__} fitted with {len(pairs)} constraints")

Chunk Constraints for RCA

RCA uses a different constraint format based on chunks of similar items:

from metric_learn import RCA
import numpy as np

# Create chunks manually
chunks = [
    [0, 1, 2],      # Chunk 1: indices of similar items
    [3, 4, 5],      # Chunk 2: indices of similar items  
    [6, 7, 8, 9],   # Chunk 3: indices of similar items
    [10, 11]        # Chunk 4: indices of similar items
]

rca = RCA(dim=3)
rca.fit(chunks)

# Generate chunks from class labels
def labels_to_chunks(y):
    """Convert class labels to RCA chunk format."""
    chunks = []
    unique_labels = np.unique(y)
    for label in unique_labels:
        chunk_indices = np.where(y == label)[0].tolist()
        if len(chunk_indices) > 1:  # Need at least 2 items per chunk
            chunks.append(chunk_indices)
    return chunks

# Example usage
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)

# Convert labels to chunks using Constraints class
y_subset = y[:100] 
constraints = Constraints(y_subset)
chunks = constraints.chunks(n_chunks=20, chunk_size=3)

rca = RCA(dim=10)
rca.fit(chunks)
X_transformed = rca.transform(X[:100])

Data Preprocessing Utilities

While not exported as separate utilities, metric-learn algorithms include preprocessing capabilities:

Using Preprocessors

from metric_learn import ITML
import numpy as np

# Your dataset
X = np.random.randn(100, 8)

# Index-based constraints (more memory efficient)
pairs_idx = [(0, 1), (2, 5), (10, 20), (15, 25)]
y = [1, -1, 1, -1]  # 1 for similar, -1 for dissimilar

# Method 1: Use preprocessor parameter
itml_with_preprocessor = ITML(preprocessor=X)
itml_with_preprocessor.fit(pairs_idx, y)

# Method 2: Convert indices to actual data pairs  
pairs_data = np.array([[X[i], X[j]] for i, j in pairs_idx])
itml_direct = ITML()
itml_direct.fit(pairs_data, y)

# Both methods are equivalent

Custom Preprocessor Functions

from metric_learn import ITML
import numpy as np

def custom_preprocessor(indices):
    """Custom preprocessor that applies transformations before metric learning."""
    # indices is a 2D array of shape (n_pairs, 2)
    # Return 3D array of shape (n_pairs, 2, n_features)
    pairs = []
    for i, j in indices:
        # Apply custom transformations
        x_i = your_transform_function(your_data[i])
        x_j = your_transform_function(your_data[j])
        pairs.append([x_i, x_j])
    return np.array(pairs)

# Use custom preprocessor
itml = ITML(preprocessor=custom_preprocessor)
itml.fit(pairs_idx, y)

Package Version Information

__version__: str
    """Package version string"""

Usage:

import metric_learn
print("Metric-learn version:", metric_learn.__version__)

Integration Utilities

Common patterns for integrating metric-learn with scikit-learn workflows:

Pipeline Integration

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from metric_learn import LMNN

# Create pipeline with metric learning
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('metric_learner', LMNN(k=3)),
    ('classifier', KNeighborsClassifier(n_neighbors=3))
])

# Note: This requires custom handling since LMNN needs labels in fit()
# Better approach:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Learn metric
lmnn = LMNN(k=3)
lmnn.fit(X_train_scaled, y_train)

# Transform data
X_train_transformed = lmnn.transform(X_train_scaled)
X_test_transformed = lmnn.transform(X_test_scaled)

# Classify
knn = KNeighborsClassifier(n_neighbors=3, metric=lmnn.get_metric())
knn.fit(X_train_scaled, y_train)  # Use original scaled data for metric computation
accuracy = knn.score(X_test_scaled, y_test)

Cross-Validation with Metric Learning

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from metric_learn import NCA
import numpy as np

def metric_learning_cv_score(X, y, metric_learner, classifier, cv=5):
    """Custom cross-validation for metric learning algorithms."""
    from sklearn.model_selection import KFold
    
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]  
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Fit metric learner
        metric_learner_copy = type(metric_learner)(**metric_learner.get_params())
        metric_learner_copy.fit(X_train, y_train)
        
        # Transform data
        X_train_transformed = metric_learner_copy.transform(X_train)
        X_test_transformed = metric_learner_copy.transform(X_test)
        
        # Fit and score classifier
        classifier_copy = type(classifier)(**classifier.get_params())
        classifier_copy.fit(X_train_transformed, y_train)
        score = classifier_copy.score(X_test_transformed, y_test)
        scores.append(score)
    
    return np.array(scores)

# Usage example
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)

nca = NCA(max_iter=100)
knn = KNeighborsClassifier(n_neighbors=3)

scores = metric_learning_cv_score(X, y, nca, knn, cv=5)
print(f"CV scores: {scores}")
print(f"Mean CV score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

Install with Tessl CLI