Python implementations of metric learning algorithms
—
Helper classes and functions for working with metric learning algorithms, including constraint generation and data preprocessing utilities.
Helper class for generating constraint pairs from labeled data, enabling easy conversion from supervised learning problems to weakly-supervised metric learning.
class Constraints:
def __init__(self, partial_labels):
"""
Build constraints from labeled data.
Parameters:
- partial_labels: array-like, shape=(n_samples,), labels with -1 for unknown
"""
def positive_negative_pairs(self, n_constraints, same_length=False, random_state=None):
"""
Generate positive and negative pairs from labeled data.
Parameters:
- n_constraints: int, number of positive and negative constraints to generate
- same_length: bool, whether to ensure same number of positive and negative pairs
- random_state: int or None, random state for reproducibility
Returns:
- positive_pairs: array-like, shape=(n_pos, 2), pairs with same label
- negative_pairs: array-like, shape=(n_neg, 2), pairs with different labels
"""
def chunks(self, n_chunks=100, chunk_size=2, random_state=None):
"""
Generate chunks of similar items for RCA algorithm.
Parameters:
- n_chunks: int, number of chunks to generate
- chunk_size: int, number of items per chunk
- random_state: int or None, random state for reproducibility
Returns:
- chunks: array-like, shape=(n_samples,), 1D array of chunk indicators
where -1 indicates that the point does not belong to any chunk
"""
def generate_knntriplets(self, X, k_genuine, k_impostor):
"""
Generate triplets from labeled data using k-nearest neighbors.
Parameters:
- X: array-like, shape=(n_samples, n_features), input data
- k_genuine: int, number of neighbors of the same class to consider
- k_impostor: int, number of neighbors of different classes to consider
Returns:
- triplets: array-like, shape=(n_constraints, 3), 2D array of triplet indicators
"""Usage examples:
from metric_learn import Constraints
from sklearn.datasets import load_iris
import numpy as np
# Load sample data
X, y = load_iris(return_X_y=True)
# Create constraints generator from labels
constraints = Constraints(y)
# Generate positive and negative pairs
pos_pairs, neg_pairs = constraints.positive_negative_pairs(n_constraints=200)
# Combine into format expected by weakly-supervised algorithms
pairs = np.vstack([pos_pairs, neg_pairs])
pair_labels = np.hstack([np.ones(len(pos_pairs)), -np.ones(len(neg_pairs))])
print("Generated pairs shape:", pairs.shape)
print("Pair labels shape:", pair_labels.shape)
print("Unique pair labels:", np.unique(pair_labels)) # [-1, 1]
# Use with weakly-supervised algorithms
from metric_learn import ITML
itml = ITML(preprocessor=X)
itml.fit(pairs, pair_labels)The metric-learn package supports various constraint formats for different algorithms:
Most common format for weakly-supervised learning:
from metric_learn import Constraints, ITML, LSML
from sklearn.datasets import make_classification
# Generate sample data
X, y = make_classification(n_samples=200, n_features=5, n_classes=3, random_state=42)
# Generate pair constraints
constraints = Constraints(y)
pos_pairs, neg_pairs = constraints.positive_negative_pairs(n_constraints=250)
pairs = np.vstack([pos_pairs, neg_pairs])
pair_labels = np.hstack([np.ones(len(pos_pairs)), -np.ones(len(neg_pairs))])
# Use with different algorithms
algorithms = [
ITML(preprocessor=X),
LSML(preprocessor=X)
]
for algo in algorithms:
algo.fit(pairs, pair_labels)
print(f"{algo.__class__.__name__} fitted with {len(pairs)} constraints")RCA uses a different constraint format based on chunks of similar items:
from metric_learn import RCA
import numpy as np
# Create chunks manually
chunks = [
[0, 1, 2], # Chunk 1: indices of similar items
[3, 4, 5], # Chunk 2: indices of similar items
[6, 7, 8, 9], # Chunk 3: indices of similar items
[10, 11] # Chunk 4: indices of similar items
]
rca = RCA(dim=3)
rca.fit(chunks)
# Generate chunks from class labels
def labels_to_chunks(y):
"""Convert class labels to RCA chunk format."""
chunks = []
unique_labels = np.unique(y)
for label in unique_labels:
chunk_indices = np.where(y == label)[0].tolist()
if len(chunk_indices) > 1: # Need at least 2 items per chunk
chunks.append(chunk_indices)
return chunks
# Example usage
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)
# Convert labels to chunks using Constraints class
y_subset = y[:100]
constraints = Constraints(y_subset)
chunks = constraints.chunks(n_chunks=20, chunk_size=3)
rca = RCA(dim=10)
rca.fit(chunks)
X_transformed = rca.transform(X[:100])While not exported as separate utilities, metric-learn algorithms include preprocessing capabilities:
from metric_learn import ITML
import numpy as np
# Your dataset
X = np.random.randn(100, 8)
# Index-based constraints (more memory efficient)
pairs_idx = [(0, 1), (2, 5), (10, 20), (15, 25)]
y = [1, -1, 1, -1] # 1 for similar, -1 for dissimilar
# Method 1: Use preprocessor parameter
itml_with_preprocessor = ITML(preprocessor=X)
itml_with_preprocessor.fit(pairs_idx, y)
# Method 2: Convert indices to actual data pairs
pairs_data = np.array([[X[i], X[j]] for i, j in pairs_idx])
itml_direct = ITML()
itml_direct.fit(pairs_data, y)
# Both methods are equivalentfrom metric_learn import ITML
import numpy as np
def custom_preprocessor(indices):
"""Custom preprocessor that applies transformations before metric learning."""
# indices is a 2D array of shape (n_pairs, 2)
# Return 3D array of shape (n_pairs, 2, n_features)
pairs = []
for i, j in indices:
# Apply custom transformations
x_i = your_transform_function(your_data[i])
x_j = your_transform_function(your_data[j])
pairs.append([x_i, x_j])
return np.array(pairs)
# Use custom preprocessor
itml = ITML(preprocessor=custom_preprocessor)
itml.fit(pairs_idx, y)__version__: str
"""Package version string"""Usage:
import metric_learn
print("Metric-learn version:", metric_learn.__version__)Common patterns for integrating metric-learn with scikit-learn workflows:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from metric_learn import LMNN
# Create pipeline with metric learning
pipeline = Pipeline([
('scaler', StandardScaler()),
('metric_learner', LMNN(k=3)),
('classifier', KNeighborsClassifier(n_neighbors=3))
])
# Note: This requires custom handling since LMNN needs labels in fit()
# Better approach:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Learn metric
lmnn = LMNN(k=3)
lmnn.fit(X_train_scaled, y_train)
# Transform data
X_train_transformed = lmnn.transform(X_train_scaled)
X_test_transformed = lmnn.transform(X_test_scaled)
# Classify
knn = KNeighborsClassifier(n_neighbors=3, metric=lmnn.get_metric())
knn.fit(X_train_scaled, y_train) # Use original scaled data for metric computation
accuracy = knn.score(X_test_scaled, y_test)from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from metric_learn import NCA
import numpy as np
def metric_learning_cv_score(X, y, metric_learner, classifier, cv=5):
"""Custom cross-validation for metric learning algorithms."""
from sklearn.model_selection import KFold
kf = KFold(n_splits=cv, shuffle=True, random_state=42)
scores = []
for train_idx, test_idx in kf.split(X):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# Fit metric learner
metric_learner_copy = type(metric_learner)(**metric_learner.get_params())
metric_learner_copy.fit(X_train, y_train)
# Transform data
X_train_transformed = metric_learner_copy.transform(X_train)
X_test_transformed = metric_learner_copy.transform(X_test)
# Fit and score classifier
classifier_copy = type(classifier)(**classifier.get_params())
classifier_copy.fit(X_train_transformed, y_train)
score = classifier_copy.score(X_test_transformed, y_test)
scores.append(score)
return np.array(scores)
# Usage example
from sklearn.datasets import load_wine
X, y = load_wine(return_X_y=True)
nca = NCA(max_iter=100)
knn = KNeighborsClassifier(n_neighbors=3)
scores = metric_learning_cv_score(X, y, nca, knn, cv=5)
print(f"CV scores: {scores}")
print(f"Mean CV score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")Install with Tessl CLI
npx tessl i tessl/pypi-metric-learn