0
# Utilities
1
2
Helper classes and functions for working with metric learning algorithms, including constraint generation and data preprocessing utilities.
3
4
## Capabilities
5
6
### Constraints Class
7
8
Helper class for generating constraint pairs from labeled data, enabling easy conversion from supervised learning problems to weakly-supervised metric learning.
9
10
```python { .api }
11
class Constraints:
12
def __init__(self, partial_labels):
13
"""
14
Build constraints from labeled data.
15
16
Parameters:
17
- partial_labels: array-like, shape=(n_samples,), labels with -1 for unknown
18
"""
19
20
def positive_negative_pairs(self, n_constraints, same_length=False, random_state=None):
21
"""
22
Generate positive and negative pairs from labeled data.
23
24
Parameters:
25
- n_constraints: int, number of positive and negative constraints to generate
26
- same_length: bool, whether to ensure same number of positive and negative pairs
27
- random_state: int or None, random state for reproducibility
28
29
Returns:
30
- positive_pairs: array-like, shape=(n_pos, 2), pairs with same label
31
- negative_pairs: array-like, shape=(n_neg, 2), pairs with different labels
32
"""
33
34
def chunks(self, n_chunks=100, chunk_size=2, random_state=None):
35
"""
36
Generate chunks of similar items for RCA algorithm.
37
38
Parameters:
39
- n_chunks: int, number of chunks to generate
40
- chunk_size: int, number of items per chunk
41
- random_state: int or None, random state for reproducibility
42
43
Returns:
44
- chunks: array-like, shape=(n_samples,), 1D array of chunk indicators
45
where -1 indicates that the point does not belong to any chunk
46
"""
47
48
def generate_knntriplets(self, X, k_genuine, k_impostor):
49
"""
50
Generate triplets from labeled data using k-nearest neighbors.
51
52
Parameters:
53
- X: array-like, shape=(n_samples, n_features), input data
54
- k_genuine: int, number of neighbors of the same class to consider
55
- k_impostor: int, number of neighbors of different classes to consider
56
57
Returns:
58
- triplets: array-like, shape=(n_constraints, 3), 2D array of triplet indicators
59
"""
60
```
61
62
Usage examples:
63
64
```python
65
from metric_learn import Constraints
66
from sklearn.datasets import load_iris
67
import numpy as np
68
69
# Load sample data
70
X, y = load_iris(return_X_y=True)
71
72
# Create constraints generator from labels
73
constraints = Constraints(y)
74
75
# Generate positive and negative pairs
76
pos_pairs, neg_pairs = constraints.positive_negative_pairs(n_constraints=200)
77
78
# Combine into format expected by weakly-supervised algorithms
79
pairs = np.vstack([pos_pairs, neg_pairs])
80
pair_labels = np.hstack([np.ones(len(pos_pairs)), -np.ones(len(neg_pairs))])
81
82
print("Generated pairs shape:", pairs.shape)
83
print("Pair labels shape:", pair_labels.shape)
84
print("Unique pair labels:", np.unique(pair_labels)) # [-1, 1]
85
86
# Use with weakly-supervised algorithms
87
from metric_learn import ITML
88
itml = ITML(preprocessor=X)
89
itml.fit(pairs, pair_labels)
90
```
91
92
### Working with Different Constraint Types
93
94
The metric-learn package supports various constraint formats for different algorithms:
95
96
#### Pair Constraints
97
98
Most common format for weakly-supervised learning:
99
100
```python
101
from metric_learn import Constraints, ITML, LSML
102
from sklearn.datasets import make_classification
103
104
# Generate sample data
105
X, y = make_classification(n_samples=200, n_features=5, n_classes=3, random_state=42)
106
107
# Generate pair constraints
108
constraints = Constraints(y)
109
pos_pairs, neg_pairs = constraints.positive_negative_pairs(n_constraints=250)
110
pairs = np.vstack([pos_pairs, neg_pairs])
111
pair_labels = np.hstack([np.ones(len(pos_pairs)), -np.ones(len(neg_pairs))])
112
113
# Use with different algorithms
114
algorithms = [
115
ITML(preprocessor=X),
116
LSML(preprocessor=X)
117
]
118
119
for algo in algorithms:
120
algo.fit(pairs, pair_labels)
121
print(f"{algo.__class__.__name__} fitted with {len(pairs)} constraints")
122
```
123
124
#### Chunk Constraints for RCA
125
126
RCA uses a different constraint format based on chunks of similar items:
127
128
```python
129
from metric_learn import RCA
130
import numpy as np
131
132
# Create chunks manually
133
chunks = [
134
[0, 1, 2], # Chunk 1: indices of similar items
135
[3, 4, 5], # Chunk 2: indices of similar items
136
[6, 7, 8, 9], # Chunk 3: indices of similar items
137
[10, 11] # Chunk 4: indices of similar items
138
]
139
140
rca = RCA(dim=3)
141
rca.fit(chunks)
142
143
# Generate chunks from class labels
144
def labels_to_chunks(y):
145
"""Convert class labels to RCA chunk format."""
146
chunks = []
147
unique_labels = np.unique(y)
148
for label in unique_labels:
149
chunk_indices = np.where(y == label)[0].tolist()
150
if len(chunk_indices) > 1: # Need at least 2 items per chunk
151
chunks.append(chunk_indices)
152
return chunks
153
154
# Example usage
155
from sklearn.datasets import load_digits
156
X, y = load_digits(return_X_y=True)
157
158
# Convert labels to chunks using Constraints class
159
y_subset = y[:100]
160
constraints = Constraints(y_subset)
161
chunks = constraints.chunks(n_chunks=20, chunk_size=3)
162
163
rca = RCA(dim=10)
164
rca.fit(chunks)
165
X_transformed = rca.transform(X[:100])
166
```
167
168
### Data Preprocessing Utilities
169
170
While not exported as separate utilities, metric-learn algorithms include preprocessing capabilities:
171
172
#### Using Preprocessors
173
174
```python
175
from metric_learn import ITML
176
import numpy as np
177
178
# Your dataset
179
X = np.random.randn(100, 8)
180
181
# Index-based constraints (more memory efficient)
182
pairs_idx = [(0, 1), (2, 5), (10, 20), (15, 25)]
183
y = [1, -1, 1, -1] # 1 for similar, -1 for dissimilar
184
185
# Method 1: Use preprocessor parameter
186
itml_with_preprocessor = ITML(preprocessor=X)
187
itml_with_preprocessor.fit(pairs_idx, y)
188
189
# Method 2: Convert indices to actual data pairs
190
pairs_data = np.array([[X[i], X[j]] for i, j in pairs_idx])
191
itml_direct = ITML()
192
itml_direct.fit(pairs_data, y)
193
194
# Both methods are equivalent
195
```
196
197
#### Custom Preprocessor Functions
198
199
```python
200
from metric_learn import ITML
201
import numpy as np
202
203
def custom_preprocessor(indices):
204
"""Custom preprocessor that applies transformations before metric learning."""
205
# indices is a 2D array of shape (n_pairs, 2)
206
# Return 3D array of shape (n_pairs, 2, n_features)
207
pairs = []
208
for i, j in indices:
209
# Apply custom transformations
210
x_i = your_transform_function(your_data[i])
211
x_j = your_transform_function(your_data[j])
212
pairs.append([x_i, x_j])
213
return np.array(pairs)
214
215
# Use custom preprocessor
216
itml = ITML(preprocessor=custom_preprocessor)
217
itml.fit(pairs_idx, y)
218
```
219
220
### Package Version Information
221
222
```python { .api }
223
__version__: str
224
"""Package version string"""
225
```
226
227
Usage:
228
229
```python
230
import metric_learn
231
print("Metric-learn version:", metric_learn.__version__)
232
```
233
234
### Integration Utilities
235
236
Common patterns for integrating metric-learn with scikit-learn workflows:
237
238
#### Pipeline Integration
239
240
```python
241
from sklearn.pipeline import Pipeline
242
from sklearn.preprocessing import StandardScaler
243
from sklearn.neighbors import KNeighborsClassifier
244
from metric_learn import LMNN
245
246
# Create pipeline with metric learning
247
pipeline = Pipeline([
248
('scaler', StandardScaler()),
249
('metric_learner', LMNN(k=3)),
250
('classifier', KNeighborsClassifier(n_neighbors=3))
251
])
252
253
# Note: This requires custom handling since LMNN needs labels in fit()
254
# Better approach:
255
from sklearn.model_selection import train_test_split
256
257
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
258
259
# Scale data
260
scaler = StandardScaler()
261
X_train_scaled = scaler.fit_transform(X_train)
262
X_test_scaled = scaler.transform(X_test)
263
264
# Learn metric
265
lmnn = LMNN(k=3)
266
lmnn.fit(X_train_scaled, y_train)
267
268
# Transform data
269
X_train_transformed = lmnn.transform(X_train_scaled)
270
X_test_transformed = lmnn.transform(X_test_scaled)
271
272
# Classify
273
knn = KNeighborsClassifier(n_neighbors=3, metric=lmnn.get_metric())
274
knn.fit(X_train_scaled, y_train) # Use original scaled data for metric computation
275
accuracy = knn.score(X_test_scaled, y_test)
276
```
277
278
#### Cross-Validation with Metric Learning
279
280
```python
281
from sklearn.model_selection import cross_val_score
282
from sklearn.neighbors import KNeighborsClassifier
283
from metric_learn import NCA
284
import numpy as np
285
286
def metric_learning_cv_score(X, y, metric_learner, classifier, cv=5):
287
"""Custom cross-validation for metric learning algorithms."""
288
from sklearn.model_selection import KFold
289
290
kf = KFold(n_splits=cv, shuffle=True, random_state=42)
291
scores = []
292
293
for train_idx, test_idx in kf.split(X):
294
X_train, X_test = X[train_idx], X[test_idx]
295
y_train, y_test = y[train_idx], y[test_idx]
296
297
# Fit metric learner
298
metric_learner_copy = type(metric_learner)(**metric_learner.get_params())
299
metric_learner_copy.fit(X_train, y_train)
300
301
# Transform data
302
X_train_transformed = metric_learner_copy.transform(X_train)
303
X_test_transformed = metric_learner_copy.transform(X_test)
304
305
# Fit and score classifier
306
classifier_copy = type(classifier)(**classifier.get_params())
307
classifier_copy.fit(X_train_transformed, y_train)
308
score = classifier_copy.score(X_test_transformed, y_test)
309
scores.append(score)
310
311
return np.array(scores)
312
313
# Usage example
314
from sklearn.datasets import load_wine
315
X, y = load_wine(return_X_y=True)
316
317
nca = NCA(max_iter=100)
318
knn = KNeighborsClassifier(n_neighbors=3)
319
320
scores = metric_learning_cv_score(X, y, nca, knn, cv=5)
321
print(f"CV scores: {scores}")
322
print(f"Mean CV score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
323
```