or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

base-classes.mdindex.mdspecialized-algorithms.mdsupervised-algorithms.mdutilities.mdweakly-supervised-algorithms.md

utilities.mddocs/

0

# Utilities

1

2

Helper classes and functions for working with metric learning algorithms, including constraint generation and data preprocessing utilities.

3

4

## Capabilities

5

6

### Constraints Class

7

8

Helper class for generating constraint pairs from labeled data, enabling easy conversion from supervised learning problems to weakly-supervised metric learning.

9

10

```python { .api }

11

class Constraints:

12

def __init__(self, partial_labels):

13

"""

14

Build constraints from labeled data.

15

16

Parameters:

17

- partial_labels: array-like, shape=(n_samples,), labels with -1 for unknown

18

"""

19

20

def positive_negative_pairs(self, n_constraints, same_length=False, random_state=None):

21

"""

22

Generate positive and negative pairs from labeled data.

23

24

Parameters:

25

- n_constraints: int, number of positive and negative constraints to generate

26

- same_length: bool, whether to ensure same number of positive and negative pairs

27

- random_state: int or None, random state for reproducibility

28

29

Returns:

30

- positive_pairs: array-like, shape=(n_pos, 2), pairs with same label

31

- negative_pairs: array-like, shape=(n_neg, 2), pairs with different labels

32

"""

33

34

def chunks(self, n_chunks=100, chunk_size=2, random_state=None):

35

"""

36

Generate chunks of similar items for RCA algorithm.

37

38

Parameters:

39

- n_chunks: int, number of chunks to generate

40

- chunk_size: int, number of items per chunk

41

- random_state: int or None, random state for reproducibility

42

43

Returns:

44

- chunks: array-like, shape=(n_samples,), 1D array of chunk indicators

45

where -1 indicates that the point does not belong to any chunk

46

"""

47

48

def generate_knntriplets(self, X, k_genuine, k_impostor):

49

"""

50

Generate triplets from labeled data using k-nearest neighbors.

51

52

Parameters:

53

- X: array-like, shape=(n_samples, n_features), input data

54

- k_genuine: int, number of neighbors of the same class to consider

55

- k_impostor: int, number of neighbors of different classes to consider

56

57

Returns:

58

- triplets: array-like, shape=(n_constraints, 3), 2D array of triplet indicators

59

"""

60

```

61

62

Usage examples:

63

64

```python

65

from metric_learn import Constraints

66

from sklearn.datasets import load_iris

67

import numpy as np

68

69

# Load sample data

70

X, y = load_iris(return_X_y=True)

71

72

# Create constraints generator from labels

73

constraints = Constraints(y)

74

75

# Generate positive and negative pairs

76

pos_pairs, neg_pairs = constraints.positive_negative_pairs(n_constraints=200)

77

78

# Combine into format expected by weakly-supervised algorithms

79

pairs = np.vstack([pos_pairs, neg_pairs])

80

pair_labels = np.hstack([np.ones(len(pos_pairs)), -np.ones(len(neg_pairs))])

81

82

print("Generated pairs shape:", pairs.shape)

83

print("Pair labels shape:", pair_labels.shape)

84

print("Unique pair labels:", np.unique(pair_labels)) # [-1, 1]

85

86

# Use with weakly-supervised algorithms

87

from metric_learn import ITML

88

itml = ITML(preprocessor=X)

89

itml.fit(pairs, pair_labels)

90

```

91

92

### Working with Different Constraint Types

93

94

The metric-learn package supports various constraint formats for different algorithms:

95

96

#### Pair Constraints

97

98

Most common format for weakly-supervised learning:

99

100

```python

101

from metric_learn import Constraints, ITML, LSML

102

from sklearn.datasets import make_classification

103

104

# Generate sample data

105

X, y = make_classification(n_samples=200, n_features=5, n_classes=3, random_state=42)

106

107

# Generate pair constraints

108

constraints = Constraints(y)

109

pos_pairs, neg_pairs = constraints.positive_negative_pairs(n_constraints=250)

110

pairs = np.vstack([pos_pairs, neg_pairs])

111

pair_labels = np.hstack([np.ones(len(pos_pairs)), -np.ones(len(neg_pairs))])

112

113

# Use with different algorithms

114

algorithms = [

115

ITML(preprocessor=X),

116

LSML(preprocessor=X)

117

]

118

119

for algo in algorithms:

120

algo.fit(pairs, pair_labels)

121

print(f"{algo.__class__.__name__} fitted with {len(pairs)} constraints")

122

```

123

124

#### Chunk Constraints for RCA

125

126

RCA uses a different constraint format based on chunks of similar items:

127

128

```python

129

from metric_learn import RCA

130

import numpy as np

131

132

# Create chunks manually

133

chunks = [

134

[0, 1, 2], # Chunk 1: indices of similar items

135

[3, 4, 5], # Chunk 2: indices of similar items

136

[6, 7, 8, 9], # Chunk 3: indices of similar items

137

[10, 11] # Chunk 4: indices of similar items

138

]

139

140

rca = RCA(dim=3)

141

rca.fit(chunks)

142

143

# Generate chunks from class labels

144

def labels_to_chunks(y):

145

"""Convert class labels to RCA chunk format."""

146

chunks = []

147

unique_labels = np.unique(y)

148

for label in unique_labels:

149

chunk_indices = np.where(y == label)[0].tolist()

150

if len(chunk_indices) > 1: # Need at least 2 items per chunk

151

chunks.append(chunk_indices)

152

return chunks

153

154

# Example usage

155

from sklearn.datasets import load_digits

156

X, y = load_digits(return_X_y=True)

157

158

# Convert labels to chunks using Constraints class

159

y_subset = y[:100]

160

constraints = Constraints(y_subset)

161

chunks = constraints.chunks(n_chunks=20, chunk_size=3)

162

163

rca = RCA(dim=10)

164

rca.fit(chunks)

165

X_transformed = rca.transform(X[:100])

166

```

167

168

### Data Preprocessing Utilities

169

170

While not exported as separate utilities, metric-learn algorithms include preprocessing capabilities:

171

172

#### Using Preprocessors

173

174

```python

175

from metric_learn import ITML

176

import numpy as np

177

178

# Your dataset

179

X = np.random.randn(100, 8)

180

181

# Index-based constraints (more memory efficient)

182

pairs_idx = [(0, 1), (2, 5), (10, 20), (15, 25)]

183

y = [1, -1, 1, -1] # 1 for similar, -1 for dissimilar

184

185

# Method 1: Use preprocessor parameter

186

itml_with_preprocessor = ITML(preprocessor=X)

187

itml_with_preprocessor.fit(pairs_idx, y)

188

189

# Method 2: Convert indices to actual data pairs

190

pairs_data = np.array([[X[i], X[j]] for i, j in pairs_idx])

191

itml_direct = ITML()

192

itml_direct.fit(pairs_data, y)

193

194

# Both methods are equivalent

195

```

196

197

#### Custom Preprocessor Functions

198

199

```python

200

from metric_learn import ITML

201

import numpy as np

202

203

def custom_preprocessor(indices):

204

"""Custom preprocessor that applies transformations before metric learning."""

205

# indices is a 2D array of shape (n_pairs, 2)

206

# Return 3D array of shape (n_pairs, 2, n_features)

207

pairs = []

208

for i, j in indices:

209

# Apply custom transformations

210

x_i = your_transform_function(your_data[i])

211

x_j = your_transform_function(your_data[j])

212

pairs.append([x_i, x_j])

213

return np.array(pairs)

214

215

# Use custom preprocessor

216

itml = ITML(preprocessor=custom_preprocessor)

217

itml.fit(pairs_idx, y)

218

```

219

220

### Package Version Information

221

222

```python { .api }

223

__version__: str

224

"""Package version string"""

225

```

226

227

Usage:

228

229

```python

230

import metric_learn

231

print("Metric-learn version:", metric_learn.__version__)

232

```

233

234

### Integration Utilities

235

236

Common patterns for integrating metric-learn with scikit-learn workflows:

237

238

#### Pipeline Integration

239

240

```python

241

from sklearn.pipeline import Pipeline

242

from sklearn.preprocessing import StandardScaler

243

from sklearn.neighbors import KNeighborsClassifier

244

from metric_learn import LMNN

245

246

# Create pipeline with metric learning

247

pipeline = Pipeline([

248

('scaler', StandardScaler()),

249

('metric_learner', LMNN(k=3)),

250

('classifier', KNeighborsClassifier(n_neighbors=3))

251

])

252

253

# Note: This requires custom handling since LMNN needs labels in fit()

254

# Better approach:

255

from sklearn.model_selection import train_test_split

256

257

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

258

259

# Scale data

260

scaler = StandardScaler()

261

X_train_scaled = scaler.fit_transform(X_train)

262

X_test_scaled = scaler.transform(X_test)

263

264

# Learn metric

265

lmnn = LMNN(k=3)

266

lmnn.fit(X_train_scaled, y_train)

267

268

# Transform data

269

X_train_transformed = lmnn.transform(X_train_scaled)

270

X_test_transformed = lmnn.transform(X_test_scaled)

271

272

# Classify

273

knn = KNeighborsClassifier(n_neighbors=3, metric=lmnn.get_metric())

274

knn.fit(X_train_scaled, y_train) # Use original scaled data for metric computation

275

accuracy = knn.score(X_test_scaled, y_test)

276

```

277

278

#### Cross-Validation with Metric Learning

279

280

```python

281

from sklearn.model_selection import cross_val_score

282

from sklearn.neighbors import KNeighborsClassifier

283

from metric_learn import NCA

284

import numpy as np

285

286

def metric_learning_cv_score(X, y, metric_learner, classifier, cv=5):

287

"""Custom cross-validation for metric learning algorithms."""

288

from sklearn.model_selection import KFold

289

290

kf = KFold(n_splits=cv, shuffle=True, random_state=42)

291

scores = []

292

293

for train_idx, test_idx in kf.split(X):

294

X_train, X_test = X[train_idx], X[test_idx]

295

y_train, y_test = y[train_idx], y[test_idx]

296

297

# Fit metric learner

298

metric_learner_copy = type(metric_learner)(**metric_learner.get_params())

299

metric_learner_copy.fit(X_train, y_train)

300

301

# Transform data

302

X_train_transformed = metric_learner_copy.transform(X_train)

303

X_test_transformed = metric_learner_copy.transform(X_test)

304

305

# Fit and score classifier

306

classifier_copy = type(classifier)(**classifier.get_params())

307

classifier_copy.fit(X_train_transformed, y_train)

308

score = classifier_copy.score(X_test_transformed, y_test)

309

scores.append(score)

310

311

return np.array(scores)

312

313

# Usage example

314

from sklearn.datasets import load_wine

315

X, y = load_wine(return_X_y=True)

316

317

nca = NCA(max_iter=100)

318

knn = KNeighborsClassifier(n_neighbors=3)

319

320

scores = metric_learning_cv_score(X, y, nca, knn, cv=5)

321

print(f"CV scores: {scores}")

322

print(f"Mean CV score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

323

```